Description
Description of the bug
Hi,
I am using the PyMuPDF library to translate pdfs keeping the overall structure intact. Now, this has been working successfully but while trying to translate an RTL langue. I got this error in processing the pdf on insert_htmlbox function.
int too large to convert to float
The bbox for which this occured is:
[393.83990478515625, 245.69000244140625, 393.83990478515625, 256.7300109863281]
I tried to round off the values to 2 decimal places even but still got the same issue. This bbox value is gotten using this code snippet:
pages_blocks = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
blocks = page.get_text("dict")["blocks"]
page_blocks = []
for block in blocks:
if "lines" in block:
for line in block["lines"]:
for span in line["spans"]:
if span["text"].strip():
bbox = span["bbox"]
if isinstance(bbox, (list, tuple)) and len(bbox) == 4:
bbox = [float(coord) for coord in bbox]
if span["text"]:
page_blocks.append({
"text": span["text"],
"bbox": bbox,
})
pages_blocks.append(page_blocks)
return pages_blocks
Note that is value of bbox works:
[393.83990478515625, 245.69000244140625, 400.83990478515625, 256.7300109863281]
Only difference is the third value's whole part so I am unsure what exactly are the constraints here.
How to reproduce the bug
Code to reproduce output:
import fitz
class TranslatePdfClient:
def extract_text_blocks(self, doc):
pages_blocks = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
blocks = page.get_text("dict")["blocks"]
page_blocks = []
for block in blocks:
if "lines" in block:
for line in block["lines"]:
for span in line["spans"]:
font_properties = {"size": span["size"], "color": "#%06x" % span["color"], "font-family": '%s' % span["font"]}
if span["text"].strip():
bbox = span["bbox"]
if isinstance(bbox, (list, tuple)) and len(bbox) == 4:
bbox = [float(coord) for coord in bbox]
if span["text"]:
page_blocks.append({
"text": span["text"],
"bbox": bbox,
"font_properties": font_properties
})
pages_blocks.append(page_blocks)
return pages_blocks
def translate(self, source_language, target_language):
scale = 0.5
try:
doc = fitz.open('test.pdf')
pages_blocks = self.extract_text_blocks(doc)
print("doc", doc)
for page_num, page_blocks in enumerate(pages_blocks):
page = doc.load_page(page_num)
for block in page_blocks:
text = block["text"]
bbox = block["bbox"]
font_properties = block["font_properties"]
if isinstance(bbox, (list, tuple)) and len(bbox) == 4:
try:
translated_text = text #hardcoding for bug reproduction only
if any(c.isalpha() for c in text):
#bbox = [388.3199157714844, 250.489990234375, 400.7757873535156, 261.5299987792969] #works
#bbox = [393.83990478515625, 245.69000244140625, 393.83990478515625, 256.7300109863281] #crashes
#bbox = [393.83990478515625, 245.69000244140625, 400.83990478515625, 256.7300109863281] works
rect = fitz.Rect(bbox)
page.add_redact_annot(rect, text="")
page.apply_redactions(images=0, graphics=0, text=0)
adjusted_size = font_properties['size'] - 2
adjusted_size = "%g" % adjusted_size
html = f'''
<div style="font-size:{adjusted_size}px; font-family:{font_properties["font-family"]}; color:{font_properties["color"]}; overflow:visible;">
{translated_text}
</div>
'''
page.insert_htmlbox(rect, html, scale_low=scale)
else:
print(f'No need to translate non-alpha text: {text}')
except Exception as e:
print(f"Error processing block: '{text}' with bbox: {bbox}")
print(e)
raise
else:
print(f"Invalid bbox: {bbox}")
doc.save('output.pdf', garbage=4, clean=True, deflate=True, deflate_images=True, deflate_fonts=True)
doc.close()
except Exception as e:
print(fitz.TOOLS.mupdf_warnings())
print(f"Error processing block: {str(e)}")
print(e)
if __name__ == "__main__":
client = TranslatePdfClient()
client.translate('arabic', 'english')
test file:
Getting error when using file directly as:
Error processing block: 'ر' with bbox: [393.83990478515625, 245.69000244140625, 393.83990478515625, 256.7300109863281]
'NoneType' object has no attribute 'y1'
Error processing block: 'NoneType' object has no attribute 'y1'
'NoneType' object has no attribute 'y1'
but when I use a file stream which I use on prod, I get:
Error processing block: 'ر' with bbox: [393.83990478515625, 245.69000244140625, 393.83990478515625, 256.7300109863281]
int too large to convert to float
file: test.pdf
PyMuPDF version
1.24.11
Operating system
Linux
Python version
3.10