Some fixes

JorjMcKie · JorjMcKie · commit 9df602b98a34 · 2024-08-22T12:42:16.000-04:00
* Extend the list of known bullet point Unicodes
* Fix typo for detecting a "quad" drawing
diff --git a/docs/src/changes.rst b/docs/src/changes.rst
@@ -4,6 +4,21 @@
 Change Log
 ===========================================================================
 
+Changes in version 0.0.11
+--------------------------
+
+Fixes:
+~~~~~~~
+
+* `90 <https://github.com/pymupdf/RAG/issues/90>`_ "'Quad' object has no attribute 'tl'"
+* `88 <https://github.com/pymupdf/RAG/issues/88>`_ "Bug in is_significant function"
+
+
+Improvements:
+~~~~~~~~~~~~~~
+* Extended the list of known bullet point characters.
+
+
 Changes in version 0.0.10
 --------------------------
 
diff --git a/pymupdf4llm/README.md b/pymupdf4llm/README.md
@@ -33,15 +33,15 @@ pathlib.Path("output.md").write_bytes(md_text.encode())
 
 Instead of the filename string as above, one can also provide a PyMuPDF `Document`. By default, all pages in the PDF will be processed. If desired, the parameter `pages=[...]` can be used to provide a list of zero-based page numbers to consider.
 
-**New features as of v0.0.2:**
+**Feature Overview:**
 
 * Support for pages with **_multiple text columns_**.
 * Support for **_image and vector graphics extraction_**:
 
     1. Specify `pymupdf4llm.to_markdown("input.pdf", write_images=True)`. Default is `False`.
-    2. Each image or vector graphic on the page will be extracted and stored as a PNG image named `"input.pdf-pno-index.png"` in the folder of `"input.pdf"`. Where `pno` is the 0-based page number and `index` is some sequence number.
-    3. The image files will have width and height equal to the values on the page.
-    4. Any text contained in the images or graphics will not be extracted, but become visible as image parts.
+    2. Each image or vector graphic on the page will be extracted and stored as an image named `"input.pdf-pno-index.extension"` in a folder of your choice. The image `extension` can be chosen to represent a PyMuPDF-supported image format (for instance "png" or "jpg"),  `pno` is the 0-based page number and `index` is some sequence number.
+    3. The image files will have width and height equal to the values on the page. The desired resolution can be chosen via parameter `dpi` (default: `dpi=150`).
+    4. Any text contained in the images or graphics will be extracted and **also become visible as part of the generated image**. This behavior can be changed via `force_text=False` (text only apears as part of the image).
 
 * Support for **page chunks**: Instead of returning one large string for the whole document, a list of dictionaries can be generated: one for each page. Specify `data = pymupdf4llm.to_markdown("input.pdf", page_chunks=True)`. Then, for instance the first item, `data[0]` will contain a dictionary for the first page with the text and some metadata.
 
diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -69,7 +69,9 @@ def sanitize_spans(line):
         Returns:
             A list of sorted, and potentially cleaned-up spans
         """
-        line.sort(key=lambda s: s["bbox"].x0)  # sort left to right
+        # sort ascending horizontally
+        line.sort(key=lambda s: s["bbox"].x0)
+        # join spans, delete duplicates
         for i in range(len(line) - 1, 0, -1):  # iterate back to front
             s0 = line[i - 1]
             s1 = line[i]
@@ -78,13 +80,17 @@ def sanitize_spans(line):
             delta = s1["size"] * 0.1
             if s0["bbox"].x1 + delta < s1["bbox"].x0:
                 continue  # all good: no joining neded
+
+            # We need to join bbox and text of two consecutive spans
+            # On occasion, spans may also be duplicated.
+            if s0["text"] != s1["text"] or s0["bbox"] != s1["bbox"]:
+                s0["text"] += s1["text"]
             s0["bbox"] |= s1["bbox"]  # join boundary boxes
-            s0["text"] += s1["text"]  # join the text
             del line[i]  # delete the joined-in span
             line[i - 1] = s0  # update the span
         return line
 
-    if clip is None:  # use TextPage if not provided
+    if clip is None:  # use TextPage rect if not provided
         clip = textpage.rect
     # extract text blocks - if bbox is not empty
     blocks = [
@@ -126,10 +132,7 @@ def sanitize_spans(line):
         sbbox = s["bbox"]  # this bbox
         sbbox0 = line[-1]["bbox"]  # previous bbox
         # if any of top or bottom coordinates are close enough, join...
-        if (
-            abs(sbbox.y1 - sbbox0.y1) <= y_delta
-            or abs(sbbox.y0 - sbbox0.y0) <= y_delta
-        ):
+        if abs(sbbox.y1 - sbbox0.y1) <= y_delta or abs(sbbox.y0 - sbbox0.y0) <= y_delta:
             line.append(s)  # append to this line
             lrect |= sbbox  # extend line rectangle
             continue
@@ -150,9 +153,7 @@ def sanitize_spans(line):
     return nlines
 
 
-def get_text_lines(
-    page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False
-):
+def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False):
     """Extract text by line keeping natural reading sequence.
 
     Notes:
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -40,15 +40,15 @@
 if fitz.pymupdf_version_tuple < (1, 24, 2):
     raise NotImplementedError("PyMuPDF version 1.24.2 or later is needed.")
 
-bullet = (
+bullet = [
     "- ",
     "* ",
     chr(0xF0A7),
     chr(0xF0B7),
     chr(0xB7),
     chr(8226),
-    chr(9679),
-)
+] + list(map(chr, range(9642, 9680)))
+
 GRAPHICS_TEXT = "\n![](%s)\n"
 
 
@@ -193,7 +193,7 @@ def is_significant(box, paths):
         for itm in p["items"]:
             if itm[0] in ("l", "c"):  # line or curve
                 points.extend(itm[1:])  # append all the points
-            elif itm[0] == "q":  # quad
+            elif itm[0] == "qu":  # quad
                 q = itm[1]
                 # follow corners anti-clockwise
                 points.extend([q.ul, q.ll, q.lr, q.ur, q.ul])