From d1c6e30126c43f5e078be26bba666c2cf6919ddd Mon Sep 17 00:00:00 2001 From: Jorj McKie Date: Sat, 7 Aug 2021 12:55:50 -0400 Subject: [PATCH] Upload v1.18.16 --- PKG-INFO | 6 +- README.md | 22 +-- changes.rst | 23 ++- fitz/__main__.py | 370 +++++++++++++++++++++++++++++++++++++++++- fitz/fitz.i | 58 +++++-- fitz/helper-devices.i | 24 ++- fitz/helper-fields.i | 8 +- fitz/helper-stext.i | 6 + fitz/helper-xobject.i | 2 +- fitz/utils.py | 20 ++- fitz/version.i | 6 +- setup.py | 2 +- 12 files changed, 493 insertions(+), 54 deletions(-) diff --git a/PKG-INFO b/PKG-INFO index 344ec917c..e46ae84e2 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.1 Name: PyMuPDF -Version: 1.18.15 +Version: 1.18.16 Author: Jorj McKie Author-email: jorj.x.mckie@outlook.de License: GNU AFFERO GPL 3.0 @@ -10,7 +10,7 @@ Home-page: https://github.com/pymupdf/PyMuPDF Download-url: https://github.com/pymupdf/PyMuPDF Summary: PyMuPDF is a Python binding for the document renderer and toolkit MuPDF Description: - Release date: July 10, 2021 + Release date: August 8, 2021 Authors ======= @@ -21,7 +21,7 @@ Description: Introduction ============ - PyMuPDF (current version 1.18.15) is a Python binding with support for `MuPDF `_ (current version 1.18.*), a lightweight PDF, XPS, and E-book viewer, renderer and toolkit, which is maintained and developed by Artifex Software, Inc. + PyMuPDF (current version 1.18.16) is a Python binding with support for `MuPDF `_ (current version 1.18.*), a lightweight PDF, XPS, and E-book viewer, renderer and toolkit, which is maintained and developed by Artifex Software, Inc. MuPDF can access files in PDF, XPS, OpenXPS, CBZ, EPUB and FB2 (e-books) formats, and it is known for its top performance and high rendering quality. diff --git a/README.md b/README.md index f392549b8..2de975b0f 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,17 @@ -# PyMuPDF 1.18.15 +# PyMuPDF 1.18.16 ![logo](https://github.com/pymupdf/PyMuPDF/blob/master/demo/pymupdf.jpg) -Release date: July 10, 2021 +Release date: August 8, 2021 On **[PyPI](https://pypi.org/project/PyMuPDF)** since August 2016: [![Downloads](https://static.pepy.tech/personalized-badge/pymupdf?period=total&units=international_system&left_color=black&right_color=orange&left_text=Downloads)](https://pepy.tech/project/pymupdf) - -# Authors -* [Jorj X. McKie](mailto:jorj.x.mckie@outlook.de) -* [Ruikai Liu](mailto:lrk700@gmail.com) + +# Author +[Jorj X. McKie](mailto:jorj.x.mckie@outlook.de), based on original code by [Ruikai Liu](mailto:lrk700@gmail.com). # Introduction -PyMuPDF (current version 1.18.15) is a Python binding with support for [MuPDF](https://mupdf.com/) (current version 1.18.*), a lightweight PDF, XPS, and E-book viewer, renderer, and toolkit, which is maintained and developed by Artifex Software, Inc. +PyMuPDF (current version 1.18.16) is a Python binding with support for [MuPDF](https://mupdf.com/) (current version 1.18.*), a lightweight PDF, XPS, and E-book viewer, renderer, and toolkit, which is maintained and developed by Artifex Software, Inc. MuPDF can access files in PDF, XPS, OpenXPS, CBZ, EPUB and FB2 (e-books) formats, and it is known for its top performance and high rendering quality. @@ -36,19 +35,20 @@ For **PDF documents,** there exists a plethora of additional features: they can * Images and fonts can be extracted or inserted. > You may want to have a look at [this](https://github.com/pymupdf/PyMuPDF-Utilities/blob/master/examples/image-maintenance.py) cool GUI example script, which lets you **_insert, delete, replace_** or **_re-position_** images under your visual control. - > Since v1.18.8 there is a new experimental `Document` method `subset_fonts()`, which automatically builds subsets based on the usage of all eligible fonts in the document. Especially for new documents, this can lead to significant file size reductions. The method was developed in cooperation with our user @cuteufo - again thanks a lot for the contribution. + > Since v1.18.8 there is a `Document` method `subset_fonts()`, which automatically builds subsets based on the usage of all eligible fonts in the document. Especially for new documents, this can lead to significant file size reductions. The method was developed in cooperation with our user @cuteufo - again thanks a lot for the contribution. * Embedded files are fully supported. * PDFs can be reformatted to support double-sided printing, posterizing, applying logos or watermarks * Password protection is fully supported: decryption, encryption, encryption method selection, permmission level and user / owner password setting. * Support of the **PDF Optional Content** concept for images, text and drawings. * Low-level PDF structures can be accessed and modified. -* PyMuPDF can also be used as a **module in the command line** using ``"python -m fitz ..."``. This is a versatile utility, which we will further develop going forward. It currently supports PDF document +* **Command line module** ``"python -m fitz ..."``. A versatile utility with the following features - **encryption / decryption / optimization** - - creating **sub-documents** + - creation of **sub-documents** - document **joining** - **image / font extraction** - - full support of **embedded files**. + - full support of **embedded files** + - **_layout-preserving text extraction_** (all documents) Have a look at the basic [demos](https://github.com/pymupdf/PyMuPDF-Utilities/tree/master/demo), the [examples](https://github.com/pymupdf/PyMuPDF-Utilities/tree/master/examples) (which contain complete, working programs), and the **recipes** section of our [Wiki](https://github.com/pymupdf/PyMuPDF/wiki) sidebar, which contains more than a dozen of guides in How-To-style. diff --git a/changes.rst b/changes.rst index fac70d931..a71560c4e 100644 --- a/changes.rst +++ b/changes.rst @@ -1,19 +1,36 @@ Change Logs =============== +Changes in Version 1.18.16 +--------------------------- +* **Fixed** issue `#1184 `_. Existing PDF widget fonts in a PDF are now accepted (i.e. not forcedly changed to a Base-14 font). + +* **Fixed** issue `#1154 `_. Text search hits should now be correct when ``clip`` is specified. + +* **Fixed** issue `#1152 `_. + +* **Fixed** issue `#1146 `_. + +* **Added** :attr:`Link.flags` and :meth:`Link.set_flags` to the :ref:`Link` class. Implements enhancement requests `#1187 `_. + +* **Added** option to *simulate* :meth:`TextWriter.fill_textbox` output for predicting the number of lines, that a given text would occupy in the textbox. + +* **Added** text output support as subcommand `gettext` to the ``fitz`` CLI module. Most importantly, original **physical text layout** reproduction is now supported. + + Changes in Version 1.18.15 --------------------------- * **Fixed** issue `#1088 `_. Removing an annotation's fill color should now work again both ways, using the ``fill_color=[]`` argument in :meth:`Annot.update` as well as ``fill=[]`` in :meth:`Annot.set_colors`. * **Fixed** issue `#1081 `_. :meth:`Document.subset_fonts`: fixed an error which created wrong character widths for some fonts. -* **Fixed** issue `#1078 `_. :meth:`Page.get_text` and other methods related to text extraction: changed the default value of the :ref:`TextPage` ``flags`` parameter. All whitespace and ligatures are now preserved. +* **Fixed** issue `#1078 `_. :meth:`Page.get_text` and other methods related to text extraction: changed the default value of the :ref:`TextPage` ``flags`` parameter. All whitespace and :data:`ligatures` are now preserved. * **Fixed** issue `#1085 `_. The old *snake_cased* alias of ``fitz.detTextlength`` is now defined correctly. -* **Changed** :meth:`Document.subset_fonts` will now prefix fonts that were successfully subsetted with an appropriate six letter uppercase tag as prescribed by the PDF specification. +* **Changed** :meth:`Document.subset_fonts` will now correctly prefix font subsets with an appropriate six letter uppercase tag, complying with the PDF specification. -* **Added** new method :meth:`Widget.button_states` which returns the possible values that a button-type field can have when being set "on" or "off". +* **Added** new method :meth:`Widget.button_states` which returns the possible values that a button-type field can have when being set to "on" or "off". * **Added** support of text with **Small Capital** letters to the :ref:`Font` and :ref:`TextWriter` classes. This is reflected by an additional bool parameter ``small_caps`` in various of their methods. diff --git a/fitz/__main__.py b/fitz/__main__.py index b76df875b..d28d10e93 100644 --- a/fitz/__main__.py +++ b/fitz/__main__.py @@ -1,15 +1,21 @@ # ----------------------------------------------------------------------------- # Copyright 2020-2021, Harald Lieder, mailto:harald.lieder@outlook.com # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html - # Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is # maintained and developed by Artifex Software, Inc. https://artifex.com. # ----------------------------------------------------------------------------- +import argparse +import bisect import os import sys import fitz +from fitz.fitz import ( + TEXT_INHIBIT_SPACES, + TEXT_PRESERVE_LIGATURES, + TEXT_PRESERVE_WHITESPACE, +) mycenter = lambda x: (" %s " % x).center(75, "-") @@ -540,12 +546,299 @@ def extract_objects(args): doc.close() +def page_simple(page, textout, GRID, fontsize, noformfeed, skip_empty, flags): + eop = b"\n" if noformfeed else bytes([12]) + text = page.get_text("text", flags=flags) + if not text: + if not skip_empty: + textout.write(eop) # write formfeed + return + textout.write(text.encode("utf8")) + textout.write(eop) + return + + +def page_blocksort(page, textout, GRID, fontsize, noformfeed, skip_empty, flags): + eop = b"\n" if noformfeed else bytes([12]) + blocks = page.get_text("blocks", flags=flags) + if blocks == []: + if not skip_empty: + textout.write(eop) # write formfeed + return + blocks.sort(key=lambda b: (b[3], b[0])) + for b in blocks: + textout.write(b[4].encode("utf8")) + textout.write(eop) + return + + +def page_layout(page, textout, GRID, fontsize, noformfeed, skip_empty, flags): + left = page.rect.width # left most used coordinate + right = 0 # rightmost coordinate + rowheight = page.rect.height # smallest row height in use + chars = [] # all chars here + rows = set() # bottom coordinates of lines + eop = b"\n" if noformfeed else bytes([12]) + + # -------------------------------------------------------------------- + def find_line_index(values: list[int], value: int) -> int: + """Find the right row coordinate. + + Args: + values: (list) y-coordinates of rows. + value: (int) lookup for this value (y-origin of char). + Returns: + y-ccordinate of appropriate line for value. + """ + i = bisect.bisect_right(values, value) + if i: + return values[i - 1] + raise RuntimeError("Line for %g not found in %s" % (value, values)) + + # -------------------------------------------------------------------- + def curate_rows(rows, GRID): + rows = list(rows) + rows.sort() # sort ascending + nrows = [rows[0]] + for h in rows[1:]: + if h >= nrows[-1] + GRID: # only keep significant differences + nrows.append(h) + return nrows # curated list of line bottom coordinates + + def process_blocks(blocks, rows, chars, rowheight, left, right, page): + for block in blocks: + for line in block["lines"]: + if line["dir"] != (1, 0): # ignore non-horizontal text + continue + x0, y0, x1, y1 = line["bbox"] + if y1 < 0 or y0 > page.rect.height: # ignore if outside CropBox + continue + # upd row height + height = y1 - y0 + + if rowheight > height: + rowheight = height + for span in line["spans"]: + if span["size"] <= fontsize: + continue + for c in span["chars"]: + x0, _, x1, _ = c["bbox"] + cwidth = x1 - x0 + ox, oy = c["origin"] + oy = int(round(oy)) + rows.add(oy) + ch = c["c"] + if left > ox and ch != " ": + left = ox # update left coordinate + if right < x1: + right = x1 # update right coordinate + # handle ligatures: + if cwidth == 0 and chars != []: # potential ligature + old_ch, old_ox, old_oy, old_cwidth = chars[-1] + if old_oy == oy: # ligature + if old_ch != chr(0xFB00): # previous "ff" char lig? + lig = joinligature(old_ch + ch) # no + # convert to one of the 3-char ligatures: + elif ch == "i": + lig = chr(0xFB03) # "ffi" + elif ch == "l": + lig = chr(0xFB04) # "ffl" + else: # something wrong, leave old char in place + lig = old_ch + chars[-1] = (lig, old_ox, old_oy, old_cwidth) + continue + chars.append((ch, ox, oy, cwidth)) # all chars on page + return left, right + + def joinligature(lig): + """Return ligature character for a given pair / triple of characters. + + Args: + lig: (str) 2/3 characters, e.g. "ff" + Returns: + Ligature, e.g. "ff" -> chr(0xFB00) + """ + + if lig == "ff": + return chr(0xFB00) + elif lig == "fi": + return chr(0xFB01) + elif lig == "fl": + return chr(0xFB02) + elif lig == "ffi": + return chr(0xFB03) + elif lig == "ffl": + return chr(0xFB04) + elif lig == "ft": + return chr(0xFB05) + elif lig == "st": + return chr(0xFB06) + return lig + + # -------------------------------------------------------------------- + def make_textline(left, slot, minslot, lchars): + """Produce the text of one output line. + + Args: + left: (float) left most coordinate used on page + slot: (float) avg width of one character in any font in use. + minslot: (float) min width for the characters in this line. + chars: (list[tuple]) characters of this line. + Returns: + text: (str) text string for this line + """ + text = "" # we output this + old_x1 = 0 # end coordinate of last char + old_ox = 0 # x-origin of last char + if minslot <= fitz.EPSILON: + raise RuntimeError("program error: minslot too small = %g" % minslot) + + for c in lchars: # loop over characters + char, ox, _, cwidth = c + ox = ox - left # its (relative) start coordinate + x1 = ox + cwidth # ending coordinate + + # eliminate overprint effect + if ( + old_ox <= ox < old_x1 + and char == text[-1] + and abs(ox - old_ox) <= cwidth * 0.1 + ): + continue + + # omit spaces overlapping previous char + if char == " " and (old_x1 - ox) / cwidth > 0.8: + continue + + # close enough to previous? + if ox < old_x1 + minslot: # assume char adjacent to previous + text += char # append to output + old_x1 = x1 # new end coord + old_ox = ox # new origin.x + continue + + # else next char starts after some gap: + # fill in right number of spaces, so char is positioned + # in the right slot of the line + if char == " ": # rest relevant for non-space only + continue + delta = int(ox / slot) - len(text) + if ox > old_x1 and delta > 1: + text += " " * delta + # now append char + text += char + old_x1 = x1 # new end coordinate + old_ox = ox # new origin + return text.rstrip() + + # extract page text by single characters ("rawdict") + blocks = page.get_text("rawdict", flags=flags)["blocks"] + + if blocks == []: + if not skip_empty: + textout.write(eop) # write formfeed + return + left, right = process_blocks(blocks, rows, chars, rowheight, left, right, page) + + # compute list of line coordinates - ignoring small (GRID) differences + rows = curate_rows(rows, GRID) + + # sort all chars by x-coordinates, so every line will receive char info + # sorted from left to right. + chars.sort(key=lambda c: c[1]) + + # populate the lines with their char info + lines = {} # key: y1-ccordinate, value: char list + for c in chars: + _, _, oy, _ = c + y = find_line_index(rows, oy) # y-coord of the right line + lchars = lines.get(y, []) # read line chars so far + lchars.append(c) # append this char + lines[y] = lchars # write back to line + + # ensure line coordinates are ascending + keys = list(lines.keys()) + keys.sort() + + # ------------------------------------------------------------------------- + # Compute "char resolution" for the page: the char width corresponding to + # 1 text char position on output - call it 'slot'. + # For each line, compute median of its char widths. The minimum across all + # lines is 'slot'. + # The minimum char width of each line is used to determine if spaces must + # be inserted in between two characters. + # ------------------------------------------------------------------------- + slot = right - left + minslots = {} + for k in keys: + lchars = lines[k] + ccount = len(lchars) + if ccount < 2: + minslots[k] = 1 + continue + widths = [c[3] for c in lchars] + widths.sort() + i = int(ccount / 2 + 0.5) # index of median value + this_slot = widths[i] # take median value + if this_slot < slot: + slot = this_slot + minslots[k] = min(widths) + + # compute line advance in text output + rowheight = rowheight * (rows[-1] - rows[0]) / (rowheight * len(rows)) * 1.2 + rowpos = rows[0] # first line positioned here + textout.write(b"\n") + for k in keys: # walk through the lines + while rowpos < k: # honor distance between lines + textout.write(b"\n") + rowpos += rowheight + text = make_textline(left, slot, minslots[k], lines[k]) + textout.write((text + "\n").encode("utf8")) + rowpos = k + rowheight + + textout.write(eop) # write formfeed + + +def gettext(args): + doc = open_file(args.input, args.password, pdf=False) + pagel = get_list(args.pages, doc.page_count + 1) + output = args.output + if output == None: + filename, _ = os.path.splitext(doc.name) + output = filename + ".txt" + textout = open(output, "wb") + flags = TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE + if args.convert_white: + flags ^= TEXT_PRESERVE_WHITESPACE + if args.noligatures: + flags ^= TEXT_PRESERVE_LIGATURES + if args.extra_spaces: + flags ^= TEXT_INHIBIT_SPACES + func = { + "simple": page_simple, + "blocks": page_blocksort, + "layout": page_layout, + } + for pno in pagel: + page = doc[pno - 1] + func[args.mode]( + page, + textout, + args.grid, + args.fontsize, + args.noformfeed, + args.skip_empty, + flags=flags, + ) + + textout.close() + + def main(): """Define command configurations.""" - import argparse - parser = argparse.ArgumentParser( - description=mycenter("Basic PyMuPDF Functions"), prog="fitz" + prog="fitz", + description=mycenter("Basic PyMuPDF Functions"), ) subps = parser.add_subparsers( title="Subcommands", help="Enter 'command -h' for subcommand specific help" @@ -760,6 +1053,75 @@ def main(): ) ps_embed_copy.set_defaults(func=embedded_copy) + # ------------------------------------------------------------------------- + # 'textlayout' command + # ------------------------------------------------------------------------- + ps_gettext = subps.add_parser( + "gettext", description=mycenter("extract text in various formatting modes") + ) + ps_gettext.add_argument("input", type=str, help="input document filename") + ps_gettext.add_argument("-password", help="password for input document") + ps_gettext.add_argument( + "-mode", + type=str, + help="mode: simple, block sort, or layout (default)", + choices=("simple", "blocks", "layout"), + default="layout", + ) + ps_gettext.add_argument( + "-pages", + type=str, + help="select pages, format: 1,5-7,50-N", + default="1-N", + ) + ps_gettext.add_argument( + "-noligatures", + action="store_true", + help="expand ligature characters (default False)", + default=False, + ) + ps_gettext.add_argument( + "-convert-white", + action="store_true", + help="convert whitespace characters to white (default False)", + default=False, + ) + ps_gettext.add_argument( + "-extra-spaces", + action="store_true", + help="fill gaps with spaces (default False)", + default=False, + ) + ps_gettext.add_argument( + "-noformfeed", + action="store_true", + help="write linefeeds, no formfeeds (default False)", + default=False, + ) + ps_gettext.add_argument( + "-skip-empty", + action="store_true", + help="suppress pages with no text (default False)", + default=False, + ) + ps_gettext.add_argument( + "-output", + help="store text in this file (default inputfilename.txt)", + ) + ps_gettext.add_argument( + "-grid", + type=float, + help="merge lines if closer than this (default 2)", + default=2, + ) + ps_gettext.add_argument( + "-fontsize", + type=float, + help="only include text with a larger fontsize (default 3)", + default=3, + ) + ps_gettext.set_defaults(func=gettext) + # ------------------------------------------------------------------------- # start program # ------------------------------------------------------------------------- diff --git a/fitz/fitz.i b/fitz/fitz.i index 023afb04f..6c031b0bc 100644 --- a/fitz/fitz.i +++ b/fitz/fitz.i @@ -5603,6 +5603,11 @@ def get_oc_items(self) -> list: } return rc; } + %pythoncode %{ + def _get_texttrace(self): + """Return low-level text information of the page.""" + return self._getTexttrace() + %} //---------------------------------------------------------------- @@ -5970,21 +5975,23 @@ except: txtpy = PySequence_ITEM(linklist, (Py_ssize_t) i); text = JM_StrAsChar(txtpy); Py_CLEAR(txtpy); - if (!text) THROWMSG(gctx, "bad linklist item"); - annot = pdf_add_object_drop(gctx, page->doc, - JM_pdf_obj_from_str(gctx, page->doc, text)); - ind_obj = pdf_new_indirect(gctx, page->doc, pdf_to_num(gctx, annot), 0); - pdf_array_push_drop(gctx, annots, ind_obj); - pdf_drop_obj(gctx, annot); + if (!text) { + PySys_WriteStderr("skipping bad link / annot item %i.\n", i); + continue; + } + fz_try(gctx) { + annot = pdf_add_object_drop(gctx, page->doc, + JM_pdf_obj_from_str(gctx, page->doc, text)); + ind_obj = pdf_new_indirect(gctx, page->doc, pdf_to_num(gctx, annot), 0); + pdf_array_push_drop(gctx, annots, ind_obj); + pdf_drop_obj(gctx, annot); + } + fz_catch(gctx) { + PySys_WriteStderr("skipping bad link / annot item %i.\n", i); + } } } fz_catch(gctx) { - if (text) { - PySys_WriteStderr("%s (%i): '%s'\n", fz_caught_message(gctx), i, text); - } - else if (i >= 0) { - PySys_WriteStderr("%s (%i)\n", fz_caught_message(gctx), i); - } PyErr_Clear(); return NULL; } @@ -8983,8 +8990,8 @@ struct Annot s = "[%g %g %g %g]" % tuple(stroke) doc.xref_set_key(self.xref, "C", s) - if self.type[0] not in fill_annots: - print("warning: annot type '%s' has no fill color" % self.type[0]) + if fill and self.type[0] not in fill_annots: + print("Warning: fill color ignored for annot type '%s'." % self.type[1]) return if fill in ([], ()): doc.xref_set_key(self.xref, "IC", "[]") @@ -9690,6 +9697,27 @@ struct Link def border(self): return self._border(self.parent.parent.this, self.xref) + @property + def flags(self)->int: + CheckParent(self) + doc = self.parent.parent + if not doc.is_pdf: + return 0 + f = doc.xref_get_key(self.xref, "F") + if f[1] != "null": + return int(f[1]) + return 0 + + def set_flags(self, flags): + CheckParent(self) + doc = self.parent.parent + if not doc.is_pdf: + raise ValueError("not a PDF") + if not type(flags) is int: + raise ValueError("bad 'flags' value") + doc.xref_set_key(self.xref, "F", str(flags)) + return None + def set_border(self, border=None, width=0, dashes=None, style=None): if type(border) is not dict: border = {"width": width, "style": style, "dashes": dashes} @@ -10140,7 +10168,7 @@ struct TextPage { last_char = ch->c; linerect = fz_union_rect(linerect, cbbox); } - if (last_char != 10) { + if (last_char != 10 && !fz_is_empty_rect(linerect)) { fz_append_byte(gctx, res, 10); } blockrect = fz_union_rect(blockrect, linerect); diff --git a/fitz/helper-devices.i b/fitz/helper-devices.i index fec611c54..53e5cf463 100644 --- a/fitz/helper-devices.i +++ b/fitz/helper-devices.i @@ -191,6 +191,7 @@ jm_trace_text_linewidth(fz_context *ctx, fz_device *dev_, const fz_path *path, c static void jm_trace_text_span(fz_context *ctx, PyObject *out, fz_text_span *span, int type, fz_matrix ctm, fz_rect scissor, fz_colorspace *colorspace, const float *color, float alpha) { + fz_font *out_font = NULL; int i, n; const char *fontname = JM_font_name(ctx, span->font); PyObject *chars = PyTuple_New(span->len); @@ -210,6 +211,12 @@ jm_trace_text_span(fz_context *ctx, PyObject *out, fz_text_span *span, int type, asc = 1 + dsc; } + int fflags = 0; + int mono = fz_font_is_monospaced(ctx, span->font); + fflags += mono * TEXT_FONT_MONOSPACED; + fflags += fz_font_is_italic(ctx, span->font) * TEXT_FONT_ITALIC; + fflags += fz_font_is_serif(ctx, span->font) * TEXT_FONT_SERIFED; + fflags += fz_font_is_bold(ctx, span->font) * TEXT_FONT_BOLD; fz_matrix mat = trace_text_ptm; fz_matrix ctm_rot = fz_concat(ctm, trace_text_rot); mat = fz_concat(mat, ctm_rot); @@ -238,16 +245,25 @@ jm_trace_text_span(fz_context *ctx, PyObject *out, fz_text_span *span, int type, span->items[i].ucs, span->items[i].gid, char_orig.x, char_orig.y, adv)); } - if (space_adv == 0) { - space_adv = fz_advance_glyph(ctx, span->font, fz_encode_character_by_glyph_name(ctx, span->font, "space"), span->wmode); - space_adv *= fsize; + if (!space_adv) { + if (!mono) { + space_adv = fz_advance_glyph(ctx, span->font, + fz_encode_character_with_fallback(ctx, span->font, 32, 0, 0, &out_font), + span->wmode); + space_adv *= fsize; + if (!space_adv) { + space_adv = last_adv; + } + } else { + space_adv = last_adv; + } } - // make the span dictionary PyObject *span_dict = PyDict_New(); DICT_SETITEMSTR_DROP(span_dict, "dir", JM_py_from_point(fz_normalize_vector(dir))); DICT_SETITEM_DROP(span_dict, dictkey_font, Py_BuildValue("s",fontname)); DICT_SETITEM_DROP(span_dict, dictkey_wmode, PyLong_FromLong((long) span->wmode)); + DICT_SETITEM_DROP(span_dict, dictkey_flags, PyLong_FromLong((long) fflags)); DICT_SETITEMSTR_DROP(span_dict, "bidi", PyLong_FromLong((long) span->bidi_level)); DICT_SETITEMSTR_DROP(span_dict, "ascender", PyFloat_FromDouble(asc)); DICT_SETITEMSTR_DROP(span_dict, "descender", PyFloat_FromDouble(dsc)); diff --git a/fitz/helper-fields.i b/fitz/helper-fields.i index 0aecf23ae..dc7680a96 100644 --- a/fitz/helper-fields.i +++ b/fitz/helper-fields.i @@ -940,13 +940,15 @@ class Widget(object): def _adjust_font(self): - """Ensure text_font is from our list and correctly spelled. + """Ensure text_font is correctly spelled if empty or from our list. + + Otherwise assume the font is in an existing field. """ if not self.text_font: self.text_font = "Helv" return - valid_fonts = ("Cour", "TiRo", "Helv", "ZaDb") - for f in valid_fonts: + doc = self.parent.parent + for f in doc.FormFonts + ["Cour", "TiRo", "Helv", "ZaDb"]: if self.text_font.lower() == f.lower(): self.text_font = f return diff --git a/fitz/helper-stext.i b/fitz/helper-stext.i index a12849153..f7526b510 100644 --- a/fitz/helper-stext.i +++ b/fitz/helper-stext.i @@ -327,6 +327,7 @@ JM_search_stext_page(fz_context *ctx, fz_stext_page *page, const char *needle) fz_stext_char *ch; fz_buffer *buffer = NULL; const char *haystack, *begin, *end; + fz_rect rect = page->mediabox; int c, inside; if (strlen(needle) == 0) Py_RETURN_NONE; @@ -349,6 +350,10 @@ JM_search_stext_page(fz_context *ctx, fz_stext_page *page, const char *needle) } for (line = block->u.t.first_line; line; line = line->next) { for (ch = line->first_char; ch; ch = ch->next) { + if (!fz_is_infinite_rect(rect) && + !fz_contains_rect(rect, JM_char_bbox(ctx, line, ch))) { + goto next_char; + } try_new_match: if (!inside) { if (haystack >= begin) inside = 1; @@ -364,6 +369,7 @@ try_new_match: } } haystack += fz_chartorune(&c, haystack); +next_char:; } assert(*haystack == '\n'); ++haystack; diff --git a/fitz/helper-xobject.i b/fitz/helper-xobject.i index a1a964718..bb613bdf7 100644 --- a/fitz/helper-xobject.i +++ b/fitz/helper-xobject.i @@ -38,7 +38,7 @@ fz_buffer *JM_read_contents(fz_context * ctx, pdf_obj * pageref) //----------------------------------------------------------------------------- // Make an XObject from a PDF page -// For a positive xref assume that that object can be used instead +// For a positive xref assume that its object can be used instead //----------------------------------------------------------------------------- pdf_obj *JM_xobject_from_page(fz_context * ctx, pdf_document * pdfout, fz_page * fsrcpage, int xref, pdf_graft_map *gmap) { diff --git a/fitz/utils.py b/fitz/utils.py index 331d4dba1..a15bb1f85 100644 --- a/fitz/utils.py +++ b/fitz/utils.py @@ -391,6 +391,8 @@ def search_for(*args, **kwargs) -> list: page, text = args quads = kwargs.get("quads", 0) clip = kwargs.get("clip") + if clip != None: + clip = Rect(clip) flags = kwargs.get( "flags", TEXT_DEHYPHENATE | TEXT_PRESERVE_WHITESPACE | TEXT_PRESERVE_LIGATURES ) @@ -398,7 +400,6 @@ def search_for(*args, **kwargs) -> list: CheckParent(page) tp = page.get_textpage(clip=clip, flags=flags) # create TextPage rlist = tp.search(text, quads=quads) - tp = None return rlist @@ -4066,11 +4067,13 @@ def fill_textbox( font = Font("helv") def textlen(x): + """Return length of a string.""" return font.text_length( x, fontsize=fontsize, small_caps=small_caps ) # abbreviation def char_lengths(x): + """Return list of single character lengths for a string.""" return font.char_lengths(x, fontsize=fontsize, small_caps=small_caps) def append_this(pos, text): @@ -4137,6 +4140,7 @@ def output_justify(start, line): else: lheight = lineheight + LINEHEIGHT = fontsize * lheight # effective line height width = std_width # available horizontal space # starting point of text @@ -4163,7 +4167,7 @@ def output_justify(start, line): for line in text: textlines.extend(line.splitlines()) - max_lines = int((rect.y1 - pos.y) / (lheight * fontsize)) + max_lines = int((rect.y1 - pos.y) / LINEHEIGHT) new_lines = [] # the final list of textbox lines no_justify = [] # no justify for these line numbers @@ -4208,6 +4212,11 @@ def output_justify(start, line): if len(words) == 0: break + # ------------------------------------------------------------------------- + # List of lines created. Each item is (text, tl), where 'tl' is the PDF + # output length (float) and 'text' is the text. Except for justified text, + # this is output-ready. + # ------------------------------------------------------------------------- nlines = len(new_lines) if nlines > max_lines: msg = "Only fitting %i of %i lines." % (max_lines, nlines) @@ -4216,7 +4225,6 @@ def output_justify(start, line): elif warn == False: raise ValueError(msg) - lh = fontsize * lheight start = Point() for i, (line, tl) in enumerate(new_lines): if i > max_lines: # do not exceed space @@ -4235,7 +4243,7 @@ def output_justify(start, line): ): output_justify(start, line) start.x = std_start - start.y += lh + start.y += LINEHEIGHT continue if i > 0 or pos.x == std_start: # left, center, right alignments @@ -4243,9 +4251,9 @@ def output_justify(start, line): append_this(start, line) start.x = std_start - start.y += lh + start.y += LINEHEIGHT - return new_lines[max_lines:] # return non-written lines + return new_lines[i + 1 :] # return non-written lines # ------------------------------------------------------------------------ diff --git a/fitz/version.i b/fitz/version.i index e7324f33c..c536b7adb 100644 --- a/fitz/version.i +++ b/fitz/version.i @@ -1,6 +1,6 @@ %pythoncode %{ VersionFitz = "1.18.0" -VersionBind = "1.18.15" -VersionDate = "2021-07-10 00:00:01" -version = (VersionBind, VersionFitz, "20210710000001") +VersionBind = "1.18.16" +VersionDate = "2021-08-05 00:00:01" +version = (VersionBind, VersionFitz, "20210805000001") %} \ No newline at end of file diff --git a/setup.py b/setup.py index 5049d97bc..e234c2e73 100644 --- a/setup.py +++ b/setup.py @@ -155,7 +155,7 @@ def load_libraries(): setup( name="PyMuPDF", - version="1.18.15", + version="1.18.16", description="Python bindings for the PDF rendering library MuPDF", long_description=long_desc, classifiers=classifier,