Skip to content

Word delimiter support, fixes #2637, #2556, #2553, #2522 #2661

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 79 additions & 17 deletions fitz/fitz.i
Original file line number Diff line number Diff line change
Expand Up @@ -4030,7 +4030,7 @@ if rbgroups:
if not type(x) in (list, tuple):
raise ValueError("bad RBGroup '%s'" % x)
s = set(x).difference(ocgs)
if f != set():
if s != set():
raise ValueError("bad OCGs in RBGroup: %s" % s)

if basestate:
Expand Down Expand Up @@ -5542,8 +5542,6 @@ struct Page {
r = pdf_annot_rect(gctx, annot);
r = fz_make_rect(p.x, p.y, p.x + r.x1 - r.x0, p.y + r.y1 - r.y0);
pdf_set_annot_rect(gctx, annot, r);
int flags = PDF_ANNOT_IS_PRINT;
pdf_set_annot_flags(gctx, annot, flags);

if (icon)
pdf_set_annot_icon_name(gctx, annot, icon);
Expand All @@ -5554,7 +5552,6 @@ struct Page {
pdf_dict_put_text_string(gctx, annot_obj, PDF_NAME(Contents), filename);
pdf_update_annot(gctx, annot);
pdf_set_annot_rect(gctx, annot, r);
pdf_set_annot_flags(gctx, annot, flags);
JM_add_annot_id(gctx, annot, "A");
}
fz_always(gctx) {
Expand Down Expand Up @@ -7616,10 +7613,9 @@ def insert_font(self, fontname="helv", fontfile=None, fontbuffer=None,
annot_xrefs = [a[0] for a in self.annot_xrefs() if a[1] not in skip_types]
else:
annot_xrefs = [a[0] for a in self.annot_xrefs() if a[1] in types and a[1] not in skip_types]

for xref in annot_xrefs:
annot = self.load_annot(xref)
annot._yielded=True
yield annot
yield self.load_annot(xref)


def widgets(self, types=None):
Expand Down Expand Up @@ -11533,13 +11529,18 @@ struct TextPage {
extractIMGINFO(int hashes=0)
{
fz_stext_block *block;
int block_n = -1;
int block_n = -1, leave = 0;
fz_stext_page *this_tpage = (fz_stext_page *) $self;
PyObject *rc = NULL, *block_dict = NULL;
fz_pixmap *pix = NULL;
fz_rect bbox;
fz_try(gctx) {
rc = PyList_New(0);
for (block = this_tpage->first_block; block; block = block->next) {
bbox = block->bbox;
if (JM_ignore_rect(bbox)) {
continue; // guard against nonsense block bbox
}
block_n++;
if (block->type == FZ_STEXT_BLOCK_TEXT) {
continue;
Expand All @@ -11556,7 +11557,7 @@ struct TextPage {
block_dict = PyDict_New();
DICT_SETITEM_DROP(block_dict, dictkey_number, Py_BuildValue("i", block_n));
DICT_SETITEM_DROP(block_dict, dictkey_bbox,
JM_py_from_rect(block->bbox));
JM_py_from_rect(bbox));
DICT_SETITEM_DROP(block_dict, dictkey_matrix,
JM_py_from_matrix(block->u.i.transform));
DICT_SETITEM_DROP(block_dict, dictkey_width,
Expand Down Expand Up @@ -11695,7 +11696,7 @@ struct TextPage {
fz_rect wbbox = fz_empty_rect; // word bbox
fz_stext_page *this_tpage = (fz_stext_page *) $self;
fz_rect tp_rect = this_tpage->mediabox;

int word_delimiter = 0;
PyObject *lines = NULL;
fz_try(gctx) {
buff = fz_new_buffer(gctx, 64);
Expand All @@ -11717,10 +11718,11 @@ struct TextPage {
!fz_is_infinite_rect(tp_rect)) {
continue;
}
if (ch->c == 32 && buflen == 0)
word_delimiter = JM_is_word_delimiter(ch->c);
if (word_delimiter && buflen == 0)
continue; // skip spaces at line start
if (ch->c == 32) {
if (!fz_is_empty_rect(wbbox)) {
if (word_delimiter) { // encountered end of word
if (!fz_is_empty_rect(wbbox)) { // output word
word_n = JM_append_word(gctx, lines, buff, &wbbox,
block_n, line_n, word_n);
}
Expand Down Expand Up @@ -11806,10 +11808,10 @@ struct TextPage {
fz_print_stext_page_as_xhtml(gctx, out, this_tpage, 0);
break;
default:
JM_print_stext_page_as_text(gctx, out, this_tpage);
JM_print_stext_page_as_text(gctx, res, this_tpage);
break;
}
text = JM_UnicodeFromBuffer(gctx, res);
text = JM_EscapeStrFromBuffer(gctx, res);

}
fz_always(gctx) {
Expand Down Expand Up @@ -12168,6 +12170,7 @@ struct TextWriter
morph: tuple(Point, Matrix), apply a matrix with a fixpoint.
matrix: Matrix to be used instead of 'morph' argument.
render_mode: (int) PDF render mode operator 'Tr'.
border_width: (float) stroke line Width. Relevant for render mode > 0.
"""

CheckParent(page)
Expand All @@ -12185,6 +12188,8 @@ struct TextWriter
opacity = self.opacity
if color is None:
color = self.color
if render_mode < 0:
render_mode = 0
%}

%pythonappend write_text%{
Expand Down Expand Up @@ -12234,7 +12239,7 @@ struct TextWriter
temp = line.split()
fsize = float(temp[1])
if render_mode != 0:
w = fsize * 0.05
w = fsize * border_width
else:
w = 1
new_cont_lines.append("%g w" % w)
Expand All @@ -12257,7 +12262,7 @@ struct TextWriter
repair_mono_font(page, font)
%}
PyObject *write_text(struct Page *page, PyObject *color=NULL, float opacity=-1, int overlay=1,
PyObject *morph=NULL, PyObject *matrix=NULL, int render_mode=0, int oc=0)
PyObject *morph=NULL, PyObject *matrix=NULL, int render_mode=0, int oc=0, float border_width=0.05)
{
pdf_page *pdfpage = pdf_page_from_fz_page(gctx, (fz_page *) page);
pdf_obj *resources = NULL;
Expand Down Expand Up @@ -14412,6 +14417,63 @@ struct Tools
}


FITZEXCEPTION(set_word_delimiters, !result)
%pythonprepend set_word_delimiters %{
"""Set characters to be word delimiters."""
if delims == None:
delims = []
if not hasattr(delims, "__getitem__") or len(delims) > 64:
raise ValueError("bad delimiter value(s)")

try:
delims = set([ord(c) for c in delims])
except:
print("bad delimiter value(s)")
raise
delims = tuple(delims)
%}
PyObject *set_word_delimiters(PyObject *delims=NULL)
{
int i, len = (int) PyTuple_Size(delims);
if (!len) {
word_delimiters[0] = 0; // set list to empty
return Py_False;
}

fz_try(gctx) {
for (i = 0; i < len; i++) {
word_delimiters[i] = (int) PyLong_AsLong(PyTuple_GET_ITEM(delims, (Py_ssize_t) i));
word_delimiters[i+1] = 0;
}
}
fz_always(gctx) {
PyErr_Clear();
}
fz_catch(gctx) {
return NULL;
}
return Py_True;
}


FITZEXCEPTION(get_word_delimiters, !result)
%pythonprepend get_word_delimiters %{"""Get the word delimiting characters."""%}
PyObject *get_word_delimiters()
{
int delim, i = 0;
PyObject *rc = PyList_New(0);
while (1) {
delim = word_delimiters[i];
if (!delim) {
break;
}
PyList_Append(rc, Py_BuildValue("C", delim));
i++;
}
return rc;
}


FITZEXCEPTION(set_icc, !result)
%pythonprepend set_icc
%{"""Set ICC color handling on or off."""%}
Expand Down
8 changes: 7 additions & 1 deletion fitz/helper-devices.i
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ jm_lineart_path(fz_context *ctx, jm_lineart_device *dev, const fz_path *path)
DICT_SETITEM_DROP(dev_pathdict, dictkey_items, PyList_New(0));
fz_walk_path(ctx, path, &trace_path_walker, dev);
// Check if any items were added ...
if (!PyList_Size(PyDict_GetItem(dev_pathdict, dictkey_items))) {
if (!PyDict_GetItem(dev_pathdict, dictkey_items) || !PyList_Size(PyDict_GetItem(dev_pathdict, dictkey_items))) {
Py_CLEAR(dev_pathdict);
}
}
Expand Down Expand Up @@ -468,6 +468,9 @@ jm_lineart_clip_path(fz_context *ctx, fz_device *dev_, const fz_path *path, int
trace_device_ctm = ctm; //fz_concat(ctm, trace_device_ptm);
path_type = CLIP_PATH;
jm_lineart_path(ctx, dev, path);
if (!dev_pathdict) {
return;
}
DICT_SETITEM_DROP(dev_pathdict, dictkey_type, PyUnicode_FromString("clip"));
DICT_SETITEMSTR_DROP(dev_pathdict, "even_odd", JM_BOOL(even_odd));
if (!PyDict_GetItemString(dev_pathdict, "closePath")) {
Expand All @@ -489,6 +492,9 @@ jm_lineart_clip_stroke_path(fz_context *ctx, fz_device *dev_, const fz_path *pat
trace_device_ctm = ctm; //fz_concat(ctm, trace_device_ptm);
path_type = CLIP_STROKE_PATH;
jm_lineart_path(ctx, dev, path);
if (!dev_pathdict) {
return;
}
DICT_SETITEM_DROP(dev_pathdict, dictkey_type, PyUnicode_FromString("clip"));
DICT_SETITEMSTR_DROP(dev_pathdict, "even_odd", Py_BuildValue("s", NULL));
if (!PyDict_GetItemString(dev_pathdict, "closePath")) {
Expand Down
34 changes: 30 additions & 4 deletions fitz/helper-geo-c.i
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ JM_rect_from_py(PyObject *r)

for (i = 0; i < 4; i++) {
if (JM_FLOAT_ITEM(r, i, &f[i]) == 1) return fz_infinite_rect;
if (f[i] < FZ_MIN_INF_RECT) f[i] = FZ_MIN_INF_RECT;
if (f[i] > FZ_MAX_INF_RECT) f[i] = FZ_MAX_INF_RECT;
if (f[i] <= FZ_MIN_INF_RECT) f[i] = FZ_MIN_INF_RECT;
if (f[i] >= FZ_MAX_INF_RECT) f[i] = FZ_MAX_INF_RECT;
}

return fz_make_rect((float) f[0], (float) f[1], (float) f[2], (float) f[3]);
Expand All @@ -92,6 +92,32 @@ JM_py_from_rect(fz_rect r)
return Py_BuildValue("ffff", r.x0, r.y0, r.x1, r.y1);
}

//-----------------------------------------------------------------------------
// Ignore this rect (generalizes infinite, empty etc.)
//-----------------------------------------------------------------------------
int JM_ignore_rect(fz_rect r)
{
if (fz_is_infinite_rect(r) || fz_is_empty_rect(r)) return 1;
if (r.x0 >= FZ_MAX_INF_RECT || r.x0 <= FZ_MIN_INF_RECT) return 1;
if (r.y0 >= FZ_MAX_INF_RECT || r.y0 <= FZ_MIN_INF_RECT) return 1;
if (r.x1 >= FZ_MAX_INF_RECT || r.x1 <= FZ_MIN_INF_RECT) return 1;
if (r.y1 >= FZ_MAX_INF_RECT || r.y1 <= FZ_MIN_INF_RECT) return 1;
return 0;
}

//-----------------------------------------------------------------------------
// Ignore this rect (generalizes infinite, empty etc.)
//-----------------------------------------------------------------------------
int JM_ignore_irect(fz_irect r)
{
if (fz_is_infinite_irect(r) || fz_is_empty_irect(r)) return 1;
if (r.x0 >= FZ_MAX_INF_RECT || r.x0 <= FZ_MIN_INF_RECT) return 1;
if (r.y0 >= FZ_MAX_INF_RECT || r.y0 <= FZ_MIN_INF_RECT) return 1;
if (r.x1 >= FZ_MAX_INF_RECT || r.x1 <= FZ_MIN_INF_RECT) return 1;
if (r.y1 >= FZ_MAX_INF_RECT || r.y1 <= FZ_MIN_INF_RECT) return 1;
return 0;
}

//-----------------------------------------------------------------------------
// PySequence to fz_irect. Default: infinite irect
//-----------------------------------------------------------------------------
Expand All @@ -105,8 +131,8 @@ JM_irect_from_py(PyObject *r)

for (i = 0; i < 4; i++) {
if (JM_INT_ITEM(r, i, &x[i]) == 1) return fz_infinite_irect;
if (x[i] < FZ_MIN_INF_RECT) x[i] = FZ_MIN_INF_RECT;
if (x[i] > FZ_MAX_INF_RECT) x[i] = FZ_MAX_INF_RECT;
if (x[i] <= FZ_MIN_INF_RECT) x[i] = FZ_MIN_INF_RECT;
if (x[i] >= FZ_MAX_INF_RECT) x[i] = FZ_MAX_INF_RECT;
}

return fz_make_irect(x[0], x[1], x[2], x[3]);
Expand Down
3 changes: 3 additions & 0 deletions fitz/helper-globals.i
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ static int subset_fontnames = 0;
// Unset ascender / descender corrections
static int skip_quad_corrections = 0;

// Additional word delimiting characters
static int word_delimiters[65] = {0};

// constants: error messages
static const char MSG_BAD_ANNOT_TYPE[] = "bad annot type";
static const char MSG_BAD_APN[] = "bad or missing annot AP/N";
Expand Down
Loading