Skip to content

Commit

Permalink
Fix #2553
Browse files Browse the repository at this point in the history
Previously, the output of plain text converted characters via fz_chartorune and "words", "blocks", "dict" and "rawdict" handled character conversions differently, using Python raw unicode decoding.
A yet somewhat different behavior was used in page.get_textbox() - which is plain text extraction from within a rectangle independent from using a clip.

This fix ensures that plain text extraction (including textbox) deliver the same output.
This is checked via comparing the set of characters produced in each of the cases.
  • Loading branch information
JorjMcKie committed Oct 24, 2023
1 parent 6df96dc commit ebc0361
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 81 deletions.
20 changes: 6 additions & 14 deletions fitz/fitz.i
Original file line number Diff line number Diff line change
Expand Up @@ -11827,10 +11827,10 @@ struct TextPage {
fz_print_stext_page_as_xhtml(gctx, out, this_tpage, 0);
break;
default:
JM_print_stext_page_as_text(gctx, out, this_tpage);
JM_print_stext_page_as_text(gctx, res, this_tpage);
break;
}
text = JM_UnicodeFromBuffer(gctx, res);
text = JM_EscapeStrFromBuffer(gctx, res);

}
fz_always(gctx) {
Expand All @@ -11845,28 +11845,20 @@ struct TextPage {


//----------------------------------------------------------------
// method extractRect()
// method extractTextbox()
//----------------------------------------------------------------
FITZEXCEPTION(extractTextbox, !result)
PyObject *extractTextbox(PyObject *rect)
{
fz_stext_page *this_tpage = (fz_stext_page *) $self;
fz_rect area = JM_rect_from_py(rect);
PyObject *rc = NULL;
char *found = NULL;
fz_try(gctx) {
char *found = JM_copy_rectangle(gctx, this_tpage, area);
if (found) {
rc = JM_UnicodeFromStr(found);
JM_Free(found);
} else {
rc = EMPTY_STRING;
}
rc = JM_copy_rectangle(gctx, this_tpage, area);
}
fz_catch(gctx) {
if (found) JM_Free(found);
return EMPTY_STRING;
return NULL;
}

return rc;
}

Expand Down
33 changes: 16 additions & 17 deletions fitz/helper-stext.i
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ no_more_matches:;
// character (which else leads to 2 new-lines).
//-----------------------------------------------------------------------------
void
JM_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page)
JM_print_stext_page_as_text(fz_context *ctx, fz_buffer *buff, fz_stext_page *page)
{
fz_stext_block *block;
fz_stext_line *line;
Expand All @@ -480,14 +480,11 @@ JM_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page
if (fz_is_infinite_rect(rect) ||
JM_rects_overlap(rect, chbbox)) {
last_char = ch->c;
n = fz_runetochar(utf, ch->c);
for (i = 0; i < n; i++) {
fz_write_byte(ctx, out, utf[i]);
}
JM_append_rune(ctx, buff, ch->c);
}
}
if (last_char != 10 && last_char > 0) {
fz_write_string(ctx, out, "\n");
fz_append_string(ctx, buff, "\n");
}
}
}
Expand Down Expand Up @@ -794,18 +791,17 @@ void JM_make_textpage_dict(fz_context *ctx, fz_stext_page *tp, PyObject *page_di


//---------------------------------------------------------------------
char *
PyObject *
JM_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area)
{
fz_stext_block *block;
fz_stext_line *line;
fz_stext_char *ch;
fz_buffer *buffer;
unsigned char *s;
int need_new_line = 0;

buffer = fz_new_buffer(ctx, 1024);
PyObject *rc = NULL;
fz_try(ctx) {
buffer = fz_new_buffer(ctx, 1024);
for (block = page->first_block; block; block = block->next) {
if (block->type != FZ_STEXT_BLOCK_TEXT)
continue;
Expand All @@ -819,24 +815,27 @@ JM_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area)
fz_append_string(ctx, buffer, "\n");
need_new_line = 0;
}
fz_append_rune(ctx, buffer, ch->c < 32 ? FZ_REPLACEMENT_CHARACTER : ch->c);
JM_append_rune(ctx, buffer, ch->c);
}
}
if (line_had_text)
need_new_line = 1;
}
}
fz_terminate_buffer(ctx, buffer);
rc = JM_EscapeStrFromBuffer(ctx, buffer);
if (!rc) {
rc = EMPTY_STRING;
PyErr_Clear();
}
}
fz_always(ctx) {
fz_drop_buffer(ctx, buffer);
}
fz_catch(ctx) {
fz_drop_buffer(ctx, buffer);
fz_rethrow(ctx);
}


fz_buffer_extract(ctx, buffer, &s); /* take over the data */
fz_drop_buffer(ctx, buffer);
return (char*)s;
return rc;
}
//---------------------------------------------------------------------

Expand Down
62 changes: 35 additions & 27 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11903,8 +11903,8 @@ def _extractText(self, format_):
elif format_ == 4:
mupdf.fz_print_stext_page_as_xhtml(out, this_tpage, 0)
else:
JM_print_stext_page_as_text(out, this_tpage)
text = JM_UnicodeFromBuffer(res)
JM_print_stext_page_as_text(res, this_tpage)
text = JM_EscapeStrFromBuffer(res)
return text

def _getNewBlockList(self, page_dict, raw):
Expand Down Expand Up @@ -12096,10 +12096,7 @@ def extractTextbox(self, rect):
assert isinstance(this_tpage, mupdf.FzStextPage)
area = JM_rect_from_py(rect)
found = JM_copy_rectangle(this_tpage, area);
if (found):
rc = JM_UnicodeFromStr(found)
else:
rc = ''
rc = PyUnicode_DecodeRawUnicodeEscape(found)
return rc

def extractWORDS(self, delimiters=None):
Expand Down Expand Up @@ -13929,12 +13926,24 @@ def JM_annot_set_border( border, doc, annot_obj):
mupdf.pdf_dict_put_int( obj, PDF_NAME('I'), nclouds)


def make_escape(ch):
if ch == 92:
return "\\u005c"
elif 32 <= ch <= 127 or ch == 10:
return chr(ch)
elif 0xd800 <= ch <= 0xdfff: # orphaned surrogate
return chr(0xfffd)
elif ch <= 0xffff:
return "\\u%04x" % ch
else:
return "\\U%08x" % ch


def JM_append_rune(buff, ch):
"""
APPEND non-ascii runes in unicode escape format to fz_buffer.
No need for special processing in pure Python.
"""
mupdf.fz_append_string(buff, chr(ch))
mupdf.fz_append_string(buff, make_escape(ch))


def JM_append_word(lines, buff, wbbox, block_n, line_n, word_n):
Expand Down Expand Up @@ -14244,9 +14253,10 @@ def JM_compress_buffer(inbuffer):
return buf;



def JM_copy_rectangle(page, area):
need_new_line = 0
buffer_ = mupdf.fz_new_buffer(1024)
buffer = io.StringIO()
for block in page:
if block.m_internal.type != mupdf.FZ_STEXT_BLOCK_TEXT:
continue
Expand All @@ -14257,17 +14267,13 @@ def JM_copy_rectangle(page, area):
if JM_rects_overlap(area, r):
line_had_text = 1
if need_new_line:
mupdf.fz_append_string(buffer_, "\n")
buffer.write("\n")
need_new_line = 0
mupdf.fz_append_rune(
buffer_,
FZ_REPLACEMENT_CHARACTER if ch.m_internal.c < 32 else ch.m_internal.c,
)
buffer.write(make_escape(ch.m_internal.c))
if line_had_text:
need_new_line = 1
mupdf.fz_terminate_buffer(buffer_)

s = mupdf.fz_buffer_extract(buffer_) # take over the data
s = buffer.getvalue() # take over the data
return s


Expand Down Expand Up @@ -16303,16 +16309,16 @@ def JM_point_from_py(p):
return mupdf.FzPoint(x, y)


def JM_print_stext_page_as_text(out, page):
def JM_print_stext_page_as_text(res, page):
'''
Plain text output. An identical copy of fz_print_stext_page_as_text,
but lines within a block are concatenated by space instead a new-line
character (which else leads to 2 new-lines).
'''
if 1 and g_use_extra:
return extra.JM_print_stext_page_as_text( out, page)
return extra.JM_print_stext_page_as_text(res, page)

assert isinstance(out, mupdf.FzOutput)
assert isinstance(res, mupdf.FzBuffer)
assert isinstance(page, mupdf.FzStextPage)
rect = mupdf.FzRect(page.m_internal.mediabox)
last_char = 0
Expand Down Expand Up @@ -16340,14 +16346,10 @@ def JM_print_stext_page_as_text(out, page):
):
#raw += chr(ch.m_internal.c)
last_char = ch.m_internal.c
utf = mupdf.fz_runetochar2(last_char)
#log( '{=last_char!r utf!r}')
for c in utf:
assert isinstance(c, int), f'{type(c)=} {c=}'
assert 0 <= c < 256, f'{utf=} {c=}'
mupdf.fz_write_byte(out, c)
JM_append_rune(res, last_char)
if last_char != 10 and last_char > 0:
mupdf.fz_write_string(out, "\n")
mupdf.fz_append_string(res, "\n")


def JM_put_script(annot_obj, key1, key2, value):
Expand Down Expand Up @@ -17153,8 +17155,14 @@ def ENSURE_OPERATION( pdf):


def PyUnicode_DecodeRawUnicodeEscape(s, errors='strict'):
# fixme: handle escape sequencies
ret = s.decode('utf8', errors=errors)
# FIXED: handle raw unicode escape sequences
if not s:
return ""
if isinstance(s, str):
rc = s.encode("utf8", errors=errors)
elif isinstance(s, bytes):
rc = s[:]
ret = rc.decode('raw_unicode_escape', errors=errors)
z = ret.find(chr(0))
if z >= 0:
ret = ret[:z]
Expand Down
47 changes: 24 additions & 23 deletions src/extra.i
Original file line number Diff line number Diff line change
Expand Up @@ -2554,34 +2554,37 @@ static int JM_rects_overlap(const fz_rect a, const fz_rect b)
}

//
void ll_JM_print_stext_page_as_text(fz_output *out, fz_stext_page *page)
void JM_append_rune(fz_buffer *buff, int ch);
void ll_JM_print_stext_page_as_text(fz_buffer *res, fz_stext_page *page)
{
fz_stext_block *block;
fz_stext_line *line;
fz_stext_char *ch;
fz_rect rect = page->mediabox;
fz_rect chbbox;
int last_char = 0;
char utf[10];
int i, n;

for (block = page->first_block; block; block = block->next) {
if (block->type == FZ_STEXT_BLOCK_TEXT) {
for (line = block->u.t.first_line; line; line = line->next) {

for (block = page->first_block; block; block = block->next)
{
if (block->type == FZ_STEXT_BLOCK_TEXT)
{
for (line = block->u.t.first_line; line; line = line->next)
{
last_char = 0;
for (ch = line->first_char; ch; ch = ch->next) {
for (ch = line->first_char; ch; ch = ch->next)
{
chbbox = JM_char_bbox(line, ch);
if (mupdf::ll_fz_is_infinite_rect(rect) ||
JM_rects_overlap(rect, chbbox)) {
JM_rects_overlap(rect, chbbox))
{
last_char = ch->c;
n = mupdf::ll_fz_runetochar(utf, ch->c);
for (i = 0; i < n; i++) {
mupdf::ll_fz_write_byte(out, utf[i]);
}
JM_append_rune(res, last_char);
}
}
if (last_char != 10 && last_char > 0) {
mupdf::ll_fz_write_string(out, "\n");
if (last_char != 10 && last_char > 0)
{
mupdf::ll_fz_append_string(res, "\n");
}
}
}
Expand All @@ -2592,11 +2595,11 @@ void ll_JM_print_stext_page_as_text(fz_output *out, fz_stext_page *page)
// but lines within a block are concatenated by space instead a new-line
// character (which else leads to 2 new-lines).
//-----------------------------------------------------------------------------
void JM_print_stext_page_as_text(mupdf::FzOutput& out, mupdf::FzStextPage& page)
void JM_print_stext_page_as_text(mupdf::FzBuffer& res, mupdf::FzStextPage& page)
{
if (0)
{
return ll_JM_print_stext_page_as_text(out.m_internal, page.m_internal);
return ll_JM_print_stext_page_as_text(res.m_internal, page.m_internal);
}

fz_rect rect = page.m_internal->mediabox;
Expand All @@ -2616,14 +2619,12 @@ void JM_print_stext_page_as_text(mupdf::FzOutput& out, mupdf::FzStextPage& page)
)
{
last_char = ch.m_internal->c;
char utf[10];
int n = mupdf::ll_fz_runetochar(utf, ch.m_internal->c);
mupdf::ll_fz_write_data( out.m_internal, utf, n);
JM_append_rune(res.m_internal, last_char);
}
}
if (last_char != 10 && last_char > 0)
{
mupdf::fz_write_string( out, "\n");
mupdf::ll_fz_append_string(res.m_internal, "\n");
}
}
}
Expand Down Expand Up @@ -3312,15 +3313,15 @@ static int JM_char_font_flags(fz_font *font, fz_stext_line *line, fz_stext_char
void JM_append_rune(fz_buffer *buff, int ch)
{
char text[32];
if (ch == 92) // prevent accidental "\u", "\U"
if (ch == 92) // prevent accidental "\u", "\U" sequences
{
mupdf::ll_fz_append_string(buff, "\\u005c");
}
else if ((ch >= 32 && ch <= 255) || ch == 10)
{
mupdf::ll_fz_append_byte(buff, ch);
}
else if (ch >= 0xd800 && ch <= 0xdfff) // surrogate Unicodes prohibited
else if (ch >= 0xd800 && ch <= 0xdfff) // orphaned surrogate Unicodes
{
mupdf::ll_fz_append_string(buff, "\\ufffd");
}
Expand Down Expand Up @@ -4363,7 +4364,7 @@ mupdf::FzDevice JM_new_texttrace_device(PyObject* out);
fz_rect JM_char_bbox(const mupdf::FzStextLine& line, const mupdf::FzStextChar& ch);

static fz_quad JM_char_quad( fz_stext_line *line, fz_stext_char *ch);
void JM_print_stext_page_as_text(mupdf::FzOutput& out, mupdf::FzStextPage& page);
void JM_print_stext_page_as_text(mupdf::FzBuffer& res, mupdf::FzStextPage& page);

void set_small_glyph_heights(int on);
mupdf::FzRect JM_cropbox(mupdf::PdfObj& page_obj);
Expand Down
Binary file added tests/resources/test_2553.pdf
Binary file not shown.
Loading

0 comments on commit ebc0361

Please sign in to comment.