Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST: reader.get_fields #1004

Merged
merged 5 commits into from
Jun 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion PyPDF2/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,6 @@ def readStringFromStream( # TODO: PEP8
tok = b""
else:
msg = rf"Unexpected escaped string: {tok.decode('utf8')}"
# if.strict: PdfReadError(msg)
logger.warning(msg)
txt += tok
return createStringObject(txt, forced_encoding)
Expand Down
8 changes: 8 additions & 0 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,11 @@ def test_parse_encoding_advanced_encoding_not_implemented():
with pytest.warns(PdfReadWarning, match="Advanced encoding .* not implemented yet"):
for page in reader.pages:
page.extract_text()


def test_get_font_width_from_default(): # L40
url = "https://corpora.tika.apache.org/base/docs/govdocs1/908/908104.pdf"
name = "tika-908104.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()
26 changes: 25 additions & 1 deletion tests/test_filters.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import string
from io import BytesIO
from itertools import product as cartesian_product

import pytest

from PyPDF2.errors import PdfReadError, PdfStreamError
from PyPDF2 import PdfReader
from PyPDF2.errors import PdfReadError, PdfReadWarning, PdfStreamError
from PyPDF2.filters import (
ASCII85Decode,
ASCIIHexDecode,
Expand All @@ -13,6 +15,8 @@
)
from PyPDF2.generic import ArrayObject, DictionaryObject, NumberObject

from . import get_pdf_from_url

filter_inputs = (
# "", '', """""",
string.ascii_lowercase,
Expand Down Expand Up @@ -192,3 +196,23 @@ def test_CCITTFaxDecode():
b"\x04\x00\x01\x00\x00\x00\x00\x00\x00\x00\x17\x01\x04\x00\x01\x00"
b"\x00\x00\x00\x00\x00\x00\x00\x00"
)


def test_decompress_zlib_error():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/952/952445.pdf"
name = "tika-952445.pdf"
with pytest.warns(PdfReadWarning, match=r"incorrect startxref pointer\(3\)"):
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()
# assert exc.value.args[0] == "Could not find xref table at specified location"


def test_lzw_decode_neg1():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/921/921632.pdf"
name = "tika-921632.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
with pytest.raises(PdfReadError) as exc:
for page in reader.pages:
page.extract_text()
assert exc.value.args[0] == "Missed the stop code in LZWDecode!"
22 changes: 22 additions & 0 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,3 +451,25 @@ def test_text_string_write_to_stream():
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.compress_content_streams()


def test_name_object_read_from_stream_unicode_error(): # L588
url = "https://corpora.tika.apache.org/base/docs/govdocs1/974/974966.pdf"
name = "tika-974966.pdf"

reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()


def test_bool_repr():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/932/932449.pdf"
name = "tika-932449.pdf"

reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
with open("tmp-fields-report.txt", "w") as fp:
fields = reader.get_fields(fileobj=fp)
assert fields

# cleanup
os.remove("tmp-fields-report.txt")
8 changes: 8 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,3 +258,11 @@ def test_extract_text_page_pdf_impossible_decode_xform():
):
for page in reader.pages:
page.extract_text()


def test_extract_text_operator_t_star(): # L1266, L1267
url = "https://corpora.tika.apache.org/base/docs/govdocs1/967/967943.pdf"
name = "tika-967943.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
for page in reader.pages:
page.extract_text()
33 changes: 32 additions & 1 deletion tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,8 @@ def test_get_form(src, expected, expected_get_fields):
fields = reader.get_form_text_fields()
assert fields == expected

fields = reader.get_fields()
with open("tmp-fields-report.txt", "w") as f:
fields = reader.get_fields(fileobj=f)
assert fields == expected_get_fields
if fields:
for field in fields.values():
Expand All @@ -352,6 +353,9 @@ def test_get_form(src, expected, expected_get_fields):
field.additional_actions,
]

# cleanup
os.remove("tmp-fields-report.txt")


@pytest.mark.parametrize(
("src", "page_nb"),
Expand Down Expand Up @@ -751,6 +755,14 @@ def test_extract_text_pdf15():
page.extract_text()


def test_extract_text_xref_table_21_bytes_clrf():
# pdf/0264cf510015b2a4b395a15cb23c001e.pdf
url = "https://corpora.tika.apache.org/base/docs/govdocs1/956/956939.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name="tika-956939.pdf")))
for page in reader.pages:
page.extract_text()


def test_get_fields():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972486.pdf"
name = "tika-972486.pdf"
Expand Down Expand Up @@ -783,3 +795,22 @@ def test_get_fields_read_else_block3():
with pytest.raises(PdfReadError) as exc:
PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
assert exc.value.args[0] == "Could not find xref table at specified location"


def test_metadata_is_none():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/963/963692.pdf"
name = "tika-963692.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
assert reader.metadata is None


def test_get_fields_read_write_report():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/909/909655.pdf"
name = "tika-909655.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
with open("tmp-fields-report.txt", "w") as fp:
fields = reader.get_fields(fileobj=fp)
assert fields

# cleanup
os.remove("tmp-fields-report.txt")