Skip to content

Commit 2a5a199

Browse files
authored
MAINT: Consistant usage of warnings / log messages (#1164)
1 parent 89033cb commit 2a5a199

File tree

14 files changed

+206
-125
lines changed

14 files changed

+206
-125
lines changed

PyPDF2/_cmap.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import Any, Dict, List, Tuple, Union, cast
44

55
from ._codecs import adobe_glyphs, charset_encoding
6+
from ._utils import logger_warning
67
from .errors import PdfReadWarning
78
from .generic import DecodedStreamObject, DictionaryObject
89

@@ -330,9 +331,9 @@ def compute_space_width(
330331
st += 1
331332
w = w[2:]
332333
else:
333-
warnings.warn(
334+
logger_warning(
334335
"unknown widths : \n" + (ft1["/W"]).__repr__(),
335-
PdfReadWarning,
336+
__name__,
336337
)
337338
break
338339
try:

PyPDF2/_page.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,12 @@
5151
TransformationMatrixType,
5252
deprecate_no_replacement,
5353
deprecate_with_replacement,
54+
logger_warning,
5455
matrix_multiply,
5556
)
5657
from .constants import PageAttributes as PG
5758
from .constants import Ressources as RES
58-
from .errors import PageSizeNotDefinedError, PdfReadWarning
59+
from .errors import PageSizeNotDefinedError
5960
from .generic import (
6061
ArrayObject,
6162
ContentStream,
@@ -1430,9 +1431,9 @@ def process_operation(operator: bytes, operands: List) -> None:
14301431
text = self.extract_xform_text(xobj[operands[0]], orientations, space_width) # type: ignore
14311432
output += text
14321433
except Exception:
1433-
warnings.warn(
1434+
logger_warning(
14341435
f" impossible to decode XFormObject {operands[0]}",
1435-
PdfReadWarning,
1436+
__name__,
14361437
)
14371438
finally:
14381439
text = ""

PyPDF2/_reader.py

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
import os
3131
import re
3232
import struct
33-
import warnings
3433
import zlib
3534
from io import BytesIO
3635
from pathlib import Path
@@ -54,6 +53,7 @@
5453
b_,
5554
deprecate_no_replacement,
5655
deprecate_with_replacement,
56+
logger_warning,
5757
read_non_whitespace,
5858
read_previous_line,
5959
read_until_whitespace,
@@ -70,7 +70,7 @@
7070
from .constants import PageAttributes as PG
7171
from .constants import PagesAttributes as PA
7272
from .constants import TrailerKeys as TK
73-
from .errors import PdfReadError, PdfReadWarning, PdfStreamError
73+
from .errors import PdfReadError, PdfStreamError
7474
from .generic import (
7575
ArrayObject,
7676
ContentStream,
@@ -258,10 +258,10 @@ def __init__(
258258
Dict[Any, Any]
259259
] = None # map page indirect_ref number to Page Number
260260
if hasattr(stream, "mode") and "b" not in stream.mode: # type: ignore
261-
warnings.warn(
261+
logger_warning(
262262
"PdfReader stream/file object is not in binary mode. "
263263
"It may not be read correctly.",
264-
PdfReadWarning,
264+
__name__,
265265
)
266266
if isinstance(stream, (str, Path)):
267267
with open(stream, "rb") as fh:
@@ -836,7 +836,7 @@ def _build_destination(
836836
try:
837837
return Destination(title, page, typ, *array) # type: ignore
838838
except PdfReadError:
839-
warnings.warn(f"Unknown destination: {title} {array}", PdfReadWarning)
839+
logger_warning(f"Unknown destination: {title} {array}", __name__)
840840
if self.strict:
841841
raise
842842
# create a link to first Page
@@ -1091,11 +1091,11 @@ def _get_object_from_stream(
10911091
except PdfStreamError as exc:
10921092
# Stream object cannot be read. Normally, a critical error, but
10931093
# Adobe Reader doesn't complain, so continue (in strict mode?)
1094-
warnings.warn(
1094+
logger_warning(
10951095
f"Invalid stream (index {i}) within object "
10961096
f"{indirect_reference.idnum} {indirect_reference.generation}: "
10971097
f"{exc}",
1098-
PdfReadWarning,
1098+
__name__,
10991099
)
11001100

11011101
if self.strict:
@@ -1162,10 +1162,10 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]:
11621162
retval, indirect_reference.idnum, indirect_reference.generation
11631163
)
11641164
else:
1165-
warnings.warn(
1165+
logger_warning(
11661166
f"Object {indirect_reference.idnum} {indirect_reference.generation} "
11671167
"not defined.",
1168-
PdfReadWarning,
1168+
__name__,
11691169
)
11701170
if self.strict:
11711171
raise PdfReadError("Could not find object.")
@@ -1207,9 +1207,9 @@ def read_object_header(self, stream: StreamType) -> Tuple[int, int]:
12071207
read_non_whitespace(stream)
12081208
stream.seek(-1, 1)
12091209
if extra and self.strict:
1210-
warnings.warn(
1210+
logger_warning(
12111211
f"Superfluous whitespace found in object header {idnum} {generation}", # type: ignore
1212-
PdfReadWarning,
1212+
__name__,
12131213
)
12141214
return int(idnum), int(generation)
12151215

@@ -1250,7 +1250,7 @@ def cache_indirect_object(
12501250
if self.strict:
12511251
raise PdfReadError(msg)
12521252
else:
1253-
warnings.warn(msg)
1253+
logger_warning(msg, __name__)
12541254
self.resolved_objects[(generation, idnum)] = obj
12551255
return obj
12561256

@@ -1276,8 +1276,8 @@ def read(self, stream: StreamType) -> None:
12761276
if self.strict and xref_issue_nr:
12771277
raise PdfReadError("Broken xref table")
12781278
else:
1279-
warnings.warn(
1280-
f"incorrect startxref pointer({xref_issue_nr})", PdfReadWarning
1279+
logger_warning(
1280+
f"incorrect startxref pointer({xref_issue_nr})", __name__
12811281
)
12821282

12831283
# read all cross reference tables and their trailers
@@ -1335,7 +1335,7 @@ def _find_startxref_pos(self, stream: StreamType) -> int:
13351335
if not line.startswith(b"startxref"):
13361336
raise PdfReadError("startxref not found")
13371337
startxref = int(line[9:].strip())
1338-
warnings.warn("startxref on same line as offset", PdfReadWarning)
1338+
logger_warning("startxref on same line as offset", __name__)
13391339
else:
13401340
line = read_previous_line(stream)
13411341
if line[:9] != b"startxref":
@@ -1355,9 +1355,9 @@ def _read_standard_xref_table(self, stream: StreamType) -> None:
13551355
if firsttime and num != 0:
13561356
self.xref_index = num
13571357
if self.strict:
1358-
warnings.warn(
1358+
logger_warning(
13591359
"Xref table not zero-indexed. ID numbers for objects will be corrected.",
1360-
PdfReadWarning,
1360+
__name__,
13611361
)
13621362
# if table not zero indexed, could be due to error from when PDF was created
13631363
# which will lead to mismatched indices later on, only warned and corrected if self.strict==True
@@ -1474,9 +1474,10 @@ def _read_xref_other_error(
14741474
"/Prev=0 in the trailer (try opening with strict=False)"
14751475
)
14761476
else:
1477-
warnings.warn(
1477+
logger_warning(
14781478
"/Prev=0 in the trailer - assuming there"
1479-
" is no previous xref table"
1479+
" is no previous xref table",
1480+
__name__,
14801481
)
14811482
return None
14821483
# bad xref character at startxref. Let's see if we can find
@@ -1502,7 +1503,7 @@ def _read_xref_other_error(
15021503
# no xref table found at specified location
15031504
if "/Root" in self.trailer and not self.strict:
15041505
# if Root has been already found, just raise warning
1505-
warnings.warn("Invalid parent xref., rebuild xref", PdfReadWarning)
1506+
logger_warning("Invalid parent xref., rebuild xref", __name__)
15061507
try:
15071508
self._rebuild_xref_table(stream)
15081509
return None

PyPDF2/_writer.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
import struct
3636
import time
3737
import uuid
38-
import warnings
3938
from hashlib import md5
4039
from typing import (
4140
Any,
@@ -49,8 +48,6 @@
4948
cast,
5049
)
5150

52-
from PyPDF2.errors import PdfReadWarning
53-
5451
from ._page import PageObject, _VirtualList
5552
from ._reader import PdfReader
5653
from ._security import _alg33, _alg34, _alg35
@@ -60,6 +57,7 @@
6057
b_,
6158
deprecate_bookmark,
6259
deprecate_with_replacement,
60+
logger_warning,
6361
)
6462
from .constants import AnnotationDictionaryAttributes
6563
from .constants import CatalogAttributes as CA
@@ -780,9 +778,10 @@ def write(self, stream: StreamType) -> None:
780778
the write method and the tell method, similar to a file object.
781779
"""
782780
if hasattr(stream, "mode") and "b" not in stream.mode:
783-
warnings.warn(
781+
logger_warning(
784782
f"File <{stream.name}> to write to is not in binary mode. " # type: ignore
785-
"It may not be written to correctly."
783+
"It may not be written to correctly.",
784+
__name__,
786785
)
787786

788787
if not self._root:
@@ -966,10 +965,10 @@ def _resolve_indirect_object(self, data: IndirectObject) -> IndirectObject:
966965
real_obj = data.pdf.get_object(data)
967966

968967
if real_obj is None:
969-
warnings.warn(
968+
logger_warning(
970969
f"Unable to resolve [{data.__class__.__name__}: {data}], "
971970
"returning NullObject instead",
972-
PdfReadWarning,
971+
__name__,
973972
)
974973
real_obj = NullObject()
975974

@@ -1703,8 +1702,9 @@ def _set_page_layout(self, layout: Union[NameObject, LayoutType]) -> None:
17031702
"""
17041703
if not isinstance(layout, NameObject):
17051704
if layout not in self._valid_layouts:
1706-
warnings.warn(
1707-
f"Layout should be one of: {'', ''.join(self._valid_layouts)}"
1705+
logger_warning(
1706+
f"Layout should be one of: {'', ''.join(self._valid_layouts)}",
1707+
__name__,
17081708
)
17091709
layout = NameObject(layout)
17101710
self._root_object.update({NameObject("/PageLayout"): layout})
@@ -1803,7 +1803,9 @@ def set_page_mode(self, mode: PagemodeType) -> None:
18031803
mode_name: NameObject = mode
18041804
else:
18051805
if mode not in self._valid_modes:
1806-
warnings.warn(f"Mode should be one of: {', '.join(self._valid_modes)}")
1806+
logger_warning(
1807+
f"Mode should be one of: {', '.join(self._valid_modes)}", __name__
1808+
)
18071809
mode_name = NameObject(mode)
18081810
self._root_object.update({NameObject("/PageMode"): mode_name})
18091811

PyPDF2/generic.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
import hashlib
3636
import logging
3737
import re
38-
import warnings
3938
from enum import IntFlag
4039
from io import BytesIO
4140
from typing import (
@@ -74,12 +73,7 @@
7473
from .constants import StreamAttributes as SA
7574
from .constants import TypArguments as TA
7675
from .constants import TypFitArguments as TF
77-
from .errors import (
78-
STREAM_TRUNCATED_PREMATURELY,
79-
PdfReadError,
80-
PdfReadWarning,
81-
PdfStreamError,
82-
)
76+
from .errors import STREAM_TRUNCATED_PREMATURELY, PdfReadError, PdfStreamError
8377

8478
logger = logging.getLogger(__name__)
8579
ObjectPrefix = b"/<[tf(n%"
@@ -813,7 +807,7 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader
813807
if pdf is not None and pdf.strict:
814808
raise PdfReadError(msg)
815809
else:
816-
warnings.warn(msg, PdfReadWarning)
810+
logger_warning(msg, __name__)
817811

818812
pos = stream.tell()
819813
s = read_non_whitespace(stream)

docs/user/suppress-warnings.md

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,20 @@
1-
# Suppress Warnings and Log messages
1+
# Exceptions, Warnings, and Log messages
22

33
PyPDF2 makes use of 3 mechanisms to show that something went wrong:
44

5-
* **Exceptions**: Error-cases the client should explicitly handle. In the
6-
`strict=True` mode, most log messages will become exceptions. This can be
7-
useful in applications where you can force to user to fix the broken PDF.
8-
* **Warnings**: Avoidable issues, such as using deprecated classes / functions / parameters
9-
* **Log messages**: Nothing the client can do, but they should know it happened.
5+
* **Log messages** are informative messages that can be used for post-mortem
6+
analysis. Most of the time, users can ignore them. They come in different
7+
*levels*, such as info / warning / error indicating the severity.
8+
Examples are non-standard compliant PDF files which PyPDF2 can deal with.
9+
* **Warnings** are avoidable issues, such as using deprecated classes /
10+
functions / parameters. Another example is missing capabilities of PyPDF2.
11+
In those cases, PyPDF2 users should adjust their code. Warnings
12+
are issued by the `warnings` module - those are different from the log-level
13+
"warning".
14+
* **Exceptions** are error-cases that PyPDF2 users should explicitly handle.
15+
In the `strict=True` mode, most log messages with the warning level will
16+
become exceptions. This can be useful in applications where you can force to
17+
user to fix the broken PDF.
1018

1119

1220
## Exceptions

tests/__init__.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import os
22
import ssl
33
import urllib.request
4+
from typing import List
45

56

67
def get_pdf_from_url(url: str, name: str) -> bytes:
@@ -30,3 +31,22 @@ def get_pdf_from_url(url: str, name: str) -> bytes:
3031
with open(cache_path, "rb") as fp:
3132
data = fp.read()
3233
return data
34+
35+
36+
def _strip_position(line: str) -> str:
37+
"""
38+
Remove the location information.
39+
40+
The message
41+
WARNING PyPDF2._reader:_utils.py:364 Xref table not zero-indexed.
42+
43+
becomes
44+
Xref table not zero-indexed.
45+
"""
46+
line = ".py".join(line.split(".py:")[1:])
47+
line = " ".join(line.split(" ")[1:])
48+
return line
49+
50+
51+
def normalize_warnings(caplog_text: str) -> List[str]:
52+
return [_strip_position(line) for line in caplog_text.strip().split("\n")]

tests/bench.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
import os
22

3-
import pytest
4-
53
import PyPDF2
64
from PyPDF2 import PdfReader, Transformation
75
from PyPDF2.generic import Destination
@@ -127,7 +125,6 @@ def text_extraction(pdf_path):
127125
return text
128126

129127

130-
@pytest.mark.filterwarnings("ignore::PyPDF2.errors.PdfReadWarning")
131128
def test_text_extraction(benchmark):
132129
file_path = os.path.join(SAMPLE_ROOT, "009-pdflatex-geotopo/GeoTopo.pdf")
133130
benchmark(text_extraction, file_path)

0 commit comments

Comments
 (0)