Skip to content

Commit

Permalink
Merge branch 'main' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma authored Apr 24, 2022
2 parents 25befd4 + 6729b80 commit b033b6d
Show file tree
Hide file tree
Showing 16 changed files with 470 additions and 55 deletions.
11 changes: 7 additions & 4 deletions PyPDF2/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
from ._version import __version__
from .merger import PdfFileMerger
from .pagerange import PageRange, parse_filename_page_ranges
from .pdf import PdfFileReader, PdfFileWriter
from PyPDF2._version import __version__
from PyPDF2.merger import PdfFileMerger
from PyPDF2.pagerange import PageRange, parse_filename_page_ranges
from PyPDF2.papersizes import PaperSize
from PyPDF2.pdf import PdfFileReader, PdfFileWriter
from PyPDF2 import pdf

__all__ = [
"__version__",
"PageRange",
"PaperSize",
"parse_filename_page_ranges",
"pdf",
"PdfFileMerger",
Expand Down
70 changes: 37 additions & 33 deletions PyPDF2/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1217,39 +1217,43 @@ def decode_pdfdocencoding(byte_array):
retval += c
return retval

# PDFDocEncoding Character Set: Table D.2 of PDF Reference 1.7
# C.1 Predefined encodings sorted by character name of another PDF reference
# Some indices have '\u0000' although they should have something else:
# 22: should be '\u0017'
_pdfDocEncoding = (
u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'), u_('\u0000'),
u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'),
u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'),
u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'),
u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'),
u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'),
u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'),
u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'),
u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'),
u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'),
u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'),
u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'),
u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'),
u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'),
u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'),
u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'),
u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'),
u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'),
u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'),
u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'),
u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'),
u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'),
u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'),
u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'),
u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'),
u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'),
u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'),
u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'),
u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'),
u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff')
u_('\u0000'), u_('\u0001'), u_('\u0002'), u_('\u0003'), u_('\u0004'), u_('\u0005'), u_('\u0006'), u_('\u0007'), # 0 - 7
u_('\u0008'), u_('\u0009'), u_('\u000a'), u_('\u000b'), u_('\u000c'), u_('\u000d'), u_('\u000e'), u_('\u000f'), # 8 - 15
u_('\u0010'), u_('\u0011'), u_('\u0012'), u_('\u0013'), u_('\u0014'), u_('\u0015'), u_('\u0000'), u_('\u0017'), # 16 - 23
u_('\u02d8'), u_('\u02c7'), u_('\u02c6'), u_('\u02d9'), u_('\u02dd'), u_('\u02db'), u_('\u02da'), u_('\u02dc'), # 24 - 31
u_('\u0020'), u_('\u0021'), u_('\u0022'), u_('\u0023'), u_('\u0024'), u_('\u0025'), u_('\u0026'), u_('\u0027'), # 32 - 39
u_('\u0028'), u_('\u0029'), u_('\u002a'), u_('\u002b'), u_('\u002c'), u_('\u002d'), u_('\u002e'), u_('\u002f'), # 40 - 47
u_('\u0030'), u_('\u0031'), u_('\u0032'), u_('\u0033'), u_('\u0034'), u_('\u0035'), u_('\u0036'), u_('\u0037'), # 48 - 55
u_('\u0038'), u_('\u0039'), u_('\u003a'), u_('\u003b'), u_('\u003c'), u_('\u003d'), u_('\u003e'), u_('\u003f'), # 56 - 63
u_('\u0040'), u_('\u0041'), u_('\u0042'), u_('\u0043'), u_('\u0044'), u_('\u0045'), u_('\u0046'), u_('\u0047'), # 64 - 71
u_('\u0048'), u_('\u0049'), u_('\u004a'), u_('\u004b'), u_('\u004c'), u_('\u004d'), u_('\u004e'), u_('\u004f'), # 72 - 79
u_('\u0050'), u_('\u0051'), u_('\u0052'), u_('\u0053'), u_('\u0054'), u_('\u0055'), u_('\u0056'), u_('\u0057'), # 80 - 87
u_('\u0058'), u_('\u0059'), u_('\u005a'), u_('\u005b'), u_('\u005c'), u_('\u005d'), u_('\u005e'), u_('\u005f'), # 88 - 95
u_('\u0060'), u_('\u0061'), u_('\u0062'), u_('\u0063'), u_('\u0064'), u_('\u0065'), u_('\u0066'), u_('\u0067'), # 96 - 103
u_('\u0068'), u_('\u0069'), u_('\u006a'), u_('\u006b'), u_('\u006c'), u_('\u006d'), u_('\u006e'), u_('\u006f'), # 104 - 111
u_('\u0070'), u_('\u0071'), u_('\u0072'), u_('\u0073'), u_('\u0074'), u_('\u0075'), u_('\u0076'), u_('\u0077'), # 112 - 119
u_('\u0078'), u_('\u0079'), u_('\u007a'), u_('\u007b'), u_('\u007c'), u_('\u007d'), u_('\u007e'), u_('\u0000'), # 120 - 127
u_('\u2022'), u_('\u2020'), u_('\u2021'), u_('\u2026'), u_('\u2014'), u_('\u2013'), u_('\u0192'), u_('\u2044'), # 128 - 135
u_('\u2039'), u_('\u203a'), u_('\u2212'), u_('\u2030'), u_('\u201e'), u_('\u201c'), u_('\u201d'), u_('\u2018'), # 136 - 143
u_('\u2019'), u_('\u201a'), u_('\u2122'), u_('\ufb01'), u_('\ufb02'), u_('\u0141'), u_('\u0152'), u_('\u0160'), # 144 - 151
u_('\u0178'), u_('\u017d'), u_('\u0131'), u_('\u0142'), u_('\u0153'), u_('\u0161'), u_('\u017e'), u_('\u0000'), # 152 - 159
u_('\u20ac'), u_('\u00a1'), u_('\u00a2'), u_('\u00a3'), u_('\u00a4'), u_('\u00a5'), u_('\u00a6'), u_('\u00a7'), # 160 - 167
u_('\u00a8'), u_('\u00a9'), u_('\u00aa'), u_('\u00ab'), u_('\u00ac'), u_('\u0000'), u_('\u00ae'), u_('\u00af'), # 168 - 175
u_('\u00b0'), u_('\u00b1'), u_('\u00b2'), u_('\u00b3'), u_('\u00b4'), u_('\u00b5'), u_('\u00b6'), u_('\u00b7'), # 176 - 183
u_('\u00b8'), u_('\u00b9'), u_('\u00ba'), u_('\u00bb'), u_('\u00bc'), u_('\u00bd'), u_('\u00be'), u_('\u00bf'), # 184 - 191
u_('\u00c0'), u_('\u00c1'), u_('\u00c2'), u_('\u00c3'), u_('\u00c4'), u_('\u00c5'), u_('\u00c6'), u_('\u00c7'), # 192 - 199
u_('\u00c8'), u_('\u00c9'), u_('\u00ca'), u_('\u00cb'), u_('\u00cc'), u_('\u00cd'), u_('\u00ce'), u_('\u00cf'), # 200 - 207
u_('\u00d0'), u_('\u00d1'), u_('\u00d2'), u_('\u00d3'), u_('\u00d4'), u_('\u00d5'), u_('\u00d6'), u_('\u00d7'), # 208 - 215
u_('\u00d8'), u_('\u00d9'), u_('\u00da'), u_('\u00db'), u_('\u00dc'), u_('\u00dd'), u_('\u00de'), u_('\u00df'), # 216 - 223
u_('\u00e0'), u_('\u00e1'), u_('\u00e2'), u_('\u00e3'), u_('\u00e4'), u_('\u00e5'), u_('\u00e6'), u_('\u00e7'), # 224 - 231
u_('\u00e8'), u_('\u00e9'), u_('\u00ea'), u_('\u00eb'), u_('\u00ec'), u_('\u00ed'), u_('\u00ee'), u_('\u00ef'), # 232 - 239
u_('\u00f0'), u_('\u00f1'), u_('\u00f2'), u_('\u00f3'), u_('\u00f4'), u_('\u00f5'), u_('\u00f6'), u_('\u00f7'), # 240 - 247
u_('\u00f8'), u_('\u00f9'), u_('\u00fa'), u_('\u00fb'), u_('\u00fc'), u_('\u00fd'), u_('\u00fe'), u_('\u00ff') # 248 - 255
)

assert len(_pdfDocEncoding) == 256
Expand All @@ -1259,5 +1263,5 @@ def decode_pdfdocencoding(byte_array):
char = _pdfDocEncoding[i]
if char == u_("\u0000"):
continue
assert char not in _pdfDocEncoding_rev
assert char not in _pdfDocEncoding_rev, str(char) + " at " + str(i) + " already at " + str(_pdfDocEncoding_rev[char])
_pdfDocEncoding_rev[char] = i
48 changes: 48 additions & 0 deletions PyPDF2/papersizes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Helper to get paper sizes."""

from collections import namedtuple

Dimensions = namedtuple("Dimensions", ["width", "height"])


class PaperSize(object):
"""(width, height) of the paper in portrait mode in pixels at 72 ppi."""

# Notes how to calculate it:
# 1. Get the size of the paper in mm
# 2. Convert it to inches (25.4 millimeters are equal to 1 inches)
# 3. Convert it to pixels ad 72dpi (1 inch is equal to 72 pixels)

# All Din-A paper sizes follow this pattern:
# 2xA(n-1) = A(n)
# So the height of the next bigger one is the width of the smaller one
# The ratio is always approximately the ratio 1:2**0.5
# Additionally, A0 is defined to have an area of 1 m**2
# Be aware of rounding issues!
A0 = Dimensions(2384, 3370) # 841mm x 1189mm
A1 = Dimensions(1684, 2384)
A2 = Dimensions(1191, 1684)
A3 = Dimensions(842, 1191)
A4 = Dimensions(
595, 842
) # Printer paper, documents - this is by far the most common
A5 = Dimensions(420, 595) # Paperback books
A6 = Dimensions(298, 420) # Post cards
A7 = Dimensions(210, 298)
A8 = Dimensions(147, 210)

# Envelopes
C4 = Dimensions(649, 918)


_din_a = [
PaperSize.A0,
PaperSize.A1,
PaperSize.A2,
PaperSize.A3,
PaperSize.A4,
PaperSize.A5,
PaperSize.A6,
PaperSize.A7,
PaperSize.A8,
]
42 changes: 31 additions & 11 deletions PyPDF2/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def set_need_appearances_writer(self):
self._root_object["/AcroForm"][need_appearances] = BooleanObject(True)

except Exception as e:
print('set_need_appearances_writer() catch : ', repr(e))
logger.error('set_need_appearances_writer() catch : ', repr(e))

def addPage(self, page):
"""
Expand Down Expand Up @@ -371,7 +371,7 @@ def appendPagesFromReader(self, reader, after_page_append=None):
# Trigger callback, pass writer page as parameter
if callable(after_page_append): after_page_append(writer_page)

def updatePageFormFieldValues(self, page, fields):
def updatePageFormFieldValues(self, page, fields, flags=0):
'''
Update the form field values for a given page from a fields dictionary.
Copy field texts and values from fields to page.
Expand All @@ -381,6 +381,9 @@ def updatePageFormFieldValues(self, page, fields):
and field data will be updated.
:param fields: a Python dictionary of field names (/T) and text
values (/V)
:param flags: An integer (0 to 7). The first bit sets ReadOnly, the
second bit sets Required, the third bit sets NoExport. See
PDF Reference Table 8.70 for details.
'''
# Iterate through pages, update field values
for j in range(0, len(page[PG.ANNOTS])):
Expand All @@ -394,6 +397,8 @@ def updatePageFormFieldValues(self, page, fields):
writer_annot.update({
NameObject("/V"): TextStringObject(fields[field])
})
if flags:
writer_annot.update({NameObject("/Ff"): NumberObject(flags)})
elif writer_parent_annot.get('/T') == field:
writer_parent_annot.update({
NameObject("/V"): TextStringObject(fields[field])
Expand Down Expand Up @@ -424,7 +429,7 @@ def cloneDocumentFromReader(self, reader, after_page_append=None):
self.cloneReaderDocumentRoot(reader)
self.appendPagesFromReader(reader, after_page_append)

def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True, permissions_flag=-1):
"""
Encrypt this PDF file with the PDF Standard encryption handler.
Expand All @@ -436,6 +441,13 @@ def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
:param bool use_128bit: flag as to whether to use 128bit
encryption. When false, 40bit encryption will be used. By default,
this flag is on.
:param unsigned int permissions_flag: permissions as described in
TABLE 3.20 of the PDF 1.7 specification. A bit value of 1 means the
permission is grantend. Hence an integer value of -1 will set all
flags.
Bit position 3 is for printing, 4 is for modifying content, 5 and 6
control annotations, 9 for form fields, 10 for extraction of
text and graphics.
"""
import random
import time
Expand All @@ -449,8 +461,7 @@ def encrypt(self, user_pwd, owner_pwd = None, use_128bit = True):
V = 1
rev = 2
keylen = int(40 / 8)
# permit everything:
P = -1
P = permissions_flag
O = ByteStringObject(_alg33(owner_pwd, user_pwd, rev, keylen))
ID_1 = ByteStringObject(md5(b_(repr(time.time()))).digest())
ID_2 = ByteStringObject(md5(b_(repr(random.random()))).digest())
Expand Down Expand Up @@ -623,7 +634,7 @@ def _sweepIndirectReferences(self, externMap, data):
newobj = self._sweepIndirectReferences(externMap, newobj)
self._objects[idnum-1] = newobj
return newobj_ido
except ValueError:
except (ValueError, RecursionError):
# Unable to resolve the Object, returning NullObject instead.
warnings.warn("Unable to resolve [{}: {}], returning NullObject instead".format(
data.__class__.__name__, data
Expand Down Expand Up @@ -2071,7 +2082,7 @@ def _pairs(self, array):
def readNextEndLine(self, stream, limit_offset=0):
debug = False
if debug: print(">>readNextEndLine")
line = b_("")
line_parts = []
while True:
# Prevent infinite loops in malformed PDFs
if stream.tell() == 0 or stream.tell() == limit_offset:
Expand All @@ -2098,10 +2109,10 @@ def readNextEndLine(self, stream, limit_offset=0):
break
else:
if debug: print(" x is neither")
line = x + line
if debug: print((" RNEL line:", line))
line_parts.append(x)
if debug: print("leaving RNEL")
return line
line_parts.reverse()
return b"".join(line_parts)

def decrypt(self, password):
"""
Expand Down Expand Up @@ -2766,7 +2777,7 @@ def compressContentStreams(self):
content = ContentStream(content, self.pdf)
self[NameObject("/Contents")] = content.flateEncode()

def extractText(self, Tj_sep="", TJ_sep=" "):
def extractText(self, Tj_sep="", TJ_sep=""):
"""
Locate all text drawing commands, in the order they are provided in the
content stream, and extract the text. This works well for some PDF
Expand Down Expand Up @@ -2808,6 +2819,15 @@ def extractText(self, Tj_sep="", TJ_sep=" "):
if isinstance(i, TextStringObject):
text += TJ_sep
text += i
elif isinstance(i, NumberObject):
# a positive value decreases and the negative value increases
# space
if int(i) < 0:
if len(text) == 0 or text[-1] != " ":
text += " "
else:
if len(text) > 1 and text[-1] == " ":
text = text[:-1]
text += "\n"
return text

Expand Down
19 changes: 18 additions & 1 deletion Resources/crazyones.txt
Original file line number Diff line number Diff line change
@@ -1 +1,18 @@
The Cr azy Ones Octob er 14, 1998 Heres to the crazy ones. The mis˝ts. The reb els. The troublemak ers. The round p egs in the square holes. The ones who see things di˙eren tly . Theyre not fond of rules. And they ha v e no resp ect for the status quo. Y ou can quote them, disagree with them, glorify or vilify them. Ab out the only thing y ou cant do is ignore them. Because they c hange things. They in v en t. They imagine. They heal. They explore. They create. They inspire. They push the h uman race forw ard. Ma yb e they ha v e to b e crazy . Ho w else can y ou stare at an empt y can v as and see a w ork of art? Or sit in silence and hear a song thats nev er b een written? Or gaze at a red planet and see a lab oratory on wheels? W e mak e to ols for these kinds of p eople. While some see them as the crazy ones, w e see genius. Because the p eople who are crazy enough to think they can c hange the w orld, are the ones who do.
The Crazy Ones
October 14, 1998
Heres to the crazy ones. The mis˝ts. The reb els. The troublemakers.
The round p egs in the square holes.
The ones who see things di˙erently. Theyre not fond of rules. And
they have no resp ect for the status quo. You can quote them,
disagree with them, glorify or vilify them.
Ab out the only thing you cant do is ignore them. Because they change
things. They invent. They imagine. They heal. They explore. They
create. They inspire. They push the human race forward.
Mayb e they have to b e crazy.
How else can you stare at an empty canvas and see a work of art? Or
sit in silence and hear a song thats never b een written? Or gaze at
a red planet and see a lab oratory on wheels?
We make to ols for these kinds of p eople.
While some see them as the crazy ones, we see genius. Because the
p eople who are crazy enough to think they can change the world,
are the ones who do.
30 changes: 30 additions & 0 deletions Tests/test_papersizes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from PyPDF2 import papersizes
import pytest


def test_din_a0():
dim = papersizes.PaperSize.A0
area_square_pixels = float(dim.width) * dim.height

# 72 pixels is 1 inch
area_square_inch = area_square_pixels / 72**2

# 25.4 millimeter is equal to 1 inches
area_square_mm = area_square_inch * (25.4)**2
assert abs(area_square_mm - 999949) < 100
conversion_factor = 72 / 25.4
assert (dim.width - 841 * conversion_factor) < 1
assert (dim.width - 1189 * conversion_factor) < 1



@pytest.mark.parametrize("dimensions", papersizes._din_a)
def test_din_a_ratio(dimensions):
assert abs(dimensions.height - dimensions.width * 2**0.5) <= 2.5


@pytest.mark.parametrize(
"dimensions_a, dimensions_b", list(zip(papersizes._din_a, papersizes._din_a[1:]))
)
def test_din_a_doubling(dimensions_a, dimensions_b):
assert abs(dimensions_a.height - 2 * dimensions_b.width) <= 4
Loading

0 comments on commit b033b6d

Please sign in to comment.