Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Support R6 decrypting #1015

Merged
merged 7 commits into from
Jun 26, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 34 additions & 54 deletions PyPDF2/_encryption.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import hashlib
import random
import struct
from typing import Dict, Optional, Tuple, Union, cast
from typing import Optional, Tuple, Union, cast

from PyPDF2.errors import DependencyError
from PyPDF2.generic import (
Expand Down Expand Up @@ -226,15 +226,6 @@ def _padding(data: bytes) -> bytes:
return (data + _PADDING)[:32]


def _bytes(value: Union[bytes, str]) -> bytes:
if isinstance(value, bytes):
return value
try:
return value.encode("latin-1")
except Exception: # noqa
return value.encode("utf-8")


class AlgV4:
@staticmethod
def compute_key(
Expand Down Expand Up @@ -642,10 +633,13 @@ def __init__(
self.StrF = StrF
self.EFF = EFF

# 1 => owner password
# 2 => user password
self._password_type = 0
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
self._key: Optional[bytes] = None
# keep key
self._user_keys: Dict = {}
self._owner_keys: Dict = {}

def verified(self) -> bool:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess verified means that the file was decrypted? Would is_decrypted be better?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I think this should rather be _verified if the user does not need it.

return self._password_type != 0

def decrypt_object(self, obj: PdfObject, idnum: int, generation: int) -> PdfObject:
"""
Expand Down Expand Up @@ -690,7 +684,7 @@ def decrypt_object(self, obj: PdfObject, idnum: int, generation: int) -> PdfObje
key_hash.update(b"sAlT")
aes128_key = key_hash.digest()[: min(n + 5, 16)]

# for V=5 use AES-256
# for AES-256
aes256_key = key

stmCrypt = self._get_crypt(self.StmF, rc4_key, aes128_key, aes256_key)
Expand All @@ -713,45 +707,32 @@ def _get_crypt(
else:
return CryptRC4(rc4_key)

def verify(self, user_pwd: Union[bytes, str], owner_pwd: Union[bytes, str]) -> int:
up_bytes = _bytes(user_pwd)
op_bytes = _bytes(owner_pwd)

key = self._user_keys.get(up_bytes)
if key:
self._key = key
return 1

key = self._owner_keys.get(op_bytes)
if key:
self._key = key
return 2

rc = 0
if self.algV <= 4:
key, rc = self.verify_v4(up_bytes, op_bytes)
def verify(self, password: Union[bytes, str]) -> int:
if isinstance(password, str):
try:
pwd = password.encode("latin-1")
except Exception: # noqa
pwd = password.encode("utf-8")
Comment on lines +718 to +721
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you choose latin-1 as the default?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to be honest, i don't think too much about it, just copy it from previous code. 😀

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hehe, ok. Thanks for the honesty ❤️

else:
key, rc = self.verify_v5(up_bytes, op_bytes)
pwd = password

if rc == 1:
self._key = key
self._user_keys[up_bytes] = key
elif rc == 2:
key, rc = self.verify_v4(pwd) if self.algV <= 4 else self.verify_v5(pwd)
if rc != 0:
self._password_type = rc
self._key = key
self._owner_keys[op_bytes] = key

return rc

def verify_v4(self, user_pwd: bytes, owner_pwd: bytes) -> Tuple[bytes, int]:
def verify_v4(self, password: bytes) -> Tuple[bytes, int]:
R = cast(int, self.entry["/R"])
P = cast(int, self.entry["/P"])
P = (P + 0x100000000) % 0x100000000 # maybe < 0
metadata_encrypted = self.entry.get("/EncryptMetadata", True)
o_entry = cast(ByteStringObject, self.entry["/O"].get_object()).original_bytes
u_entry = cast(ByteStringObject, self.entry["/U"].get_object()).original_bytes

key = AlgV4.verify_user_password(
user_pwd,
# verify owner password first
key = AlgV4.verify_owner_password(
password,
R,
self.key_size,
o_entry,
Expand All @@ -761,9 +742,9 @@ def verify_v4(self, user_pwd: bytes, owner_pwd: bytes) -> Tuple[bytes, int]:
metadata_encrypted,
)
if key:
return key, 1
key = AlgV4.verify_owner_password(
owner_pwd,
return key, 2
key = AlgV4.verify_user_password(
password,
R,
self.key_size,
o_entry,
Expand All @@ -773,26 +754,25 @@ def verify_v4(self, user_pwd: bytes, owner_pwd: bytes) -> Tuple[bytes, int]:
metadata_encrypted,
)
if key:
return key, 2
return key, 1
return b"", 0

def verify_v5(self, user_pwd: bytes, owner_pwd: bytes) -> Tuple[bytes, int]:
def verify_v5(self, password: bytes) -> Tuple[bytes, int]:
# TODO: use SASLprep process
o_entry = cast(ByteStringObject, self.entry["/O"].get_object()).original_bytes
u_entry = cast(ByteStringObject, self.entry["/U"].get_object()).original_bytes
oe_entry = cast(ByteStringObject, self.entry["/OE"].get_object()).original_bytes
ue_entry = cast(ByteStringObject, self.entry["/UE"].get_object()).original_bytes

rc = 0
key = AlgV5.verify_user_password(self.algR, user_pwd, u_entry, ue_entry)
if key:
# verify owner password first
key = AlgV5.verify_owner_password(self.algR, password, o_entry, oe_entry, u_entry)
rc = 2
if not key:
key = AlgV5.verify_user_password(self.algR, password, u_entry, ue_entry)
rc = 1
else:
key = AlgV5.verify_owner_password(self.algR, owner_pwd, o_entry, oe_entry, u_entry)
if key:
rc = 2
if rc == 0:
if not key:
return b"", 0

# verify Perms
perms = cast(ByteStringObject, self.entry["/Perms"].get_object()).original_bytes
P = cast(int, self.entry["/P"])
Expand Down
6 changes: 3 additions & 3 deletions PyPDF2/_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def merge(
reader = PdfReader(stream, strict=self.strict) # type: ignore[arg-type]
self.inputs.append((stream, reader, my_file))
if encryption_obj is not None:
reader._encryption = encryption_obj
reader.encryption = encryption_obj
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Every attribute that does not start with an underscore is part of the public interface. That means we cannot change the behavior / name without deprecation warnings.

Is there a reason why a user would want to access the encryption attribute directly?


# Find the range of pages to merge.
if pages is None:
Expand Down Expand Up @@ -204,8 +204,8 @@ def _create_stream(
stream = FileIO(fileobj, "rb")
my_file = True
elif isinstance(fileobj, PdfReader):
if hasattr(fileobj, "_encryption"):
encryption_obj = fileobj._encryption
if fileobj.encryption:
encryption_obj = fileobj.encryption
orig_tell = fileobj.stream.tell()
fileobj.stream.seek(0)
stream = BytesIO(fileobj.stream.read())
Expand Down
75 changes: 30 additions & 45 deletions PyPDF2/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
cast,
)

from ._encryption import Encryption
from ._page import PageObject, _VirtualList
from ._utils import (
StrByteType,
Expand All @@ -66,7 +67,6 @@
from .constants import PagesAttributes as PA
from .constants import TrailerKeys as TK
from .errors import (
DependencyError,
PdfReadError,
PdfReadWarning,
PdfStreamError,
Expand Down Expand Up @@ -254,8 +254,26 @@ def __init__(
self.stream = stream

self._override_encryption = False
if password is not None and self.decrypt(password) == 0:
raise PdfReadError("Wrong password")
self.encryption: Optional[Encryption] = None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consistently with my question in _merger, I would prefer if this was the private attribute _encryption instead of the public attribute `encryption. Except, of course, if there is a reason why users would want to access it.

if self.is_encrypted:
self._override_encryption = True
# Some documents may not have a /ID, use two empty
# byte strings instead. Solves
# https://github.com/mstamy2/PyPDF2/issues/608
id_entry = self.trailer.get(TK.ID)
id1_entry = id_entry[0].get_object().original_bytes if id_entry else b""
encryptEntry = cast(DictionaryObject, self.trailer[TK.ENCRYPT].get_object())
self.encryption = Encryption.read(encryptEntry, id1_entry)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
encryptEntry = cast(DictionaryObject, self.trailer[TK.ENCRYPT].get_object())
self.encryption = Encryption.read(encryptEntry, id1_entry)
encrypt_entry = cast(DictionaryObject, self.trailer[TK.ENCRYPT].get_object())
self.encryption = Encryption.read(encrypt_entry, id1_entry)


# try empty password if no password provided
pwd = password if password is not None else b""
if self.encryption.verify(pwd) == 0 and password is not None:
# raise if password provided
raise PdfReadError("Wrong password")
self._override_encryption = False
else:
if password is not None:
raise PdfReadError("Not encrypted file")

@property
def metadata(self) -> Optional[DocumentInformation]:
Expand Down Expand Up @@ -344,17 +362,7 @@ def _get_num_pages(self) -> int:
# the PDF file's page count is used in this case. Otherwise,
# the original method (flattened page count) is used.
if self.is_encrypted:
try:
self._override_encryption = True
self.decrypt("")
return self.trailer[TK.ROOT]["/Pages"]["/Count"] # type: ignore
except DependencyError as e:
# make dependency error clear to users
raise e
except Exception:
raise PdfReadError("File has not been decrypted")
finally:
self._override_encryption = False
return self.trailer[TK.ROOT]["/Pages"]["/Count"] # type: ignore
else:
if self.flattened_pages is None:
self._flatten()
Expand Down Expand Up @@ -1047,13 +1055,13 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]:
retval = read_object(self.stream, self) # type: ignore

# override encryption is used for the /Encrypt dictionary
if not self._override_encryption and self.is_encrypted:
if not self._override_encryption and self.encryption is not None:
# if we don't have the encryption key:
if not hasattr(self, "_encryption"):
raise PdfReadError("file has not been decrypted")
if not self.encryption.verified():
raise PdfReadError("File has not been decrypted")
# otherwise, decrypt here...
retval = cast(PdfObject, retval)
retval = self._encryption.decrypt_object(
retval = self.encryption.decrypt_object(
retval, indirect_reference.idnum, indirect_reference.generation
)
else:
Expand Down Expand Up @@ -1562,15 +1570,12 @@ def decrypt(self, password: Union[str, bytes]) -> int:
:return: ``0`` if the password failed, ``1`` if the password matched the user
password, and ``2`` if the password matched the owner password.
:rtype: int
:raises NotImplementedError: if document uses an unsupported encryption
method.
"""

self._override_encryption = True
try:
return self._decrypt(password)
finally:
self._override_encryption = False
if not self.encryption:
raise PdfReadError("Not encrypted file")
# TODO: raise Exception for wrong password
return self.encryption.verify(password)

def decode_permissions(self, permissions_code: int) -> Dict[str, bool]:
# Takes the permissions as an integer, returns the allowed access
Expand All @@ -1587,26 +1592,6 @@ def decode_permissions(self, permissions_code: int) -> Dict[str, bool]:
) # bit 12
return permissions

def _decrypt(self, password: Union[str, bytes]) -> int:
# already got the KEY
if hasattr(self, "_encryption"):
return 3
from PyPDF2._encryption import Encryption

# Some documents may not have a /ID, use two empty
# byte strings instead. Solves
# https://github.com/mstamy2/PyPDF2/issues/608
id_entry = self.trailer.get(TK.ID)
id1_entry = id_entry[0].get_object().original_bytes if id_entry else b""
encryptEntry = cast(DictionaryObject, self.trailer[TK.ENCRYPT].get_object())
encryption = Encryption.read(encryptEntry, id1_entry)
# maybe password is owner password
# TODO: add/modify api to set owner password
rr = encryption.verify(password, password)
if rr > 0:
self._encryption = encryption
return rr

@property
def is_encrypted(self) -> bool:
"""
Expand Down