Skip to content

Commit

Permalink
Reworked Archive.add(); addresses #3126.
Browse files Browse the repository at this point in the history
src/__init__.py:
    Simplified the logic so we systematically switch on the type of
    `content`. This also avoids behaviour depending on whether str(content)
    (when content is not itself a string) happens to match an existing file or
    directory.

    Also removed the local variables and pass params explicitly to local fn
    `make_subarch()`.

    Also use `assert` if we are passed incorrect types, instead of explicitly
    raising an exception.

tests/test_general.py:
    Added test_archive_3126() for #3126.
  • Loading branch information
julian-smith-artifex-com committed Feb 14, 2024
1 parent ab6600c commit b6a12af
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 98 deletions.
172 changes: 75 additions & 97 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1678,22 +1678,24 @@ def add( self, content, path=None):
Add a sub-archive.

Args:
content: content to be added. May be one of Archive, folder
name, file name, raw bytes (bytes, bytearray), zipfile,
tarfile, or a sequence of any of these types.
content:
The content to be added. May be one of:
`str` - must be path of directory or file.
`bytes`, `bytearray`, `io.BytesIO` - raw data.
`zipfile.Zipfile`.
`tarfile.TarFile`.
`fitz.Archive`.
A two-item tuple `(data, name)`.
List or tuple (but not tuple with length 2) of the above.
path: (str) a "virtual" path name, under which the elements
of content can be retrieved. Use it to e.g. cope with
duplicate element names.
'''
def bin_ok(x):
def is_binary_data(x):
return isinstance(x, (bytes, bytearray, io.BytesIO))

entries = []
mount = None
fmt = None

def make_subarch():
subarch = {"fmt": fmt, "entries": entries, "path": mount}
def make_subarch(entries, mount, fmt):
subarch = dict(fmt=fmt, entries=entries, path=mount)
if fmt != "tree" or self._subarchives == []:
self._subarchives.append(subarch)
else:
Expand All @@ -1703,102 +1705,78 @@ def make_subarch():
else:
ltree["entries"].extend(subarch["entries"])
self._subarchives[-1] = ltree
return

if isinstance(content, zipfile.ZipFile):
fmt = "zip"
entries = content.namelist()
mount = path
if isinstance(content, pathlib.Path):
content = str(content)

if isinstance(content, str):
if os.path.isdir(content):
self._add_dir(content, path)
return make_subarch(os.listdir(content), path, 'dir')
elif os.path.isfile(content):
assert isinstance(path, str) and path != '', \
f'Need name for binary content, but {path=}.'
with open(content) as f:
ff = f.read()
self._add_treeitem(ff, path)
return make_subarch([path], None, 'tree')
else:
raise ValueError(f'Not a file or directory: {content!r}')

elif is_binary_data(content):
assert isinstance(path, str) and path != '' \
f'Need name for binary content, but {path=}.'
self._add_treeitem(content, path)
return make_subarch([path], None, 'tree')

elif isinstance(content, zipfile.ZipFile):
filename = getattr(content, "filename", None)
fp = getattr(content, "fp", None)
if filename:
self._add_ziptarfile(filename, 1, path)
if filename is None:
fp = content.fp.getvalue()
self._add_ziptarmemory(fp, 1, path)
else:
self._add_ziptarmemory(fp.getvalue(), 1, path)
return make_subarch()
self._add_ziptarfile(filename, 1, path)
return make_subarch(content.namelist(), path, 'zip')

if isinstance(content, tarfile.TarFile):
fmt = "tar"
entries = content.getnames()
mount = path
elif isinstance(content, tarfile.TarFile):
filename = getattr(content.fileobj, "name", None)
fp = content.fileobj
if not isinstance(fp, io.BytesIO) and not filename:
fp = fp.fileobj
if filename:
self._add_ziptarfile(filename, 0, path)
else:
if filename is None:
fp = content.fileobj
if not isinstance(fp, io.BytesIO):
fp = fp.fileobj
self._add_ziptarmemory(fp.getvalue(), 0, path)
return make_subarch()
else:
self._add_ziptarfile(filename, 0, path)
return make_subarch(content.getnames(), path, 'tar')

if isinstance(content, Archive):
fmt = "multi"
mount = path
elif isinstance(content, Archive):
self._add_arch(content, path)
return make_subarch()

if bin_ok(content):
if not (path and type(path) is str):
raise ValueError("need name for binary content")
fmt = "tree"
mount = None
entries = [path]
self._add_treeitem(content, path)
return make_subarch()

if hasattr(content, "name"):
content = content.name
elif isinstance(content, pathlib.Path):
content = str(content)

if os.path.isdir(str(content)):
a0 = str(content)
fmt = "dir"
mount = path
entries = os.listdir(a0)
self._add_dir(a0, path)
return make_subarch()

if os.path.isfile(str(content)):
if not (path and type(path) is str):
raise ValueError("need name for binary content")
a0 = str(content)
_ = open(a0, "rb")
ff = _.read()
_.close()
fmt = "tree"
mount = None
entries = [path]
self._add_treeitem(ff, path)
return make_subarch()

if type(content) is str or not getattr(content, "__getitem__", None):
raise ValueError("bad archive content")

#----------------------------------------
# handling sequence types here
#----------------------------------------

if len(content) == 2: # covers the tree item plus path
return make_subarch([], path, 'multi')

if isinstance(content, tuple) and len(content) == 2:
# covers the tree item plus path
data, name = content
if bin_ok(data) or os.path.isfile(str(data)):
if not type(name) is str:
raise ValueError(f"bad item name {name}")
mount = path
fmt = "tree"
if bin_ok(data):
self._add_treeitem(data, name, path=mount)
else:
_ = open(str(data), "rb")
ff = _.read()
_.close()
self._add_treeitem(ff, name, path=mount)
entries = [name]
return make_subarch()

# deal with sequence of disparate items
for item in content:
self.add(item, path)
assert isinstance(name, str), f'Unexpected {type(name)=}'
if is_binary_data(data):
self._add_treeitem(data, name, path=path)
elif isinstance(data, str):
if os.path.isfile(data):
with open(data, 'rb') as f:
ff = f.read()
self._add_treeitem(ff, name, path=path)
else:
assert 0, f'Unexpected {type(data)=}.'
return make_subarch([name], path, 'tree')

elif hasattr(content, '__getitem__'):
# Deal with sequence of disparate items.
for item in content:
self.add(item, path)
return

else:
raise TypeError(f'Unrecognised type {type(content)}.')
assert 0

@property
def entry_list( self):
Expand Down
12 changes: 11 additions & 1 deletion tests/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os

import fitz
import pathlib
import pickle

scriptdir = os.path.abspath(os.path.dirname(__file__))
Expand Down Expand Up @@ -87,7 +88,7 @@ def test_annot_clean_contents():
annot = page.add_highlight_annot((10, 10, 20, 20))

# the annotation appearance will not start with command b"q"
assert annot._getAP().startswith(b"q") == False


# invoke appearance stream cleaning and reformatting
annot.clean_contents()
Expand Down Expand Up @@ -872,3 +873,12 @@ def test_xml():
def test_3112_set_xml_metadata():
document = fitz.Document()
document.set_xml_metadata('hello world')

def test_archive_3126():
if not hasattr(fitz, 'mupdf'):
print(f'Not running because known to fail with classic.')
return
p = os.path.abspath(f'{__file__}/../../tests/resources')
p = pathlib.Path(p)
archive = fitz.Archive(p)

0 comments on commit b6a12af

Please sign in to comment.