Skip to content

Add support for zstd compression where provided by stdlib #26

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 85 additions & 21 deletions tests/test_zipstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from zipstream import ZipStream


PY313 = sys.version_info < (3, 14)
PY36 = sys.version_info < (3, 7)
PY35 = sys.version_info < (3, 6)

Expand All @@ -30,6 +31,14 @@
("mbyte", 1024 * 1024),
]

COMPRESS_TYPES = [
zipfile.ZIP_STORED,
zipfile.ZIP_LZMA,
zipfile.ZIP_DEFLATED,
zipfile.ZIP_BZIP2,
]
if not PY313:
COMPRESS_TYPES.append(zipfile.ZIP_ZSTANDARD)

# Patch is_dir onto ZipInfo objects in 3.5 to make testing easier
@pytest.fixture(autouse=PY35)
Expand Down Expand Up @@ -107,12 +116,7 @@ def _gen_rand():
# Tests start
################################

@pytest.mark.parametrize("ct", [
zipfile.ZIP_STORED,
zipfile.ZIP_LZMA,
zipfile.ZIP_DEFLATED,
zipfile.ZIP_BZIP2
])
@pytest.mark.parametrize("ct", COMPRESS_TYPES)
def test_zipstream_compression(caplog, files, ct):
"""Test that all types of compression properly compress and extract"""
caplog.set_level(logging.WARNING)
Expand All @@ -135,12 +139,7 @@ def test_zipstream_compression(caplog, files, ct):
_verify_zip_contains(zf, f)


@pytest.mark.parametrize("ct", [
zipfile.ZIP_STORED,
zipfile.ZIP_LZMA,
zipfile.ZIP_DEFLATED,
zipfile.ZIP_BZIP2
])
@pytest.mark.parametrize("ct", COMPRESS_TYPES)
@pytest.mark.parametrize("cl", [None, 2])
def test_mixed_compression_and_getinfo(ct, cl):
"""Test that files are compressed using the correct method and level and
Expand All @@ -159,11 +158,14 @@ def test_mixed_compression_and_getinfo(ct, cl):
zs.add(b"3c", arcname="3c", compress_type=zipfile.ZIP_DEFLATED, compress_level=TEST_CL)
zs.add(b"4", arcname="4", compress_type=zipfile.ZIP_BZIP2)
zs.add(b"4c", arcname="4c", compress_type=zipfile.ZIP_BZIP2, compress_level=TEST_CL)
if not PY313:
zs.add(b"5", arcname="5", compress_type=zipfile.ZIP_ZSTANDARD)
zs.add(b"5c", arcname="5c", compress_type=zipfile.ZIP_ZSTANDARD, compress_level=TEST_CL)

zf = _get_zip(zs)
zinfos = zf.infolist()
fullinfos = zs.info_list()
assert len(zinfos) == len(fullinfos) == 9
assert len(zinfos) == len(fullinfos) == 9 + (0 if PY313 else 2)

def assert_zinfo(idx, name, compress_type, compress_level):
zi = zinfos[idx]
Expand All @@ -189,6 +191,9 @@ def assert_zinfo(idx, name, compress_type, compress_level):
assert_zinfo(6, "3c", zipfile.ZIP_DEFLATED, TEST_CL)
assert_zinfo(7, "4", zipfile.ZIP_BZIP2, cl)
assert_zinfo(8, "4c", zipfile.ZIP_BZIP2, TEST_CL)
if not PY313:
assert_zinfo(9, "5", zipfile.ZIP_ZSTANDARD, cl)
assert_zinfo(10, "5c", zipfile.ZIP_ZSTANDARD, TEST_CL)


@pytest.mark.parametrize("zip64", [False, True])
Expand Down Expand Up @@ -368,6 +373,34 @@ def test_invalid_compression(ct):
zs.add(".", arcname=".", compress_type=ct)


@pytest.mark.skipif(PY313, reason="Tests zstd compress_level (Python 3.14+ only)")
def test_invalid_zstd_compression():
"""Test zstd values outside of valid ones cause an error"""
ZipStream(compress_type=zipfile.ZIP_ZSTANDARD)

from compression.zstd import CompressionParameter
lower, upper = CompressionParameter.compression_level.bounds()

for x in (lower, lower+1, 0, upper-1, upper):
ZipStream(compress_type=zipfile.ZIP_ZSTANDARD, compress_level=x)

for x in (lower-1, upper+1):
with pytest.raises(ValueError):
ZipStream(compress_type=zipfile.ZIP_ZSTANDARD, compress_level=x)
with pytest.raises(ValueError):
ZipStream().add_path(".", compress_type=zipfile.ZIP_ZSTANDARD, compress_level=x)
with pytest.raises(ValueError):
ZipStream().add(".", arcname=".", compress_type=zipfile.ZIP_ZSTANDARD, compress_level=x)

zs = ZipStream(compress_type=zipfile.ZIP_ZSTANDARD)
with pytest.raises(ValueError):
zs.add(".", arcname=".", compress_level=x)

zs = ZipStream(compress_level=x)
with pytest.raises(ValueError):
zs.add(".", arcname=".", compress_type=zipfile.ZIP_ZSTANDARD)


def test_multibyte_and_non_ascii_characters_in_filenames():
zs = ZipStream(sized=True)
zs.add(None, "☆/")
Expand Down Expand Up @@ -734,12 +767,7 @@ def custom_walk(path):
[b"a", b"list", b"of", b"bytes"],
_gen_rand()
])
@pytest.mark.parametrize("ct", [
zipfile.ZIP_STORED,
zipfile.ZIP_LZMA,
zipfile.ZIP_DEFLATED,
zipfile.ZIP_BZIP2
])
@pytest.mark.parametrize("ct", COMPRESS_TYPES)
def test_adding_data(caplog, data, ct):
"""Test adding non-files with different compression methods"""
caplog.set_level(logging.WARNING)
Expand Down Expand Up @@ -1173,6 +1201,36 @@ def fakelocaltime(_=None):
assert zinfos[0].date_time == (2107, 12, 31, 23, 59, 58)


@pytest.mark.skipif(PY313, reason="Tests zstd compress_level (Python 3.14+ only)")
def test_zstd_uses_compression_level():
"""Test that the zstd compression level is applied"""
zs = ZipStream(compress_type=zipfile.ZIP_ZSTANDARD)
test = b"a"*1024
zs.add(test, "-7.txt", compress_level=-7)
zs.add(test, "default.txt")
zs.add(test, "22.txt", compress_level=22)

data = bytes(zs)
info = list(zs.info_list())
assert len(info) == zs.num_streamed() == 3

for x in info:
assert x["size"] == 1024
assert x["compress_type"] == zipfile.ZIP_ZSTANDARD
assert x["CRC"] == 2085984185

assert info[0]["name"] == "-7.txt"
assert info[1]["name"] == "default.txt"
assert info[2]["name"] == "22.txt"

# check compress level set
assert info[0]["compress_level"] == -7
assert info[1]["compress_level"] == None
assert info[2]["compress_level"] == 22

# check different compressed sizes for each level (in decreasing order as level increases)
assert info[0]["compressed_size"] > info[1]["compressed_size"] > info[2]["compressed_size"]

def test_info_list(monkeypatch):
faketime = (1980, 1, 1, 0, 0, 0)

Expand Down Expand Up @@ -1228,8 +1286,8 @@ def fakelocaltime(_=None):
assert len([x for x in info2 if not x["streamed"]]) == zs.num_queued() == 0
assert len([x for x in info2 if x["streamed"]]) == zs.num_streamed() == 3

# Make sure any information that ws provided up-front hasn't changed
# (except for the "streamed" key which mush got False -> True)
# Make sure any information that was provided up-front hasn't changed
# (except for the "streamed" key which must go False -> True)
for pre, post in zip(info, info2):
for k, v in pre.items():
if k == "streamed":
Expand Down Expand Up @@ -1525,6 +1583,9 @@ def test_sized_zipstream(monkeypatch, files, zip64):
ZipStream(sized=True, compress_type=zipfile.ZIP_LZMA)
with pytest.raises(ValueError):
ZipStream(sized=True, compress_type=zipfile.ZIP_BZIP2)
if not PY313:
with pytest.raises(ValueError):
ZipStream(sized=True, compress_type=zipfile.ZIP_ZSTANDARD)

with pytest.raises(ValueError):
ZipStream.from_path(".", sized=True, compress_type=zipfile.ZIP_DEFLATED)
Expand All @@ -1546,6 +1607,9 @@ def test_sized_zipstream(monkeypatch, files, zip64):
szs.add("invalid", "invalid", compress_type=zipfile.ZIP_LZMA)
with pytest.raises(ValueError):
szs.add("invalid", "invalid", compress_type=zipfile.ZIP_BZIP2)
if not PY313:
with pytest.raises(ValueError):
szs.add("invalid", "invalid", compress_type=zipfile.ZIP_ZSTANDARD)

assert szs.sized
calculated = len(szs)
Expand Down
74 changes: 50 additions & 24 deletions zipstream/ng.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
)


# Constants for compatibility modes
PY313_COMPAT = sys.version_info < (3, 14) # disable zstd
PY36_COMPAT = sys.version_info < (3, 7) # disable compress_level
PY35_COMPAT = sys.version_info < (3, 6) # backport ZipInfo functions, stringify path-like objects

# Size of chunks to read out of files
# Note that when compressing data the compressor will operate on bigger chunks
# than this - it keeps a cache as new chunks are fed to it.
Expand All @@ -51,16 +56,22 @@
# (includes "/" regardless of platform as per ZIP format specification)
PATH_SEPARATORS = set(x for x in (os.sep, os.altsep, "/") if x)

# Constants for compatibility modes
PY36_COMPAT = sys.version_info < (3, 7) # disable compress_level
PY35_COMPAT = sys.version_info < (3, 6) # backport ZipInfo functions, stringify path-like objects
# zstd-related constants
if not PY313_COMPAT:
from zipfile import ZIP_ZSTANDARD, ZSTANDARD_VERSION
from compression.zstd import CompressionParameter
ZSTD_LEVEL_BOUNDS = CompressionParameter.compression_level.bounds()


__all__ = [
# Defined classes
"ZipStream", "ZipStreamInfo",
# Compression constants (imported from zipfile)
"ZIP_STORED", "ZIP_DEFLATED", "BZIP2_VERSION", "ZIP_BZIP2", "LZMA_VERSION", "ZIP_LZMA",
"ZIP_STORED",
"ZIP_DEFLATED",
"ZIP_BZIP2", "BZIP2_VERSION",
"ZIP_LZMA", "LZMA_VERSION",
*(["ZIP_ZSTANDARD", "ZSTANDARD_VERSION"] if not PY313_COMPAT else []),
# Helper functions
"walk"
]
Expand Down Expand Up @@ -93,6 +104,24 @@ def _check_compression(compress_type, compress_level):
raise ValueError(
"compress_level must be between 1 and 9 when using ZIP_BZIP2"
)
elif not PY313_COMPAT and compress_type == ZIP_ZSTANDARD:
if not ZSTD_LEVEL_BOUNDS[0] <= compress_level <= ZSTD_LEVEL_BOUNDS[1]:
raise ValueError(
"compress_level must be between {} and {} when using ZIP_ZSTANDARD".format(
*ZSTD_LEVEL_BOUNDS
)
)


def _min_version_for_compress_type(compress_type, min_version=0):
"""Ensure the compress_type is supported by the min_version"""
if compress_type == ZIP_BZIP2:
min_version = max(BZIP2_VERSION, min_version)
elif compress_type == ZIP_LZMA:
min_version = max(LZMA_VERSION, min_version)
elif not PY313_COMPAT and compress_type == ZIP_ZSTANDARD:
min_version = max(ZSTANDARD_VERSION, min_version)
return min_version


def _timestamp_to_dos(ts):
Expand Down Expand Up @@ -177,11 +206,7 @@ def FileHeader(self, zip64):
file_size = 0xFFFFFFFF
compress_size = 0xFFFFFFFF

if self.compress_type == ZIP_BZIP2:
min_version = max(BZIP2_VERSION, min_version)
elif self.compress_type == ZIP_LZMA:
min_version = max(LZMA_VERSION, min_version)

min_version = _min_version_for_compress_type(self.compress_type, min_version)
self.extract_version = max(min_version, self.extract_version)
self.create_version = max(min_version, self.create_version)
filename, flag_bits = self._encodeFilenameFlags()
Expand Down Expand Up @@ -315,11 +340,7 @@ def _central_directory_header_data(self):
) + extra_data
min_version = ZIP64_VERSION

if self.compress_type == ZIP_BZIP2:
min_version = max(BZIP2_VERSION, min_version)
elif self.compress_type == ZIP_LZMA:
min_version = max(LZMA_VERSION, min_version)

min_version = _min_version_for_compress_type(self.compress_type, min_version)
extract_version = max(min_version, self.extract_version)
create_version = max(min_version, self.create_version)
filename, flag_bits = self._encodeFilenameFlags()
Expand Down Expand Up @@ -502,19 +523,24 @@ def __init__(self, *, compress_type=ZIP_STORED, compress_level=None, sized=False

compress_type:
The ZIP compression method to use when writing the archive, and
should be ZIP_STORED, ZIP_DEFLATED, ZIP_BZIP2 or ZIP_LZMA;
unrecognized values will cause NotImplementedError to be raised. If
ZIP_DEFLATED, ZIP_BZIP2 or ZIP_LZMA is specified but the
corresponding module (zlib, bz2 or lzma) is not available,
RuntimeError is raised. The default is ZIP_STORED.
should be ZIP_STORED, ZIP_DEFLATED, ZIP_BZIP2, ZIP_LZMA, or
ZIP_ZSTANDARD (Python 3.14+); unrecognized values will cause
NotImplementedError to be raised.
If ZIP_DEFLATED, ZIP_BZIP2, ZIP_LZMA, or ZIP_ZSTANDARD is specified
but the corresponding module (zlib, bz2, lzma, or compression.zstd)
is not available, RuntimeError is raised. The default is ZIP_STORED.

compress_level:
Controls the compression level to use when writing files to the
archive. When using ZIP_STORED or ZIP_LZMA it has no effect. When
using ZIP_DEFLATED integers 0 through 9 are accepted (see zlib for
more information). When using ZIP_BZIP2 integers 1 through 9 are
accepted (see bz2 for more information). Raises a ValueError if the
provided value isn't valid for the `compress_type`.
archive. When using ZIP_STORED or ZIP_LZMA it has no effect.
When using ZIP_DEFLATED integers 0 through 9 are accepted (see zlib
for more information).
When using ZIP_BZIP2 integers 1 through 9 are accepted (see bz2 for
more information).
When using ZIP_ZSTANDARD integers -7 though 22 are common (see
compression.zstd.CompressionParameter for more information).
Raises a ValueError if the provided value isn't valid for the
`compress_type`.

Only available in Python 3.7+ (raises a ValueError if used on a
lower version)
Expand Down
Loading