Skip to content

Commit

Permalink
pythongh-103477: Read and write gzip header and trailer with zlib
Browse files Browse the repository at this point in the history
RHEL, SLES and Ubuntu for IBM zSystems (aka s390x) ship with a zlib
optimization [1] that significantly improves deflate and inflate
performance on this platform by using a specialized CPU instruction.

This instruction not only compresses the data, but also computes a
checksum. At the moment Pyhton's gzip support performs compression and
checksum calculation separately, which creates unnecessary overhead on
s390x.

The reason is that Python needs to write specific values into gzip
header; and when this support was introduced in year 1997, there was
indeed no better way to do this.

Since v1.2.2.1 (2011) zlib provides inflateGetHeader() and
deflateSetHeader() functions for that, so Python does not have to deal
with the exact header and trailer formats anymore.

Add the new interfaces to zlibmodule.c that make use of these
functions:

* Add mtime argument to zlib.compress().
* Add mtime and fname arguments to zlib.compressobj().
* Add gz_header_mtime and gz_header_done propeties to ZlibDecompressor.

In Python modules, replace raw streams with gzip streams, make use of
the new interfaces, and remove all mentions of crc32.

📜🤖 NEWS entry added by blurb_it.

[1] madler/zlib#410
  • Loading branch information
iii-i committed Oct 9, 2023
1 parent 5e7edac commit b06e7e6
Show file tree
Hide file tree
Showing 11 changed files with 219 additions and 236 deletions.
2 changes: 2 additions & 0 deletions Include/internal/pycore_global_objects_fini_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Include/internal/pycore_global_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,7 @@ struct _Py_global_strings {
STRUCT_FOR_ID(fix_imports)
STRUCT_FOR_ID(flags)
STRUCT_FOR_ID(flush)
STRUCT_FOR_ID(fname)
STRUCT_FOR_ID(follow_symlinks)
STRUCT_FOR_ID(format)
STRUCT_FOR_ID(frequency)
Expand Down Expand Up @@ -548,6 +549,7 @@ struct _Py_global_strings {
STRUCT_FOR_ID(modules)
STRUCT_FOR_ID(mro)
STRUCT_FOR_ID(msg)
STRUCT_FOR_ID(mtime)
STRUCT_FOR_ID(mycmp)
STRUCT_FOR_ID(n)
STRUCT_FOR_ID(n_arg)
Expand Down
2 changes: 2 additions & 0 deletions Include/internal/pycore_runtime_init_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions Include/internal/pycore_unicodeobject_generated.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

206 changes: 41 additions & 165 deletions Lib/gzip.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,12 +217,15 @@ def __init__(self, filename=None, mode=None,
FutureWarning, 2)
self.mode = WRITE
self._init_write(filename)
if mtime is None:
mtime = int(time.time())
self.compress = zlib.compressobj(compresslevel,
zlib.DEFLATED,
-zlib.MAX_WBITS,
16 + zlib.MAX_WBITS,
zlib.DEF_MEM_LEVEL,
0)
self._write_mtime = mtime
0,
mtime=mtime,
fname=self._encode_fname())
self._buffer_size = _WRITE_BUFFER_SIZE
self._buffer = io.BufferedWriter(_WriteBufferStream(self),
buffer_size=self._buffer_size)
Expand All @@ -231,9 +234,6 @@ def __init__(self, filename=None, mode=None,

self.fileobj = fileobj

if self.mode == WRITE:
self._write_gzip_header(compresslevel)

@property
def mtime(self):
"""Last modification time read from stream, or None"""
Expand All @@ -245,7 +245,6 @@ def __repr__(self):

def _init_write(self, filename):
self.name = filename
self.crc = zlib.crc32(b"")
self.size = 0
self.writebuf = []
self.bufsize = 0
Expand All @@ -256,9 +255,7 @@ def tell(self):
self._buffer.flush()
return super().tell()

def _write_gzip_header(self, compresslevel):
self.fileobj.write(b'\037\213') # magic header
self.fileobj.write(b'\010') # compression method
def _encode_fname(self):
try:
# RFC 1952 requires the FNAME field to be Latin-1. Do not
# include filenames that cannot be represented that way.
Expand All @@ -269,24 +266,7 @@ def _write_gzip_header(self, compresslevel):
fname = fname[:-3]
except UnicodeEncodeError:
fname = b''
flags = 0
if fname:
flags = FNAME
self.fileobj.write(chr(flags).encode('latin-1'))
mtime = self._write_mtime
if mtime is None:
mtime = time.time()
write32u(self.fileobj, int(mtime))
if compresslevel == _COMPRESS_LEVEL_BEST:
xfl = b'\002'
elif compresslevel == _COMPRESS_LEVEL_FAST:
xfl = b'\004'
else:
xfl = b'\000'
self.fileobj.write(xfl)
self.fileobj.write(b'\377')
if fname:
self.fileobj.write(fname + b'\000')
return fname

def write(self,data):
self._check_not_closed()
Expand All @@ -311,7 +291,6 @@ def _write_raw(self, data):
if length > 0:
self.fileobj.write(self.compress.compress(data))
self.size += length
self.crc = zlib.crc32(data, self.crc)
self.offset += length

return length
Expand Down Expand Up @@ -355,9 +334,6 @@ def close(self):
if self.mode == WRITE:
self._buffer.flush()
fileobj.write(self.compress.flush())
write32u(fileobj, self.crc)
# self.size may exceed 2 GiB, or even 4 GiB
write32u(fileobj, self.size & 0xffffffff)
elif self.mode == READ:
self._buffer.close()
finally:
Expand Down Expand Up @@ -427,78 +403,17 @@ def readline(self, size=-1):
return self._buffer.readline(size)


def _read_exact(fp, n):
'''Read exactly *n* bytes from `fp`
This method is required because fp may be unbuffered,
i.e. return short reads.
'''
data = fp.read(n)
while len(data) < n:
b = fp.read(n - len(data))
if not b:
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")
data += b
return data


def _read_gzip_header(fp):
'''Read a gzip header from `fp` and progress to the end of the header.
Returns last mtime if header was present or None otherwise.
'''
magic = fp.read(2)
if magic == b'':
return None

if magic != b'\037\213':
raise BadGzipFile('Not a gzipped file (%r)' % magic)

(method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
if method != 8:
raise BadGzipFile('Unknown compression method')

if flag & FEXTRA:
# Read & discard the extra field, if present
extra_len, = struct.unpack("<H", _read_exact(fp, 2))
_read_exact(fp, extra_len)
if flag & FNAME:
# Read and discard a null-terminated string containing the filename
while True:
s = fp.read(1)
if not s or s==b'\000':
break
if flag & FCOMMENT:
# Read and discard a null-terminated string containing a comment
while True:
s = fp.read(1)
if not s or s==b'\000':
break
if flag & FHCRC:
_read_exact(fp, 2) # Read & discard the 16-bit header CRC
return last_mtime


class _GzipReader(_compression.DecompressReader):
def __init__(self, fp):
super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor,
wbits=-zlib.MAX_WBITS)
wbits=16 + zlib.MAX_WBITS)
# Set flag indicating start of a new member
self._new_member = True
self._last_mtime = None

def _init_read(self):
self._crc = zlib.crc32(b"")
self._stream_size = 0 # Decompressed size of unconcatenated stream

def _read_gzip_header(self):
last_mtime = _read_gzip_header(self._fp)
if last_mtime is None:
return False
self._last_mtime = last_mtime
return True

def read(self, size=-1):
if size < 0:
return self.readall()
Expand All @@ -512,33 +427,41 @@ def read(self, size=-1):
while True:
if self._decompressor.eof:
# Ending case: we've come to the end of a member in the file,
# so finish up this member, and read a new gzip header.
# Check the CRC and file size, and set the flag so we read
# a new member
# so finish up this member and set the flag, so that we read a
# new member
self._read_eof()
self._new_member = True
self._decompressor = self._decomp_factory(
**self._decomp_args)

if self._new_member:
# If the _new_member flag is set, we have to
# jump to the next member, if there is one.
self._init_read()
if not self._read_gzip_header():
self._size = self._pos
return b""
self._new_member = False

# Read a chunk of data from the file
if self._decompressor.needs_input:
buf = self._fp.read(READ_BUFFER_SIZE)
uncompress = self._decompressor.decompress(buf, size)
if self._new_member:
# If the _new_member flag is set, we have to
# jump to the next member, if there is one.
self._init_read()
if not buf:
self._size = self._pos
return b""
self._new_member = False
compressed_buf = buf
else:
uncompress = self._decompressor.decompress(b"", size)
assert not self._new_member
compressed_buf = b""
try:
uncompress = self._decompressor.decompress(compressed_buf, size)
except zlib.error:
if self._decompressor.gz_header_done == 1:
raise
raise BadGzipFile()

if self._decompressor.gz_header_done == 1:
self._last_mtime = self._decompressor.gz_header_mtime

if self._decompressor.unused_data != b"":
# Prepend the already read bytes to the fileobj so they can
# be seen by _read_eof() and _read_gzip_header()
# be seen by _read_eof()
self._fp.prepend(self._decompressor.unused_data)

if uncompress != b"":
Expand All @@ -547,23 +470,12 @@ def read(self, size=-1):
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")

self._crc = zlib.crc32(uncompress, self._crc)
self._stream_size += len(uncompress)
self._pos += len(uncompress)
return uncompress

def _read_eof(self):
# We've read to the end of the file
# We check that the computed CRC and size of the
# uncompressed data matches the stored values. Note that the size
# stored is the true file size mod 2**32.
crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
if crc32 != self._crc:
raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
hex(self._crc)))
elif isize != (self._stream_size & 0xffffffff):
raise BadGzipFile("Incorrect length of data produced")

# Gzip files can be padded with zeroes and still have archives.
# Consume all zero bytes and set the file position to the first
# non-zero byte. See http://www.gzip.org/#faq8
Expand All @@ -578,68 +490,32 @@ def _rewind(self):
self._new_member = True


def _create_simple_gzip_header(compresslevel: int,
mtime = None) -> bytes:
"""
Write a simple gzip header with no extra fields.
:param compresslevel: Compresslevel used to determine the xfl bytes.
:param mtime: The mtime (must support conversion to a 32-bit integer).
:return: A bytes object representing the gzip header.
"""
if mtime is None:
mtime = time.time()
if compresslevel == _COMPRESS_LEVEL_BEST:
xfl = 2
elif compresslevel == _COMPRESS_LEVEL_FAST:
xfl = 4
else:
xfl = 0
# Pack ID1 and ID2 magic bytes, method (8=deflate), header flags (no extra
# fields added to header), mtime, xfl and os (255 for unknown OS).
return struct.pack("<BBBBLBB", 0x1f, 0x8b, 8, 0, int(mtime), xfl, 255)


def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
"""Compress data in one shot and return the compressed string.
compresslevel sets the compression level in range of 0-9.
mtime can be used to set the modification time. The modification time is
set to the current time by default.
"""
if mtime == 0:
# Use zlib as it creates the header with 0 mtime by default.
# This is faster and with less overhead.
return zlib.compress(data, level=compresslevel, wbits=31)
header = _create_simple_gzip_header(compresslevel, mtime)
trailer = struct.pack("<LL", zlib.crc32(data), (len(data) & 0xffffffff))
# Wbits=-15 creates a raw deflate block.
return (header + zlib.compress(data, level=compresslevel, wbits=-15) +
trailer)
if mtime is None:
mtime = int(time.time())
return zlib.compress(data, level=compresslevel, wbits=31, mtime=mtime)


def decompress(data):
"""Decompress a gzip compressed string in one shot.
Return the decompressed string.
"""
decompressed_members = []
while True:
fp = io.BytesIO(data)
if _read_gzip_header(fp) is None:
return b"".join(decompressed_members)
# Use a zlib raw deflate compressor
do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
# Read all the data except the header
decompressed = do.decompress(data[fp.tell():])
if not do.eof or len(do.unused_data) < 8:
while data:
do = zlib.decompressobj(wbits=16 + zlib.MAX_WBITS)
decompressed = do.decompress(data)
if not do.eof:
raise EOFError("Compressed file ended before the end-of-stream "
"marker was reached")
crc, length = struct.unpack("<II", do.unused_data[:8])
if crc != zlib.crc32(decompressed):
raise BadGzipFile("CRC check failed")
if length != (len(decompressed) & 0xffffffff):
raise BadGzipFile("Incorrect length of data produced")
decompressed_members.append(decompressed)
data = do.unused_data[8:].lstrip(b"\x00")
data = do.unused_data.lstrip(b"\x00")
return b"".join(decompressed_members)


def main():
Expand Down
Loading

0 comments on commit b06e7e6

Please sign in to comment.