Skip to content

Commit cc0172c

Browse files
Issue #1159051: GzipFile now raises EOFError when reading a corrupted file
with truncated header or footer. Added tests for reading truncated gzip, bzip2, and lzma files.
2 parents 791c97a + 57f9b7a commit cc0172c

File tree

5 files changed

+81
-44
lines changed

5 files changed

+81
-44
lines changed

Lib/gzip.py

Lines changed: 37 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,6 @@ def write32u(output, value):
6565
# or unsigned.
6666
output.write(struct.pack("<L", value))
6767

68-
def read32(input):
69-
return struct.unpack("<I", input.read(4))[0]
70-
7168
class _PaddedFile:
7269
"""Minimal read-only file object that prepends a string to the contents
7370
of an actual file. Shouldn't be used outside of gzip.py, as it lacks
@@ -281,27 +278,31 @@ def _init_read(self):
281278
self.crc = zlib.crc32(b"") & 0xffffffff
282279
self.size = 0
283280

281+
def _read_exact(self, n):
282+
data = self.fileobj.read(n)
283+
while len(data) < n:
284+
b = self.fileobj.read(n - len(data))
285+
if not b:
286+
raise EOFError("Compressed file ended before the "
287+
"end-of-stream marker was reached")
288+
data += b
289+
return data
290+
284291
def _read_gzip_header(self):
285292
magic = self.fileobj.read(2)
286293
if magic == b'':
287-
raise EOFError("Reached EOF")
294+
return False
288295

289296
if magic != b'\037\213':
290297
raise OSError('Not a gzipped file')
291-
method = ord( self.fileobj.read(1) )
298+
299+
method, flag, self.mtime = struct.unpack("<BBIxx", self._read_exact(8))
292300
if method != 8:
293301
raise OSError('Unknown compression method')
294-
flag = ord( self.fileobj.read(1) )
295-
self.mtime = read32(self.fileobj)
296-
# extraflag = self.fileobj.read(1)
297-
# os = self.fileobj.read(1)
298-
self.fileobj.read(2)
299302

300303
if flag & FEXTRA:
301304
# Read & discard the extra field, if present
302-
xlen = ord(self.fileobj.read(1))
303-
xlen = xlen + 256*ord(self.fileobj.read(1))
304-
self.fileobj.read(xlen)
305+
self._read_exact(struct.unpack("<H", self._read_exact(2)))
305306
if flag & FNAME:
306307
# Read and discard a null-terminated string containing the filename
307308
while True:
@@ -315,12 +316,13 @@ def _read_gzip_header(self):
315316
if not s or s==b'\000':
316317
break
317318
if flag & FHCRC:
318-
self.fileobj.read(2) # Read & discard the 16-bit header CRC
319+
self._read_exact(2) # Read & discard the 16-bit header CRC
319320

320321
unused = self.fileobj.unused()
321322
if unused:
322323
uncompress = self.decompress.decompress(unused)
323324
self._add_read_data(uncompress)
325+
return True
324326

325327
def write(self,data):
326328
self._check_closed()
@@ -354,20 +356,16 @@ def read(self, size=-1):
354356

355357
readsize = 1024
356358
if size < 0: # get the whole thing
357-
try:
358-
while True:
359-
self._read(readsize)
360-
readsize = min(self.max_read_chunk, readsize * 2)
361-
except EOFError:
362-
size = self.extrasize
359+
while self._read(readsize):
360+
readsize = min(self.max_read_chunk, readsize * 2)
361+
size = self.extrasize
363362
else: # just get some more of it
364-
try:
365-
while size > self.extrasize:
366-
self._read(readsize)
367-
readsize = min(self.max_read_chunk, readsize * 2)
368-
except EOFError:
369-
if size > self.extrasize:
370-
size = self.extrasize
363+
while size > self.extrasize:
364+
if not self._read(readsize):
365+
if size > self.extrasize:
366+
size = self.extrasize
367+
break
368+
readsize = min(self.max_read_chunk, readsize * 2)
371369

372370
offset = self.offset - self.extrastart
373371
chunk = self.extrabuf[offset: offset + size]
@@ -385,12 +383,9 @@ def read1(self, size=-1):
385383
if self.extrasize <= 0 and self.fileobj is None:
386384
return b''
387385

388-
try:
389-
# For certain input data, a single call to _read() may not return
390-
# any data. In this case, retry until we get some data or reach EOF.
391-
while self.extrasize <= 0:
392-
self._read()
393-
except EOFError:
386+
# For certain input data, a single call to _read() may not return
387+
# any data. In this case, retry until we get some data or reach EOF.
388+
while self.extrasize <= 0 and self._read():
394389
pass
395390
if size < 0 or size > self.extrasize:
396391
size = self.extrasize
@@ -413,12 +408,9 @@ def peek(self, n):
413408
if self.extrasize == 0:
414409
if self.fileobj is None:
415410
return b''
416-
try:
417-
# Ensure that we don't return b"" if we haven't reached EOF.
418-
while self.extrasize == 0:
419-
# 1024 is the same buffering heuristic used in read()
420-
self._read(max(n, 1024))
421-
except EOFError:
411+
# Ensure that we don't return b"" if we haven't reached EOF.
412+
# 1024 is the same buffering heuristic used in read()
413+
while self.extrasize == 0 and self._read(max(n, 1024)):
422414
pass
423415
offset = self.offset - self.extrastart
424416
remaining = self.extrasize
@@ -431,13 +423,14 @@ def _unread(self, buf):
431423

432424
def _read(self, size=1024):
433425
if self.fileobj is None:
434-
raise EOFError("Reached EOF")
426+
return False
435427

436428
if self._new_member:
437429
# If the _new_member flag is set, we have to
438430
# jump to the next member, if there is one.
439431
self._init_read()
440-
self._read_gzip_header()
432+
if not self._read_gzip_header():
433+
return False
441434
self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
442435
self._new_member = False
443436

@@ -454,7 +447,7 @@ def _read(self, size=1024):
454447
self.fileobj.prepend(self.decompress.unused_data, True)
455448
self._read_eof()
456449
self._add_read_data( uncompress )
457-
raise EOFError('Reached EOF')
450+
return False
458451

459452
uncompress = self.decompress.decompress(buf)
460453
self._add_read_data( uncompress )
@@ -470,6 +463,7 @@ def _read(self, size=1024):
470463
# a new member on the next call
471464
self._read_eof()
472465
self._new_member = True
466+
return True
473467

474468
def _add_read_data(self, data):
475469
self.crc = zlib.crc32(data, self.crc) & 0xffffffff
@@ -484,8 +478,7 @@ def _read_eof(self):
484478
# We check the that the computed CRC and size of the
485479
# uncompressed data matches the stored values. Note that the size
486480
# stored is the true file size mod 2**32.
487-
crc32 = read32(self.fileobj)
488-
isize = read32(self.fileobj) # may exceed 2GB
481+
crc32, isize = struct.unpack("<II", self._read_exact(8))
489482
if crc32 != self.crc:
490483
raise OSError("CRC check failed %s != %s" % (hex(crc32),
491484
hex(self.crc)))

Lib/test/test_bz2.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,19 @@ def testSeekBackwardsBytesIO(self):
569569
bz2f.seek(-150, 1)
570570
self.assertEqual(bz2f.read(), self.TEXT[500-150:])
571571

572+
def test_read_truncated(self):
573+
# Drop the eos_magic field (6 bytes) and CRC (4 bytes).
574+
truncated = self.DATA[:-10]
575+
with BZ2File(BytesIO(truncated)) as f:
576+
self.assertRaises(EOFError, f.read)
577+
with BZ2File(BytesIO(truncated)) as f:
578+
self.assertEqual(f.read(len(self.TEXT)), self.TEXT)
579+
self.assertRaises(EOFError, f.read, 1)
580+
# Incomplete 4-byte file header, and block header of at least 146 bits.
581+
for i in range(22):
582+
with BZ2File(BytesIO(truncated[:i])) as f:
583+
self.assertRaises(EOFError, f.read, 1)
584+
572585

573586
class BZ2CompressorTest(BaseTest):
574587
def testCompress(self):

Lib/test/test_gzip.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,20 @@ def test_decompress(self):
389389
datac = gzip.compress(data)
390390
self.assertEqual(gzip.decompress(datac), data)
391391

392+
def test_read_truncated(self):
393+
data = data1*50
394+
# Drop the CRC (4 bytes) and file size (4 bytes).
395+
truncated = gzip.compress(data)[:-8]
396+
with gzip.GzipFile(fileobj=io.BytesIO(truncated)) as f:
397+
self.assertRaises(EOFError, f.read)
398+
with gzip.GzipFile(fileobj=io.BytesIO(truncated)) as f:
399+
self.assertEqual(f.read(len(data)), data)
400+
self.assertRaises(EOFError, f.read, 1)
401+
# Incomplete 10-byte header.
402+
for i in range(2, 10):
403+
with gzip.GzipFile(fileobj=io.BytesIO(truncated[:i])) as f:
404+
self.assertRaises(EOFError, f.read, 1)
405+
392406

393407
class TestOpen(BaseTest):
394408
def test_binary_modes(self):

Lib/test/test_lzma.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,20 @@ def test_read_incomplete(self):
669669
with LZMAFile(BytesIO(COMPRESSED_XZ[:128])) as f:
670670
self.assertRaises(EOFError, f.read)
671671

672+
def test_read_truncated(self):
673+
# Drop stream footer: CRC (4 bytes), index size (4 bytes),
674+
# flags (2 bytes) and magic number (2 bytes).
675+
truncated = COMPRESSED_XZ[:-12]
676+
with LZMAFile(BytesIO(truncated)) as f:
677+
self.assertRaises(EOFError, f.read)
678+
with LZMAFile(BytesIO(truncated)) as f:
679+
self.assertEqual(f.read(len(INPUT)), INPUT)
680+
self.assertRaises(EOFError, f.read, 1)
681+
# Incomplete 12-byte header.
682+
for i in range(12):
683+
with LZMAFile(BytesIO(truncated[:i])) as f:
684+
self.assertRaises(EOFError, f.read, 1)
685+
672686
def test_read_bad_args(self):
673687
f = LZMAFile(BytesIO(COMPRESSED_XZ))
674688
f.close()

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,9 @@ Core and Builtins
220220
Library
221221
-------
222222

223+
- Issue #1159051: GzipFile now raises EOFError when reading a corrupted file
224+
with truncated header or footer.
225+
223226
- Issue #16993: shutil.which() now preserves the case of the path and extension
224227
on Windows.
225228

0 commit comments

Comments
 (0)