Skip to content

Commit e84ff1b

Browse files
author
wangkebo
committed
procssing malformed docx: media file not in [Content_Types].xml
1 parent 0cf6d71 commit e84ff1b

File tree

2 files changed

+6
-1
lines changed

2 files changed

+6
-1
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
lxml>=3.1.0
22
typing-extensions
3+
python-magic

src/docx/opc/pkgreader.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from docx.opc.packuri import PACKAGE_URI, PackURI
66
from docx.opc.phys_pkg import PhysPkgReader
77
from docx.opc.shared import CaseInsensitiveDict
8+
import magic
89

910

1011
class PackageReader:
@@ -51,7 +52,10 @@ def _load_serialized_parts(phys_reader, pkg_srels, content_types):
5152
sparts = []
5253
part_walker = PackageReader._walk_phys_parts(phys_reader, pkg_srels)
5354
for partname, blob, reltype, srels in part_walker:
54-
content_type = content_types[partname]
55+
try:
56+
content_type = content_types[partname]
57+
except KeyError:
58+
content_type = magic.from_buffer(blob, mime=True)
5559
spart = _SerializedPart(partname, content_type, reltype, blob, srels)
5660
sparts.append(spart)
5761
return tuple(sparts)

0 commit comments

Comments
 (0)