Skip to content

Commit b8c14a7

Browse files
authored
fix: replace UnicodeDecodeError to prevent large payload logging (#4071)
Replace UnicodeDecodeError with UnprocessableEntityError in encoding detection to avoid logging entire file contents. UnicodeDecodeError.object automatically stores complete input data, causing memory issues with large files in logging and error reporting systems.
1 parent 591729c commit b8c14a7

File tree

4 files changed

+100
-10
lines changed

4 files changed

+100
-10
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## 0.18.12
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
- **Prevent large file content in encoding exceptions** Replace UnicodeDecodeError with UnprocessableEntityError in encoding detection to avoid storing entire file content in exception objects, which can cause issues in logging and error reporting systems when processing large files.
9+
110
## 0.18.11
211

312
### Enhancements
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
"""Test encoding detection error handling (PR #4071)."""
2+
3+
import os
4+
import pickle
5+
import sys
6+
import tempfile
7+
from unittest.mock import patch
8+
9+
import pytest
10+
11+
from unstructured.errors import UnprocessableEntityError
12+
from unstructured.file_utils.encoding import detect_file_encoding
13+
14+
15+
def test_charset_detection_failure():
16+
"""Test encoding detection failure with memory safety checks."""
17+
large_data = b"\x80\x81\x82\x83" * 250_000 # 1MB of invalid UTF-8
18+
19+
with tempfile.NamedTemporaryFile(mode="wb", delete=False) as f:
20+
f.write(large_data)
21+
temp_file_path = f.name
22+
23+
try:
24+
detect_result = {"encoding": None, "confidence": None}
25+
with patch("unstructured.file_utils.encoding.detect", return_value=detect_result):
26+
with patch("unstructured.file_utils.encoding.COMMON_ENCODINGS", ["utf_8"]): # Will fail
27+
with pytest.raises(UnprocessableEntityError) as exc_info:
28+
detect_file_encoding(filename=temp_file_path)
29+
30+
exception = exc_info.value
31+
32+
assert "Unable to determine file encoding" in str(exception)
33+
34+
# Ensure no .object attribute that would store file content (prevents memory bloat)
35+
# See: https://docs.python.org/3/library/exceptions.html#UnicodeError.object
36+
assert not hasattr(exception, "object")
37+
38+
# Exception should be lightweight regardless of file size
39+
exception_memory = sys.getsizeof(exception)
40+
serialized_size = len(pickle.dumps(exception))
41+
42+
assert exception_memory < 10_000 # Small in-memory footprint
43+
assert serialized_size < 10_000 # Small serialization footprint
44+
finally:
45+
os.unlink(temp_file_path)
46+
47+
48+
def test_decode_failure():
49+
"""Test decode failure with memory safety checks."""
50+
# Invalid UTF-16: BOM followed by odd number of bytes
51+
invalid_utf16 = b"\xff\xfe" + b"A\x00B\x00" + b"\x00"
52+
53+
detect_result = {"encoding": "utf-16", "confidence": 0.95}
54+
with patch("unstructured.file_utils.encoding.detect", return_value=detect_result):
55+
with pytest.raises(UnprocessableEntityError) as exc_info:
56+
detect_file_encoding(file=invalid_utf16)
57+
58+
exception = exc_info.value
59+
60+
assert "detected 'utf-16' but decode failed" in str(exception)
61+
62+
# Ensure no .object attribute that would store file content (prevents memory bloat)
63+
# See: https://docs.python.org/3/library/exceptions.html#UnicodeError.object
64+
assert not hasattr(exception, "object")
65+
66+
# Exception should be lightweight
67+
exception_memory = sys.getsizeof(exception)
68+
serialized_size = len(pickle.dumps(exception))
69+
70+
assert exception_memory < 10_000 # Small in-memory footprint
71+
assert serialized_size < 10_000 # Small serialization footprint

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.18.11" # pragma: no cover
1+
__version__ = "0.18.12" # pragma: no cover

unstructured/file_utils/encoding.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from charset_normalizer import detect
44

5+
from unstructured.errors import UnprocessableEntityError
56
from unstructured.partition.common.common import convert_to_bytes
67

78
ENCODE_REC_THRESHOLD = 0.8
@@ -88,17 +89,26 @@ def detect_file_encoding(
8889
except (UnicodeDecodeError, UnicodeError):
8990
continue
9091
else:
91-
raise UnicodeDecodeError(
92-
"Unable to determine the encoding of the file or match it with any "
93-
"of the specified encodings.",
94-
byte_data,
95-
0,
96-
len(byte_data),
97-
"Invalid encoding",
98-
)
92+
# NOTE: Use UnprocessableEntityError instead of UnicodeDecodeError to avoid
93+
# logging the entire file content. UnicodeDecodeError automatically stores
94+
# the complete input data, which can be problematic for large files.
95+
raise UnprocessableEntityError(
96+
"Unable to determine file encoding after trying all common encodings. "
97+
"File may be corrupted or in an unsupported format."
98+
) from None
9999

100100
else:
101-
file_text = byte_data.decode(encoding)
101+
# NOTE: Catch UnicodeDecodeError to avoid logging the entire file content.
102+
# UnicodeDecodeError automatically stores the complete input data in its
103+
# 'object' attribute, which can cause issues with large files in logging
104+
# and error reporting systems.
105+
try:
106+
file_text = byte_data.decode(encoding)
107+
except (UnicodeDecodeError, UnicodeError):
108+
raise UnprocessableEntityError(
109+
f"File encoding detection failed: detected '{encoding}' but decode failed. "
110+
f"File may be corrupted or in an unsupported format."
111+
) from None
102112

103113
formatted_encoding = format_encoding_str(encoding)
104114

0 commit comments

Comments
 (0)