|  | 
|  | 1 | +"""Test encoding detection error handling (PR #4071).""" | 
|  | 2 | + | 
|  | 3 | +import os | 
|  | 4 | +import pickle | 
|  | 5 | +import sys | 
|  | 6 | +import tempfile | 
|  | 7 | +from unittest.mock import patch | 
|  | 8 | + | 
|  | 9 | +import pytest | 
|  | 10 | + | 
|  | 11 | +from unstructured.errors import UnprocessableEntityError | 
|  | 12 | +from unstructured.file_utils.encoding import detect_file_encoding | 
|  | 13 | + | 
|  | 14 | + | 
|  | 15 | +def test_charset_detection_failure(): | 
|  | 16 | +    """Test encoding detection failure with memory safety checks.""" | 
|  | 17 | +    large_data = b"\x80\x81\x82\x83" * 250_000  # 1MB of invalid UTF-8 | 
|  | 18 | + | 
|  | 19 | +    with tempfile.NamedTemporaryFile(mode="wb", delete=False) as f: | 
|  | 20 | +        f.write(large_data) | 
|  | 21 | +        temp_file_path = f.name | 
|  | 22 | + | 
|  | 23 | +    try: | 
|  | 24 | +        detect_result = {"encoding": None, "confidence": None} | 
|  | 25 | +        with patch("unstructured.file_utils.encoding.detect", return_value=detect_result): | 
|  | 26 | +            with patch("unstructured.file_utils.encoding.COMMON_ENCODINGS", ["utf_8"]):  # Will fail | 
|  | 27 | +                with pytest.raises(UnprocessableEntityError) as exc_info: | 
|  | 28 | +                    detect_file_encoding(filename=temp_file_path) | 
|  | 29 | + | 
|  | 30 | +                exception = exc_info.value | 
|  | 31 | + | 
|  | 32 | +                assert "Unable to determine file encoding" in str(exception) | 
|  | 33 | + | 
|  | 34 | +                # Ensure no .object attribute that would store file content (prevents memory bloat) | 
|  | 35 | +                # See: https://docs.python.org/3/library/exceptions.html#UnicodeError.object | 
|  | 36 | +                assert not hasattr(exception, "object") | 
|  | 37 | + | 
|  | 38 | +                # Exception should be lightweight regardless of file size | 
|  | 39 | +                exception_memory = sys.getsizeof(exception) | 
|  | 40 | +                serialized_size = len(pickle.dumps(exception)) | 
|  | 41 | + | 
|  | 42 | +                assert exception_memory < 10_000  # Small in-memory footprint | 
|  | 43 | +                assert serialized_size < 10_000  # Small serialization footprint | 
|  | 44 | +    finally: | 
|  | 45 | +        os.unlink(temp_file_path) | 
|  | 46 | + | 
|  | 47 | + | 
|  | 48 | +def test_decode_failure(): | 
|  | 49 | +    """Test decode failure with memory safety checks.""" | 
|  | 50 | +    # Invalid UTF-16: BOM followed by odd number of bytes | 
|  | 51 | +    invalid_utf16 = b"\xff\xfe" + b"A\x00B\x00" + b"\x00" | 
|  | 52 | + | 
|  | 53 | +    detect_result = {"encoding": "utf-16", "confidence": 0.95} | 
|  | 54 | +    with patch("unstructured.file_utils.encoding.detect", return_value=detect_result): | 
|  | 55 | +        with pytest.raises(UnprocessableEntityError) as exc_info: | 
|  | 56 | +            detect_file_encoding(file=invalid_utf16) | 
|  | 57 | + | 
|  | 58 | +        exception = exc_info.value | 
|  | 59 | + | 
|  | 60 | +        assert "detected 'utf-16' but decode failed" in str(exception) | 
|  | 61 | + | 
|  | 62 | +        # Ensure no .object attribute that would store file content (prevents memory bloat) | 
|  | 63 | +        # See: https://docs.python.org/3/library/exceptions.html#UnicodeError.object | 
|  | 64 | +        assert not hasattr(exception, "object") | 
|  | 65 | + | 
|  | 66 | +        # Exception should be lightweight | 
|  | 67 | +        exception_memory = sys.getsizeof(exception) | 
|  | 68 | +        serialized_size = len(pickle.dumps(exception)) | 
|  | 69 | + | 
|  | 70 | +        assert exception_memory < 10_000  # Small in-memory footprint | 
|  | 71 | +        assert serialized_size < 10_000  # Small serialization footprint | 
0 commit comments