🔧 automatically lower confidence on small bytes str on non Unicode results (legacy detect function)

Ousret · Ousret · commit f277074e281a · 2025-08-09T06:16:39.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ### Changed
 - mypy(c) is no longer a required dependency at build time if `CHARSET_NORMALIZER_USE_MYPYC` isn't set to `1`. (#595) (#583)
+- automatically lower confidence on small bytes samples that are not Unicode in `detect` output legacy function. (#391)
 
 ### Added
 - Custom build backend to overcome inability to mark mypy as an optional dependency in the build phase.
diff --git a/src/charset_normalizer/legacy.py b/src/charset_normalizer/legacy.py
@@ -4,7 +4,7 @@
 from warnings import warn
 
 from .api import from_bytes
-from .constant import CHARDET_CORRESPONDENCE
+from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
 
 # TODO: remove this check when dropping Python 3.7 support
 if TYPE_CHECKING:
@@ -49,6 +49,22 @@ def detect(
     language = r.language if r is not None and r.language != "Unknown" else ""
     confidence = 1.0 - r.chaos if r is not None else None
 
+    # automatically lower confidence
+    # on small bytes samples.
+    # https://github.com/jawah/charset_normalizer/issues/391
+    if (
+        confidence is not None
+        and confidence >= 0.9
+        and encoding
+        not in {
+            "utf_8",
+            "ascii",
+        }
+        and r.bom is False  # type: ignore[union-attr]
+        and len(byte_str) < TOO_SMALL_SEQUENCE
+    ):
+        confidence -= 0.2
+
     # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
     # but chardet does return 'utf-8-sig' and it is a valid codec name.
     if r is not None and encoding == "utf_8" and r.bom:
diff --git a/tests/test_detect_legacy.py b/tests/test_detect_legacy.py
@@ -41,3 +41,20 @@ def test_utf8_sig_not_striped(self):
 
         with self.subTest("Verify that UTF-8-SIG is returned when using legacy detect"):
             self.assertEqual(r["encoding"], "UTF-8-SIG")
+
+    def test_small_payload_confidence_altered(self):
+
+        with self.subTest("Unicode should yield 1. confidence even on small bytes string"):
+            r = detect("#表 10-1 クラスタ設定".encode("utf_16"))
+
+            self.assertTrue(r["confidence"] == 1.0)
+
+        with self.subTest("ShiftJis should not yield 1. confidence on small bytes string"):
+            r = detect("#表 10-1 クラスタ設定".encode("cp932"))
+
+            self.assertTrue(r["confidence"] < 1.0)
+
+        with self.subTest("ShiftJis should yield 1. confidence on sufficient bytes string"):
+            r = detect("#表 10-1 クラスタ設定　…　リソース同居制約".encode("cp932"))
+
+            self.assertTrue(r["confidence"] == 1.0)