Skip to content

Commit f277074

Browse files
committed
🔧 automatically lower confidence on small bytes str on non Unicode results (legacy detect function)
1 parent 15ae241 commit f277074

File tree

3 files changed

+35
-1
lines changed

3 files changed

+35
-1
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
66

77
### Changed
88
- mypy(c) is no longer a required dependency at build time if `CHARSET_NORMALIZER_USE_MYPYC` isn't set to `1`. (#595) (#583)
9+
- automatically lower confidence on small bytes samples that are not Unicode in `detect` output legacy function. (#391)
910

1011
### Added
1112
- Custom build backend to overcome inability to mark mypy as an optional dependency in the build phase.

src/charset_normalizer/legacy.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from warnings import warn
55

66
from .api import from_bytes
7-
from .constant import CHARDET_CORRESPONDENCE
7+
from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
88

99
# TODO: remove this check when dropping Python 3.7 support
1010
if TYPE_CHECKING:
@@ -49,6 +49,22 @@ def detect(
4949
language = r.language if r is not None and r.language != "Unknown" else ""
5050
confidence = 1.0 - r.chaos if r is not None else None
5151

52+
# automatically lower confidence
53+
# on small bytes samples.
54+
# https://github.com/jawah/charset_normalizer/issues/391
55+
if (
56+
confidence is not None
57+
and confidence >= 0.9
58+
and encoding
59+
not in {
60+
"utf_8",
61+
"ascii",
62+
}
63+
and r.bom is False # type: ignore[union-attr]
64+
and len(byte_str) < TOO_SMALL_SEQUENCE
65+
):
66+
confidence -= 0.2
67+
5268
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
5369
# but chardet does return 'utf-8-sig' and it is a valid codec name.
5470
if r is not None and encoding == "utf_8" and r.bom:

tests/test_detect_legacy.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,20 @@ def test_utf8_sig_not_striped(self):
4141

4242
with self.subTest("Verify that UTF-8-SIG is returned when using legacy detect"):
4343
self.assertEqual(r["encoding"], "UTF-8-SIG")
44+
45+
def test_small_payload_confidence_altered(self):
46+
47+
with self.subTest("Unicode should yield 1. confidence even on small bytes string"):
48+
r = detect("#表 10-1 クラスタ設定".encode("utf_16"))
49+
50+
self.assertTrue(r["confidence"] == 1.0)
51+
52+
with self.subTest("ShiftJis should not yield 1. confidence on small bytes string"):
53+
r = detect("#表 10-1 クラスタ設定".encode("cp932"))
54+
55+
self.assertTrue(r["confidence"] < 1.0)
56+
57+
with self.subTest("ShiftJis should yield 1. confidence on sufficient bytes string"):
58+
r = detect("#表 10-1 クラスタ設定 … リソース同居制約".encode("cp932"))
59+
60+
self.assertTrue(r["confidence"] == 1.0)

0 commit comments

Comments
 (0)