File tree Expand file tree Collapse file tree 3 files changed +35
-1
lines changed
Expand file tree Collapse file tree 3 files changed +35
-1
lines changed Original file line number Diff line number Diff line change @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
66
77### Changed
88- mypy(c) is no longer a required dependency at build time if ` CHARSET_NORMALIZER_USE_MYPYC ` isn't set to ` 1 ` . (#595 ) (#583 )
9+ - automatically lower confidence on small bytes samples that are not Unicode in ` detect ` output legacy function. (#391 )
910
1011### Added
1112- Custom build backend to overcome inability to mark mypy as an optional dependency in the build phase.
Original file line number Diff line number Diff line change 44from warnings import warn
55
66from .api import from_bytes
7- from .constant import CHARDET_CORRESPONDENCE
7+ from .constant import CHARDET_CORRESPONDENCE , TOO_SMALL_SEQUENCE
88
99# TODO: remove this check when dropping Python 3.7 support
1010if TYPE_CHECKING :
@@ -49,6 +49,22 @@ def detect(
4949 language = r .language if r is not None and r .language != "Unknown" else ""
5050 confidence = 1.0 - r .chaos if r is not None else None
5151
52+ # automatically lower confidence
53+ # on small bytes samples.
54+ # https://github.com/jawah/charset_normalizer/issues/391
55+ if (
56+ confidence is not None
57+ and confidence >= 0.9
58+ and encoding
59+ not in {
60+ "utf_8" ,
61+ "ascii" ,
62+ }
63+ and r .bom is False # type: ignore[union-attr]
64+ and len (byte_str ) < TOO_SMALL_SEQUENCE
65+ ):
66+ confidence -= 0.2
67+
5268 # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
5369 # but chardet does return 'utf-8-sig' and it is a valid codec name.
5470 if r is not None and encoding == "utf_8" and r .bom :
Original file line number Diff line number Diff line change @@ -41,3 +41,20 @@ def test_utf8_sig_not_striped(self):
4141
4242 with self .subTest ("Verify that UTF-8-SIG is returned when using legacy detect" ):
4343 self .assertEqual (r ["encoding" ], "UTF-8-SIG" )
44+
45+ def test_small_payload_confidence_altered (self ):
46+
47+ with self .subTest ("Unicode should yield 1. confidence even on small bytes string" ):
48+ r = detect ("#表 10-1 クラスタ設定" .encode ("utf_16" ))
49+
50+ self .assertTrue (r ["confidence" ] == 1.0 )
51+
52+ with self .subTest ("ShiftJis should not yield 1. confidence on small bytes string" ):
53+ r = detect ("#表 10-1 クラスタ設定" .encode ("cp932" ))
54+
55+ self .assertTrue (r ["confidence" ] < 1.0 )
56+
57+ with self .subTest ("ShiftJis should yield 1. confidence on sufficient bytes string" ):
58+ r = detect ("#表 10-1 クラスタ設定 … リソース同居制約" .encode ("cp932" ))
59+
60+ self .assertTrue (r ["confidence" ] == 1.0 )
You can’t perform that action at this time.
0 commit comments