File tree Expand file tree Collapse file tree 3 files changed +35
-1
lines changed Expand file tree Collapse file tree 3 files changed +35
-1
lines changed Original file line number Diff line number Diff line change @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
6
6
7
7
### Changed
8
8
- mypy(c) is no longer a required dependency at build time if ` CHARSET_NORMALIZER_USE_MYPYC ` isn't set to ` 1 ` . (#595 ) (#583 )
9
+ - automatically lower confidence on small bytes samples that are not Unicode in ` detect ` output legacy function. (#391 )
9
10
10
11
### Added
11
12
- Custom build backend to overcome inability to mark mypy as an optional dependency in the build phase.
Original file line number Diff line number Diff line change 4
4
from warnings import warn
5
5
6
6
from .api import from_bytes
7
- from .constant import CHARDET_CORRESPONDENCE
7
+ from .constant import CHARDET_CORRESPONDENCE , TOO_SMALL_SEQUENCE
8
8
9
9
# TODO: remove this check when dropping Python 3.7 support
10
10
if TYPE_CHECKING :
@@ -49,6 +49,22 @@ def detect(
49
49
language = r .language if r is not None and r .language != "Unknown" else ""
50
50
confidence = 1.0 - r .chaos if r is not None else None
51
51
52
+ # automatically lower confidence
53
+ # on small bytes samples.
54
+ # https://github.com/jawah/charset_normalizer/issues/391
55
+ if (
56
+ confidence is not None
57
+ and confidence >= 0.9
58
+ and encoding
59
+ not in {
60
+ "utf_8" ,
61
+ "ascii" ,
62
+ }
63
+ and r .bom is False # type: ignore[union-attr]
64
+ and len (byte_str ) < TOO_SMALL_SEQUENCE
65
+ ):
66
+ confidence -= 0.2
67
+
52
68
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
53
69
# but chardet does return 'utf-8-sig' and it is a valid codec name.
54
70
if r is not None and encoding == "utf_8" and r .bom :
Original file line number Diff line number Diff line change @@ -41,3 +41,20 @@ def test_utf8_sig_not_striped(self):
41
41
42
42
with self .subTest ("Verify that UTF-8-SIG is returned when using legacy detect" ):
43
43
self .assertEqual (r ["encoding" ], "UTF-8-SIG" )
44
+
45
+ def test_small_payload_confidence_altered (self ):
46
+
47
+ with self .subTest ("Unicode should yield 1. confidence even on small bytes string" ):
48
+ r = detect ("#表 10-1 クラスタ設定" .encode ("utf_16" ))
49
+
50
+ self .assertTrue (r ["confidence" ] == 1.0 )
51
+
52
+ with self .subTest ("ShiftJis should not yield 1. confidence on small bytes string" ):
53
+ r = detect ("#表 10-1 クラスタ設定" .encode ("cp932" ))
54
+
55
+ self .assertTrue (r ["confidence" ] < 1.0 )
56
+
57
+ with self .subTest ("ShiftJis should yield 1. confidence on sufficient bytes string" ):
58
+ r = detect ("#表 10-1 クラスタ設定 … リソース同居制約" .encode ("cp932" ))
59
+
60
+ self .assertTrue (r ["confidence" ] == 1.0 )
You can’t perform that action at this time.
0 commit comments