This repository has been archived by the owner on Nov 30, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
check_compat.py
66 lines (43 loc) · 1.76 KB
/
check_compat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from json import load
from os.path import exists
from typing import Dict, Optional
def cp_name(name: Optional[str]) -> str:
if name is None:
return 'N/A'
return name.lower().replace('windows-', 'cp').replace('-', '_').replace('iso_', 'iso').replace('ibm', 'cp')
def is_equivalent(cp_a: Optional[str], cp_b: Optional[str]) -> bool:
cp_name_a: str = cp_name(cp_a)
cp_name_b: str = cp_name(cp_b)
if cp_name_a == 'ibm855' and cp_name_b == "cp855":
return True
if cp_name_a == "euc_jp" and cp_name_b == "euc_jis_2004":
return True
if cp_name_a == "shift_jis" and (cp_name_b == "cp932" or cp_name_b == "shift_jis_2004"):
return True
if cp_name_a == "euc_kr" and cp_name_b == "cp949":
return True
if cp_name_a == "maccyrillic" and cp_name_b == "mac_cyrillic":
return True
if cp_name_a == "iso8859_1" and cp_name_b == "cp1252":
return True
if cp_name_a == "iso8859_7" and cp_name_b == "cp1253":
return True
return cp_name_a == cp_name_b
if __name__ == "__main__":
if exists("./results/dump-2.7.json") is False or exists("./results/dump-3.8.json") is False:
print("Missing either 2.7 or 3.8 dump")
exit(1)
r27: Dict[str, Optional[str]]
r38: Dict[str, Optional[str]]
with open("./results/dump-2.7.json", "r") as fp:
r27 = load(fp)
with open("./results/dump-3.8.json", "r") as fp:
r38 = load(fp)
c: int = 0
print("file;chardet;charset_normalizer")
for file, apparent_encoding in r27.items():
if not is_equivalent(apparent_encoding, r38[file]):
print(f"{file};{apparent_encoding};{r38[file]}")
c += 1
print("EOF;EOF")
print("Ratio ", (1.0 - round(c / len(r27.keys()), 3)) * 100.)