Skip to content

Commit f63ce12

Browse files
committed
Replace MacRoman detection by a table constructed from Windows-1252's table; Disable hungarian detection as it cause more misdetections
1 parent 7f46c07 commit f63ce12

File tree

4 files changed

+66
-211
lines changed

4 files changed

+66
-211
lines changed

lib/Web/Encoding/UnivCharDet.pm

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ package Web::Encoding::UnivCharDet::UniversalDetector;
5151
our $VERSION = '1.0';
5252
use Web::Encoding::UnivCharDet::CharsetProber;
5353
use Web::Encoding::UnivCharDet::UTFCharsetProber;
54-
use Web::Encoding::UnivCharDet::MacCharsetProber;
5554

5655
sub new ($$) {
5756
my $self = bless {
@@ -132,8 +131,6 @@ sub handle_data ($$) {
132131
$self->{charset_probers}->[2]
133132
||= Web::Encoding::UnivCharDet::CharsetProber::Latin1->new
134133
unless $self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
135-
$self->{charset_probers}->[3]
136-
||= Web::Encoding::UnivCharDet::MacCharsetProber::MacRoman->new;
137134
}
138135
} else {
139136
if ($self->{input_state} eq 'pure ascii' and

lib/Web/Encoding/UnivCharDet/CharsetProber.pm

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -309,14 +309,15 @@ sub handle_data ($$) {
309309
Web::Encoding::UnivCharDet::Defs::Windows_1252SpanishModel,
310310
Web::Encoding::UnivCharDet::Defs::Windows_1252PortugueseModel,
311311
Web::Encoding::UnivCharDet::Defs::Windows_1252GermanModel,
312-
Web::Encoding::UnivCharDet::Defs::Iso_8859_2HungarianModel,
312+
#Web::Encoding::UnivCharDet::Defs::Iso_8859_2HungarianModel,
313313
Web::Encoding::UnivCharDet::Defs::Iso_8859_2CroatianModel,
314314
Web::Encoding::UnivCharDet::Defs::Iso_8859_2PolishModel,
315315
Web::Encoding::UnivCharDet::Defs::Iso_8859_2CzechModell,
316-
Web::Encoding::UnivCharDet::Defs::Windows_1250HungarianModel,
316+
#Web::Encoding::UnivCharDet::Defs::Windows_1250HungarianModel,
317317
Web::Encoding::UnivCharDet::Defs::Windows_1250CroatianModel,
318318
Web::Encoding::UnivCharDet::Defs::Windows_1250PolishModel,
319319
#Web::Encoding::UnivCharDet::Defs::Windows_1250CzechModel,
320+
Web::Encoding::UnivCharDet::Defs::MacRomanSpanishModel,
320321

321322
#Web::Encoding::UnivCharDet::Defs::Win1250HungarianModel,
322323
#Web::Encoding::UnivCharDet::Defs::Latin2HungarianModel,

lib/Web/Encoding/UnivCharDet/Defs3.pm

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,48 @@ my $Windows_1252Portuguese_CharToOrderMap = [
8383
100, 41, 51, 27, 37, 30, 38,SYM, 48,101, 32, 47, 39,102,103,104, # FX */
8484
];
8585

86+
=pod
87+
88+
my $MacRomanFrench_CharToOrderMap = [
89+
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, CTR, 254, 254, 252, 254, 254,
90+
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
91+
253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
92+
251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 253, 253, 253, 253, 253, 253,
93+
253, 2, 18, 11, 10, 0, 17, 16, 19, 3, 25, 27, 7, 13, 4, 9,
94+
12, 20, 6, 1, 5, 8, 15, 30, 21, 24, 28, 253, 253, 253, 253, 253,
95+
253, 2, 18, 11, 10, 0, 17, 16, 19, 3, 25, 27, 7, 13, 4, 9,
96+
12, 20, 6, 1, 5, 8, 15, 30, 21, 24, 28, 253, 253, 253, 253, 254,
97+
43, 48, 29, 14, 46, 39, 38, 41, 23, 33, 43, 51, 48, 29, 14, 22,
98+
26, 40, 42, 56, 32, 36, 46, 44, 49, 31, 39, 71, 50, 35, 37, 38,
99+
253, 253, 253, 253, 253, 253, 253, 55, 253, 253, 253, 253, 253, SYM, 45, 54,
100+
SYM, 253, SYM, SYM, 253, 52, SYM, SYM, SYM, SYM, SYM, 253, 253, SYM, 45, 54,
101+
253, 253, 253, SYM, 68, SYM, SYM, 253, 253, 253, 253, 23, 51, 69, 34, 34,
102+
253, 253, 253, 253, 253, 253, 253, SYM, 57, 57, SYM, 253, 253, 253, SYM, SYM,
103+
253, 253, 253, 253, 253, 33, 26, 41, 40, 22, 42, 32, 36, 56, 44, 31,
104+
SYM, 49, 50, 37, 35, SYM, 253, 253, 253, SYM, SYM, SYM, 253, SYM, SYM, SYM,
105+
];
106+
107+
=cut
108+
109+
my $MacRomanSpanish_CharToOrderMap = [
110+
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, CTR, 254, 254, 252, 254, 254,
111+
254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
112+
253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
113+
251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 253, 253, 253, 253, 253, 253,
114+
253, 1, 14, 9, 8, 0, 18, 15, 20, 5, 24, 30, 7, 12, 3, 2,
115+
13, 22, 6, 4, 10, 11, 16, 31, 27, 17, 25, 253, 253, 253, 253, 253,
116+
253, 1, 14, 9, 8, 0, 18, 15, 20, 5, 24, 30, 7, 12, 3, 2,
117+
13, 22, 6, 4, 10, 11, 16, 31, 27, 17, 25, 253, 253, 253, 253, 254,
118+
40, 46, 38, 26, 28, 34, 32, 23, 36, 35, 40, 41, 46, 38, 26, 37,
119+
49, 48, 21, 53, 42, 47, 28, 19, 44, 57, 34, 54, 29, 52, 56, 32,
120+
253, 253, 253, 253, 253, 253, 253, 43, 253, 253, 253, 253, 253, SYM, 33, 59,
121+
SYM, 253, SYM, SYM, 253, 67, SYM, SYM, SYM, SYM, SYM, 253, 253, SYM, 33, 59,
122+
253, 253, 253, SYM, 65, SYM, SYM, 253, 253, 253, 253, 36, 41, 54, 55, 55,
123+
253, 253, 253, 253, 253, 253, 253, SYM, 68, 66, SYM, 253, 253, 253, SYM, SYM,
124+
253, 253, 253, 253, 253, 35, 49, 23, 48, 37, 21, 42, 47, 53, 19, 57,
125+
SYM, 44, 29, 56, 52, SYM, 253, 253, 253, SYM, SYM, SYM, 253, SYM, SYM, SYM,
126+
];
127+
86128
my $FrenchLangModel = [
87129
3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,0,1,3,2,0,2,3,3,2,1,0,1,1,0,2,1,
88130
3,3,3,3,2,3,2,3,3,3,2,3,3,3,3,2,2,3,3,3,3,1,3,0,3,1,0,3,1,0,2,0,1,1,2,0,0,2,
@@ -275,7 +317,19 @@ sub Windows_1252PortugueseModel () { +{
275317
} }
276318

277319

278-
my $Iso_8859_2_CharToOrderMap = [
320+
sub MacRomanSpanishModel () { +{
321+
char_to_order_map => $MacRomanSpanish_CharToOrderMap,
322+
precedence_matrix => $SpanishLangModel,
323+
freq_char_count => 33,
324+
typical_positive_ratio => 0.9990026288941288,
325+
keep_english_letter => 1,
326+
charset_name => "macintosh",
327+
debug_name => "MacRomanSpanishModel",
328+
} }
329+
330+
=pod
331+
332+
my $Iso_8859_2Hungarian_CharToOrderMap = [
279333
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X */
280334
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X */
281335
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X */
@@ -294,7 +348,7 @@ my $Iso_8859_2_CharToOrderMap = [
294348
50, 43, 72, 24, 41, 27, 23,SYM, 53, 73, 30, 31, 29, 48, 56,SYM, # FX */
295349
];
296350
297-
my $Windows_1250_CharToOrderMap = [
351+
my $Windows_1250Hungarian_CharToOrderMap = [
298352
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X */
299353
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X */
300354
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X */
@@ -348,6 +402,8 @@ my $HungarianLangModel = [
348402
3,1,2,3,3,3,3,3,1,0,3,0,1,1,1,2,1,3,0,2,0,1,3,0,0,2,1,0,1,0,1,0,
349403
];
350404
405+
=cut
406+
351407
my $Iso_8859_2Croatian_CharToOrderMap = [
352408
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X */
353409
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X */
@@ -736,9 +792,10 @@ my $CzechLangModel = [
736792
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
737793
];
738794

795+
=pod
739796
740797
sub Iso_8859_2HungarianModel () { +{
741-
char_to_order_map => $Iso_8859_2_CharToOrderMap,
798+
char_to_order_map => $Iso_8859_2Hungarian_CharToOrderMap,
742799
precedence_matrix => $HungarianLangModel,
743800
freq_char_count => 32,
744801
typical_positive_ratio => 0.999000688384496,
@@ -749,7 +806,7 @@ sub Iso_8859_2HungarianModel () { +{
749806
} }
750807
751808
sub Windows_1250HungarianModel () { +{
752-
char_to_order_map => $Windows_1250_CharToOrderMap,
809+
char_to_order_map => $Windows_1250Hungarian_CharToOrderMap,
753810
precedence_matrix => $HungarianLangModel,
754811
freq_char_count => 32,
755812
typical_positive_ratio => 0.999000688384496,
@@ -759,6 +816,8 @@ sub Windows_1250HungarianModel () { +{
759816
debug_only => 0,
760817
} }
761818
819+
=cut
820+
762821
sub Iso_8859_2CroatianModel () { +{
763822
char_to_order_map => $Iso_8859_2Croatian_CharToOrderMap,
764823
precedence_matrix => $CroatianLangModel,

lib/Web/Encoding/UnivCharDet/MacCharsetProber.pm

Lines changed: 0 additions & 202 deletions
This file was deleted.

0 commit comments

Comments
 (0)