@@ -77,6 +77,10 @@ sub reset ($) {
7777 # delete $self->{nbsp_found};
7878 delete $self -> {esc_found };
7979 delete $self -> {binary_found };
80+ $self -> {win1250_refs } = 0;
81+ $self -> {win1252_refs } = 0;
82+ $self -> {unicode_refs } = 0;
83+ delete $self -> {resolve_latin1_refs };
8084} # reset
8185
8286sub handle_data ($$) {
@@ -116,6 +120,7 @@ sub handle_data ($$) {
116120
117121 my $length = length $_ [1];
118122 my $zero = 0;
123+ my $high = 0;
119124 for my $i (0..($length - 1)) {
120125 my $c = ord substr $_ [1], $i , 1;
121126 $zero ++ if $c == 0x00;
@@ -125,22 +130,12 @@ sub handle_data ($$) {
125130 if ($c & 0x80 and $c != 0xA0) {
126131 if ($self -> {input_state } ne ' high byte' ) {
127132 $self -> {input_state } = ' high byte' ;
128- delete $self -> {esc_charset_prober };
129- delete $self -> {utf1632_prober };
130-
131- $self -> {charset_probers }-> [0]
132- ||= Web::Encoding::UnivCharDet::CharsetProber::MBCSGroup-> new
133- ($self -> {lang_filter });
134- $self -> {charset_probers }-> [1]
135- ||= Web::Encoding::UnivCharDet::CharsetProber::SBCSGroup-> new
136- if $self -> {lang_filter } & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
137- $self -> {charset_probers }-> [2]
138- ||= Web::Encoding::UnivCharDet::CharsetProber::Latin1-> new
139- unless $self -> {lang_filter } & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
140- $self -> {charset_probers }-> [3]
141- ||= Web::Encoding::UnivCharDet::CharsetProber::Vietnamese-> new
142- if $self -> {lang_filter } & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
133+ $high = 1;
143134 }
135+ delete $self -> {amp };
136+ } elsif ($c == 0x26) {
137+ $self -> {amp } = ' ' ;
138+ $self -> {last_char } = $c ;
144139 } else {
145140 if ($self -> {input_state } eq ' pure ascii' ) {
146141 if ($c == 0x1B or $c == 0x0E or $c == 0x0F) {
@@ -154,11 +149,88 @@ sub handle_data ($$) {
154149 $c == 0x7F) {
155150 $self -> {binary_found } = 1;
156151 }
152+ $self -> {last_char } = $c ;
153+ }
154+ if (defined $self -> {amp }) {
155+ if ($c == 0x3B) {
156+ if (defined $Web::Encoding::UnivCharDet::Defs::Latin1Entities -> {$self -> {amp }}) {
157+ $self -> {win1252_refs }++;
158+ if ($self -> {amp } =~ / ^#([0-9]+)$ / and
159+ $Web::Encoding::UnivCharDet::Defs::Windows1250Refs -> {$1 }) {
160+ $self -> {win1250_refs }++;
161+ }
162+ } elsif ($self -> {amp } =~ / ^#([0-9]+)$ / ) {
163+ my $cc = $1 ;
164+ if ($cc > 0xFF) {
165+ if ($Web::Encoding::UnivCharDet::Defs::Windows1250Refs -> {$cc }) {
166+ $self -> {win1250_refs }++;
167+ } else {
168+ $self -> {unicode_refs }++;
169+ }
170+ } elsif (0x80 <= $cc ) {
171+ if ($Web::Encoding::UnivCharDet::Defs::Windows1250Refs -> {$cc }) {
172+ $self -> {win1250_refs }++;
173+ }
174+ $self -> {win1252_refs }++;
175+ }
176+ }
177+ delete $self -> {amp };
178+ } elsif ($c == 0x23 and $self -> {amp } eq ' ' ) { # &#
179+ $self -> {amp } .= chr $c ;
180+ } elsif (10 < length $self -> {amp }) {
181+ delete $self -> {amp };
182+ } elsif (0x30 <= $c and $c <= 0x39) {
183+ $self -> {amp } .= chr $c ;
184+ } elsif (0x41 <= $c and $c <= 0x5A) {
185+ $self -> {amp } .= chr $c ;
186+ } elsif (0x61 <= $c and $c <= 0x7A) {
187+ $self -> {amp } .= chr $c ;
188+ } else {
189+ delete $self -> {amp };
190+ }
157191 }
158- $self -> {last_char } = $c ;
159192 }
160193 } # $i
161194
195+ if ($self -> {input_state } eq ' pure ascii' and
196+ $self -> {unicode_refs } < 10 and
197+ $self -> {win1250_refs } + $self -> {win1252_refs } > 10) {
198+ $self -> {input_state } = ' high byte' ;
199+ $high = 1;
200+ }
201+
202+ if ($high ) {
203+ delete $self -> {esc_charset_prober };
204+ delete $self -> {utf1632_prober };
205+
206+ $self -> {charset_probers }-> [0]
207+ ||= Web::Encoding::UnivCharDet::CharsetProber::MBCSGroup-> new
208+ ($self -> {lang_filter });
209+ $self -> {charset_probers }-> [1]
210+ ||= Web::Encoding::UnivCharDet::CharsetProber::SBCSGroup-> new
211+ if $self -> {lang_filter } & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
212+ $self -> {charset_probers }-> [2]
213+ ||= Web::Encoding::UnivCharDet::CharsetProber::Latin1-> new
214+ unless $self -> {lang_filter } & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
215+ $self -> {charset_probers }-> [3]
216+ ||= Web::Encoding::UnivCharDet::CharsetProber::Vietnamese-> new
217+ if $self -> {lang_filter } & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
218+ } # $high
219+
220+ if ($self -> {win1252_refs } > 10 and $self -> {unicode_refs } < 10) {
221+ for (grep { defined $_ } @{$self -> {charset_probers }}) {
222+ $_ -> set_resolve_latin1_refs (1);
223+ }
224+ $self -> {resolve_latin1_refs } = ' windows-1252' ;
225+ # } elsif ($self->{win1250_refs} > 10 and $self->{unicode_refs} < 10) {
226+ # for (grep { defined $_ } @{$self->{charset_probers}}) {
227+ # $_->set_resolve_latin1_refs (1);
228+ # }
229+ # $self->{resolve_latin1_refs} = 'windows-1250';
230+ } else {
231+ delete $self -> {resolve_latin1_refs };
232+ }
233+
162234 if ($self -> {utf } and $zero ) {
163235 if ($zero / ($length || 1) > 0.1) { # random threshold
164236 $self -> {charset_probers } = [];
@@ -189,12 +261,51 @@ sub handle_data ($$) {
189261 }
190262 }
191263 } elsif ($self -> {input_state } eq ' high byte' ) {
192- for (grep { defined $_ } @{$self -> {charset_probers }}) {
193- my $st = $_ -> handle_data ($_ [1]);
194- if ($st eq ' found it' ) {
195- $self -> {done } = 1;
196- $self -> {detected_charset } = $_ -> get_charset_name; # non-undef when found
197- return 1;
264+ if (defined $self -> {resolve_latin1_refs }) {
265+ my $x = $_ [1];
266+ if ($self -> {resolve_latin1_refs } eq ' windows-1252' ) {
267+ $x =~ s { &#(12[89]|1[3-9][0-9]|2[0-4][0-9]|25[0-5]);} { pack 'C', $1 } ge ;
268+ $x =~ s { &([A-Za-z0-9]+);} {
269+ if (defined $Web::Encoding::UnivCharDet::Defs::Latin1Entities ->{$1 }) {
270+ chr $Web::Encoding::UnivCharDet::Defs::Latin1Entities ->{$1 };
271+ } else {
272+ ('&'.$1 .';');
273+ }
274+ } ge ;
275+ # } elsif ($self->{resolve_latin1_refs} eq 'windows-1250') {
276+ # $x =~ s{&#([0-9]+);}{
277+ # my $cc = $Web::Encoding::UnivCharDet::Defs::Windows1250Refs->{$1};
278+ # if (defined $cc) {
279+ # pack 'C', $cc;
280+ # } else {
281+ # '&' . $1 . ';';
282+ # }
283+ # }ge;
284+ }
285+ for (grep { defined $_ } @{$self -> {charset_probers }}[0,1]) {
286+ my $st = $_ -> handle_data ($x );
287+ if ($st eq ' found it' ) {
288+ $self -> {done } = 1;
289+ $self -> {detected_charset } = $_ -> get_charset_name; # non-undef when found
290+ return 1;
291+ }
292+ }
293+ for (grep { defined $_ } @{$self -> {charset_probers }}[2,3]) {
294+ my $st = $_ -> handle_data ($_ [1]);
295+ if ($st eq ' found it' ) {
296+ $self -> {done } = 1;
297+ $self -> {detected_charset } = $_ -> get_charset_name; # non-undef when found
298+ return 1;
299+ }
300+ }
301+ } else {
302+ for (grep { defined $_ } @{$self -> {charset_probers }}) {
303+ my $st = $_ -> handle_data ($_ [1]);
304+ if ($st eq ' found it' ) {
305+ $self -> {done } = 1;
306+ $self -> {detected_charset } = $_ -> get_charset_name; # non-undef when found
307+ return 1;
308+ }
198309 }
199310 }
200311 }
@@ -249,7 +360,11 @@ sub get_reported_charset ($) {
249360
250361sub dump_status ($) {
251362 my $self = $_ [0];
252- print " Input state: $self ->{input_state}\n " ;
363+ printf " [%s ] %s (%d %d %d ) %s \n " ,
364+ $self -> {reported } // ' ' ,
365+ $self -> {resolve_latin1_refs } ? ' htmlrefs:' .$self -> {resolve_latin1_refs } : ' ' ,
366+ $self -> {win1250_refs }, $self -> {win1250_refs }, $self -> {unicode_refs },
367+ $self -> {input_state };
253368 $_ -> dump_status for grep { defined $_ }
254369 @{$self -> {charset_probers }},
255370 $self -> {esc_charset_prober },
@@ -266,6 +381,7 @@ sub dump_status_for_json ($) {
266381 @{$self -> {charset_probers }},
267382 $self -> {esc_charset_prober },
268383 $self -> {utf1632_prober }],
384+ htmlrefs => $self -> {resolve_latin1_refs },
269385 reported => $self -> {reported }};
270386} # dump_status_for_json
271387
0 commit comments