Skip to content

Commit 228d763

Browse files
committed
Added support for detection of Latin1 HTML character reference encoded 8-bit bytes
1 parent c5a17b7 commit 228d763

File tree

3 files changed

+608
-58
lines changed

3 files changed

+608
-58
lines changed

lib/Web/Encoding/UnivCharDet.pm

Lines changed: 139 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ sub reset ($) {
7777
#delete $self->{nbsp_found};
7878
delete $self->{esc_found};
7979
delete $self->{binary_found};
80+
$self->{win1250_refs} = 0;
81+
$self->{win1252_refs} = 0;
82+
$self->{unicode_refs} = 0;
83+
delete $self->{resolve_latin1_refs};
8084
} # reset
8185

8286
sub handle_data ($$) {
@@ -116,6 +120,7 @@ sub handle_data ($$) {
116120

117121
my $length = length $_[1];
118122
my $zero = 0;
123+
my $high = 0;
119124
for my $i (0..($length - 1)) {
120125
my $c = ord substr $_[1], $i, 1;
121126
$zero++ if $c == 0x00;
@@ -125,22 +130,12 @@ sub handle_data ($$) {
125130
if ($c & 0x80 and $c != 0xA0) {
126131
if ($self->{input_state} ne 'high byte') {
127132
$self->{input_state} = 'high byte';
128-
delete $self->{esc_charset_prober};
129-
delete $self->{utf1632_prober};
130-
131-
$self->{charset_probers}->[0]
132-
||= Web::Encoding::UnivCharDet::CharsetProber::MBCSGroup->new
133-
($self->{lang_filter});
134-
$self->{charset_probers}->[1]
135-
||= Web::Encoding::UnivCharDet::CharsetProber::SBCSGroup->new
136-
if $self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
137-
$self->{charset_probers}->[2]
138-
||= Web::Encoding::UnivCharDet::CharsetProber::Latin1->new
139-
unless $self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
140-
$self->{charset_probers}->[3]
141-
||= Web::Encoding::UnivCharDet::CharsetProber::Vietnamese->new
142-
if $self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
133+
$high = 1;
143134
}
135+
delete $self->{amp};
136+
} elsif ($c == 0x26) {
137+
$self->{amp} = '';
138+
$self->{last_char} = $c;
144139
} else {
145140
if ($self->{input_state} eq 'pure ascii') {
146141
if ($c == 0x1B or $c == 0x0E or $c == 0x0F) {
@@ -154,11 +149,88 @@ sub handle_data ($$) {
154149
$c == 0x7F) {
155150
$self->{binary_found} = 1;
156151
}
152+
$self->{last_char} = $c;
153+
}
154+
if (defined $self->{amp}) {
155+
if ($c == 0x3B) {
156+
if (defined $Web::Encoding::UnivCharDet::Defs::Latin1Entities->{$self->{amp}}) {
157+
$self->{win1252_refs}++;
158+
if ($self->{amp} =~ /^#([0-9]+)$/ and
159+
$Web::Encoding::UnivCharDet::Defs::Windows1250Refs->{$1}) {
160+
$self->{win1250_refs}++;
161+
}
162+
} elsif ($self->{amp} =~ /^#([0-9]+)$/) {
163+
my $cc = $1;
164+
if ($cc > 0xFF) {
165+
if ($Web::Encoding::UnivCharDet::Defs::Windows1250Refs->{$cc}) {
166+
$self->{win1250_refs}++;
167+
} else {
168+
$self->{unicode_refs}++;
169+
}
170+
} elsif (0x80 <= $cc) {
171+
if ($Web::Encoding::UnivCharDet::Defs::Windows1250Refs->{$cc}) {
172+
$self->{win1250_refs}++;
173+
}
174+
$self->{win1252_refs}++;
175+
}
176+
}
177+
delete $self->{amp};
178+
} elsif ($c == 0x23 and $self->{amp} eq '') { # &#
179+
$self->{amp} .= chr $c;
180+
} elsif (10 < length $self->{amp}) {
181+
delete $self->{amp};
182+
} elsif (0x30 <= $c and $c <= 0x39) {
183+
$self->{amp} .= chr $c;
184+
} elsif (0x41 <= $c and $c <= 0x5A) {
185+
$self->{amp} .= chr $c;
186+
} elsif (0x61 <= $c and $c <= 0x7A) {
187+
$self->{amp} .= chr $c;
188+
} else {
189+
delete $self->{amp};
190+
}
157191
}
158-
$self->{last_char} = $c;
159192
}
160193
} # $i
161194

195+
if ($self->{input_state} eq 'pure ascii' and
196+
$self->{unicode_refs} < 10 and
197+
$self->{win1250_refs} + $self->{win1252_refs} > 10) {
198+
$self->{input_state} = 'high byte';
199+
$high = 1;
200+
}
201+
202+
if ($high) {
203+
delete $self->{esc_charset_prober};
204+
delete $self->{utf1632_prober};
205+
206+
$self->{charset_probers}->[0]
207+
||= Web::Encoding::UnivCharDet::CharsetProber::MBCSGroup->new
208+
($self->{lang_filter});
209+
$self->{charset_probers}->[1]
210+
||= Web::Encoding::UnivCharDet::CharsetProber::SBCSGroup->new
211+
if $self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
212+
$self->{charset_probers}->[2]
213+
||= Web::Encoding::UnivCharDet::CharsetProber::Latin1->new
214+
unless $self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
215+
$self->{charset_probers}->[3]
216+
||= Web::Encoding::UnivCharDet::CharsetProber::Vietnamese->new
217+
if $self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
218+
} # $high
219+
220+
if ($self->{win1252_refs} > 10 and $self->{unicode_refs} < 10) {
221+
for (grep { defined $_ } @{$self->{charset_probers}}) {
222+
$_->set_resolve_latin1_refs (1);
223+
}
224+
$self->{resolve_latin1_refs} = 'windows-1252';
225+
#} elsif ($self->{win1250_refs} > 10 and $self->{unicode_refs} < 10) {
226+
# for (grep { defined $_ } @{$self->{charset_probers}}) {
227+
# $_->set_resolve_latin1_refs (1);
228+
# }
229+
# $self->{resolve_latin1_refs} = 'windows-1250';
230+
} else {
231+
delete $self->{resolve_latin1_refs};
232+
}
233+
162234
if ($self->{utf} and $zero) {
163235
if ($zero / ($length || 1) > 0.1) { # random threshold
164236
$self->{charset_probers} = [];
@@ -189,12 +261,51 @@ sub handle_data ($$) {
189261
}
190262
}
191263
} elsif ($self->{input_state} eq 'high byte') {
192-
for (grep { defined $_ } @{$self->{charset_probers}}) {
193-
my $st = $_->handle_data ($_[1]);
194-
if ($st eq 'found it') {
195-
$self->{done} = 1;
196-
$self->{detected_charset} = $_->get_charset_name; # non-undef when found
197-
return 1;
264+
if (defined $self->{resolve_latin1_refs}) {
265+
my $x = $_[1];
266+
if ($self->{resolve_latin1_refs} eq 'windows-1252') {
267+
$x =~ s{&#(12[89]|1[3-9][0-9]|2[0-4][0-9]|25[0-5]);}{pack 'C', $1}ge;
268+
$x =~ s{&([A-Za-z0-9]+);}{
269+
if (defined $Web::Encoding::UnivCharDet::Defs::Latin1Entities->{$1}) {
270+
chr $Web::Encoding::UnivCharDet::Defs::Latin1Entities->{$1};
271+
} else {
272+
('&'.$1.';');
273+
}
274+
}ge;
275+
#} elsif ($self->{resolve_latin1_refs} eq 'windows-1250') {
276+
# $x =~ s{&#([0-9]+);}{
277+
# my $cc = $Web::Encoding::UnivCharDet::Defs::Windows1250Refs->{$1};
278+
# if (defined $cc) {
279+
# pack 'C', $cc;
280+
# } else {
281+
# '&' . $1 . ';';
282+
# }
283+
# }ge;
284+
}
285+
for (grep { defined $_ } @{$self->{charset_probers}}[0,1]) {
286+
my $st = $_->handle_data ($x);
287+
if ($st eq 'found it') {
288+
$self->{done} = 1;
289+
$self->{detected_charset} = $_->get_charset_name; # non-undef when found
290+
return 1;
291+
}
292+
}
293+
for (grep { defined $_ } @{$self->{charset_probers}}[2,3]) {
294+
my $st = $_->handle_data ($_[1]);
295+
if ($st eq 'found it') {
296+
$self->{done} = 1;
297+
$self->{detected_charset} = $_->get_charset_name; # non-undef when found
298+
return 1;
299+
}
300+
}
301+
} else {
302+
for (grep { defined $_ } @{$self->{charset_probers}}) {
303+
my $st = $_->handle_data ($_[1]);
304+
if ($st eq 'found it') {
305+
$self->{done} = 1;
306+
$self->{detected_charset} = $_->get_charset_name; # non-undef when found
307+
return 1;
308+
}
198309
}
199310
}
200311
}
@@ -249,7 +360,11 @@ sub get_reported_charset ($) {
249360

250361
sub dump_status ($) {
251362
my $self = $_[0];
252-
print "Input state: $self->{input_state}\n";
363+
printf "[%s] %s (%d %d %d) %s\n",
364+
$self->{reported} // '',
365+
$self->{resolve_latin1_refs} ? 'htmlrefs:'.$self->{resolve_latin1_refs} : '',
366+
$self->{win1250_refs}, $self->{win1250_refs}, $self->{unicode_refs},
367+
$self->{input_state};
253368
$_->dump_status for grep { defined $_ }
254369
@{$self->{charset_probers}},
255370
$self->{esc_charset_prober},
@@ -266,6 +381,7 @@ sub dump_status_for_json ($) {
266381
@{$self->{charset_probers}},
267382
$self->{esc_charset_prober},
268383
$self->{utf1632_prober}],
384+
htmlrefs => $self->{resolve_latin1_refs},
269385
reported => $self->{reported}};
270386
} # dump_status_for_json
271387

0 commit comments

Comments
 (0)