Skip to content

Commit d685dd2

Browse files
committed
Fixed detection scores for ASCII-only vietnamese texts; Fixed how detected html-escaped encodings are reported; Drop support for detections of html-escaped encodings with no real-world examples
1 parent 574e9ba commit d685dd2

File tree

2 files changed

+96
-44
lines changed

2 files changed

+96
-44
lines changed

lib/Web/Encoding/UnivCharDet.pm

Lines changed: 40 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,8 @@ sub reset ($) {
6565
$self->{done} = 0;
6666
$self->{best_guess} = -1;
6767
$self->{start} = 1;
68-
$self->{detected_charset} = undef;
68+
delete $self->{detected_charset};
69+
delete $self->{font_charset};
6970
$self->{got_data} = undef;
7071
$self->{input_state} = 'pure ascii';
7172
$self->{last_char} = 0x00;
@@ -218,17 +219,25 @@ sub handle_data ($$) {
218219
} # $high
219220

220221
if ($self->{win1252_refs} > 10 and $self->{unicode_refs} < 10) {
221-
for (grep { defined $_ } @{$self->{charset_probers}}) {
222-
$_->set_resolve_latin1_refs (1);
222+
$self->{charset_probers}->[4]
223+
||= Web::Encoding::UnivCharDet::CharsetProber::MBCSGroup->new
224+
($self->{lang_filter}, resolve_latin1_refs => 1);
225+
if ($self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK) {
226+
$self->{charset_probers}->[5]
227+
||= Web::Encoding::UnivCharDet::CharsetProber::SBCSGroup->new
228+
(resolve_latin1_refs => 1);
229+
}
230+
if ($self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK) {
231+
$self->{charset_probers}->[6]
232+
||= Web::Encoding::UnivCharDet::CharsetProber::Vietnamese->new
233+
(resolve_latin1_refs => 1);
223234
}
224235
$self->{resolve_latin1_refs} = 'windows-1252';
225-
#} elsif ($self->{win1250_refs} > 10 and $self->{unicode_refs} < 10) {
226-
# for (grep { defined $_ } @{$self->{charset_probers}}) {
227-
# $_->set_resolve_latin1_refs (1);
228-
# }
229-
# $self->{resolve_latin1_refs} = 'windows-1250';
230236
} else {
231237
delete $self->{resolve_latin1_refs};
238+
delete $self->{charset_probers}->[4];
239+
delete $self->{charset_probers}->[5];
240+
delete $self->{charset_probers}->[6];
232241
}
233242

234243
if ($self->{utf} and $zero) {
@@ -272,29 +281,21 @@ sub handle_data ($$) {
272281
('&'.$1.';');
273282
}
274283
}ge;
275-
#} elsif ($self->{resolve_latin1_refs} eq 'windows-1250') {
276-
# $x =~ s{&#([0-9]+);}{
277-
# my $cc = $Web::Encoding::UnivCharDet::Defs::Windows1250Refs->{$1};
278-
# if (defined $cc) {
279-
# pack 'C', $cc;
280-
# } else {
281-
# '&' . $1 . ';';
282-
# }
283-
# }ge;
284284
}
285-
for (grep { defined $_ } @{$self->{charset_probers}}[0,1]) {
286-
my $st = $_->handle_data ($x);
285+
for (grep { defined $_ } @{$self->{charset_probers}}[0..3]) {
286+
my $st = $_->handle_data ($_[1]);
287287
if ($st eq 'found it') {
288288
$self->{done} = 1;
289289
$self->{detected_charset} = $_->get_charset_name; # non-undef when found
290290
return 1;
291291
}
292292
}
293-
for (grep { defined $_ } @{$self->{charset_probers}}[2,3]) {
294-
my $st = $_->handle_data ($_[1]);
293+
for (grep { defined $_ } @{$self->{charset_probers}}[4..6]) {
294+
my $st = $_->handle_data ($x);
295295
if ($st eq 'found it') {
296296
$self->{done} = 1;
297-
$self->{detected_charset} = $_->get_charset_name; # non-undef when found
297+
$self->{detected_charset} = 'windows-1252';
298+
$self->{font_charset} = $_->get_charset_name; # non-undef when found
298299
return 1;
299300
}
300301
}
@@ -338,7 +339,16 @@ sub data_end ($) {
338339
}
339340
}
340341
if ($max_prober_confidence > Web::Encoding::UnivCharDet::Defs::MINIMUM_THRESHOLD) {
341-
$self->{reported} = $max_prober->get_charset_name; # or undef
342+
if ($max_prober->{resolve_latin1_refs}) {
343+
$self->{reported} = 'windows-1252';
344+
$self->{font_charset} = $max_prober->get_charset_name; # or undef
345+
if (not defined $self->{font_charset} or
346+
$self->{font_charset} eq 'windows-1252') {
347+
delete $self->{font_charset};
348+
}
349+
} else {
350+
$self->{reported} = $max_prober->get_charset_name; # or undef
351+
}
342352
}
343353
} elsif ($self->{input_state} eq 'pure ascii' or
344354
$self->{input_state} eq 'esc ascii') {
@@ -358,18 +368,22 @@ sub get_reported_charset ($) {
358368
return $_[0]->{reported};
359369
} # get_reported_charset
360370

371+
sub get_reported_font_charset ($) {
372+
return $_[0]->{font_charset};
373+
} # get_reported_font_charset
374+
361375
sub dump_status ($) {
362376
my $self = $_[0];
363377
printf "[%s] %s (%d %d %d) %s\n",
364378
$self->{reported} // '',
365-
$self->{resolve_latin1_refs} ? 'htmlrefs:'.$self->{resolve_latin1_refs} : '',
379+
defined $self->{font_charset} ? 'html:'.$self->{font_charset} : '',
366380
$self->{win1250_refs}, $self->{win1250_refs}, $self->{unicode_refs},
367381
$self->{input_state};
368382
$_->dump_status for grep { defined $_ }
369383
@{$self->{charset_probers}},
370384
$self->{esc_charset_prober},
371385
$self->{utf1632_prober};
372-
print "Reported: @{[$self->{reported} // '']}\n";
386+
print "Reported: @{[$self->{reported} // '']} @{[defined $self->{font_charset} ? 'html:'.$self->{font_charset} : '']}\n";
373387
} # dump_status
374388

375389
sub dump_status_for_json ($) {
@@ -381,7 +395,7 @@ sub dump_status_for_json ($) {
381395
@{$self->{charset_probers}},
382396
$self->{esc_charset_prober},
383397
$self->{utf1632_prober}],
384-
htmlrefs => $self->{resolve_latin1_refs},
398+
font_charset => $self->{font_charset},
385399
reported => $self->{reported}};
386400
} # dump_status_for_json
387401

lib/Web/Encoding/UnivCharDet/CharsetProber.pm

Lines changed: 56 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -233,9 +233,11 @@ package Web::Encoding::UnivCharDet::CharsetProber::SBCSGroup;
233233
push our @ISA, qw(Web::Encoding::UnivCharDet::CharsetProber);
234234
our $VERSION = '1.0';
235235

236-
sub new ($) {
237-
my $self = bless {}, $_[0];
236+
sub new ($;%) {
237+
my $self = bless {}, shift;
238+
my %args = @_;
238239
$self->reset;
240+
$self->set_resolve_latin1_refs (1) if $args{resolve_latin1_refs};
239241
return $self;
240242
} # new
241243

@@ -770,40 +772,63 @@ sub new ($$;%) {
770772
my $self = bless {}, shift;
771773
my $filter = shift;
772774
my %args = @_;
773-
774-
$self->{probers} = [
775-
Web::Encoding::UnivCharDet::CharsetProber::UTF8->new,
776-
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_JAPANESE
775+
776+
if ($args{set_resolve_latin1_refs}) {
777+
$self->{probers} = [
778+
undef,
779+
undef,
780+
undef,
781+
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_SIMPLIFIED
782+
? Web::Encoding::UnivCharDet::CharsetProber::GB18030->new
783+
($filter == Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_SIMPLIFIED)
784+
: undef,
785+
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_KOREAN
786+
? Web::Encoding::UnivCharDet::CharsetProber::EUCKR->new
787+
($filter == Web::Encoding::UnivCharDet::Defs::FILTER_KOREAN)
788+
: undef,
789+
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_TRADITIONAL
790+
? Web::Encoding::UnivCharDet::CharsetProber::Big5->new
791+
($filter == Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_TRADITIONAL)
792+
: undef,
793+
undef,
794+
undef,
795+
];
796+
} else {
797+
$self->{probers} = [
798+
Web::Encoding::UnivCharDet::CharsetProber::UTF8->new,
799+
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_JAPANESE
777800
? Web::Encoding::UnivCharDet::CharsetProber::SJIS->new
778801
($filter == Web::Encoding::UnivCharDet::Defs::FILTER_JAPANESE)
779802
: undef,
780-
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_JAPANESE
803+
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_JAPANESE
781804
? Web::Encoding::UnivCharDet::CharsetProber::EUCJP->new
782805
($filter == Web::Encoding::UnivCharDet::Defs::FILTER_JAPANESE)
783806
: undef,
784-
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_SIMPLIFIED
807+
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_SIMPLIFIED
785808
? Web::Encoding::UnivCharDet::CharsetProber::GB18030->new
786809
($filter == Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_SIMPLIFIED)
787810
: undef,
788-
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_KOREAN
811+
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_KOREAN
789812
? Web::Encoding::UnivCharDet::CharsetProber::EUCKR->new
790813
($filter == Web::Encoding::UnivCharDet::Defs::FILTER_KOREAN)
791814
: undef,
792-
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_TRADITIONAL
815+
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_TRADITIONAL
793816
? Web::Encoding::UnivCharDet::CharsetProber::Big5->new
794817
($filter == Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_TRADITIONAL)
795818
: undef,
796-
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_TRADITIONAL
819+
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_TRADITIONAL
797820
? Web::Encoding::UnivCharDet::CharsetProber::EUCTW->new
798821
($filter == Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_TRADITIONAL)
799822
: undef,
800-
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_KOREAN
823+
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_KOREAN
801824
? Web::Encoding::UnivCharDet::JohabCharsetProber->new
802825
($filter == Web::Encoding::UnivCharDet::Defs::FILTER_KOREAN)
803826
: undef,
804-
];
827+
];
828+
}
805829

806830
$self->reset;
831+
$self->set_resolve_latin1_refs (1) if $args{resolve_latin1_refs};
807832
return $self;
808833
} # new
809834

@@ -1352,9 +1377,11 @@ package Web::Encoding::UnivCharDet::CharsetProber::Vietnamese;
13521377
push our @ISA, qw(Web::Encoding::UnivCharDet::CharsetProber);
13531378
our $VERSION = '1.0';
13541379

1355-
sub new ($$) {
1356-
my $self = bless {}, $_[0];
1380+
sub new ($;%) {
1381+
my $self = bless {}, shift;
1382+
my %args = @_;
13571383
$self->reset;
1384+
$self->set_resolve_latin1_refs (1) if $args{resolve_latin1_refs};
13581385
return $self;
13591386
} # new
13601387

@@ -1475,11 +1502,12 @@ sub handle_data ($$) {
14751502
} # TBL
14761503
}
14771504

1478-
my $selected = [grep { $_ == 0 } @{$self->{notme}}];
1505+
my $selected = [grep { $self->{notme}->[$_] == 0 } 0..$#{$self->{notme}}];
14791506
if (@$selected == 0) {
14801507
$self->{state} = 'not me';
14811508
} elsif (@$selected == 1) {
1482-
if ($self->get_confidence > 0.8) {
1509+
if ($self->get_confidence > 0.8 and
1510+
$self->{words}->[$selected->[0]]->[3] == 0) {
14831511
$self->{state} = 'found it';
14841512
}
14851513
}
@@ -1510,7 +1538,8 @@ sub get_confidence ($;$) {
15101538
if ($A1 > 0) {
15111539
my $conf = $self->{probers}->[$charset]->get_confidence;
15121540
if ($conf > 0.9) {
1513-
push @answer, [$charset, $conf, $A4];
1541+
## Words are ASCII-only but there are non-ASCII punctuations
1542+
push @answer, [$charset, 0.6, $A4];
15141543
}
15151544
}
15161545
next;
@@ -1540,6 +1569,15 @@ sub get_confidence ($;$) {
15401569

15411570
#my $conf = 1 - exp(-0.05 * ($A1 + $A2));
15421571
#$raw *= $conf;
1572+
1573+
#my $bad_ratio = ($A3 + $A4) / ($T + 1e-6);
1574+
#my $foreign_penalty_base = 1 / (1 + exp(-12 * ($bad_ratio - 0.35)));
1575+
#my $length_factor = 1 - exp(-0.3 * $T);
1576+
#my $a2_factor = 1 / (1 + exp(-2 * ($A2 - 1)));
1577+
#my $foreign_penalty = $foreign_penalty_base
1578+
# * $length_factor
1579+
# * (1 - 0.5 * $a2_factor);
1580+
#$raw -= 0.6 * $foreign_penalty;
15431581

15441582
$score = 1 / (1 + exp(-5 * ($raw - 0.1)));
15451583
$score = 0 if $score < 0;

0 commit comments

Comments
 (0)