Skip to content

Commit c5a17b7

Browse files
committed
Added vietnamese detection support
1 parent 7efed43 commit c5a17b7

File tree

4 files changed

+543
-24
lines changed

4 files changed

+543
-24
lines changed

lib/Web/Encoding/UnivCharDet.pm

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,9 @@ sub handle_data ($$) {
137137
$self->{charset_probers}->[2]
138138
||= Web::Encoding::UnivCharDet::CharsetProber::Latin1->new
139139
unless $self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
140+
$self->{charset_probers}->[3]
141+
||= Web::Encoding::UnivCharDet::CharsetProber::Vietnamese->new
142+
if $self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
140143
}
141144
} else {
142145
if ($self->{input_state} eq 'pure ascii') {

lib/Web/Encoding/UnivCharDet/CharsetProber.pm

Lines changed: 240 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,6 @@ sub reset ($) {
250250
$Web::Encoding::UnivCharDet::Defs::Iso_8859_5BulgarianModel,
251251
$Web::Encoding::UnivCharDet::Defs::Win1251BulgarianModel,
252252
$Web::Encoding::UnivCharDet::Defs::TIS620ThaiModel,
253-
$Web::Encoding::UnivCharDet::Defs::VisciiVietnameseModel,
254253
$Web::Encoding::UnivCharDet::Defs::Windows_1256ArabicModel,
255254
$Web::Encoding::UnivCharDet::Defs::Iso_8859_6ArabicModel,
256255
$Web::Encoding::UnivCharDet::Defs::Georgian_AcademyGeorgianModel,
@@ -348,7 +347,7 @@ sub handle_data ($$) {
348347
$Web::Encoding::UnivCharDet::Defs::Iso_8859_16RomanianModel,
349348
#$Web::Encoding::UnivCharDet::Defs::Iso_8859_9TurkishModel,
350349
$Web::Encoding::UnivCharDet::Defs::Windows_1254TurkishModel,
351-
$Web::Encoding::UnivCharDet::Defs::Windows_1258VietnameseModel,
350+
#$Web::Encoding::UnivCharDet::Defs::Windows_1258VietnameseModel,
352351
$Web::Encoding::UnivCharDet::Defs::Mac_CentraleuropeCzechModel,
353352
$Web::Encoding::UnivCharDet::Defs::Ibm852CzechModel,
354353
$Web::Encoding::UnivCharDet::Defs::Ibm852PolishModel,
@@ -407,15 +406,19 @@ sub get_confidence ($) {
407406
my $cn = {};
408407
for my $i (@$best_i) {
409408
my $charset = $self->{probers}->[$i]->get_charset_name;
410-
$cc->{$charset}++;
411-
$cn->{$charset} //= $i;
409+
if (defined $charset) {
410+
$cc->{$charset}++;
411+
$cn->{$charset} //= $i;
412+
}
412413
}
413414
my $charset = [sort { $cc->{$b} <=> $cc->{$a} || $a cmp $b } keys %$cc]->[0];
414-
$self->{best_guess} = $cn->{$charset};
415-
if ($best_conf < 0.21 and $self->{latin} and $charset eq 'windows-1252') {
416-
$best_conf = 0.21;
417-
} elsif ($best_conf <= 0.01) {
418-
$self->{best_guess} = -1;
415+
if (defined $charset) {
416+
$self->{best_guess} = $cn->{$charset};
417+
if ($best_conf < 0.21 and $self->{latin} and $charset eq 'windows-1252') {
418+
$best_conf = 0.21;
419+
} elsif ($best_conf <= 0.01) {
420+
$self->{best_guess} = -1;
421+
}
419422
}
420423
}
421424
return $best_conf;
@@ -1307,7 +1310,7 @@ sub handle_data ($$) {
13071310
} # handle_data
13081311

13091312
sub get_charset_name ($) {
1310-
return $_[0]->{detected_charset};
1313+
return $_[0]->{detected_charset}; # or undef
13111314
} # get_charset_name
13121315

13131316
sub get_confidence ($) {
@@ -1316,19 +1319,243 @@ sub get_confidence ($) {
13161319

13171320
sub dump_status ($) {
13181321
my $self = $_[0];
1319-
printf "[%s] %s (%s)\n",
1322+
printf " ESC: %s [%s] (%s)\n",
13201323
$self->get_confidence,
1321-
$self->get_charset_name,
1324+
$self->get_charset_name // '',
13221325
$self->{state};
13231326
} # dump_status
13241327

13251328
sub dump_status_for_json ($) {
13261329
my $self = $_[0];
1327-
return {type => $self->get_charset_name,
1330+
return {type => ref $self,
13281331
charset => $self->get_charset_name,
13291332
confidence => $self->get_confidence};
13301333
} # dump_status_for_json
13311334

1335+
package Web::Encoding::UnivCharDet::CharsetProber::Vietnamese;
1336+
push our @ISA, qw(Web::Encoding::UnivCharDet::CharsetProber);
1337+
our $VERSION = '1.0';
1338+
1339+
sub new ($$) {
1340+
my $self = bless {}, $_[0];
1341+
$self->reset;
1342+
return $self;
1343+
} # new
1344+
1345+
sub reset ($) {
1346+
my $self = $_[0];
1347+
$self->{state} = 'detecting';
1348+
delete $self->{detected_charset};
1349+
$self->{vstates} = [$Web::Encoding::UnivCharDet::Defs::VietStateInitial,
1350+
$Web::Encoding::UnivCharDet::Defs::VietStateInitial,
1351+
$Web::Encoding::UnivCharDet::Defs::VietStateInitial,
1352+
$Web::Encoding::UnivCharDet::Defs::VietStateInitial];
1353+
$self->{nonascii} = [0, 0, 0, 0];
1354+
$self->{words} = [[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]];
1355+
$self->{notme} = [0, 0, 0, 0];
1356+
$self->{probers} = [
1357+
map { Web::Encoding::UnivCharDet::CharsetProber::SBCS->new ($_) }
1358+
$Web::Encoding::UnivCharDet::Defs::VisciiVietnameseModel,
1359+
$Web::Encoding::UnivCharDet::Defs::VniVietnameseModel,
1360+
$Web::Encoding::UnivCharDet::Defs::VpsVietnameseModel,
1361+
$Web::Encoding::UnivCharDet::Defs::Vn3VietnameseModel,
1362+
];
1363+
$self->{current} = ['', '', '', ''];
1364+
} # reset
1365+
1366+
my $Tables = [
1367+
[0, $Web::Encoding::UnivCharDet::Defs::VISCIIClassTable],
1368+
[1, $Web::Encoding::UnivCharDet::Defs::VNIClassTable],
1369+
[2, $Web::Encoding::UnivCharDet::Defs::VPSClassTable],
1370+
[3, $Web::Encoding::UnivCharDet::Defs::VN3ClassTable],
1371+
];
1372+
sub handle_data ($$) {
1373+
my $self = $_[0];
1374+
return $self->{state} unless $self->{state} eq 'detecting';
1375+
1376+
for my $i (0..((length $_[1]) - 1)) {
1377+
my $cc = (ord substr $_[1], $i, 1);
1378+
for (@$Tables) {
1379+
my $charset = $_->[0];
1380+
next if $self->{notme}->[$charset];
1381+
1382+
my $cls = ord substr $_->[1], $cc, 1;
1383+
1384+
my $os = $self->{vstates}->[$charset];
1385+
my $ns = ord substr $Web::Encoding::UnivCharDet::Defs::VietStateTable,
1386+
($self->{vstates}->[$charset] * $Web::Encoding::UnivCharDet::Defs::VietStateInputs + $cls);
1387+
$self->{vstates}->[$charset] = $ns;
1388+
1389+
if ((0x20 <= $cc and $cc <= 0x7E) or (0x09 <= $cc and $cc <= 0x0D)) {
1390+
#
1391+
} else {
1392+
$self->{nonascii}->[$charset]++;
1393+
}
1394+
1395+
if (Web::Encoding::UnivCharDet::Defs::IS_VIET_WORD_START ($os, $ns)) {
1396+
$self->{nonascii}->[$charset] = 0;
1397+
$self->{current}->[$charset] = pack 'C', $cc;
1398+
}
1399+
if (Web::Encoding::UnivCharDet::Defs::IS_VIET_VWORD_END ($os, $ns)) {
1400+
if ($self->{nonascii}->[$charset]) {
1401+
$self->{words}->[$charset]->[1]++;
1402+
} else {
1403+
$self->{words}->[$charset]->[0]++;
1404+
}
1405+
$self->{probers}->[$charset]->handle_data ($self->{current}->[$charset]);
1406+
}
1407+
if (Web::Encoding::UnivCharDet::Defs::IS_VIET_FWORD_END ($os, $ns)) {
1408+
if ($self->{nonascii}->[$charset]) {
1409+
$self->{words}->[$charset]->[3]++;
1410+
} else {
1411+
$self->{words}->[$charset]->[2]++;
1412+
}
1413+
}
1414+
if (Web::Encoding::UnivCharDet::Defs::IS_VIET_NOTME ($os, $ns)) {
1415+
$self->{notme}->[$charset] = 1;
1416+
next;
1417+
}
1418+
if (Web::Encoding::UnivCharDet::Defs::IS_VIET_VWORD ($ns)) {
1419+
$self->{current}->[$charset] .= pack 'C', $cc;
1420+
}
1421+
}
1422+
}
1423+
1424+
my $selected = [grep { $_ == 0 } @{$self->{notme}}];
1425+
if (@$selected == 0) {
1426+
$self->{state} = 'not me';
1427+
} elsif (@$selected == 1) {
1428+
if ($self->get_confidence > 0.8) {
1429+
$self->{state} = 'found it';
1430+
}
1431+
}
1432+
1433+
return $self->{state};
1434+
} # handle_data
1435+
1436+
sub get_charset_name ($) {
1437+
my $self = $_[0];
1438+
$self->get_confidence if not defined $self->{detected_charset};
1439+
1440+
return $self->{detected_charset}; # or undef
1441+
} # get_charset_name
1442+
1443+
sub get_confidence ($;$) {
1444+
my $self = $_[0];
1445+
if ($self->{state} eq 'not me' and not defined $_[1]) {
1446+
$self->{detected_charset} = undef;
1447+
return 0.01;
1448+
}
1449+
1450+
my @answer;
1451+
for my $charset (defined $_[1] ? ($_[1]) : (0..3)) {
1452+
next if $self->{notme}->[$charset];
1453+
1454+
my ($A1, $A2, $A3, $A4) = @{$self->{words}->[$charset]};
1455+
if ($A2 == 0 and $A4 == 0) { # ASCII only
1456+
if ($A1 > 0) {
1457+
my $conf = $self->{probers}->[$charset]->get_confidence;
1458+
if ($conf > 0.9) {
1459+
push @answer, [$charset, $conf, $A4];
1460+
}
1461+
}
1462+
next;
1463+
}
1464+
my $F = 0;
1465+
my $T = $A1 + $A2 + $A3 + $A4;
1466+
next if $T == 0;
1467+
1468+
my $score;
1469+
{
1470+
if ($T < 10) {
1471+
if ($A2 >= 1) {
1472+
$score = 0.95;
1473+
} elsif ($A1 >= 3) {
1474+
$score = 0.8;
1475+
} else {
1476+
$score = 0.5;
1477+
}
1478+
last;
1479+
}
1480+
1481+
my $v_ratio = ($A1 + $A2) / ($T + 1e-6);
1482+
my $v_strength = ($A2 * 2 + $A1 * 0.5) / ($T + 1e-6);
1483+
my $penalty = 0.5 * ($A4 / ($T + 1e-6)) + 0.3 * ($F / ($T + 1e-6));
1484+
my $raw = $v_ratio * (0.6 + 0.4 * $v_strength) - $penalty;
1485+
#$raw += 0.15 if $A2 >= 1 && $v_ratio < 0.2;
1486+
1487+
#my $conf = 1 - exp(-0.05 * ($A1 + $A2));
1488+
#$raw *= $conf;
1489+
1490+
$score = 1 / (1 + exp(-5 * ($raw - 0.1)));
1491+
$score = 0 if $score < 0;
1492+
$score = 1 if $score > 1;
1493+
1494+
last;
1495+
}
1496+
1497+
push @answer, [$charset, $score, $A4];
1498+
}
1499+
@answer = sort { $b->[1] <=> $a->[1] } @answer;
1500+
if (@answer > 1 and not $answer[0]->[0] == 1) { # != vni
1501+
if ($answer[0]->[2] == 0) {
1502+
@answer = grep { $_->[2] == 0 } @answer;
1503+
}
1504+
@answer = grep { ($answer[0]->[1] - $_->[1]) < 0.1 } @answer;
1505+
if (@answer > 1) {
1506+
for (@answer) {
1507+
my $sc = $self->{probers}->[$_->[0]]->{seq_counters};
1508+
$_->[3] = $sc->[Web::Encoding::UnivCharDet::CharsetProber::SBCS::POSITIVE_CAT] - $sc->[Web::Encoding::UnivCharDet::CharsetProber::SBCS::NEGATIVE_CAT]/($sc->[Web::Encoding::UnivCharDet::CharsetProber::SBCS::POSITIVE_CAT]+0.1)*4;
1509+
#$_->[4] = $self->{probers}->[$_->[0]]->get_confidence;
1510+
}
1511+
@answer = sort { $b->[3] <=> $a->[3] } @answer;
1512+
}
1513+
}
1514+
unless (defined $_[1]) {
1515+
my $ans = @answer ? ['viscii', 'x-viet-vni', 'x-viet-vps', 'x-viet-tcvn']->[$answer[0]->[0]] : undef;
1516+
$self->{detected_charset} = $ans; # or undef
1517+
}
1518+
1519+
my $c = $answer[0]->[1]; # or undef;
1520+
return $c || 0.01;
1521+
} # get_confidence
1522+
1523+
sub dump_status ($) {
1524+
my $self = $_[0];
1525+
printf " Viet: %s [%s] (%s)\n",
1526+
$self->get_confidence,
1527+
$self->get_charset_name // '',
1528+
$self->{state};
1529+
for my $charset (0..3) {
1530+
printf " %s [%s] (%d %d %d %d / %d)\n",
1531+
$self->get_confidence ($charset),
1532+
['viscii', 'x-viet-vni', 'x-viet-vps', 'x-viet-tcvn']->[$charset],
1533+
@{$self->{words}->[$charset]},
1534+
$self->{vstates}->[$charset];
1535+
}
1536+
for (@{$self->{probers}}) {
1537+
$_->dump_status;
1538+
}
1539+
} # dump_status
1540+
1541+
sub dump_status_for_json ($) {
1542+
my $self = $_[0];
1543+
return {type => ref $self,
1544+
charset => $self->get_charset_name // '',
1545+
confidence => $self->get_confidence,
1546+
probers => [
1547+
{type => 'viscii', charset => 'viscii',
1548+
confidence => $self->get_confidence (0)},
1549+
{type => 'vni', charset => 'x-viet-vni',
1550+
confidence => $self->get_confidence (1)},
1551+
{type => 'vps', charset => 'x-viet-vps',
1552+
confidence => $self->get_confidence (2)},
1553+
{type => 'vn3', charset => 'x-viet-tcvn',
1554+
confidence => $self->get_confidence (3)},
1555+
map { $_->dump_status_for_json } @{$self->{probers}},
1556+
]};
1557+
} # dump_status_for_json
1558+
13321559
1;
13331560

13341561
=head1 LICENSE

0 commit comments

Comments
 (0)