@@ -250,7 +250,6 @@ sub reset ($) {
250250 $Web::Encoding::UnivCharDet::Defs::Iso_8859_5BulgarianModel ,
251251 $Web::Encoding::UnivCharDet::Defs::Win1251BulgarianModel ,
252252 $Web::Encoding::UnivCharDet::Defs::TIS620ThaiModel ,
253- $Web::Encoding::UnivCharDet::Defs::VisciiVietnameseModel ,
254253 $Web::Encoding::UnivCharDet::Defs::Windows_1256ArabicModel ,
255254 $Web::Encoding::UnivCharDet::Defs::Iso_8859_6ArabicModel ,
256255 $Web::Encoding::UnivCharDet::Defs::Georgian_AcademyGeorgianModel ,
@@ -348,7 +347,7 @@ sub handle_data ($$) {
348347 $Web::Encoding::UnivCharDet::Defs::Iso_8859_16RomanianModel ,
349348 # $Web::Encoding::UnivCharDet::Defs::Iso_8859_9TurkishModel,
350349 $Web::Encoding::UnivCharDet::Defs::Windows_1254TurkishModel ,
351- $Web::Encoding::UnivCharDet::Defs::Windows_1258VietnameseModel ,
350+ # $Web::Encoding::UnivCharDet::Defs::Windows_1258VietnameseModel,
352351 $Web::Encoding::UnivCharDet::Defs::Mac_CentraleuropeCzechModel ,
353352 $Web::Encoding::UnivCharDet::Defs::Ibm852CzechModel ,
354353 $Web::Encoding::UnivCharDet::Defs::Ibm852PolishModel ,
@@ -407,15 +406,19 @@ sub get_confidence ($) {
407406 my $cn = {};
408407 for my $i (@$best_i ) {
409408 my $charset = $self -> {probers }-> [$i ]-> get_charset_name;
410- $cc -> {$charset }++;
411- $cn -> {$charset } //= $i ;
409+ if (defined $charset ) {
410+ $cc -> {$charset }++;
411+ $cn -> {$charset } //= $i ;
412+ }
412413 }
413414 my $charset = [sort { $cc -> {$b } <=> $cc -> {$a } || $a cmp $b } keys %$cc ]-> [0];
414- $self -> {best_guess } = $cn -> {$charset };
415- if ($best_conf < 0.21 and $self -> {latin } and $charset eq ' windows-1252' ) {
416- $best_conf = 0.21;
417- } elsif ($best_conf <= 0.01) {
418- $self -> {best_guess } = -1;
415+ if (defined $charset ) {
416+ $self -> {best_guess } = $cn -> {$charset };
417+ if ($best_conf < 0.21 and $self -> {latin } and $charset eq ' windows-1252' ) {
418+ $best_conf = 0.21;
419+ } elsif ($best_conf <= 0.01) {
420+ $self -> {best_guess } = -1;
421+ }
419422 }
420423 }
421424 return $best_conf ;
@@ -1307,7 +1310,7 @@ sub handle_data ($$) {
13071310} # handle_data
13081311
13091312sub get_charset_name ($) {
1310- return $_ [0]-> {detected_charset };
1313+ return $_ [0]-> {detected_charset }; # or undef
13111314} # get_charset_name
13121315
13131316sub get_confidence ($) {
@@ -1316,19 +1319,243 @@ sub get_confidence ($) {
13161319
13171320sub dump_status ($) {
13181321 my $self = $_ [0];
1319- printf " [ %s ] %s (%s )\n " ,
1322+ printf " ESC: %s [ %s ] (%s )\n " ,
13201323 $self -> get_confidence,
1321- $self -> get_charset_name,
1324+ $self -> get_charset_name // ' ' ,
13221325 $self -> {state };
13231326} # dump_status
13241327
13251328sub dump_status_for_json ($) {
13261329 my $self = $_ [0];
1327- return {type => $self -> get_charset_name ,
1330+ return {type => ref $self ,
13281331 charset => $self -> get_charset_name,
13291332 confidence => $self -> get_confidence};
13301333} # dump_status_for_json
13311334
1335+ package Web::Encoding::UnivCharDet::CharsetProber::Vietnamese ;
1336+ push our @ISA , qw( Web::Encoding::UnivCharDet::CharsetProber) ;
1337+ our $VERSION = ' 1.0' ;
1338+
1339+ sub new ($$) {
1340+ my $self = bless {}, $_ [0];
1341+ $self -> reset ;
1342+ return $self ;
1343+ } # new
1344+
1345+ sub reset ($) {
1346+ my $self = $_ [0];
1347+ $self -> {state } = ' detecting' ;
1348+ delete $self -> {detected_charset };
1349+ $self -> {vstates } = [$Web::Encoding::UnivCharDet::Defs::VietStateInitial ,
1350+ $Web::Encoding::UnivCharDet::Defs::VietStateInitial ,
1351+ $Web::Encoding::UnivCharDet::Defs::VietStateInitial ,
1352+ $Web::Encoding::UnivCharDet::Defs::VietStateInitial ];
1353+ $self -> {nonascii } = [0, 0, 0, 0];
1354+ $self -> {words } = [[0,0,0,0], [0,0,0,0], [0,0,0,0], [0,0,0,0]];
1355+ $self -> {notme } = [0, 0, 0, 0];
1356+ $self -> {probers } = [
1357+ map { Web::Encoding::UnivCharDet::CharsetProber::SBCS-> new ($_ ) }
1358+ $Web::Encoding::UnivCharDet::Defs::VisciiVietnameseModel ,
1359+ $Web::Encoding::UnivCharDet::Defs::VniVietnameseModel ,
1360+ $Web::Encoding::UnivCharDet::Defs::VpsVietnameseModel ,
1361+ $Web::Encoding::UnivCharDet::Defs::Vn3VietnameseModel ,
1362+ ];
1363+ $self -> {current } = [' ' , ' ' , ' ' , ' ' ];
1364+ } # reset
1365+
1366+ my $Tables = [
1367+ [0, $Web::Encoding::UnivCharDet::Defs::VISCIIClassTable ],
1368+ [1, $Web::Encoding::UnivCharDet::Defs::VNIClassTable ],
1369+ [2, $Web::Encoding::UnivCharDet::Defs::VPSClassTable ],
1370+ [3, $Web::Encoding::UnivCharDet::Defs::VN3ClassTable ],
1371+ ];
1372+ sub handle_data ($$) {
1373+ my $self = $_ [0];
1374+ return $self -> {state } unless $self -> {state } eq ' detecting' ;
1375+
1376+ for my $i (0..((length $_ [1]) - 1)) {
1377+ my $cc = (ord substr $_ [1], $i , 1);
1378+ for (@$Tables ) {
1379+ my $charset = $_ -> [0];
1380+ next if $self -> {notme }-> [$charset ];
1381+
1382+ my $cls = ord substr $_ -> [1], $cc , 1;
1383+
1384+ my $os = $self -> {vstates }-> [$charset ];
1385+ my $ns = ord substr $Web::Encoding::UnivCharDet::Defs::VietStateTable ,
1386+ ($self -> {vstates }-> [$charset ] * $Web::Encoding::UnivCharDet::Defs::VietStateInputs + $cls );
1387+ $self -> {vstates }-> [$charset ] = $ns ;
1388+
1389+ if ((0x20 <= $cc and $cc <= 0x7E) or (0x09 <= $cc and $cc <= 0x0D)) {
1390+ #
1391+ } else {
1392+ $self -> {nonascii }-> [$charset ]++;
1393+ }
1394+
1395+ if (Web::Encoding::UnivCharDet::Defs::IS_VIET_WORD_START ($os , $ns )) {
1396+ $self -> {nonascii }-> [$charset ] = 0;
1397+ $self -> {current }-> [$charset ] = pack ' C' , $cc ;
1398+ }
1399+ if (Web::Encoding::UnivCharDet::Defs::IS_VIET_VWORD_END ($os , $ns )) {
1400+ if ($self -> {nonascii }-> [$charset ]) {
1401+ $self -> {words }-> [$charset ]-> [1]++;
1402+ } else {
1403+ $self -> {words }-> [$charset ]-> [0]++;
1404+ }
1405+ $self -> {probers }-> [$charset ]-> handle_data ($self -> {current }-> [$charset ]);
1406+ }
1407+ if (Web::Encoding::UnivCharDet::Defs::IS_VIET_FWORD_END ($os , $ns )) {
1408+ if ($self -> {nonascii }-> [$charset ]) {
1409+ $self -> {words }-> [$charset ]-> [3]++;
1410+ } else {
1411+ $self -> {words }-> [$charset ]-> [2]++;
1412+ }
1413+ }
1414+ if (Web::Encoding::UnivCharDet::Defs::IS_VIET_NOTME ($os , $ns )) {
1415+ $self -> {notme }-> [$charset ] = 1;
1416+ next ;
1417+ }
1418+ if (Web::Encoding::UnivCharDet::Defs::IS_VIET_VWORD ($ns )) {
1419+ $self -> {current }-> [$charset ] .= pack ' C' , $cc ;
1420+ }
1421+ }
1422+ }
1423+
1424+ my $selected = [grep { $_ == 0 } @{$self -> {notme }}];
1425+ if (@$selected == 0) {
1426+ $self -> {state } = ' not me' ;
1427+ } elsif (@$selected == 1) {
1428+ if ($self -> get_confidence > 0.8) {
1429+ $self -> {state } = ' found it' ;
1430+ }
1431+ }
1432+
1433+ return $self -> {state };
1434+ } # handle_data
1435+
1436+ sub get_charset_name ($) {
1437+ my $self = $_ [0];
1438+ $self -> get_confidence if not defined $self -> {detected_charset };
1439+
1440+ return $self -> {detected_charset }; # or undef
1441+ } # get_charset_name
1442+
1443+ sub get_confidence ($;$) {
1444+ my $self = $_ [0];
1445+ if ($self -> {state } eq ' not me' and not defined $_ [1]) {
1446+ $self -> {detected_charset } = undef ;
1447+ return 0.01;
1448+ }
1449+
1450+ my @answer ;
1451+ for my $charset (defined $_ [1] ? ($_ [1]) : (0..3)) {
1452+ next if $self -> {notme }-> [$charset ];
1453+
1454+ my ($A1 , $A2 , $A3 , $A4 ) = @{$self -> {words }-> [$charset ]};
1455+ if ($A2 == 0 and $A4 == 0) { # ASCII only
1456+ if ($A1 > 0) {
1457+ my $conf = $self -> {probers }-> [$charset ]-> get_confidence;
1458+ if ($conf > 0.9) {
1459+ push @answer , [$charset , $conf , $A4 ];
1460+ }
1461+ }
1462+ next ;
1463+ }
1464+ my $F = 0;
1465+ my $T = $A1 + $A2 + $A3 + $A4 ;
1466+ next if $T == 0;
1467+
1468+ my $score ;
1469+ {
1470+ if ($T < 10) {
1471+ if ($A2 >= 1) {
1472+ $score = 0.95;
1473+ } elsif ($A1 >= 3) {
1474+ $score = 0.8;
1475+ } else {
1476+ $score = 0.5;
1477+ }
1478+ last ;
1479+ }
1480+
1481+ my $v_ratio = ($A1 + $A2 ) / ($T + 1e-6);
1482+ my $v_strength = ($A2 * 2 + $A1 * 0.5) / ($T + 1e-6);
1483+ my $penalty = 0.5 * ($A4 / ($T + 1e-6)) + 0.3 * ($F / ($T + 1e-6));
1484+ my $raw = $v_ratio * (0.6 + 0.4 * $v_strength ) - $penalty ;
1485+ # $raw += 0.15 if $A2 >= 1 && $v_ratio < 0.2;
1486+
1487+ # my $conf = 1 - exp(-0.05 * ($A1 + $A2));
1488+ # $raw *= $conf;
1489+
1490+ $score = 1 / (1 + exp (-5 * ($raw - 0.1)));
1491+ $score = 0 if $score < 0;
1492+ $score = 1 if $score > 1;
1493+
1494+ last ;
1495+ }
1496+
1497+ push @answer , [$charset , $score , $A4 ];
1498+ }
1499+ @answer = sort { $b -> [1] <=> $a -> [1] } @answer ;
1500+ if (@answer > 1 and not $answer [0]-> [0] == 1) { # != vni
1501+ if ($answer [0]-> [2] == 0) {
1502+ @answer = grep { $_ -> [2] == 0 } @answer ;
1503+ }
1504+ @answer = grep { ($answer [0]-> [1] - $_ -> [1]) < 0.1 } @answer ;
1505+ if (@answer > 1) {
1506+ for (@answer ) {
1507+ my $sc = $self -> {probers }-> [$_ -> [0]]-> {seq_counters };
1508+ $_ -> [3] = $sc -> [Web::Encoding::UnivCharDet::CharsetProber::SBCS::POSITIVE_CAT] - $sc -> [Web::Encoding::UnivCharDet::CharsetProber::SBCS::NEGATIVE_CAT]/($sc -> [Web::Encoding::UnivCharDet::CharsetProber::SBCS::POSITIVE_CAT]+0.1)*4;
1509+ # $_->[4] = $self->{probers}->[$_->[0]]->get_confidence;
1510+ }
1511+ @answer = sort { $b -> [3] <=> $a -> [3] } @answer ;
1512+ }
1513+ }
1514+ unless (defined $_ [1]) {
1515+ my $ans = @answer ? [' viscii' , ' x-viet-vni' , ' x-viet-vps' , ' x-viet-tcvn' ]-> [$answer [0]-> [0]] : undef ;
1516+ $self -> {detected_charset } = $ans ; # or undef
1517+ }
1518+
1519+ my $c = $answer [0]-> [1]; # or undef;
1520+ return $c || 0.01;
1521+ } # get_confidence
1522+
1523+ sub dump_status ($) {
1524+ my $self = $_ [0];
1525+ printf " Viet: %s [%s ] (%s )\n " ,
1526+ $self -> get_confidence,
1527+ $self -> get_charset_name // ' ' ,
1528+ $self -> {state };
1529+ for my $charset (0..3) {
1530+ printf " %s [%s ] (%d %d %d %d / %d )\n " ,
1531+ $self -> get_confidence ($charset ),
1532+ [' viscii' , ' x-viet-vni' , ' x-viet-vps' , ' x-viet-tcvn' ]-> [$charset ],
1533+ @{$self -> {words }-> [$charset ]},
1534+ $self -> {vstates }-> [$charset ];
1535+ }
1536+ for (@{$self -> {probers }}) {
1537+ $_ -> dump_status;
1538+ }
1539+ } # dump_status
1540+
1541+ sub dump_status_for_json ($) {
1542+ my $self = $_ [0];
1543+ return {type => ref $self ,
1544+ charset => $self -> get_charset_name // ' ' ,
1545+ confidence => $self -> get_confidence,
1546+ probers => [
1547+ {type => ' viscii' , charset => ' viscii' ,
1548+ confidence => $self -> get_confidence (0)},
1549+ {type => ' vni' , charset => ' x-viet-vni' ,
1550+ confidence => $self -> get_confidence (1)},
1551+ {type => ' vps' , charset => ' x-viet-vps' ,
1552+ confidence => $self -> get_confidence (2)},
1553+ {type => ' vn3' , charset => ' x-viet-tcvn' ,
1554+ confidence => $self -> get_confidence (3)},
1555+ map { $_ -> dump_status_for_json } @{$self -> {probers }},
1556+ ]};
1557+ } # dump_status_for_json
1558+
133215591;
13331560
13341561=head1 LICENSE
0 commit comments