@@ -65,7 +65,8 @@ sub reset ($) {
6565 $self -> {done } = 0;
6666 $self -> {best_guess } = -1;
6767 $self -> {start } = 1;
68- $self -> {detected_charset } = undef ;
68+ delete $self -> {detected_charset };
69+ delete $self -> {font_charset };
6970 $self -> {got_data } = undef ;
7071 $self -> {input_state } = ' pure ascii' ;
7172 $self -> {last_char } = 0x00;
@@ -218,17 +219,25 @@ sub handle_data ($$) {
218219 } # $high
219220
220221 if ($self -> {win1252_refs } > 10 and $self -> {unicode_refs } < 10) {
221- for (grep { defined $_ } @{$self -> {charset_probers }}) {
222- $_ -> set_resolve_latin1_refs (1);
222+ $self -> {charset_probers }-> [4]
223+ ||= Web::Encoding::UnivCharDet::CharsetProber::MBCSGroup-> new
224+ ($self -> {lang_filter }, resolve_latin1_refs => 1);
225+ if ($self -> {lang_filter } & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK) {
226+ $self -> {charset_probers }-> [5]
227+ ||= Web::Encoding::UnivCharDet::CharsetProber::SBCSGroup-> new
228+ (resolve_latin1_refs => 1);
229+ }
230+ if ($self -> {lang_filter } & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK) {
231+ $self -> {charset_probers }-> [6]
232+ ||= Web::Encoding::UnivCharDet::CharsetProber::Vietnamese-> new
233+ (resolve_latin1_refs => 1);
223234 }
224235 $self -> {resolve_latin1_refs } = ' windows-1252' ;
225- # } elsif ($self->{win1250_refs} > 10 and $self->{unicode_refs} < 10) {
226- # for (grep { defined $_ } @{$self->{charset_probers}}) {
227- # $_->set_resolve_latin1_refs (1);
228- # }
229- # $self->{resolve_latin1_refs} = 'windows-1250';
230236 } else {
231237 delete $self -> {resolve_latin1_refs };
238+ delete $self -> {charset_probers }-> [4];
239+ delete $self -> {charset_probers }-> [5];
240+ delete $self -> {charset_probers }-> [6];
232241 }
233242
234243 if ($self -> {utf } and $zero ) {
@@ -272,29 +281,21 @@ sub handle_data ($$) {
272281 ('&'.$1 .';');
273282 }
274283 } ge ;
275- # } elsif ($self->{resolve_latin1_refs} eq 'windows-1250') {
276- # $x =~ s{&#([0-9]+);}{
277- # my $cc = $Web::Encoding::UnivCharDet::Defs::Windows1250Refs->{$1};
278- # if (defined $cc) {
279- # pack 'C', $cc;
280- # } else {
281- # '&' . $1 . ';';
282- # }
283- # }ge;
284284 }
285- for (grep { defined $_ } @{$self -> {charset_probers }}[0,1 ]) {
286- my $st = $_ -> handle_data ($x );
285+ for (grep { defined $_ } @{$self -> {charset_probers }}[0..3 ]) {
286+ my $st = $_ -> handle_data ($_ [1] );
287287 if ($st eq ' found it' ) {
288288 $self -> {done } = 1;
289289 $self -> {detected_charset } = $_ -> get_charset_name; # non-undef when found
290290 return 1;
291291 }
292292 }
293- for (grep { defined $_ } @{$self -> {charset_probers }}[2,3 ]) {
294- my $st = $_ -> handle_data ($_ [1] );
293+ for (grep { defined $_ } @{$self -> {charset_probers }}[4..6 ]) {
294+ my $st = $_ -> handle_data ($x );
295295 if ($st eq ' found it' ) {
296296 $self -> {done } = 1;
297- $self -> {detected_charset } = $_ -> get_charset_name; # non-undef when found
297+ $self -> {detected_charset } = ' windows-1252' ;
298+ $self -> {font_charset } = $_ -> get_charset_name; # non-undef when found
298299 return 1;
299300 }
300301 }
@@ -338,7 +339,16 @@ sub data_end ($) {
338339 }
339340 }
340341 if ($max_prober_confidence > Web::Encoding::UnivCharDet::Defs::MINIMUM_THRESHOLD) {
341- $self -> {reported } = $max_prober -> get_charset_name; # or undef
342+ if ($max_prober -> {resolve_latin1_refs }) {
343+ $self -> {reported } = ' windows-1252' ;
344+ $self -> {font_charset } = $max_prober -> get_charset_name; # or undef
345+ if (not defined $self -> {font_charset } or
346+ $self -> {font_charset } eq ' windows-1252' ) {
347+ delete $self -> {font_charset };
348+ }
349+ } else {
350+ $self -> {reported } = $max_prober -> get_charset_name; # or undef
351+ }
342352 }
343353 } elsif ($self -> {input_state } eq ' pure ascii' or
344354 $self -> {input_state } eq ' esc ascii' ) {
@@ -358,18 +368,22 @@ sub get_reported_charset ($) {
358368 return $_ [0]-> {reported };
359369} # get_reported_charset
360370
371+ sub get_reported_font_charset ($) {
372+ return $_ [0]-> {font_charset };
373+ } # get_reported_font_charset
374+
361375sub dump_status ($) {
362376 my $self = $_ [0];
363377 printf " [%s ] %s (%d %d %d ) %s \n " ,
364378 $self -> {reported } // ' ' ,
365- $self -> {resolve_latin1_refs } ? ' htmlrefs :' .$self -> {resolve_latin1_refs } : ' ' ,
379+ defined $self -> {font_charset } ? ' html :' .$self -> {font_charset } : ' ' ,
366380 $self -> {win1250_refs }, $self -> {win1250_refs }, $self -> {unicode_refs },
367381 $self -> {input_state };
368382 $_ -> dump_status for grep { defined $_ }
369383 @{$self -> {charset_probers }},
370384 $self -> {esc_charset_prober },
371385 $self -> {utf1632_prober };
372- print " Reported: @{[$self ->{reported} // '']}\n " ;
386+ print " Reported: @{[$self ->{reported} // '']} @{[defined $self ->{font_charset} ? 'html:'. $self ->{font_charset} : '']} \n " ;
373387} # dump_status
374388
375389sub dump_status_for_json ($) {
@@ -381,7 +395,7 @@ sub dump_status_for_json ($) {
381395 @{$self -> {charset_probers }},
382396 $self -> {esc_charset_prober },
383397 $self -> {utf1632_prober }],
384- htmlrefs => $self -> {resolve_latin1_refs },
398+ font_charset => $self -> {font_charset },
385399 reported => $self -> {reported }};
386400} # dump_status_for_json
387401
0 commit comments