Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
239 changes: 52 additions & 187 deletions Cargo.lock

Large diffs are not rendered by default.

10 changes: 6 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "charset-normalizer-rs"
version = "1.1.0"
version = "1.2.0"
authors = ["Nikolay Yarovoy <nikolay.yarovoy@gmail.com>"]
edition = "2021"
description = "Truly universal encoding detector in pure Rust - port of Python version"
Expand All @@ -27,7 +27,7 @@ chardetng = { version = "0.1.17", optional = true }
clap = { version = "4.4.2", features = ["derive"], optional = true}
counter = "0.7.0"
dialoguer = { version = "0.10.4", optional = true }
encoding = "0.2.33"
encoding_rs = "0.8.5"
env_logger = { version = "0.11.0", optional = true }
icu_normalizer = "1.3.2"
icu_properties = "1.3.2"
Expand All @@ -42,16 +42,18 @@ unicode_names2 = "2.0.0"

[dev-dependencies]
assert_cmd = "2.0.12"
criterion = "0.3"
predicates = "3.0.3"
criterion = "0.7"
predicates = "3.1.3"

[[bench]]
name = "large_payload"
harness = false
required-features = ["performance"]

[[bench]]
name = "large_datasets"
harness = false
required-features = ["performance"]

[features]
cli = ["clap", "dialoguer", "env_logger"]
Expand Down
139 changes: 79 additions & 60 deletions src/assets.rs
Original file line number Diff line number Diff line change
@@ -1,66 +1,85 @@
use crate::entity::Language;
use ahash::HashMap;
use ahash::HashSet;

use once_cell::sync::Lazy;
use std::iter::FromIterator;

pub(crate) static LANGUAGES: Lazy<[(Language, &'static str, bool, bool); 41]> = Lazy::new(|| {
[
// language, alphabet, have_accents, pure_latin
(Language::English, "eationsrhldcmufpgwbyvkjxzq", false, true, ),
(Language::English, "eationsrhldcumfpgwybvkxjzq", false, true, ),
(Language::German, "enirstadhulgocmbfkwzpvüäöj", true, true, ),
(Language::French, "easnitrluodcpmévgfbhqàxèyj", true, true, ),
(Language::Dutch, "enairtodslghvmukcpbwjzfyxë", true, true, ),
(Language::Italian, "eiaonltrscdupmgvfbzhqèàkyò", true, true, ),
(Language::Polish, "aioenrzwsctkydpmuljłgbhąęó", true, true, ),
(Language::Spanish, "eaonsrildtcumpbgvfyóhqíjzá", true, true, ),
(Language::Russian, "оаеинстрвлкмдпугяызбйьчхжц", false, false, ),
(Language::Japanese, "人一大亅丁丨竹笑口日今二彳行十土丶寸寺時乙丿乂气気冂巾亠市目儿見八小凵県月彐門間木東山出本中刀分耳又取最言田心思刂前京尹事生厶云会未来白冫楽灬馬尸尺駅明耂者了阝都高卜占厂广店子申奄亻俺上方冖学衣艮食自", false, false, ),
(Language::Japanese, "ーンス・ルトリイアラックドシレジタフロカテマィグバムプオコデニウメサビナブャエュチキズダパミェョハセベガモツネボソノァヴワポペピケゴギザホゲォヤヒユヨヘゼヌゥゾヶヂヲヅヵヱヰヮヽ゠ヾヷヿヸヹヺ", false, false, ),
(Language::Japanese, "のにるたとはしいをでてがなれからさっりすあもこまうくよきんめおけそつだやえどわちみせじばへびずろほげむべひょゆぶごゃねふぐぎぼゅづざぞぬぜぱぽぷぴぃぁぇぺゞぢぉぅゐゝゑ゛゜ゎゔ゚ゟ゙ゕゖ", false, false, ),
(Language::Portuguese, "aeosirdntmuclpgvbfhãqéçází", true, true, ),
(Language::Swedish, "eanrtsildomkgvhfupäcböåyjx", true, true, ),
(Language::Chinese, "的一是不了在人有我他这个们中来上大为和国地到以说时要就出会可也你对生能而子那得于着下自之年过发后作里用道行所然家种事成方多经么去法学如都同现当没动面起看定天分还进好小部其些主样理心她本前开但因只从想实", false, false, ),
(Language::Ukrainian, "оаніирвтесклудмпзяьбгйчхцї", false, false, ),
(Language::Norwegian, "erntasioldgkmvfpubhåyjøcæw", false, true, ),
(Language::Finnish, "aintesloukämrvjhpydögcbfwz", true, true, ),
(Language::Vietnamese, "nhticgaoumlràđsevpbyưdákộế", true, true, ),
(Language::Czech, "oeantsilvrkdumpíchzáyjběéř", true, true, ),
(Language::Hungarian, "eatlsnkriozáégmbyvdhupjöfc", true, true, ),
(Language::Korean, "이다에의는로하을가고지서한은기으년대사시를리도인스일", false, false, ),
(Language::Indonesian, "aneirtusdkmlgpbohyjcwfvzxq", false, true, ),
(Language::Turkish, "aeinrlıkdtsmyuobüşvgzhcpçğ", true, true, ),
(Language::Romanian, "eiarntulocsdpmăfvîgbșțzhâj", true, true, ),
(Language::Farsi, "ایردنهومتبسلکشزفگعخقجآپحطص", false, false, ),
(Language::Arabic, "اليمونرتبةعدسفهكقأحجشطصىخإ", false, false, ),
(Language::Danish, "erntaisdlogmkfvubhpåyøæcjw", false, true, ),
(Language::Serbian, "аиоенрсуткјвдмплгзбaieonцш", false, false, ),
(Language::Lithuanian, "iasoretnukmlpvdjgėbyųšžcąį", false, true, ),
(Language::Slovene, "eaionrsltjvkdpmuzbghčcšžfy", false, true, ),
(Language::Slovak, "oaenirvtslkdmpuchjbzáyýíčé", true, true, ),
(Language::Hebrew, "יוהלרבתמאשנעםדקחפסכגטצןזך", false, false, ),
(Language::Bulgarian, "аиоентрсвлкдпмзгяъубчцйжщх", false, false, ),
(Language::Croatian, "aioenrjstuklvdmpgzbcčhšžćf", true, true, ),
(Language::Hindi, "करसनतमहपयलवजदगबशटअएथभडचधषइ", false, false, ),
(Language::Estonian, "aiestlunokrdmvgpjhäbõüfcöy", true, true, ),
(Language::Thai, "านรอกเงมยลวดทสตะปบคหแจพชขใ", false, false, ),
(Language::Greek, "ατοιενρσκηπςυμλίόάγέδήωχθύ", false, false, ),
(Language::Tamil, "கதபடரமலனவறயளசநஇணஅஆழஙஎஉஒஸ", false, false, ),
(Language::Kazakh, "аыентрлідсмқкобиуғжңзшйпгө", false, false, ),
]
});
pub(crate) static LANGUAGE_SUPPORTED_COUNT: Lazy<usize> = Lazy::new(|| LANGUAGES.len()); // 41
pub(crate) struct LanguageEntry {
pub language: Language,
pub alphabet: &'static str,
pub alphabet_set: HashSet<char>,
pub have_accents: bool,
pub pure_latin: bool,
}

impl LanguageEntry {
pub fn new(
language: Language,
alphabet: &'static str,
have_accents: bool,
pure_latin: bool,
) -> Self {
Self {
language,
alphabet,
alphabet_set: alphabet.chars().collect(),
have_accents,
pure_latin,
}
}

pub fn get(language: &Language) -> Result<&Self, String> {
for entry in LANGUAGES.iter() {
if entry.language == *language {
return Ok(entry);
}
}
Err(String::from("Language wasn't found"))
}
}

pub(crate) static ENCODING_TO_LANGUAGE: Lazy<HashMap<&'static str, Language>> = Lazy::new(|| {
HashMap::from_iter([
("euc-kr", Language::Korean),
("big5", Language::Chinese),
("hz", Language::Chinese),
("gbk", Language::Chinese),
("gb18030", Language::Chinese),
("euc-jp", Language::Japanese),
("iso-2022-jp", Language::Japanese),
("shift_jis", Language::Japanese),
])
pub(crate) static LANGUAGES: Lazy<Vec<LanguageEntry>> = Lazy::new(|| {
vec![
// language, alphabet, have_accents, pure_latin
LanguageEntry::new(Language::English, "eationsrhldcmufpgwbyvkjxzq", false, true, ),
LanguageEntry::new(Language::English, "eationsrhldcumfpgwybvkxjzq", false, true, ),
LanguageEntry::new(Language::German, "enirstadhulgocmbfkwzpvüäöj", true, true, ),
LanguageEntry::new(Language::French, "easnitrluodcpmévgfbhqàxèyj", true, true, ),
LanguageEntry::new(Language::Dutch, "enairtodslghvmukcpbwjzfyxë", true, true, ),
LanguageEntry::new(Language::Italian, "eiaonltrscdupmgvfbzhqèàkyò", true, true, ),
LanguageEntry::new(Language::Polish, "aioenrzwsctkydpmuljłgbhąęó", true, true, ),
LanguageEntry::new(Language::Spanish, "eaonsrildtcumpbgvfyóhqíjzá", true, true, ),
LanguageEntry::new(Language::Russian, "оаеинстрвлкмдпугяызбйьчхжц", false, false, ),
LanguageEntry::new(Language::Japanese, "人一大亅丁丨竹笑口日今二彳行十土丶寸寺時乙丿乂气気冂巾亠市目儿見八小凵県月彐門間木東山出本中刀分耳又取最言田心思刂前京尹事生厶云会未来白冫楽灬馬尸尺駅明耂者了阝都高卜占厂广店子申奄亻俺上方冖学衣艮食自", false, false, ),
LanguageEntry::new(Language::Japanese, "ーンス・ルトリイアラックドシレジタフロカテマィグバムプオコデニウメサビナブャエュチキズダパミェョハセベガモツネボソノァヴワポペピケゴギザホゲォヤヒユヨヘゼヌゥゾヶヂヲヅヵヱヰヮヽ゠ヾヷヿヸヹヺ", false, false, ),
LanguageEntry::new(Language::Japanese, "のにるたとはしいをでてがなれからさっりすあもこまうくよきんめおけそつだやえどわちみせじばへびずろほげむべひょゆぶごゃねふぐぎぼゅづざぞぬぜぱぽぷぴぃぁぇぺゞぢぉぅゐゝゑ゛゜ゎゔ゚ゟ゙ゕゖ", false, false, ),
LanguageEntry::new(Language::Portuguese, "aeosirdntmuclpgvbfhãqéçází", true, true, ),
LanguageEntry::new(Language::Swedish, "eanrtsildomkgvhfupäcböåyjx", true, true, ),
LanguageEntry::new(Language::Chinese, "的一是不了在人有我他这个们中来上大为和国地到以说时要就出会可也你对生能而子那得于着下自之年过发后作里用道行所然家种事成方多经么去法学如都同现当没动面起看定天分还进好小部其些主样理心她本前开但因只从想实", false, false, ),
LanguageEntry::new(Language::Ukrainian, "оаніирвтесклудмпзяьбгйчхцї", false, false, ),
LanguageEntry::new(Language::Norwegian, "erntasioldgkmvfpubhåyjøcæw", false, true, ),
LanguageEntry::new(Language::Finnish, "aintesloukämrvjhpydögcbfwz", true, true, ),
LanguageEntry::new(Language::Vietnamese, "nhticgaoumlràđsevpbyưdákộế", true, true, ),
LanguageEntry::new(Language::Czech, "oeantsilvrkdumpíchzáyjběéř", true, true, ),
LanguageEntry::new(Language::Hungarian, "eatlsnkriozáégmbyvdhupjöfc", true, true, ),
LanguageEntry::new(Language::Korean, "이다에의는로하을가고지서한은기으년대사시를리도인스일", false, false, ),
LanguageEntry::new(Language::Indonesian, "aneirtusdkmlgpbohyjcwfvzxq", false, true, ),
LanguageEntry::new(Language::Turkish, "aeinrlıkdtsmyuobüşvgzhcpçğ", true, true, ),
LanguageEntry::new(Language::Romanian, "eiarntulocsdpmăfvîgbșțzhâj", true, true, ),
LanguageEntry::new(Language::Farsi, "ایردنهومتبسلکشزفگعخقجآپحطص", false, false, ),
LanguageEntry::new(Language::Arabic, "اليمونرتبةعدسفهكقأحجشطصىخإ", false, false, ),
LanguageEntry::new(Language::Danish, "erntaisdlogmkfvubhpåyøæcjw", false, true, ),
LanguageEntry::new(Language::Serbian, "аиоенрсуткјвдмплгзбaieonцш", false, false, ),
LanguageEntry::new(Language::Lithuanian, "iasoretnukmlpvdjgėbyųšžcąį", false, true, ),
LanguageEntry::new(Language::Slovene, "eaionrsltjvkdpmuzbghčcšžfy", false, true, ),
LanguageEntry::new(Language::Slovak, "oaenirvtslkdmpuchjbzáyýíčé", true, true, ),
LanguageEntry::new(Language::Hebrew, "יוהלרבתמאשנעםדקחפסכגטצןזך", false, false, ),
LanguageEntry::new(Language::Bulgarian, "аиоентрсвлкдпмзгяъубчцйжщх", false, false, ),
LanguageEntry::new(Language::Croatian, "aioenrjstuklvdmpgzbcčhšžćf", true, true, ),
LanguageEntry::new(Language::Hindi, "करसनतमहपयलवजदगबशटअएथभडचधषइ", false, false, ),
LanguageEntry::new(Language::Estonian, "aiestlunokrdmvgpjhäbõüfcöy", true, true, ),
LanguageEntry::new(Language::Thai, "านรอกเงมยลวดทสตะปบคหแจพชขใ", false, false, ),
LanguageEntry::new(Language::Greek, "ατοιενρσκηπςυμλίόάγέδήωχθύ", false, false, ),
LanguageEntry::new(Language::Tamil, "கதபடரமலனவறயளசநஇணஅஆழஙஎஉஒஸ", false, false, ),
LanguageEntry::new(Language::Kazakh, "аыентрлідсмқкобиуғжңзшйпгө", false, false, ),
]
});
61 changes: 24 additions & 37 deletions src/cd.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
#![allow(unused_variables)]
use crate::assets::{ENCODING_TO_LANGUAGE, LANGUAGES, LANGUAGE_SUPPORTED_COUNT};
use crate::assets::{LanguageEntry, LANGUAGES};
use crate::consts::TOO_SMALL_SEQUENCE;
use crate::enc::{Encoding, IsChunk, WantDecode};
use crate::entity::{CoherenceMatch, CoherenceMatches, Language};
use crate::utils::{
get_language_data, is_accentuated, is_multi_byte_encoding, is_suspiciously_successive_range,
is_unicode_range_secondary, unicode_range,
is_accentuated, is_suspiciously_successive_range, is_unicode_range_secondary, unicode_range,
};
use ahash::{HashMap, HashMapExt, HashSet};
use cached::proc_macro::cached;
use counter::Counter;
use encoding::label::encoding_from_whatwg_label;
use encoding::DecoderTrap;
use ordered_float::OrderedFloat;
use strsim::jaro;

Expand All @@ -20,18 +18,19 @@ use strsim::jaro;

// Return associated unicode ranges in a single byte code page.
pub(crate) fn encoding_unicode_range(iana_name: &str) -> Result<Vec<&str>, String> {
if is_multi_byte_encoding(iana_name) {
let encoder =
Encoding::by_name(iana_name).ok_or("No decoder found for this encoding".to_string())?;

if encoder.is_multi_byte_encoding() {
return Err("Function not supported on multi-byte code page".to_string());
}
let encoder = encoding_from_whatwg_label(iana_name)
.ok_or("No decoder found for this encoding".to_string())?;

let byte_range = 0x40..0xFF; // utf8 range. range.len()==191
let mut result: HashMap<&str, u8> = HashMap::with_capacity(byte_range.len());

byte_range.for_each(|i| {
if let Some(range) = encoder
.decode(&[i], DecoderTrap::Ignore)
.decode(&[i], WantDecode::Yes, IsChunk::No)
.ok()
.and_then(|chunk| chunk.chars().next())
.and_then(unicode_range)
Expand All @@ -55,11 +54,12 @@ pub(crate) fn encoding_unicode_range(iana_name: &str) -> Result<Vec<&str>, Strin
pub(crate) fn unicode_range_languages(primary_range: &str) -> Vec<&'static Language> {
LANGUAGES
.iter()
.filter_map(|(language, characters, _, _)| {
characters
.filter_map(|entry| {
entry
.alphabet
.chars()
.find(|char| unicode_range(*char).unwrap_or_default() == primary_range)
.map(|_| language)
.map(|_| &entry.language)
})
.collect::<Vec<&Language>>()
}
Expand All @@ -68,8 +68,8 @@ pub(crate) fn unicode_range_languages(primary_range: &str) -> Vec<&'static Langu
// Some code page are heavily linked to particular language(s).
// This function does the correspondence.
#[cached(size = 128)]
pub(crate) fn encoding_languages(iana_name: String) -> Vec<&'static Language> {
match encoding_unicode_range(&iana_name)
pub(crate) fn encoding_languages(iana_name: &'static str) -> Vec<&'static Language> {
match encoding_unicode_range(iana_name)
.unwrap_or_default()
.iter()
.find(|&&range| !range.contains("Latin"))
Expand All @@ -79,43 +79,30 @@ pub(crate) fn encoding_languages(iana_name: String) -> Vec<&'static Language> {
}
}

// Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
// This function does the correspondence.
pub(crate) fn mb_encoding_languages(iana_name: &str) -> Vec<&'static Language> {
ENCODING_TO_LANGUAGE
.get(iana_name)
.map_or(vec![], |found| vec![found])
}

// Return associated languages associated to given characters
#[allow(clippy::ptr_arg)]
pub(crate) fn alphabet_languages(
characters: &[char],
ignore_non_latin: bool,
) -> Vec<&'static Language> {
let mut languages: Vec<(&Language, OrderedFloat<f32>)> =
Vec::with_capacity(*LANGUAGE_SUPPORTED_COUNT);
let mut languages: Vec<(&Language, OrderedFloat<f32>)> = Vec::with_capacity(LANGUAGES.len());
let source_characters_set: HashSet<char> = characters.iter().copied().collect();
let source_has_accents = source_characters_set
.iter()
.any(|&char| is_accentuated(char));

for (language, language_characters, target_have_accents, target_pure_latin) in LANGUAGES.iter()
{
if (ignore_non_latin && !target_pure_latin) || (!target_have_accents && source_has_accents)
{
for entry in LANGUAGES.iter() {
if (ignore_non_latin && !entry.pure_latin) || (!entry.have_accents && source_has_accents) {
continue;
}

let language_characters_set: HashSet<char> = language_characters.chars().collect();
let intersection: HashSet<char> = language_characters_set
let intersection_size = entry
.alphabet_set
.intersection(&source_characters_set)
.copied()
.collect();
.count();

let ratio: f32 = intersection.len() as f32 / language_characters_set.len() as f32;
let ratio: f32 = intersection_size as f32 / entry.alphabet_set.len() as f32;
if ratio >= 0.2 {
languages.push((language, OrderedFloat(ratio)));
languages.push((&entry.language, OrderedFloat(ratio)));
}
}
// reverse sort
Expand Down Expand Up @@ -152,8 +139,8 @@ pub(crate) fn characters_popularity_compare(
language: &Language,
ordered_characters: &str,
) -> Result<f32, String> {
let language_data = get_language_data(language)?;
Ok(jaro(ordered_characters, language_data.0) as f32)
let language_data = LanguageEntry::get(language)?;
Ok(jaro(ordered_characters, language_data.alphabet) as f32)
}

// We shall NOT return more than one "English" in CoherenceMatches because it is an alternative
Expand Down
Loading