Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store only secondary weight in diacritic table and remove jamo tailoring bit #1977

Closed
wants to merge 11 commits into from
Closed
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions experimental/collator/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@ icu_provider = { version = "0.6", path = "../../provider/core", features = ["mac
icu_locid = { version = "0.6", path = "../../components/locid" }
icu_normalizer = { version = "0.6", path = "../../experimental/normalizer" }
icu_properties = { version = "0.6", path = "../../components/properties" }
icu_uniset = { version = "0.5", path = "../../utils/uniset" }
serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true }
zerovec = { version = "0.7", path = "../../utils/zerovec", features = ["serde"] }
utf8_iter = "1.0"
crabbake = { version = "0.4", path = "../../experimental/crabbake", optional = true, features = ["derive"] }
zerofrom = { version = "0.1.0", path = "../../utils/zerofrom" }

[dev-dependencies]
icu_testdata = { version = "0.6", path = "../../provider/testdata" }
Expand All @@ -56,5 +58,5 @@ bench = false # This option is required for Benchmark CI

[features]
default = []
serde = ["dep:serde", "zerovec/serde", "icu_char16trie/serde", "icu_properties/serde", "icu_normalizer/serde", "icu_codepointtrie/serde"]
datagen = ["serde", "crabbake", "zerovec/crabbake", "icu_char16trie/crabbake", "icu_properties/crabbake", "icu_normalizer/crabbake", "icu_codepointtrie/crabbake"]
serde = ["dep:serde", "zerovec/serde", "icu_char16trie/serde", "icu_properties/serde", "icu_normalizer/serde", "icu_uniset/serde", "icu_codepointtrie/serde"]
datagen = ["serde", "crabbake", "zerovec/crabbake", "icu_char16trie/crabbake", "icu_properties/crabbake", "icu_normalizer/crabbake", "icu_uniset/crabbake", "icu_codepointtrie/crabbake"]
54 changes: 41 additions & 13 deletions experimental/collator/src/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
//! the comparison of collation element sequences.

use crate::elements::{
CollationElement, CollationElements, NonPrimary, COMBINING_DIACRITICS_COUNT, JAMO_COUNT, NO_CE,
NO_CE_PRIMARY, NO_CE_SECONDARY, NO_CE_TERTIARY, QUATERNARY_MASK,
CollationElement, CollationElements, NonPrimary, JAMO_COUNT, NO_CE, NO_CE_PRIMARY,
NO_CE_SECONDARY, NO_CE_TERTIARY, OPTIMIZED_DIACRITICS_MAX_COUNT, QUATERNARY_MASK,
};
use crate::error::CollatorError;
use crate::provider::CollationDataV1Marker;
Expand All @@ -27,6 +27,7 @@ use core::cmp::Ordering;
use core::convert::TryFrom;
use icu_locid::Locale;
use icu_normalizer::provider::CanonicalDecompositionDataV1Marker;
use icu_normalizer::provider::CanonicalDecompositionTablesV1Marker;
use icu_normalizer::Decomposition;
use icu_properties::provider::CanonicalCombiningClassV1Marker;
use icu_provider::DataPayload;
Expand Down Expand Up @@ -72,6 +73,7 @@ pub struct Collator {
options: CollatorOptions,
reordering: Option<DataPayload<CollationReorderingV1Marker>>,
decompositions: DataPayload<CanonicalDecompositionDataV1Marker>,
tables: DataPayload<CanonicalDecompositionTablesV1Marker>,
ccc: DataPayload<CanonicalCombiningClassV1Marker>,
lithuanian_dot_above: bool,
}
Expand All @@ -90,6 +92,7 @@ impl Collator {
+ ResourceProvider<CollationMetadataV1Marker>
+ ResourceProvider<CollationReorderingV1Marker>
+ ResourceProvider<CanonicalDecompositionDataV1Marker>
+ ResourceProvider<CanonicalDecompositionTablesV1Marker>
+ ResourceProvider<CanonicalCombiningClassV1Marker>
+ ?Sized,
{
Expand Down Expand Up @@ -176,9 +179,10 @@ impl Collator {
.load_resource(&DataRequest::default())?
.take_payload()?;

let tailored_diacritics = metadata.tailored_diacritics();
let diacritics: DataPayload<CollationDiacriticsV1Marker> = data_provider
.load_resource(&DataRequest {
options: if metadata.tailored_diacritics() {
options: if tailored_diacritics {
resource_options
} else {
ResourceOptions::default()
Expand All @@ -187,8 +191,19 @@ impl Collator {
})?
.take_payload()?;

if diacritics.get().ce32s.len() != COMBINING_DIACRITICS_COUNT {
return Err(CollatorError::MalformedData);
if tailored_diacritics {
// In the tailored case we accept a shorter table in which case the tailoring is
// responsible for supplying the missing values in the trie.
// As of June 2022, none of the collations actually use a shortened table.
// Vietnamese and Ewe load a full-length alternative table and the rest use
// the default one.
if diacritics.get().secondaries.len() > OPTIMIZED_DIACRITICS_MAX_COUNT {
return Err(CollatorError::MalformedData);
}
} else {
if diacritics.get().secondaries.len() != OPTIMIZED_DIACRITICS_MAX_COUNT {
return Err(CollatorError::MalformedData);
}
}

let jamo: DataPayload<CollationJamoV1Marker> = data_provider
Expand All @@ -203,6 +218,10 @@ impl Collator {
.load_resource(&DataRequest::default())?
.take_payload()?;

let tables: DataPayload<CanonicalDecompositionTablesV1Marker> = data_provider
.load_resource(&DataRequest::default())?
.take_payload()?;

let ccc: DataPayload<CanonicalCombiningClassV1Marker> =
icu_properties::maps::get_canonical_combining_class(data_provider)?;

Expand Down Expand Up @@ -246,6 +265,7 @@ impl Collator {
options: merged_options,
reordering,
decompositions,
tables,
ccc,
lithuanian_dot_above: metadata.lithuanian_dot_above(),
})
Expand All @@ -272,11 +292,15 @@ impl Collator {
return Decomposition::new(
decode_utf16(left.iter().copied()).map(utf16_error_to_replacement),
self.decompositions.get(),
self.tables.get(),
None,
&self.ccc.get().code_point_trie,
)
.cmp(Decomposition::new(
decode_utf16(right.iter().copied()).map(utf16_error_to_replacement),
self.decompositions.get(),
self.tables.get(),
None,
&self.ccc.get().code_point_trie,
));
}
Expand All @@ -301,11 +325,15 @@ impl Collator {
return Decomposition::new(
left.chars(),
self.decompositions.get(),
self.tables.get(),
None,
&self.ccc.get().code_point_trie,
)
.cmp(Decomposition::new(
right.chars(),
self.decompositions.get(),
self.tables.get(),
None,
&self.ccc.get().code_point_trie,
));
}
Expand All @@ -330,11 +358,15 @@ impl Collator {
return Decomposition::new(
left.chars(),
self.decompositions.get(),
self.tables.get(),
None,
&self.ccc.get().code_point_trie,
)
.cmp(Decomposition::new(
right.chars(),
self.decompositions.get(),
self.tables.get(),
None,
&self.ccc.get().code_point_trie,
));
}
Expand Down Expand Up @@ -406,11 +438,9 @@ impl Collator {
tailoring.get(),
<&[<u32 as AsULE>::ULE; JAMO_COUNT]>::try_from(self.jamo.get().ce32s.as_ule_slice())
.unwrap(), // length already validated
<&[<u32 as AsULE>::ULE; COMBINING_DIACRITICS_COUNT]>::try_from(
self.diacritics.get().ce32s.as_ule_slice(),
)
.unwrap(), // length already validated
&self.diacritics.get().secondaries,
self.decompositions.get(),
self.tables.get(),
&self.ccc.get().code_point_trie,
numeric_primary,
self.lithuanian_dot_above,
Expand All @@ -421,11 +451,9 @@ impl Collator {
tailoring.get(),
<&[<u32 as AsULE>::ULE; JAMO_COUNT]>::try_from(self.jamo.get().ce32s.as_ule_slice())
.unwrap(), // length already validated
<&[<u32 as AsULE>::ULE; COMBINING_DIACRITICS_COUNT]>::try_from(
self.diacritics.get().ce32s.as_ule_slice(),
)
.unwrap(), // length already validated
&self.diacritics.get().secondaries,
self.decompositions.get(),
self.tables.get(),
&self.ccc.get().code_point_trie,
numeric_primary,
self.lithuanian_dot_above,
Expand Down
Loading