From 4619124940c10b574c1ebe192479b5b6d8d76bd2 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 26 Jul 2023 01:19:34 -0700 Subject: [PATCH] Very rough checkpoint --- Cargo.lock | 11 +- provider/datagen/Cargo.toml | 2 +- provider/datagen/src/lib.rs | 231 +++++++++++++++---------- provider/datagen/src/source.rs | 6 +- provider/datagen/tests/test-options.rs | 110 ++++++++++++ 5 files changed, 264 insertions(+), 96 deletions(-) create mode 100644 provider/datagen/tests/test-options.rs diff --git a/Cargo.lock b/Cargo.lock index c3f385609e6..da37f93326a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -831,6 +831,13 @@ version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" +[[package]] +name = "elsa" +version = "1.8.1" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "elsa" version = "1.8.1" @@ -1452,7 +1459,7 @@ dependencies = [ "crlify", "databake", "displaydoc", - "elsa", + "elsa 1.8.1", "eyre", "icu", "icu_calendar", @@ -1587,7 +1594,7 @@ name = "icu_ffi_coverage" version = "0.0.0" dependencies = [ "diplomat_core", - "elsa", + "elsa 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static", "rustdoc-types", "serde_json", diff --git a/provider/datagen/Cargo.toml b/provider/datagen/Cargo.toml index 2a11a471fc4..d7070dc62d1 100644 --- a/provider/datagen/Cargo.toml +++ b/provider/datagen/Cargo.toml @@ -70,7 +70,7 @@ syn = {version = "2", features = ["parsing"], optional = true } # Other external dependencies displaydoc = { version = "0.2.3", default-features = false } -elsa = "1.7" +elsa = { path = "../../../elsa" } itertools = "0.10" lazy_static = "1" log = "0.4" diff --git a/provider/datagen/src/lib.rs b/provider/datagen/src/lib.rs index dd580ae3f53..cd6ffbd246a 100644 --- a/provider/datagen/src/lib.rs +++ b/provider/datagen/src/lib.rs @@ -71,9 +71,14 @@ mod source; mod testutil; mod transform; +use elsa::sync::FrozenMap; pub use error::{is_missing_cldr_error, is_missing_icuexport_error}; +use icu_locid::langid; +use icu_locid::LanguageIdentifier; use icu_locid_transform::fallback::LocaleFallbackConfig; use icu_locid_transform::fallback::LocaleFallbacker; +use icu_locid_transform::fallback::LocaleFallbackerWithConfig; +use options::{FallbackMode, LocaleInclude}; #[allow(deprecated)] // ugh pub use registry::{all_keys, all_keys_with_experimental, deserialize_and_measure, key}; pub use source::SourceData; @@ -128,6 +133,98 @@ pub(crate) mod rayon_prelude { impl IntoParallelIterator for T {} } +enum LocaleIncluderInner { + All, + ExplicitWithExtensions { + explicit: HashSet, + }, + AncestorsAndDescendants { + explicit: HashSet, + implicit: HashSet, + }, +} + +struct LocaleIncluder<'a> { + fallbacker_with_config: &'a LocaleFallbackerWithConfig<'a>, + inner: LocaleIncluderInner, +} + +impl<'a> LocaleIncluder<'a> { + pub fn new( + fallbacker_with_config: &'a LocaleFallbackerWithConfig<'a>, + locale_include: LocaleInclude, + fallback_mode: FallbackMode, + ) -> Self { + if matches!(locale_include, LocaleInclude::All) { + return Self { + fallbacker_with_config, + inner: LocaleIncluderInner::All, + }; + } + let LocaleInclude::Explicit(explicit) = locale_include else { + unreachable!("Pre-processed LocaleInclued has only 2 variants") + }; + if matches!(fallback_mode, FallbackMode::Preresolved) { + return Self { + fallbacker_with_config, + inner: LocaleIncluderInner::ExplicitWithExtensions { explicit }, + }; + } + let explicit: HashSet = explicit.into_iter().map(|d| d.into()).collect(); + let mut implicit = HashSet::new(); + // TODO: Make including the default locale configurable + implicit.insert(DataLocale::default()); + for locale in explicit.iter() { + let mut iter = fallbacker_with_config.fallback_for(locale.clone()); + while !iter.get().is_empty() { + implicit.insert(iter.get().clone()); + } + } + Self { + fallbacker_with_config, + inner: LocaleIncluderInner::AncestorsAndDescendants { explicit, implicit }, + } + } + + pub fn get_locales_for_possible_inclusion( + &self, + supported_locales: HashSet, + ) -> HashSet { + match &self.inner { + LocaleIncluderInner::All => supported_locales, + LocaleIncluderInner::ExplicitWithExtensions { explicit } => supported_locales + .into_iter() + .chain(explicit.iter().map(|langid| langid.into())) + .collect(), + LocaleIncluderInner::AncestorsAndDescendants { explicit, .. } => supported_locales + .into_iter() + .chain(explicit.iter().cloned()) + .collect(), + } + } + + pub fn matches(&self, probe: &DataLocale) -> bool { + match &self.inner { + LocaleIncluderInner::All => true, + LocaleIncluderInner::ExplicitWithExtensions { explicit } => { + explicit.contains(&probe.get_langid()) + } + LocaleIncluderInner::AncestorsAndDescendants { explicit, implicit } => { + if implicit.contains(probe) { + return true; + } + let mut iter = self.fallbacker_with_config.fallback_for(probe.clone()); + while !iter.get().is_empty() { + if explicit.contains(iter.get()) { + return true; + } + } + false + } + } + } +} + /// [`DataProvider`] backed by [`SourceData`] /// /// If `source` does not contain a specific data source, `DataProvider::load` will @@ -186,9 +283,8 @@ impl DatagenProvider { let mut provider = Self { source }; - if provider.source.options.fallback == options::FallbackMode::Runtime { - provider.source.fallbacker = - Some(icu_locid_transform::fallback::LocaleFallbacker::try_new_unstable(&provider)?); + if provider.source.options.fallback != options::FallbackMode::Preresolved { + provider.source.fallbacker = Some(LocaleFallbacker::try_new_unstable(&provider)?); } Ok(provider) @@ -217,33 +313,7 @@ impl DatagenProvider { &self, supported: Vec, ) -> Vec { - match &self.source.options.locales { - options::LocaleInclude::All => supported, - options::LocaleInclude::Explicit(set) => supported - .into_iter() - .filter(|l| { - if let Some(fallbacker) = &self.source.fallbacker { - // Include any UND-* - if l.get_langid().language == Language::UND { - return true; - } - let mut chain = fallbacker - .for_config(Default::default()) - .fallback_for(l.clone()); - while !chain.get().is_empty() { - if set.contains(&chain.get().get_langid()) { - return true; - } - chain.step(); - } - false - } else { - set.contains(&l.get_langid()) - } - }) - .collect(), - _ => unreachable!("resolved: {:?}", self.source.options.locales), - } + supported } /// Exports data for the set of keys to the given exporter. @@ -294,9 +364,15 @@ impl DatagenProvider { let fallbacker_with_config = fallbacker.for_config(LocaleFallbackConfig::from_key(key)); - // TODO: Don't use .cloned() - let locale_groups = fallbacker_with_config - .sort_locales_into_groups(supported_locales.iter().cloned()); + + let locale_includer = LocaleIncluder::new( + &fallbacker_with_config, + provider.source.options.locales.clone(), + provider.source.options.fallback, + ); + + let locales_for_possible_inclusion = + locale_includer.get_locales_for_possible_inclusion(supported_locales); fn get_payload( provider: &DatagenProvider, @@ -320,8 +396,9 @@ impl DatagenProvider { ) { (options::FallbackMode::Hybrid, _) | (options::FallbackMode::PreferredForExporter, None) => { - supported_locales + locales_for_possible_inclusion .into_par_iter() + .filter(|locale| locale_includer.matches(locale)) .try_for_each(|locale| { let payload = get_payload(provider, key, &locale)?; exporter.put_payload(key, &locale, &payload) @@ -335,68 +412,40 @@ impl DatagenProvider { Some(BuiltInFallbackMode::Standard), ) => { let payloads = - RwLock::new(HashMap::>::new()); - locale_groups.into_par_iter().try_for_each(|group| { - group.into_par_iter().try_for_each(|locale| { + FrozenMap::>>::new(); + locales_for_possible_inclusion + .into_par_iter() + .filter(|locale| locale_includer.matches(locale)) + .try_for_each(|locale| { let payload = get_payload(provider, key, &locale)?; - let mut iter = fallbacker_with_config.fallback_for(locale.clone()); - loop { - if payloads.read().expect("poison").get(iter.get()) - == Some(&payload) - { - // Found a match: don't need to write anything - return Ok(()); - } - if iter.get().is_empty() { - break; - } - iter.step(); - } - payloads - .write() - .expect("poison") - .insert(locale.clone(), payload); + payloads.insert(locale, Box::new(payload)); Ok::<(), DataError>(()) }) - })?; - for (locale, payload) in payloads.into_inner().expect("poison").into_iter() - { - exporter.put_payload(key, &locale, &payload)?; - } - } - (options::FallbackMode::Preresolved, _) => match &provider - .source - .options - .locales - { - options::LocaleInclude::Explicit(requested_locales) => { - requested_locales.into_par_iter().try_for_each(|locale| { - let mut iter = fallbacker_with_config.fallback_for(locale.into()); - loop { - match get_payload(provider, key, iter.get()) { - Ok(payload) => { - return exporter.put_payload(key, iter.get(), &payload); - } - Err(DataError { - kind: DataErrorKind::MissingLocale, - .. - }) => { - // continue out of the match statement - } - Err(e) => return Err(e), - }; - if iter.get().is_empty() { - // could not find anything - return Err(DataError::custom( - "Couldn't find anything in Explicit fallback mode", - )); - } - iter.step(); + .map_err(|e| e.with_key(key))?; + let payloads = payloads.into_iter().collect::>(); + 'outer: for (locale, payload) in payloads.iter() { + let mut iter = fallbacker_with_config.fallback_for(locale.clone()); + while !iter.get().is_empty() { + iter.step(); + if let Some(parent_payload) = payloads.get(iter.get()) { + // Found a match: don't need to write anything + continue 'outer; } - })?; + } + // Did not find a match: export this payload + exporter.put_payload(key, locale, payload)?; } - _ => unreachable!("checked in constructor"), - }, + } + (options::FallbackMode::Preresolved, _) => { + locales_for_possible_inclusion + .into_par_iter() + .filter(|locale| locale_includer.matches(locale)) + .try_for_each(|locale| { + let payload = get_payload(provider, key, &locale)?; + exporter.put_payload(key, &locale, &payload) + }) + .map_err(|e| e.with_key(key))?; + } // Because icu_provider::datagen::FallbackMode is non_exhaustive (options::FallbackMode::PreferredForExporter, _) => { panic!("Unexpected preferred fallback mode needs to be handled") diff --git a/provider/datagen/src/source.rs b/provider/datagen/src/source.rs index 15eefa943d7..65b3c37ddcf 100644 --- a/provider/datagen/src/source.rs +++ b/provider/datagen/src/source.rs @@ -6,6 +6,7 @@ use crate::options::{LocaleInclude, Options}; use crate::transform::cldr::source::CldrCache; pub use crate::transform::cldr::source::CoverageLevel; use elsa::sync::FrozenMap; +use icu_locid::LanguageIdentifier; use icu_provider::prelude::*; use std::any::Any; use std::collections::{BTreeMap, HashSet}; @@ -33,6 +34,7 @@ pub struct SourceData { // TODO: move this out when we decide we can break the exhaustiveness of DatagenProvider pub(crate) options: Options, pub(crate) fallbacker: Option, + pub(crate) implied_locales: Vec, } #[cfg(feature = "networking")] @@ -70,8 +72,7 @@ impl SourceData { /// Creates a `SourceData` that does not have CLDR or ICU export sources set. pub fn offline() -> Self { - let mut options = Options::default(); - options.locales = LocaleInclude::All; + let options = Options::default(); Self { cldr_paths: None, icuexport_paths: None, @@ -81,6 +82,7 @@ impl SourceData { segmenter_lstm_paths: Arc::new(SerdeCache::new(AbstractFs::new_lstm_fallback())), options, fallbacker: None, + implied_locales: Vec::new(), } } diff --git a/provider/datagen/tests/test-options.rs b/provider/datagen/tests/test-options.rs new file mode 100644 index 00000000000..0617b2ffdf0 --- /dev/null +++ b/provider/datagen/tests/test-options.rs @@ -0,0 +1,110 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use std::collections::{HashMap, HashSet}; +use std::path::Path; + +use elsa::sync::FrozenMap; +use icu_datagen::options::{FallbackMode, LocaleInclude, Options}; +use icu_datagen::{DatagenProvider, SourceData}; +use icu_decimal::provider::DecimalSymbolsV1Marker; +use icu_locid::langid; +use icu_provider::datagen::{DataExporter, ExportMarker}; +use icu_provider::prelude::*; +use postcard::ser_flavors::{AllocVec, Flavor}; + +#[derive(Default)] +struct TestingExporter { + data: FrozenMap<(DataKey, DataLocale), Vec>, +} + +impl<'a> DataExporter for &'a mut TestingExporter { + fn put_payload( + &self, + key: DataKey, + locale: &DataLocale, + payload: &DataPayload, + ) -> Result<(), DataError> { + let mut serializer = postcard::Serializer { + output: AllocVec::new(), + }; + payload.serialize(&mut serializer)?; + let output = serializer + .output + .finalize() + .expect("Failed to finalize serializer output"); + println!("Putting: {key}/{locale}"); + self.data.insert((key, locale.clone()), output); + Ok(()) + } +} + +impl TestingExporter { + pub fn take_map_and_reset(&mut self) -> HashMap<(DataKey, DataLocale), Vec> { + core::mem::take(&mut self.data).into_iter().collect() + } +} + +#[test] +fn test_fallback_options() { + simple_logger::SimpleLogger::new() + .env() + .with_level(log::LevelFilter::Info) + .init() + .unwrap(); + + let data_root = Path::new(concat!(core::env!("CARGO_MANIFEST_DIR"), "/tests/data/")); + + let source = SourceData::offline() + .with_cldr(data_root.join("cldr"), Default::default()) + .unwrap() + .with_icuexport(data_root.join("icuexport")) + .unwrap(); + + let decimal_symbols_key: HashSet = [DecimalSymbolsV1Marker::KEY].into_iter().collect(); + + let mut testing_exporter = TestingExporter::default(); + + let mut options = Options::default(); + + options.locales = LocaleInclude::All; + options.fallback = FallbackMode::Hybrid; + DatagenProvider::try_new(options.clone(), source.clone()) + .unwrap() + .export(decimal_symbols_key.clone(), &mut testing_exporter) + .unwrap(); + let data_all_hybrid = testing_exporter.take_map_and_reset(); + + options.fallback = FallbackMode::RuntimeManual; + DatagenProvider::try_new(options.clone(), source.clone()) + .unwrap() + .export(decimal_symbols_key.clone(), &mut testing_exporter) + .unwrap(); + let data_all_runtime = testing_exporter.take_map_and_reset(); + + options.locales = LocaleInclude::Explicit( + [langid!("en-GB"), langid!("sr-ME"), langid!("ar")] + .into_iter() + .collect(), + ); + options.fallback = FallbackMode::Hybrid; + DatagenProvider::try_new(options.clone(), source.clone()) + .unwrap() + .export(decimal_symbols_key.clone(), &mut testing_exporter) + .unwrap(); + let data_explicit_hybrid = testing_exporter.take_map_and_reset(); + + options.fallback = FallbackMode::RuntimeManual; + DatagenProvider::try_new(options.clone(), source.clone()) + .unwrap() + .export(decimal_symbols_key.clone(), &mut testing_exporter) + .unwrap(); + let data_explicit_runtime = testing_exporter.take_map_and_reset(); + + // options.fallback = FallbackMode::Preresolved; + // DatagenProvider::try_new(options, source).unwrap() + // .export(decimal_symbols_key, &mut testing_exporter) + // .unwrap(); + // let data_explicit_preresolved = testing_exporter.take_map_and_reset(); +}