Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Using std maps in datagen #2152

Merged
merged 4 commits into from
Jul 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions components/datetime/src/provider/time_zones.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ pub struct TimeZoneFormatsV1<'data> {

/// TimeZone ID in BCP47 format
#[repr(transparent)]
#[derive(Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd, yoke::Yokeable, ULE)]
#[derive(Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd, yoke::Yokeable, ULE, Hash)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct TimeZoneBcp47Id(pub TinyAsciiStr<8>);
Expand Down Expand Up @@ -75,7 +75,7 @@ impl<'a> zerovec::maps::ZeroMapKV<'a> for TimeZoneBcp47Id {

/// MetaZone ID in a compact format
#[repr(transparent)]
#[derive(Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd, yoke::Yokeable, ULE)]
#[derive(Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd, yoke::Yokeable, ULE, Hash)]
#[cfg_attr(feature = "datagen", derive(serde::Serialize))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct MetaZoneId(pub TinyAsciiStr<4>);
Expand Down
6 changes: 3 additions & 3 deletions provider/adapters/src/filter/impls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ where
/// use icu_provider::prelude::*;
/// use icu_provider_adapters::filter::Filterable;
///
/// let provider = HelloWorldProvider::new_with_placeholder_data()
/// let provider = HelloWorldProvider
/// .filterable("Demo no-English filter")
/// .filter_by_langid(|langid| langid.language != language!("en"));
///
Expand Down Expand Up @@ -102,7 +102,7 @@ where
/// use icu_provider_adapters::filter::Filterable;
///
/// let allowlist = vec![langid!("de"), langid!("zh")];
/// let provider = HelloWorldProvider::new_with_placeholder_data()
/// let provider = HelloWorldProvider
/// .filterable("Demo German+Chinese filter")
/// .filter_by_langid_allowlist_strict(&allowlist);
///
Expand Down Expand Up @@ -162,7 +162,7 @@ where
/// use icu_provider::prelude::*;
/// use icu_provider_adapters::filter::Filterable;
///
/// let provider = HelloWorldProvider::new_with_placeholder_data()
/// let provider = HelloWorldProvider
/// .filterable("Demo require-langid filter")
/// .require_langid();
///
Expand Down
2 changes: 1 addition & 1 deletion provider/adapters/src/filter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
//! use icu_provider_adapters::filter::Filterable;
//!
//! // Only return German data from a HelloWorldProvider:
//! HelloWorldProvider::new_with_placeholder_data()
//! HelloWorldProvider
//! .filterable("Demo German-only filter")
//! .filter_by_langid(|langid| langid.language == language!("de"));
//! ```
Expand Down
10 changes: 5 additions & 5 deletions provider/adapters/src/fork/by_key.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ use crate::helpers::result_is_err_missing_resource_key;
///
/// let forking_provider = ForkByKeyProvider(
/// DummyBufferProvider,
/// HelloWorldProvider::new_with_placeholder_data().into_json_provider(),
/// HelloWorldProvider.into_json_provider(),
/// );
///
/// let data_provider = forking_provider.as_deserializing();
Expand Down Expand Up @@ -74,11 +74,11 @@ use crate::helpers::result_is_err_missing_resource_key;
/// use icu_provider_adapters::fork::by_key::ForkByKeyProvider;
///
/// let forking_provider = ForkByKeyProvider(
/// HelloWorldProvider::new_with_placeholder_data()
/// HelloWorldProvider
/// .into_json_provider()
/// .filterable("Chinese")
/// .filter_by_langid(|langid| langid.language == language!("zh")),
/// HelloWorldProvider::new_with_placeholder_data()
/// HelloWorldProvider
/// .into_json_provider()
/// .filterable("German")
/// .filter_by_langid(|langid| langid.language == language!("de")),
Expand Down Expand Up @@ -204,11 +204,11 @@ where
///
/// let forking_provider = MultiForkByKeyProvider {
/// providers: vec![
/// HelloWorldProvider::new_with_placeholder_data()
/// HelloWorldProvider
/// .into_json_provider()
/// .filterable("Chinese")
/// .filter_by_langid(|langid| langid.language == language!("zh")),
/// HelloWorldProvider::new_with_placeholder_data()
/// HelloWorldProvider
/// .into_json_provider()
/// .filterable("German")
/// .filter_by_langid(|langid| langid.language == language!("de")),
Expand Down
2 changes: 0 additions & 2 deletions provider/blob/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ zerovec = { version = "0.7", path = "../../utils/zerovec", features = ["serde",

# For the export feature
log = { version = "0.4", optional = true }
litemap = { version = "0.4", path = "../../utils/litemap/", optional = true }

[dev-dependencies]
icu_locid = { version = "0.6", path = "../../components/locid", features = ["serde"] }
Expand All @@ -50,7 +49,6 @@ export = [
"log",
"postcard/alloc",
"std",
"litemap",
"icu_provider/datagen",
"zerovec/serde",
]
Expand Down
1 change: 0 additions & 1 deletion provider/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ displaydoc = { version = "0.2.3", default-features = false }
yoke = { version = "0.6.0", path = "../../utils/yoke", features = ["derive"] }
zerofrom = { version = "0.1.0", path = "../../utils/zerofrom", features = ["derive"] }
zerovec = { version = "0.7.0", path = "../../utils/zerovec", features = ["derive"]}
litemap = { version = "0.4.0", path = "../../utils/litemap" }
stable_deref_trait = { version = "1.2.0", default-features = false }

log = { version = "0.4", optional = true }
Expand Down
2 changes: 1 addition & 1 deletion provider/core/src/buf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ impl DataMarker for BufferMarker {
/// use icu_provider::hello_world::*;
/// use icu_provider::prelude::*;
///
/// let buffer_provider = HelloWorldProvider::new_with_placeholder_data().into_json_provider();
/// let buffer_provider = HelloWorldProvider.into_json_provider();
///
/// let data_provider = buffer_provider.as_deserializing();
///
Expand Down
84 changes: 35 additions & 49 deletions provider/core/src/hello_world.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ use crate::zerofrom::{self, *};
use alloc::borrow::Cow;
use alloc::string::String;
use core::fmt::Debug;
use icu_locid::locale;
use litemap::LiteMap;

/// A struct containing "Hello World" in the requested language.
#[derive(Debug, PartialEq, Clone, Yokeable, ZeroFrom)]
Expand Down Expand Up @@ -61,9 +59,7 @@ impl ResourceMarker for HelloWorldV1Marker {
/// use icu_provider::hello_world::*;
/// use icu_provider::prelude::*;
///
/// let provider = HelloWorldProvider::new_with_placeholder_data();
///
/// let german_hello_world: DataPayload<HelloWorldV1Marker> = provider
/// let german_hello_world: DataPayload<HelloWorldV1Marker> = HelloWorldProvider
/// .load_resource(&DataRequest {
/// options: locale!("de").into(),
/// metadata: Default::default(),
Expand All @@ -75,42 +71,30 @@ impl ResourceMarker for HelloWorldV1Marker {
/// assert_eq!("Hallo Welt", german_hello_world.get().message);
/// ```
#[derive(Debug, PartialEq, Default)]
pub struct HelloWorldProvider {
map: LiteMap<ResourceOptions, Cow<'static, str>>,
}
pub struct HelloWorldProvider;

impl HelloWorldProvider {
pub const DATA: &'static [(icu_locid::Locale, &'static str)] = &[
(locale!("bn"), "ওহে বিশ্ব"),
(locale!("cs"), "Ahoj světe"),
(locale!("de"), "Hallo Welt"),
(locale!("el"), "Καλημέρα κόσμε"),
(locale!("en"), "Hello World"),
(locale!("eo"), "Saluton, Mondo"),
(locale!("fa"), "سلام دنیا‎"),
(locale!("fi"), "hei maailma"),
(locale!("is"), "Halló, heimur"),
(locale!("ja"), "こんにちは世界"),
(locale!("la"), "Ave, munde"),
(locale!("pt"), "Olá, mundo"),
(locale!("ro"), "Salut, lume"),
(locale!("ru"), "Привет, мир"),
(locale!("vi"), "Xin chào thế giới"),
(locale!("zh"), "你好世界"),
// Data from https://en.wiktionary.org/wiki/Hello_World#Translations
// Keep this sorted!
const DATA: &'static [(&'static str, &'static str)] = &[
("bn", "ওহে বিশ্ব"),
("cs", "Ahoj světe"),
("de", "Hallo Welt"),
("el", "Καλημέρα κόσμε"),
("en", "Hello World"),
("eo", "Saluton, Mondo"),
("fa", "سلام دنیا‎"),
("fi", "hei maailma"),
("is", "Halló, heimur"),
("ja", "こんにちは世界"),
("la", "Ave, munde"),
("pt", "Olá, mundo"),
("ro", "Salut, lume"),
("ru", "Привет, мир"),
("vi", "Xin chào thế giới"),
("zh", "你好世界"),
];

/// Creates a [`HelloWorldProvider`] pre-populated with hardcoded data from Wiktionary.
pub fn new_with_placeholder_data() -> HelloWorldProvider {
// Data from https://en.wiktionary.org/wiki/Hello_World#Translations
HelloWorldProvider {
map: Self::DATA
.iter()
.cloned()
.map(|(loc, value)| (loc.into(), value.into()))
.collect(),
}
}

/// Converts this provider into one that serves JSON blobs of the same data.
pub fn into_json_provider(self) -> HelloWorldJsonProvider {
HelloWorldJsonProvider(self)
Expand All @@ -122,11 +106,14 @@ impl ResourceProvider<HelloWorldV1Marker> for HelloWorldProvider {
&self,
req: &DataRequest,
) -> Result<DataResponse<HelloWorldV1Marker>, DataError> {
let data = self
.map
.get(&req.options)
.map(|s| HelloWorldV1 { message: s.clone() })
.ok_or_else(|| DataErrorKind::MissingLocale.with_key(HelloWorldV1Marker::KEY))?;
#[allow(clippy::indexing_slicing)] // binary_search
let data = Self::DATA
.binary_search_by(|(k, _)| req.options.strict_cmp(k.as_bytes()).reverse())
.map(|i| Self::DATA[i].1)
.map(|s| HelloWorldV1 {
message: Cow::Borrowed(s),
})
.map_err(|_| DataErrorKind::MissingLocale.with_req(HelloWorldV1Marker::KEY, req))?;
let metadata = DataResponseMetadata {
data_langid: Some(req.options.get_langid()),
..Default::default()
Expand Down Expand Up @@ -170,20 +157,19 @@ impl BufferProvider for HelloWorldJsonProvider {
#[cfg(feature = "datagen")]
impl IterableResourceProvider<HelloWorldV1Marker> for HelloWorldProvider {
fn supported_options(&self) -> Result<Vec<ResourceOptions>, DataError> {
Ok(self
.map
.iter_keys()
.cloned()
#[allow(clippy::unwrap_used)] // datagen
Ok(Self::DATA
.iter()
.map(|(s, _)| s.parse::<icu_locid::LanguageIdentifier>().unwrap())
.map(ResourceOptions::from)
.collect())
}
}

#[test]
fn test_iter() {
let provider = HelloWorldProvider::new_with_placeholder_data();
let mut supported_langids: Vec<ResourceOptions> = provider.supported_options().unwrap();
supported_langids.sort();
use icu_locid::locale;
let supported_langids: Vec<ResourceOptions> = HelloWorldProvider.supported_options().unwrap();

assert_eq!(
supported_langids,
Expand Down
13 changes: 13 additions & 0 deletions provider/core/src/resource.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,19 @@ pub struct ResourceKey {
hash: ResourceKeyHash,
}

// I don't think this lint makes sense, see https://github.com/rust-lang/rust-clippy/issues/2627.
// It's not possible to violate the hash invariant (k1 == k2 -> hash(k1) == hash(k2)) without
// using non-determinism or unsafe code when PartialEq is derived.
#[allow(clippy::derive_hash_xor_eq)]

// Custom implementation to avoid hashing the path.
impl core::hash::Hash for ResourceKey {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue: I feel that ResourceKeyHash should be used instead of ResourceKey when a hash is needed, which is why there isn't already an impl Hash for ResourceKey.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But ResourceKeyHash doesn't have the path, which I need for databake. This is like saying you should store f(x) is a hash set instead of x.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I cannot approve this PR given that my PR #2115 conflicts with the Hash impl. We can't merge both.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the conflict exactly?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to decide on what to do with the ResourceKeyMetadata.

  1. Does it affect PartialEq? (then it needs to go into Hash)
  2. Is it ignored for PartialEq? (then match statements get less efficient)
  3. Does it go into the path string itself? (then it gets into Hash and PartialEq automatically)

I don't want to lock ourselves into having a Hash impl if the impl would be inefficient.

tl;dr, let's reach consensus on #2115 and then we can move on.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is independent of #2115. Hashing on hash (aka path) only is valid, even if PartialEq compares more fields. Hash is free to produce collisions as long as it produces equal hashes for PartialEq values. Hash collisions don't affect correctness, they're a performance problem, but that doesn't matter here because a) it's only used in datagen and b) we don't have keys that differ only by metadata anyway.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok.

#[inline]
fn hash<H: core::hash::Hasher>(&self, state: &mut H) {
self.hash.hash(state)
}
}

#[cfg(test)]
static_assertions::const_assert_eq!(24, core::mem::size_of::<ResourceKey>());

Expand Down
1 change: 0 additions & 1 deletion provider/datagen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ icu_codepointtrie_builder = { path = "../../utils/codepointtrie/builder" }
icu_locid = { version = "0.6", path = "../../components/locid", features = ["std"]}
icu_uniset = { version = "0.5", path = "../../utils/uniset", features = ["serde"] }
itertools = "0.10"
litemap = { version = "0.4.0", path = "../../utils/litemap" }
log = "0.4"
rayon = "1.5"
serde = { version = "1.0", default-features = false, features = ["derive", "alloc"] }
Expand Down
36 changes: 12 additions & 24 deletions provider/datagen/src/databake.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use databake::{quote, CrateEnv, TokenStream};
use icu_provider::datagen::*;
use icu_provider::prelude::*;
use itertools::Itertools;
use litemap::LiteMap;
use rayon::prelude::*;
use std::collections::HashMap;
use std::fs::File;
Expand All @@ -31,8 +30,8 @@ pub(crate) struct BakedDataExporter {
mod_directory: PathBuf,
pretty: bool,
insert_feature_gates: bool,
// Temporary storage for put_payload: key -> (marker path, options -> bake)
data: Mutex<LiteMap<ResourceKey, (SyncTokenStream, LiteMap<ResourceOptions, SyncTokenStream>)>>,
// Temporary storage for put_payload: key -> (marker path, bake -> [options])
data: Mutex<HashMap<ResourceKey, (SyncTokenStream, HashMap<SyncTokenStream, Vec<String>>)>>,
// All mod.rs files in the module tree. Because generation is parallel,
// this will be non-deterministic and have to be sorted later.
mod_files: Mutex<HashMap<PathBuf, Vec<String>>>,
Expand Down Expand Up @@ -117,15 +116,15 @@ impl DataExporter for BakedDataExporter {
payload: &DataPayload<ExportMarker>,
) -> Result<(), DataError> {
let (payload, marker_type) = payload.tokenize(&self.dependencies);
let payload_string = payload.to_string();
let mut map = self.data.lock().expect("poison");
if !map.contains_key(&key) {
map.insert(key, (marker_type.to_string(), LiteMap::new()));
}
map.get_mut(&key)
.unwrap()
self.data
.lock()
.expect("poison")
.entry(key)
.or_insert_with(|| (marker_type.to_string(), Default::default()))
.1
.insert(options.clone(), payload_string);
.entry(payload.to_string())
.or_default()
.push(options.to_string());
Ok(())
}

Expand All @@ -136,21 +135,11 @@ impl DataExporter for BakedDataExporter {
.expect("poison")
.remove(&key)
.ok_or_else(|| DataError::custom("No data").with_key(key))?;
let mut sorted = raw.into_tuple_vec();
// Sort by payload bake so we can deduplicate.
sorted.sort_by(|(_, a), (_, b)| a.cmp(b));

let mut statics = Vec::new();
let mut all_options = Vec::new();

for (payload_bake_string, group) in &sorted
.into_iter()
.group_by(|(_, payload_bake_string)| payload_bake_string.clone())
{
// These are sorted because we stably sorted earlier.
let options = group
.map(|(options, _)| options.to_string())
.collect::<Vec<_>>();
for (payload_bake_string, mut options) in raw {
options.sort();
let ident_string = options
.iter()
.map(|options| {
Expand All @@ -171,7 +160,6 @@ impl DataExporter for BakedDataExporter {
statics.push((ident_string, payload_bake_string));
}

// Not necessary for functionality, but prettier
statics.sort_unstable_by(|(a, _), (b, _)| a.cmp(b));

let statics = statics
Expand Down
4 changes: 2 additions & 2 deletions provider/datagen/src/registry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ macro_rules! create_datagen_provider {
icu_provider_adapters::make_forking_provider!(
icu_provider_adapters::fork::by_key::ForkByKeyProvider,
[
icu_provider::hello_world::HelloWorldProvider::new_with_placeholder_data(),
icu_provider::hello_world::HelloWorldProvider,
$(<$constructor>::from(__source)),+,
]
)
Expand Down Expand Up @@ -202,7 +202,7 @@ macro_rules! create_datagen_provider {
icu_provider_adapters::make_forking_provider!(
icu_provider_adapters::fork::by_key::ForkByKeyProvider,
[
icu_provider::hello_world::HelloWorldProvider::new_with_placeholder_data(),
icu_provider::hello_world::HelloWorldProvider,
$(<$constructor>::from(__source)),+,
]
)
Expand Down
Loading