diff --git a/Cargo.lock b/Cargo.lock index 3353a17f262..d2c91267d60 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1058,6 +1058,7 @@ dependencies = [ "icu_timezone", "libc_alloc", "log", + "potential_utf", "serde", "simple_logger", "tinystr", @@ -1080,6 +1081,7 @@ dependencies = [ "icu_normalizer", "icu_properties", "icu_provider", + "potential_utf", "serde", "writeable", "zerovec", @@ -1176,6 +1178,7 @@ dependencies = [ "icu_provider_blob", "icu_timezone", "litemap", + "potential_utf", "serde", "serde_json", "smallvec", @@ -1250,6 +1253,7 @@ dependencies = [ "num-bigint", "num-rational", "num-traits", + "potential_utf", "serde", "smallvec", "tinystr", @@ -1327,6 +1331,7 @@ dependencies = [ "icu_locale_core", "icu_locale_data", "icu_provider", + "potential_utf", "serde", "serde_json", "tinystr", @@ -1346,6 +1351,7 @@ dependencies = [ "icu_benchmark_macros", "litemap", "postcard", + "potential_utf", "serde", "serde_json", "tinystr", @@ -1454,6 +1460,7 @@ dependencies = [ "icu_collections", "icu_properties_data", "icu_provider", + "potential_utf", "serde", "tinystr", "unicode-bidi", @@ -1626,6 +1633,7 @@ dependencies = [ "num-rational", "num-traits", "postcard", + "potential_utf", "serde", "serde-aux", "serde_json", @@ -1655,6 +1663,7 @@ dependencies = [ "icu_provider", "icu_segmenter_data", "itertools", + "potential_utf", "serde", "serde_json", "utf8_iter", @@ -2123,6 +2132,18 @@ dependencies = [ "serde", ] +[[package]] +name = "potential_utf" +version = "0.0.0" +dependencies = [ + "bincode", + "databake", + "serde", + "serde_json", + "writeable", + "zerovec", +] + [[package]] name = "powerfmt" version = "0.2.0" @@ -3413,6 +3434,7 @@ dependencies = [ "iai", "icu_benchmark_macros", "postcard", + "potential_utf", "rand", "rand_distr", "rand_pcg", diff --git a/Cargo.toml b/Cargo.toml index 1fe118a2fcd..e9f2e036f93 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,6 +74,7 @@ members = [ "utils/env_preferences", "utils/tinystr", "utils/tzif", + "utils/potential_utf", "utils/writeable", "utils/yoke", "utils/yoke/derive", @@ -159,8 +160,8 @@ icu_provider_macros = { version = "~1.5.0", path = "provider/core/macros", defau icu_provider_adapters = { version = "~1.5.0", path = "provider/adapters", default-features = false } icu_provider_baked = { version = "~1.5.0", path = "provider/baked", default-features = false } icu_provider_blob = { version = "~1.5.0", path = "provider/blob", default-features = false } -icu_provider_fs = { version = "~1.5.0", path = "provider/fs/", default-features = false } -icu_provider_registry = { version = "~1.5.0", path = "provider/registry/", default-features = false } +icu_provider_fs = { version = "~1.5.0", path = "provider/fs", default-features = false } +icu_provider_registry = { version = "~1.5.0", path = "provider/registry", default-features = false } # Baked data icu_calendar_data = { version = "~1.5.0", path = "provider/data/calendar", default-features = false } @@ -190,7 +191,8 @@ ixdtf = { version = "0.2.0", path = "utils/ixdtf", default-features = false } litemap = { version = "0.7.3", path = "utils/litemap", default-features = false } tinystr = { version = "0.7.5", path = "utils/tinystr", default-features = false } tzif = { version = "0.2.3", path = "utils/tzif", default-features = false } -writeable = { version = "0.5.5", path = "utils/writeable/", default-features = false } +potential_utf = { version = "0.0.0", path = "utils/potential_utf", default-features = false } +writeable = { version = "0.5.5", path = "utils/writeable", default-features = false } yoke = { version = "0.7.4", path = "utils/yoke", default-features = false } yoke-derive = { version = "0.7.4", path = "utils/yoke/derive", default-features = false } zerofrom = { version = "0.1.3", path = "utils/zerofrom", default-features = false } diff --git a/components/casemap/Cargo.toml b/components/casemap/Cargo.toml index c86225517e2..945f0dcc3bb 100644 --- a/components/casemap/Cargo.toml +++ b/components/casemap/Cargo.toml @@ -25,8 +25,9 @@ icu_collections = { workspace = true } icu_locale_core = { workspace = true } icu_properties = { workspace = true } icu_provider = { workspace = true, features = ["macros"] } -zerovec = { workspace = true, features = ["yoke"] } +potential_utf = { workspace = true, features = ["zerovec"] } writeable = { workspace = true } +zerovec = { workspace = true, features = ["yoke"] } databake = { workspace = true, features = ["derive"], optional = true} serde = { workspace = true, features = ["derive", "alloc"], optional = true } @@ -46,7 +47,7 @@ criterion = { workspace = true } default = ["compiled_data"] std = ["icu_collections/std", "icu_provider/std"] bench = [] -serde = ["dep:serde", "zerovec/serde", "icu_collections/serde", "icu_provider/serde", "icu_properties/serde"] +serde = ["dep:serde", "zerovec/serde", "icu_collections/serde", "icu_provider/serde", "icu_properties/serde", "potential_utf/serde"] datagen = ["serde", "dep:databake", "zerovec/databake", "icu_collections/databake"] compiled_data = ["dep:icu_casemap_data", "icu_properties/compiled_data"] diff --git a/components/casemap/src/provider/unfold.rs b/components/casemap/src/provider/unfold.rs index c8808159a1c..3d23e2412c1 100644 --- a/components/casemap/src/provider/unfold.rs +++ b/components/casemap/src/provider/unfold.rs @@ -7,7 +7,7 @@ #[cfg(feature = "datagen")] use alloc::string::String; use icu_provider::prelude::*; -use zerovec::ule::UnvalidatedStr; +use potential_utf::PotentialUtf8; use zerovec::ZeroMap; /// Reverse case folding data. Maps from multi-character strings back @@ -30,7 +30,7 @@ use zerovec::ZeroMap; pub struct CaseMapUnfoldV1<'data> { #[cfg_attr(feature = "serde", serde(borrow))] /// The actual map. Maps from strings to a list of codepoints, stored as a contiguous UTF-8 string - pub map: ZeroMap<'data, UnvalidatedStr, str>, + pub map: ZeroMap<'data, PotentialUtf8, str>, } impl<'data> CaseMapUnfoldV1<'data> { @@ -80,7 +80,7 @@ impl<'data> CaseMapUnfoldV1<'data> { let val = Self::decode_string(&row[string_width..]) .ok_or(DataError::custom("Unfold: unpaired surrogate in value"))?; if map - .try_append(UnvalidatedStr::from_str(&key), val.as_ref()) + .try_append(PotentialUtf8::from_str(&key), val.as_ref()) .is_some() { return Err(DataError::custom("Unfold: keys not sorted/unique")); @@ -99,6 +99,6 @@ impl<'data> CaseMapUnfoldV1<'data> { // Given a string, returns another string representing the set of characters // that case fold to that string. pub(crate) fn get(&self, key: &str) -> Option<&str> { - self.map.get(UnvalidatedStr::from_str(key)) + self.map.get(PotentialUtf8::from_str(key)) } } diff --git a/components/datetime/Cargo.toml b/components/datetime/Cargo.toml index 1d26c29f40e..3f5e0fc5f1d 100644 --- a/components/datetime/Cargo.toml +++ b/components/datetime/Cargo.toml @@ -31,6 +31,7 @@ icu_provider = { workspace = true, features = ["macros"] } icu_timezone = { workspace = true } smallvec = { workspace = true } tinystr = { workspace = true, features = ["alloc", "zerovec"] } +potential_utf = { workspace = true, features = ["zerovec"] } writeable = { workspace = true } zerovec = { workspace = true, features = ["yoke"] } @@ -78,6 +79,7 @@ serde = [ "litemap?/serde", "smallvec/serde", "tinystr/serde", + "potential_utf/serde", "zerovec/serde", ] datagen = [ diff --git a/components/datetime/src/provider/calendar/symbols.rs b/components/datetime/src/provider/calendar/symbols.rs index 535c00b3dd6..4e8dc118b45 100644 --- a/components/datetime/src/provider/calendar/symbols.rs +++ b/components/datetime/src/provider/calendar/symbols.rs @@ -8,8 +8,9 @@ use alloc::borrow::Cow; use icu_calendar::types::MonthCode; use icu_provider::prelude::*; +use potential_utf::PotentialUtf8; use tinystr::{tinystr, TinyStr4}; -use zerovec::{ule::UnvalidatedStr, ZeroMap}; +use zerovec::ZeroMap; size_test!(DateSymbolsV1, date_symbols_v1_size, 3792); @@ -123,17 +124,17 @@ pub struct Eras<'data> { /// /// Keys are era codes, and values are display names. See [`Eras`]. #[cfg_attr(feature = "serde", serde(borrow))] - pub names: ZeroMap<'data, UnvalidatedStr, str>, + pub names: ZeroMap<'data, PotentialUtf8, str>, /// Symbol data for era abbreviations. /// /// Keys are era codes, and values are display names. See [`Eras`]. #[cfg_attr(feature = "serde", serde(borrow))] - pub abbr: ZeroMap<'data, UnvalidatedStr, str>, + pub abbr: ZeroMap<'data, PotentialUtf8, str>, /// Symbol data for era narrow forms. /// /// Keys are era codes, and values are display names. See [`Eras`]. #[cfg_attr(feature = "serde", serde(borrow))] - pub narrow: ZeroMap<'data, UnvalidatedStr, str>, + pub narrow: ZeroMap<'data, PotentialUtf8, str>, } // Note: the SymbolsV* struct doc strings metadata are attached to `$name` in the macro invocation to diff --git a/components/datetime/src/provider/neo.rs b/components/datetime/src/provider/neo.rs index d29290d4ba1..4549a28c7c4 100644 --- a/components/datetime/src/provider/neo.rs +++ b/components/datetime/src/provider/neo.rs @@ -7,8 +7,11 @@ mod adapter; use crate::pattern::runtime::{self, PatternULE}; use alloc::borrow::Cow; use icu_provider::prelude::*; -use zerovec::ule::{AsULE, UnvalidatedStr, ULE}; -use zerovec::{VarZeroVec, ZeroMap}; +use potential_utf::PotentialUtf8; +use zerovec::{ + ule::{AsULE, ULE}, + VarZeroVec, ZeroMap, +}; #[cfg(feature = "experimental")] use crate::neo_skeleton::NeoSkeletonLength; @@ -356,7 +359,7 @@ size_test!(YearNamesV1, year_names_v1_size, 48); pub enum YearNamesV1<'data> { /// This calendar uses eras with numeric years, this stores the era names mapped from /// era code to the name - Eras(#[cfg_attr(feature = "serde", serde(borrow))] ZeroMap<'data, UnvalidatedStr, str>), + Eras(#[cfg_attr(feature = "serde", serde(borrow))] ZeroMap<'data, PotentialUtf8, str>), /// This calendar is cyclic (Chinese, Dangi), so it uses cyclic year names without any eras Cyclic(#[cfg_attr(feature = "serde", serde(borrow))] VarZeroVec<'data, str>), } diff --git a/components/experimental/Cargo.toml b/components/experimental/Cargo.toml index 2b0c3241e19..15837b20907 100644 --- a/components/experimental/Cargo.toml +++ b/components/experimental/Cargo.toml @@ -40,6 +40,7 @@ fixed_decimal = { workspace = true } icu_pattern = { workspace = true , features = ["alloc", "yoke", "zerofrom"]} litemap = { workspace = true } tinystr = { workspace = true, features = ["alloc", "zerovec"] } +potential_utf = { workspace = true, features = ["zerovec"] } writeable = { workspace = true } zerotrie = { workspace = true, features = ["yoke", "zerofrom"] } zerovec = { workspace = true, features = ["derive", "yoke"] } @@ -70,7 +71,7 @@ default = ["compiled_data"] compiled_data = ["dep:icu_experimental_data", "icu_decimal/compiled_data", "icu_list/compiled_data", "icu_plurals/compiled_data", "icu_properties/compiled_data", "icu_normalizer/compiled_data"] datagen = ["serde", "std", "dep:databake", "zerovec/databake", "zerotrie/databake", "tinystr/databake", "icu_collections/databake", "std", "log", "icu_pattern/databake"] ryu = ["fixed_decimal/ryu"] -serde = ["dep:serde", "zerovec/serde", "tinystr/serde", "icu_collections/serde", "icu_decimal/serde", "icu_list/serde", "icu_pattern/serde", "icu_plurals/serde", "icu_provider/serde", "zerotrie/serde"] +serde = ["dep:serde", "zerovec/serde", "potential_utf/serde", "tinystr/serde", "icu_collections/serde", "icu_decimal/serde", "icu_list/serde", "icu_pattern/serde", "icu_plurals/serde", "icu_provider/serde", "zerotrie/serde"] std = ["fixed_decimal/std", "icu_decimal/std", "icu_pattern/std", "icu_plurals/std", "icu_provider/std", "icu_locale_core/std"] bench = [] diff --git a/components/experimental/src/displaynames/displaynames.rs b/components/experimental/src/displaynames/displaynames.rs index 85cb862286d..7f37f0274fd 100644 --- a/components/experimental/src/displaynames/displaynames.rs +++ b/components/experimental/src/displaynames/displaynames.rs @@ -13,7 +13,7 @@ use icu_locale_core::{ Locale, }; use icu_provider::prelude::*; -use zerovec::ule::UnvalidatedStr; +use potential_utf::PotentialUtf8; /// Lookup of the locale-specific display names by region code. /// @@ -411,7 +411,7 @@ impl LocaleDisplayNamesFormatter { if let Some(script) = locale.id.script { let data = self.locale_data.get(); let id = LanguageIdentifier::from((locale.id.language, Some(script), None)); - let cmp = |uvstr: &UnvalidatedStr| id.strict_cmp(uvstr).reverse(); + let cmp = |uvstr: &PotentialUtf8| id.strict_cmp(uvstr).reverse(); if let Some(x) = match self.options.style { Some(Style::Short) => data.short_names.get_by(cmp), Some(Style::Long) => data.long_names.get_by(cmp), @@ -429,7 +429,7 @@ impl LocaleDisplayNamesFormatter { if let Some(region) = locale.id.region { let data = self.locale_data.get(); let id = LanguageIdentifier::from((locale.id.language, None, Some(region))); - let cmp = |uvstr: &UnvalidatedStr| id.strict_cmp(uvstr).reverse(); + let cmp = |uvstr: &PotentialUtf8| id.strict_cmp(uvstr).reverse(); if let Some(x) = match self.options.style { Some(Style::Short) => data.short_names.get_by(cmp), Some(Style::Long) => data.long_names.get_by(cmp), diff --git a/components/experimental/src/displaynames/provider.rs b/components/experimental/src/displaynames/provider.rs index 3bcf2238504..5c6623be13a 100644 --- a/components/experimental/src/displaynames/provider.rs +++ b/components/experimental/src/displaynames/provider.rs @@ -10,8 +10,8 @@ //! Read more about data providers: [`icu_provider`] use icu_provider::prelude::*; +use potential_utf::PotentialUtf8; use tinystr::UnvalidatedTinyAsciiStr; -use zerovec::ule::UnvalidatedStr; use zerovec::ZeroMap; // We use raw TinyAsciiStrs for map keys, as we then don't have to @@ -21,7 +21,7 @@ use zerovec::ZeroMap; type UnvalidatedRegion = UnvalidatedTinyAsciiStr<3>; type UnvalidatedLanguage = UnvalidatedTinyAsciiStr<3>; type UnvalidatedScript = UnvalidatedTinyAsciiStr<4>; -type UnvalidatedLocale = UnvalidatedStr; +type UnvalidatedLocale = PotentialUtf8; type UnvalidatedVariant = UnvalidatedTinyAsciiStr<8>; #[cfg(feature = "compiled_data")] diff --git a/components/locale/Cargo.toml b/components/locale/Cargo.toml index ee88605e01d..31f238609e4 100644 --- a/components/locale/Cargo.toml +++ b/components/locale/Cargo.toml @@ -25,13 +25,14 @@ denylist = ["bench"] all-features = true [dependencies] +databake = { workspace = true, optional = true, features = ["derive"] } +displaydoc = { workspace = true } icu_locale_core = { workspace = true, features = ["zerovec"] } icu_provider = { workspace = true, features = ["macros"] } serde = { workspace = true, features = ["derive", "alloc"], optional = true } tinystr = { workspace = true, features = ["alloc", "zerovec"] } +potential_utf = { workspace = true, features = ["zerovec"] } zerovec = { workspace = true, features = ["yoke"] } -databake = { workspace = true, optional = true, features = ["derive"] } -displaydoc = { workspace = true } icu_locale_data = { workspace = true, optional = true } @@ -51,7 +52,7 @@ bench = false # This option is required for Benchmark CI default = ["compiled_data"] std = [] bench = ["serde"] -serde = ["dep:serde", "icu_locale_core/serde", "tinystr/serde", "zerovec/serde", "icu_provider/serde"] +serde = ["dep:serde", "icu_locale_core/serde", "tinystr/serde", "zerovec/serde", "icu_provider/serde", "potential_utf/serde"] datagen = ["serde", "dep:databake", "zerovec/databake", "icu_locale_core/databake", "tinystr/databake"] compiled_data = ["dep:icu_locale_data"] diff --git a/components/locale/src/provider.rs b/components/locale/src/provider.rs index b99740178f3..69344f5c833 100644 --- a/components/locale/src/provider.rs +++ b/components/locale/src/provider.rs @@ -56,8 +56,9 @@ pub const MARKERS: &[DataMarkerInfo] = &[ use alloc::borrow::Cow; use icu_locale_core::subtags::{Language, Region, Script, Variant}; use icu_provider::prelude::*; +use potential_utf::PotentialUtf8; use tinystr::{TinyAsciiStr, UnvalidatedTinyAsciiStr}; -use zerovec::{ule::UnvalidatedStr, VarZeroVec, ZeroMap, ZeroSlice, ZeroVec}; +use zerovec::{VarZeroVec, ZeroMap, ZeroSlice, ZeroVec}; // We use raw TinyAsciiStrs for map keys, as we then don't have to // validate them as subtags on deserialization. Map lookup can be @@ -347,7 +348,7 @@ pub struct ParentsV1<'data> { /// Map from language identifier to language identifier, indicating that the language on the /// left should inherit from the language on the right. #[cfg_attr(feature = "serde", serde(borrow))] - pub parents: ZeroMap<'data, UnvalidatedStr, (Language, Option