diff --git a/utils/zerotrie/src/options.rs b/utils/zerotrie/src/options.rs index af9e90fd87a..d0c1bc101d7 100644 --- a/utils/zerotrie/src/options.rs +++ b/utils/zerotrie/src/options.rs @@ -73,6 +73,15 @@ pub(crate) enum CaseSensitivity { IgnoreCase, } +/// How to handle lookup for strings with mixed ASCII case. Only used in ignore-case tries +#[derive(Copy, Clone)] +pub(crate) enum LookupStrictness { + /// Select strings that differ in case so long as their `to_ascii_lowercase` matches + Normal, + /// Select strings only if they match exactly + Strict, +} + impl CaseSensitivity { #[cfg(feature = "serde")] const fn to_u8_flag(self) -> u8 { @@ -89,6 +98,7 @@ pub(crate) struct ZeroTrieBuilderOptions { pub ascii_mode: AsciiMode, pub capacity_mode: CapacityMode, pub case_sensitivity: CaseSensitivity, + pub lookup_strictness: LookupStrictness, } impl ZeroTrieBuilderOptions { @@ -113,6 +123,7 @@ impl ZeroTrieWithOptions for crate::ZeroTrieSimpleAscii { ascii_mode: AsciiMode::AsciiOnly, capacity_mode: CapacityMode::Normal, case_sensitivity: CaseSensitivity::Sensitive, + lookup_strictness: LookupStrictness::Normal, }; } @@ -129,6 +140,7 @@ impl ZeroTrieWithOptions for crate::ZeroAsciiIgnoreCaseTrie { ascii_mode: AsciiMode::AsciiOnly, capacity_mode: CapacityMode::Normal, case_sensitivity: CaseSensitivity::IgnoreCase, + lookup_strictness: LookupStrictness::Normal, }; } @@ -137,6 +149,19 @@ impl crate::ZeroAsciiIgnoreCaseTrie { pub(crate) const FLAGS: u8 = Self::OPTIONS.to_u8_flags(); } +/// Internal struct to power `get_strict` +pub(crate) struct ZeroAsciiIgnoreCaseStrictTrie; + +impl ZeroTrieWithOptions for ZeroAsciiIgnoreCaseStrictTrie { + const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { + phf_mode: PhfMode::BinaryOnly, + ascii_mode: AsciiMode::AsciiOnly, + capacity_mode: CapacityMode::Normal, + case_sensitivity: CaseSensitivity::IgnoreCase, + lookup_strictness: LookupStrictness::Strict, + }; +} + /// Branch nodes could be either binary search or PHF. impl ZeroTrieWithOptions for crate::ZeroTriePerfectHash { const OPTIONS: ZeroTrieBuilderOptions = ZeroTrieBuilderOptions { @@ -144,6 +169,7 @@ impl ZeroTrieWithOptions for crate::ZeroTriePerfectHash { ascii_mode: AsciiMode::BinarySpans, capacity_mode: CapacityMode::Normal, case_sensitivity: CaseSensitivity::Sensitive, + lookup_strictness: LookupStrictness::Normal, }; } @@ -159,6 +185,7 @@ impl ZeroTrieWithOptions for crate::ZeroTrieExtendedCapacity { ascii_mode: AsciiMode::BinarySpans, capacity_mode: CapacityMode::Extended, case_sensitivity: CaseSensitivity::Sensitive, + lookup_strictness: LookupStrictness::Normal, }; } diff --git a/utils/zerotrie/src/reader.rs b/utils/zerotrie/src/reader.rs index eed1c80aaad..54805969118 100644 --- a/utils/zerotrie/src/reader.rs +++ b/utils/zerotrie/src/reader.rs @@ -321,7 +321,7 @@ pub(crate) fn get_parameterized( }; if let Some((c, temp)) = ascii.split_first() { if matches!(byte_type, NodeType::Ascii) { - let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) + let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) && matches!(T::OPTIONS.lookup_strictness, LookupStrictness::Normal) { b.to_ascii_lowercase() == c.to_ascii_lowercase() } else { @@ -369,10 +369,25 @@ pub(crate) fn get_parameterized( (search, trie) = trie.debug_split_at(x); let bsearch_result = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) { - search.binary_search_by_key(&c.to_ascii_lowercase(), |x| { - x.to_ascii_lowercase() - }) + if matches!(T::OPTIONS.lookup_strictness, LookupStrictness::Normal) { + // Ordering: (A=a), (B=b), (C=c), ..., (Z=z) + search.binary_search_by_key(&c.to_ascii_lowercase(), |x| { + x.to_ascii_lowercase() + }) + } else { + // Ordering: A, a, B, b, C, c, ..., Z, z + let c_lowercase = c.to_ascii_lowercase(); + search.binary_search_by(move |p| { + let p_lowercase = p.to_ascii_lowercase(); + if c_lowercase == p_lowercase { + p.cmp(c) + } else { + p_lowercase.cmp(&c_lowercase) + } + }) + } } else { + // Ordering: A, B, C, ..., Z, a, b, c, ..., z search.binary_search(c) }; i = bsearch_result.ok()?; diff --git a/utils/zerotrie/src/zerotrie.rs b/utils/zerotrie/src/zerotrie.rs index 21d6b430de2..a76b70b823f 100644 --- a/utils/zerotrie/src/zerotrie.rs +++ b/utils/zerotrie/src/zerotrie.rs @@ -665,6 +665,39 @@ impl_zerotrie_subtype!( Vec::into_boxed_slice ); +impl ZeroAsciiIgnoreCaseTrie +where +Store: AsRef<[u8]> + ?Sized, +{ + /// Queries the trie for a string, requiring that it matches case. + /// + /// # Examples + /// + /// ``` + /// use litemap::LiteMap; + /// use zerotrie::ZeroAsciiIgnoreCaseTrie; + /// + /// let mut map = LiteMap::new_vec(); + /// map.insert(&b"foo"[..], 1); + /// map.insert(b"Bar", 2); + /// map.insert(b"Bingo", 3); + /// + /// let trie = ZeroAsciiIgnoreCaseTrie::try_from(&map)?; + /// + /// assert_eq!(trie.get(b"foo"), Some(1)); + /// assert_eq!(trie.get(b"bar"), Some(2)); + /// assert_eq!(trie.get(b"BaR"), Some(2)); + /// assert_eq!(trie.get_strict(b"bar"), None); + /// assert_eq!(trie.get_strict(b"BaR"), None); + /// assert_eq!(trie.get_strict(b"Bar"), Some(2)); + /// + /// # Ok::<_, zerotrie::ZeroTrieBuildError>(()) + /// ``` + pub fn get_strict(&self, key: K) -> Option where K: AsRef<[u8]> { + reader::get_parameterized::(self.store.as_ref(), key.as_ref()) + } +} + macro_rules! impl_dispatch { ($self:ident, $inner_fn:ident()) => { match $self.0 {