unicode-org · sffc · Mar 17, 2022 · Mar 16, 2022 · Mar 16, 2022 · Mar 16, 2022
@@ -0,0 +1,72 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use crate::LanguageIdentifier;
+use core::cmp::Ordering;
+
+#[derive(PartialEq, Eq)]
+enum State {
+    Start,
+    AfterLanguage,
+    AfterScript,
+    AfterRegion,
+    AfterVariant(usize),
+}
+
+pub struct LanguageIdentifierSubtagIterator<'a> {
+    langid: &'a LanguageIdentifier,
+    state: State,
+}
+
+impl<'a> LanguageIdentifierSubtagIterator<'a> {
+    pub fn new(langid: &'a LanguageIdentifier) -> Self {
+        LanguageIdentifierSubtagIterator {
+            langid,
+            state: State::Start,
+        }
+    }
+}
+
+impl<'a> Iterator for LanguageIdentifierSubtagIterator<'a> {
+    type Item = &'a [u8];
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.state == State::Start {
+            self.state = State::AfterLanguage;
+            return Some(self.langid.language.as_str().as_bytes());
+        }
+        if self.state == State::AfterLanguage {
+            self.state = State::AfterScript;
+            if let Some(ref script) = self.langid.script {
+                return Some(script.as_str().as_bytes());
+            }
+        }
+        if self.state == State::AfterScript {
+            self.state = State::AfterRegion;
+            if let Some(ref region) = self.langid.region {
+                return Some(region.as_str().as_bytes());
+            }
+        }
+        if self.state == State::AfterRegion {
+            self.state = State::AfterVariant(0);
+            if let Some(variant) = self.langid.variants.get(0) {
+                return Some(variant.as_str().as_bytes());
+            }
+        }
+        if let State::AfterVariant(i) = self.state {
+            self.state = State::AfterVariant(i + 1);
+            if let Some(variant) = self.langid.variants.get(i + 1) {
+                return Some(variant.as_str().as_bytes());
+            }
+        }
+        None
+    }
+}
+
+pub fn cmp(base: &LanguageIdentifier, other: &[u8]) -> Ordering {
+    let base_iter = LanguageIdentifierSubtagIterator::new(base);
+    // Note: This does not use get_subtag_iterator because we want to guarantee
+    // perfect lexicographic ordering of the strings.
+    let other_iter = other.split(|b| *b == b'-');
+    base_iter.cmp(other_iter)
+}
@@ -0,0 +1,5 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+pub mod langid;
@@ -2,6 +2,7 @@
 // called LICENSE at the top level of the ICU4X source tree
 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
 
+use core::cmp::Ordering;
 use core::str::FromStr;
 
 use crate::parser::{get_subtag_iterator, parse_language_identifier, ParserError, ParserMode};
@@ -142,6 +143,39 @@ impl LanguageIdentifier {
         let lang_id = Self::from_bytes(input.as_ref())?;
         Ok(lang_id.to_string())
     }
+
+    /// Compare this `LanguageIdentifier` with a BCP-47 string.
+    ///
+    /// The return value is equivalent to what would happen if you first converted this
+    /// `LanguageIdentifier` to a BCP-47 string and then performed a byte comparison.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use icu::locid::LanguageIdentifier;
+    /// use std::cmp::Ordering;
+    ///
+    /// let bcp47_strings: &[&[u8]] = &[
+    ///     b"pl-Latn-PL",
+    ///     b"und",
+    ///     b"und-Adlm",
+    ///     b"und-GB",
+    ///     b"und-ZA",
+    ///     b"und-fonipa",
+    ///     b"zh",
+    /// ];
+    ///
+    /// for ab in bcp47_strings.windows(2) {
+    ///     let a = ab[0];
+    ///     let b = ab[1];
+    ///     assert!(a.cmp(b) == Ordering::Less);
+    ///     let a_langid = LanguageIdentifier::from_bytes(a).expect(&String::from_utf8_lossy(a));
+    ///     assert!(a_langid.cmp_bytes(b) == Ordering::Less);
+    /// }
+    /// ```
+    pub fn cmp_bytes(&self, other: &[u8]) -> Ordering {
+        crate::cmp::langid::cmp(self, other)
+    }
 }
 
 impl AsRef<LanguageIdentifier> for LanguageIdentifier {

@@ -58,6 +58,7 @@ extern crate alloc;
 #[macro_use]
 mod helpers;
 
+mod cmp;
 pub mod extensions;
 mod langid;
 mod locale;

@@ -132,3 +132,27 @@ fn test_langid_partialeq_str() {
     let lang: LanguageIdentifier = "en".parse().expect("Parsing failed.");
     assert_ne!(lang, "en-US");
 }
+
+#[test]
+fn test_langid_cmp_bytes() {
+    let path = "./tests/fixtures/langid.json";
+    let tests: Vec<fixtures::LocaleTest> =
+        helpers::read_fixture(path).expect("Failed to read a fixture");
+    let bcp47_strings = tests
+        .iter()
+        .map(|t| match t.input {
+            fixtures::LocaleInfo::String(ref s) => s.as_str(),
+            _ => panic!("Invalid fixture"),
+        })
+        .collect::<Vec<&str>>();
+    for a in bcp47_strings.iter() {
+        for b in bcp47_strings.iter() {
+            use std::str::FromStr;
+            let a_langid = LanguageIdentifier::from_str(a).expect("Invalid BCP-47 in fixture");
+            let a_normalized = a_langid.to_string();
+            let string_cmp = a_normalized.as_bytes().cmp(b.as_bytes());
+            let test_cmp = a_langid.cmp_bytes(b.as_bytes());
+            assert_eq!(string_cmp, test_cmp, "{:?}/{:?}", a, b);
+        }
+    }
+}