Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LanguageIdentifier::cmp_bytes #1704

Merged
merged 7 commits into from
Mar 17, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions components/locid/src/cmp/langid.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use crate::LanguageIdentifier;
use core::cmp::Ordering;

#[derive(PartialEq, Eq)]
enum State {
Start,
AfterLanguage,
AfterScript,
AfterRegion,
AfterVariant(usize),
}

pub struct LanguageIdentifierSubtagIterator<'a> {
langid: &'a LanguageIdentifier,
state: State,
}

impl<'a> LanguageIdentifierSubtagIterator<'a> {
pub fn new(langid: &'a LanguageIdentifier) -> Self {
LanguageIdentifierSubtagIterator {
langid,
state: State::Start,
}
}
}

impl<'a> Iterator for LanguageIdentifierSubtagIterator<'a> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
if self.state == State::Start {
self.state = State::AfterLanguage;
return Some(self.langid.language.as_str().as_bytes());
}
if self.state == State::AfterLanguage {
self.state = State::AfterScript;
if let Some(ref script) = self.langid.script {
return Some(script.as_str().as_bytes());
}
}
if self.state == State::AfterScript {
self.state = State::AfterRegion;
if let Some(ref region) = self.langid.region {
return Some(region.as_str().as_bytes());
}
}
if self.state == State::AfterRegion {
self.state = State::AfterVariant(0);
if let Some(variant) = self.langid.variants.get(0) {
return Some(variant.as_str().as_bytes());
}
}
if let State::AfterVariant(i) = self.state {
self.state = State::AfterVariant(i + 1);
if let Some(variant) = self.langid.variants.get(i + 1) {
return Some(variant.as_str().as_bytes());
}
}
None
sffc marked this conversation as resolved.
Show resolved Hide resolved
}
}

pub fn cmp(base: &LanguageIdentifier, other: &[u8]) -> Ordering {
let base_iter = LanguageIdentifierSubtagIterator::new(base);
// Note: This does not use get_subtag_iterator because we want to guarantee
// perfect lexicographic ordering of the strings.
let other_iter = other.split(|b| *b == b'-');
base_iter.cmp(other_iter)
}
5 changes: 5 additions & 0 deletions components/locid/src/cmp/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

pub mod langid;
34 changes: 34 additions & 0 deletions components/locid/src/langid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use core::cmp::Ordering;
use core::str::FromStr;

use crate::parser::{get_subtag_iterator, parse_language_identifier, ParserError, ParserMode};
Expand Down Expand Up @@ -142,6 +143,39 @@ impl LanguageIdentifier {
let lang_id = Self::from_bytes(input.as_ref())?;
Ok(lang_id.to_string())
}

/// Compare this `LanguageIdentifier` with a BCP-47 string.
///
/// The return value is equivalent to what would happen if you first converted this
/// `LanguageIdentifier` to a BCP-47 string and then performed a byte comparison.
///
/// # Examples
///
/// ```
/// use icu::locid::LanguageIdentifier;
/// use std::cmp::Ordering;
///
/// let bcp47_strings: &[&[u8]] = &[
/// b"pl-Latn-PL",
/// b"und",
/// b"und-Adlm",
/// b"und-GB",
/// b"und-ZA",
/// b"und-fonipa",
/// b"zh",
/// ];
///
/// for ab in bcp47_strings.windows(2) {
/// let a = ab[0];
/// let b = ab[1];
/// assert!(a.cmp(b) == Ordering::Less);
/// let a_langid = LanguageIdentifier::from_bytes(a).expect(&String::from_utf8_lossy(a));
/// assert!(a_langid.cmp_bytes(b) == Ordering::Less);
/// }
/// ```
sffc marked this conversation as resolved.
Show resolved Hide resolved
pub fn cmp_bytes(&self, other: &[u8]) -> Ordering {
crate::cmp::langid::cmp(self, other)
}
}

impl AsRef<LanguageIdentifier> for LanguageIdentifier {
Expand Down
1 change: 1 addition & 0 deletions components/locid/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ extern crate alloc;
#[macro_use]
mod helpers;

mod cmp;
pub mod extensions;
mod langid;
mod locale;
Expand Down
24 changes: 24 additions & 0 deletions components/locid/tests/langid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,3 +132,27 @@ fn test_langid_partialeq_str() {
let lang: LanguageIdentifier = "en".parse().expect("Parsing failed.");
assert_ne!(lang, "en-US");
}

#[test]
fn test_langid_cmp_bytes() {
let path = "./tests/fixtures/langid.json";
let tests: Vec<fixtures::LocaleTest> =
helpers::read_fixture(path).expect("Failed to read a fixture");
let bcp47_strings = tests
.iter()
.map(|t| match t.input {
fixtures::LocaleInfo::String(ref s) => s.as_str(),
_ => panic!("Invalid fixture"),
})
.collect::<Vec<&str>>();
for a in bcp47_strings.iter() {
for b in bcp47_strings.iter() {
use std::str::FromStr;
let a_langid = LanguageIdentifier::from_str(a).expect("Invalid BCP-47 in fixture");
let a_normalized = a_langid.to_string();
let string_cmp = a_normalized.as_bytes().cmp(b.as_bytes());
let test_cmp = a_langid.cmp_bytes(b.as_bytes());
assert_eq!(string_cmp, test_cmp, "{:?}/{:?}", a, b);
}
}
}