Skip to content

Commit

Permalink
Add LocaleData parameter for word/sentence segmenter (#5318)
Browse files Browse the repository at this point in the history
Add LocaleData parameter for word/sentence segmenter 

This is a part of #3284.

ICU4C has some language break rules for word and sentence segmenter, so
this fix adds some rules to ICU4X per locale.

This adds LocaleData argument to all constructors. Also, locale
difference is small and 2 data only, I add the override table data
marker for machine state property.
  • Loading branch information
makotokato authored Sep 3, 2024
1 parent 38228a7 commit 9d45c5f
Show file tree
Hide file tree
Showing 38 changed files with 1,182 additions and 13 deletions.
4 changes: 4 additions & 0 deletions components/segmenter/src/grapheme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ impl GraphemeClusterSegmenter {
data: payload,
complex: None,
boundary_property: 0,
locale_override: None,
})
}

Expand All @@ -214,6 +215,7 @@ impl GraphemeClusterSegmenter {
data: self.payload.get(),
complex: None,
boundary_property: 0,
locale_override: None,
})
}
/// Creates a grapheme cluster break iterator for a Latin-1 (8-bit) string.
Expand All @@ -231,6 +233,7 @@ impl GraphemeClusterSegmenter {
data: self.payload.get(),
complex: None,
boundary_property: 0,
locale_override: None,
})
}

Expand All @@ -257,6 +260,7 @@ impl GraphemeClusterSegmenter {
data: payload,
complex: None,
boundary_property: 0,
locale_override: None,
})
}
}
Expand Down
2 changes: 2 additions & 0 deletions components/segmenter/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ pub use crate::word::WordSegmenter;
pub use crate::line::LineBreakOptions;
pub use crate::line::LineBreakStrictness;
pub use crate::line::LineBreakWordOption;
pub use crate::sentence::SentenceBreakOptions;
pub use crate::word::WordBreakOptions;
pub use crate::word::WordType;

// Typedefs
Expand Down
23 changes: 23 additions & 0 deletions components/segmenter/src/provider/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ const _: () = {
pub mod icu {
pub use crate as segmenter;
pub use icu_collections as collections;
pub use icu_segmenter_data::icu_locale as locale;
}
make_provider!(Baked);
impl_dictionary_for_word_only_auto_v1_marker!(Baked);
Expand All @@ -49,7 +50,9 @@ const _: () = {
impl_line_break_data_v2_marker!(Baked);
#[cfg(feature = "lstm")]
impl_lstm_for_word_line_auto_v1_marker!(Baked);
impl_sentence_break_data_override_v1_marker!(Baked);
impl_sentence_break_data_v2_marker!(Baked);
impl_word_break_data_override_v1_marker!(Baked);
impl_word_break_data_v2_marker!(Baked);
};

Expand All @@ -61,7 +64,9 @@ pub const MARKERS: &[DataMarkerInfo] = &[
GraphemeClusterBreakDataV2Marker::INFO,
LineBreakDataV2Marker::INFO,
LstmForWordLineAutoV1Marker::INFO,
SentenceBreakDataOverrideV1Marker::INFO,
SentenceBreakDataV2Marker::INFO,
WordBreakDataOverrideV1Marker::INFO,
WordBreakDataV2Marker::INFO,
];

Expand Down Expand Up @@ -148,6 +153,24 @@ impl DynamicDataMarker for UCharDictionaryBreakDataV1Marker {
type DataStruct = UCharDictionaryBreakDataV1<'static>;
}

/// codepoint trie data that the difference by specific locale
#[icu_provider::data_struct(
marker(SentenceBreakDataOverrideV1Marker, "segmenter/sentence/override@1",),
marker(WordBreakDataOverrideV1Marker, "segmenter/word/override@1")
)]
#[derive(Debug, PartialEq, Clone)]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize,databake::Bake),
databake(path = icu_segmenter::provider),
)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct RuleBreakDataOverrideV1<'data> {
/// The difference of property table for special locale.
#[cfg_attr(feature = "serde", serde(borrow))]
pub property_table_override: CodePointTrie<'data, u8>,
}

#[derive(Clone, Copy, PartialEq, Debug)]
#[cfg_attr(feature = "datagen", derive(databake::Bake))]
#[cfg_attr(feature = "datagen", databake(path = icu_segmenter::provider))]
Expand Down
9 changes: 9 additions & 0 deletions components/segmenter/src/rule_segmenter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ pub struct RuleBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> {
pub(crate) data: &'l RuleBreakDataV2<'l>,
pub(crate) complex: Option<&'l ComplexPayloads>,
pub(crate) boundary_property: u8,
pub(crate) locale_override: Option<&'l RuleBreakDataOverrideV1<'l>>,
}

impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'l, 's, Y> {
Expand Down Expand Up @@ -210,6 +211,14 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> RuleBreakIterator<'l, 's, Y> {

fn get_break_property(&self, codepoint: Y::CharType) -> u8 {
// Note: Default value is 0 == UNKNOWN
if let Some(locale_override) = &self.locale_override {
let property = locale_override
.property_table_override
.get32(codepoint.into());
if property != 0 {
return property;
}
}
self.data.property_table.get32(codepoint.into())
}

Expand Down
85 changes: 84 additions & 1 deletion components/segmenter/src/sentence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ use crate::provider::*;
use crate::rule_segmenter::*;
use utf8_iter::Utf8CharIndices;

/// Options to tailor sentence breaking behavior.
#[non_exhaustive]
#[derive(Clone, PartialEq, Eq, Debug, Default)]
pub struct SentenceBreakOptions {
/// Content locale for sentence segmenter.
pub content_locale: Option<DataLocale>,
}

/// Implements the [`Iterator`] trait over the sentence boundaries of the given string.
///
/// Lifetimes:
Expand Down Expand Up @@ -100,6 +108,7 @@ pub type SentenceBreakIteratorUtf16<'l, 's> = SentenceBreakIterator<'l, 's, Rule
#[derive(Debug)]
pub struct SentenceSegmenter {
payload: DataPayload<SentenceBreakDataV2Marker>,
payload_locale_override: Option<DataPayload<SentenceBreakDataOverrideV1Marker>>,
}

#[cfg(feature = "compiled_data")]
Expand All @@ -121,6 +130,7 @@ impl SentenceSegmenter {
payload: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_SENTENCE_BREAK_DATA_V2_MARKER,
),
payload_locale_override: None,
}
}

Expand All @@ -140,13 +150,70 @@ impl SentenceSegmenter {
D: DataProvider<SentenceBreakDataV2Marker> + ?Sized,
{
let payload = provider.load(Default::default())?.payload;
Ok(Self { payload })
Ok(Self {
payload,
payload_locale_override: None,
})
}

icu_provider::gen_any_buffer_data_constructors!(
(options: SentenceBreakOptions) -> error: DataError,
/// Constructs a [`SentenceSegmenter`] for a given options and using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
functions: [
try_new_with_options,
try_new_with_options_with_any_provider,
try_new_with_options_with_buffer_provider,
try_new_with_options_unstable,
Self
]
);

#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new_with_options)]
pub fn try_new_with_options_unstable<D>(
provider: &D,
options: SentenceBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<SentenceBreakDataV2Marker>
+ DataProvider<SentenceBreakDataOverrideV1Marker>
+ ?Sized,
{
let payload = provider.load(Default::default())?.payload;
let payload_locale_override = if let Some(locale) = options.content_locale {
let req = DataRequest {
id: DataIdentifierBorrowed::for_locale(&locale),
..Default::default()
};
match provider.load(req) {
Ok(response) => Ok(Some(response.payload)),
Err(DataError {
kind: DataErrorKind::IdentifierNotFound,
..
}) => Ok(None),
Err(e) => Err(e),
}
} else {
Ok(None)
};

Ok(Self {
payload,
payload_locale_override: payload_locale_override?,
})
}

/// Creates a sentence break iterator for an `str` (a UTF-8 string).
///
/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> SentenceBreakIteratorUtf8<'l, 's> {
let locale_override = self
.payload_locale_override
.as_ref()
.map(|payload| payload.get());
SentenceBreakIterator(RuleBreakIterator {
iter: input.char_indices(),
len: input.len(),
Expand All @@ -155,6 +222,7 @@ impl SentenceSegmenter {
data: self.payload.get(),
complex: None,
boundary_property: 0,
locale_override,
})
}
/// Creates a sentence break iterator for a potentially ill-formed UTF8 string
Expand All @@ -166,6 +234,10 @@ impl SentenceSegmenter {
&'l self,
input: &'s [u8],
) -> SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
let locale_override = self
.payload_locale_override
.as_ref()
.map(|payload| payload.get());
SentenceBreakIterator(RuleBreakIterator {
iter: Utf8CharIndices::new(input),
len: input.len(),
Expand All @@ -174,6 +246,7 @@ impl SentenceSegmenter {
data: self.payload.get(),
complex: None,
boundary_property: 0,
locale_override,
})
}
/// Creates a sentence break iterator for a Latin-1 (8-bit) string.
Expand All @@ -183,6 +256,10 @@ impl SentenceSegmenter {
&'l self,
input: &'s [u8],
) -> SentenceBreakIteratorLatin1<'l, 's> {
let locale_override = self
.payload_locale_override
.as_ref()
.map(|payload| payload.get());
SentenceBreakIterator(RuleBreakIterator {
iter: Latin1Indices::new(input),
len: input.len(),
Expand All @@ -191,13 +268,18 @@ impl SentenceSegmenter {
data: self.payload.get(),
complex: None,
boundary_property: 0,
locale_override,
})
}

/// Creates a sentence break iterator for a UTF-16 string.
///
/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> SentenceBreakIteratorUtf16<'l, 's> {
let locale_override = self
.payload_locale_override
.as_ref()
.map(|payload| payload.get());
SentenceBreakIterator(RuleBreakIterator {
iter: Utf16Indices::new(input),
len: input.len(),
Expand All @@ -206,6 +288,7 @@ impl SentenceSegmenter {
data: self.payload.get(),
complex: None,
boundary_property: 0,
locale_override,
})
}
}
Expand Down
Loading

0 comments on commit 9d45c5f

Please sign in to comment.