Skip to content

Commit df0b85d

Browse files
committed
Strip seems to have been broken for a while on unicode strings.
- Includes a failing tests + fixed it. - This function could maybe b optimized, we're scanning the string 3 times now. and once fully for chars.
1 parent e999a7b commit df0b85d

File tree

1 file changed

+13
-1
lines changed

1 file changed

+13
-1
lines changed

tokenizers/src/tokenizer/normalizer.rs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -736,12 +736,13 @@ impl NormalizedString {
736736
};
737737

738738
if leading_spaces > 0 || trailing_spaces > 0 {
739+
let count = self.get().chars().count();
739740
let transformation = self
740741
.normalized
741742
.chars()
742743
.enumerate()
743744
.filter_map(|(i, c)| {
744-
if i < leading_spaces || i >= self.len() - trailing_spaces {
745+
if i < leading_spaces || i >= count - trailing_spaces {
745746
None
746747
} else if i == self.len() - trailing_spaces - 1 {
747748
Some((c, -(trailing_spaces as isize)))
@@ -1274,6 +1275,17 @@ mod tests {
12741275
);
12751276
}
12761277

1278+
#[test]
1279+
fn strip_unicode() {
1280+
let mut n = NormalizedString::from(" 你好asa \n");
1281+
n.strip();
1282+
assert_eq!(&n.normalized, "你好asa");
1283+
assert_eq!(
1284+
n.get_range_original(Range::Normalized(0..n.normalized.len())),
1285+
Some("你好asa")
1286+
);
1287+
}
1288+
12771289
#[test]
12781290
fn prepend() {
12791291
let mut n = NormalizedString::from("there");

0 commit comments

Comments
 (0)