Strip seems to have been broken for a while on unicode strings.

Narsil · Narsil · commit df0b85d86370 · 2021-05-19T16:29:20.000+02:00
- Includes a failing tests + fixed it.
- This function could maybe b optimized, we're scanning the string 3 times now.
  and once fully for chars.
diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs
@@ -736,12 +736,13 @@ impl NormalizedString {
         };
 
         if leading_spaces > 0 || trailing_spaces > 0 {
+            let count = self.get().chars().count();
             let transformation = self
                 .normalized
                 .chars()
                 .enumerate()
                 .filter_map(|(i, c)| {
-                    if i < leading_spaces || i >= self.len() - trailing_spaces {
+                    if i < leading_spaces || i >= count - trailing_spaces {
                         None
                     } else if i == self.len() - trailing_spaces - 1 {
                         Some((c, -(trailing_spaces as isize)))
@@ -1274,6 +1275,17 @@ mod tests {
         );
     }
 
+    #[test]
+    fn strip_unicode() {
+        let mut n = NormalizedString::from("  你好asa \n");
+        n.strip();
+        assert_eq!(&n.normalized, "你好asa");
+        assert_eq!(
+            n.get_range_original(Range::Normalized(0..n.normalized.len())),
+            Some("你好asa")
+        );
+    }
+
     #[test]
     fn prepend() {
         let mut n = NormalizedString::from("there");