Skip to content

Commit 2ef2b65

Browse files
committed
optimize str::iter::Chars::advance_by
this avoids part of the char decoding work by not looking at utf8 continuation bytes
1 parent 844cb31 commit 2ef2b65

File tree

2 files changed

+54
-0
lines changed

2 files changed

+54
-0
lines changed

library/alloc/tests/str.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,6 +1170,17 @@ fn test_iterator() {
11701170
assert_eq!(s.chars().count(), v.len());
11711171
}
11721172

1173+
#[test]
1174+
fn test_iterator_advance() {
1175+
let s = "「赤錆」と呼ばれる鉄錆は、水の存在下での鉄の自然酸化によって生じる、オキシ水酸化鉄(III) 等の(含水)酸化物粒子の疎な凝集膜であるとみなせる。";
1176+
let chars: Vec<char> = s.chars().collect();
1177+
let mut it = s.chars();
1178+
it.advance_by(1).unwrap();
1179+
assert_eq!(it.next(), Some(chars[1]));
1180+
it.advance_by(33).unwrap();
1181+
assert_eq!(it.next(), Some(chars[35]));
1182+
}
1183+
11731184
#[test]
11741185
fn test_rev_iterator() {
11751186
let s = "ศไทย中华Việt Nam";

library/core/src/str/iter.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use crate::iter::{TrustedRandomAccess, TrustedRandomAccessNoCoerce};
88
use crate::ops::Try;
99
use crate::option;
1010
use crate::slice::{self, Split as SliceSplit};
11+
use core::num::NonZeroUsize;
1112

1213
use super::from_utf8_unchecked;
1314
use super::pattern::Pattern;
@@ -49,6 +50,48 @@ impl<'a> Iterator for Chars<'a> {
4950
super::count::count_chars(self.as_str())
5051
}
5152

53+
#[inline]
54+
fn advance_by(&mut self, mut remainder: usize) -> Result<(), NonZeroUsize> {
55+
const CHUNK_SIZE: usize = 32;
56+
57+
if remainder >= CHUNK_SIZE {
58+
let mut chunks = self.iter.as_slice().array_chunks::<CHUNK_SIZE>();
59+
let mut bytes_skipped: usize = 0;
60+
61+
while remainder > CHUNK_SIZE && let Some(chunk) = chunks.next() {
62+
bytes_skipped += CHUNK_SIZE;
63+
64+
let mut start_bytes = [false; CHUNK_SIZE];
65+
66+
for i in 0..CHUNK_SIZE {
67+
start_bytes[i] = !super::validations::utf8_is_cont_byte(chunk[i]);
68+
}
69+
70+
remainder -= start_bytes.into_iter().map(|i| i as u8).sum::<u8>() as usize;
71+
}
72+
73+
unsafe { self.iter.advance_by(bytes_skipped).unwrap_unchecked() };
74+
75+
// skip trailing continuation bytes
76+
while self.iter.len() > 0 {
77+
let b = self.iter.as_slice()[0];
78+
if !super::validations::utf8_is_cont_byte(b) {
79+
break
80+
}
81+
unsafe { self.iter.advance_by(1).unwrap_unchecked() };
82+
}
83+
}
84+
85+
while (remainder > 0) && (self.iter.len() > 0) {
86+
remainder -= 1;
87+
let b = self.iter.as_slice()[0];
88+
let slurp = super::validations::utf8_char_width(b);
89+
unsafe { self.iter.advance_by(slurp).unwrap_unchecked() };
90+
}
91+
92+
NonZeroUsize::new(remainder).map_or(Ok(()), Err)
93+
}
94+
5295
#[inline]
5396
fn size_hint(&self) -> (usize, Option<usize>) {
5497
let len = self.iter.len();

0 commit comments

Comments
 (0)