Skip to content

str: Implement a faster Chars iterator for &str #15638

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 19, 2014
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Clarify str Chars iterator implementation
Thanks to comments from @huonw, clarify decoding details and use
statics for important constants for UTF-8 decoding. Convert some magic
numbers scattered in the same file to use the statics too.
  • Loading branch information
root committed Jul 18, 2014
commit bbb299ad9840d02c52eefbd9989b5b18b51a7b8d
29 changes: 16 additions & 13 deletions src/libcore/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,11 @@ macro_rules! utf8_first_byte(

// return the value of $ch updated with continuation byte $byte
macro_rules! utf8_acc_cont_byte(
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & CONT_MASK) as u32)
)

macro_rules! utf8_is_cont_byte(
($byte:expr) => (($byte & 192u8) == 128)
($byte:expr) => (($byte & !CONT_MASK) == TAG_CONT_U8)
)

#[inline]
Expand All @@ -137,20 +137,20 @@ impl<'a> Iterator<char> for Chars<'a> {
fn decode_multibyte<'a>(x: u8, it: &mut slice::Items<'a, u8>) -> char {
// NOTE: Performance is very sensitive to the exact formulation here
// Decode from a byte combination out of: [[[x y] z] w]
let cont_mask = 0x3F; // continuation byte mask
let init = utf8_first_byte!(x, 2);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You say above that with 3 only needs the bottom 5 bits, but I don't see where that happens. It seems that it's used directly as is, with all 5 bits below (init << 12), and then used with only 3 bits in the 4 case (init & 7).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will have to be commented. Initial byte in range 0xE0 .. 0xEF is the three byte case, and in that range the 5th bit is clear.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, that's cool.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I should add my development testcase that tests the decoding of every valid char.

let y = unwrap_or_0(it.next());
let mut ch = utf8_acc_cont_byte!(init, y);
if x >= 0xE0 {
/* [[x y z] w] case */
/* [[x y z] w] case
* 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid */
let z = unwrap_or_0(it.next());

let y_z = (((y & cont_mask) as u32) << 6) | (z & cont_mask) as u32;
let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
/* [x y z w] case */
/* [x y z w] case
* use only the lower 3 bits of `init` */
let w = unwrap_or_0(it.next());
ch = (init & 7) << 18 | y_z << 6 | (w & cont_mask) as u32;
ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w);
}
}
unsafe {
Expand Down Expand Up @@ -754,9 +754,9 @@ fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
// %xF4 %x80-8F 2( UTF8-tail )
match w {
2 => if second & 192 != TAG_CONT_U8 {err!()},
2 => if second & !CONT_MASK != TAG_CONT_U8 {err!()},
3 => {
match (first, second, next!() & 192) {
match (first, second, next!() & !CONT_MASK) {
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) |
(0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) |
(0xED , 0x80 .. 0x9F, TAG_CONT_U8) |
Expand All @@ -765,7 +765,7 @@ fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
}
}
4 => {
match (first, second, next!() & 192, next!() & 192) {
match (first, second, next!() & !CONT_MASK, next!() & !CONT_MASK) {
(0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
(0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
(0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
Expand Down Expand Up @@ -962,7 +962,10 @@ pub struct CharRange {
pub next: uint,
}

static TAG_CONT_U8: u8 = 128u8;
/// Mask of the value bits of a continuation byte
static CONT_MASK: u8 = 0b0011_1111u8;
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
static TAG_CONT_U8: u8 = 0b1000_0000u8;

/// Unsafe operations
pub mod raw {
Expand Down Expand Up @@ -1898,7 +1901,7 @@ impl<'a> StrSlice<'a> for &'a str {
// Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange {
// while there is a previous byte == 10......
while i > 0 && s.as_bytes()[i] & 192u8 == TAG_CONT_U8 {
while i > 0 && s.as_bytes()[i] & !CONT_MASK == TAG_CONT_U8 {
i -= 1u;
}

Expand Down