Skip to content

Commit

Permalink
std: convert first_non_utf8_byte to use the iterator.
Browse files Browse the repository at this point in the history
This makes it very slightly faster, especially when the string is valid
UTF-8, and completely removes the use of `unsafe` from the first half.

Before:

    from_utf8_lossy_100_ascii              ... bench:       151 ns/iter (+/- 17)
    from_utf8_lossy_100_invalid            ... bench:       447 ns/iter (+/- 33)
    from_utf8_lossy_100_multibyte          ... bench:       135 ns/iter (+/- 4)
    from_utf8_lossy_invalid                ... bench:       124 ns/iter (+/- 10

After:

    from_utf8_lossy_100_ascii              ... bench:       119 ns/iter (+/- 8)
    from_utf8_lossy_100_invalid            ... bench:       454 ns/iter (+/- 16)
    from_utf8_lossy_100_multibyte          ... bench:       116 ns/iter (+/- 9)
    from_utf8_lossy_invalid                ... bench:       119 ns/iter (+/- 9)
  • Loading branch information
huonw committed Feb 18, 2014
1 parent a68d10e commit a39056e
Showing 1 changed file with 11 additions and 61 deletions.
72 changes: 11 additions & 61 deletions src/libstd/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -813,69 +813,19 @@ pub fn is_utf8(v: &[u8]) -> bool {

#[inline(always)]
fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
let mut i = 0u;
let total = v.len();
fn unsafe_get(xs: &[u8], i: uint) -> u8 {
unsafe { *xs.unsafe_ref(i) }
}
while i < total {
let v_i = unsafe_get(v, i);
if v_i < 128u8 {
i += 1u;
} else {
let w = utf8_char_width(v_i);
if w == 0u { return Some(i); }

let nexti = i + w;
if nexti > total { return Some(i); }
let mut it = v.iter();

// 2-byte encoding is for codepoints \u0080 to \u07ff
// first C2 80 last DF BF
// 3-byte encoding is for codepoints \u0800 to \uffff
// first E0 A0 80 last EF BF BF
// excluding surrogates codepoints \ud800 to \udfff
// ED A0 80 to ED BF BF
// 4-byte encoding is for codepoints \u10000 to \u10ffff
// first F0 90 80 80 last F4 8F BF BF
//
// Use the UTF-8 syntax from the RFC
//
// https://tools.ietf.org/html/rfc3629
// UTF8-1 = %x00-7F
// UTF8-2 = %xC2-DF UTF8-tail
// UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
// %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
// %xF4 %x80-8F 2( UTF8-tail )
// UTF8-tail = %x80-BF
match w {
2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 {
return Some(i)
},
3 => match (v_i,
unsafe_get(v, i + 1),
unsafe_get(v, i + 2) & 192u8) {
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) => (),
(0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (),
(0xED , 0x80 .. 0x9F, TAG_CONT_U8) => (),
(0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (),
_ => return Some(i),
},
_ => match (v_i,
unsafe_get(v, i + 1),
unsafe_get(v, i + 2) & 192u8,
unsafe_get(v, i + 3) & 192u8) {
(0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
(0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (),
(0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (),
_ => return Some(i)
},
}

i = nexti;
}
let ok = run_utf8_validation_iterator(&mut it);
if ok {
None
} else {
// work out how many valid bytes we've consumed
// (run_utf8_validation_iterator resets the iterator to just
// after the last good byte), which we can do because the
// vector iterator size_hint is exact.
let (remaining, _) = it.size_hint();
Some(v.len() - remaining)
}
None
}

/// Determines if a vector of `u16` contains valid UTF-16
Expand Down

0 comments on commit a39056e

Please sign in to comment.