From a68d10e6adc048b3a5f90e376f232a39223d1db8 Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Sun, 16 Feb 2014 16:11:47 +1100 Subject: [PATCH 1/2] std::str: safen and optimize is_utf8. This uses a vector iterator to avoid the necessity for unsafe indexing, and makes this function slightly faster. Unfortunately #11751 means that the iterator comes with repeated `null` checks which means the pure-ASCII case still has room for significant improvement (and the other cases too, but it's most significant for just ASCII). Before: is_utf8_100_ascii ... bench: 143 ns/iter (+/- 6) is_utf8_100_multibyte ... bench: 134 ns/iter (+/- 4) After: is_utf8_100_ascii ... bench: 123 ns/iter (+/- 4) is_utf8_100_multibyte ... bench: 115 ns/iter (+/- 5) --- src/libstd/str.rs | 79 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 2 deletions(-) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 0a7f513581c0d..570df45e08070 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -731,9 +731,84 @@ pub fn eq(a: &~str, b: &~str) -> bool { Section: Misc */ -/// Determines if a vector of bytes contains valid UTF-8 +/// Walk through `iter` checking that it's a valid UTF-8 sequence, +/// returning `true` in that case, or, if it is invalid, `false` with +/// `iter` reset such that it is pointing at the first byte in the +/// invalid sequence. +#[inline(always)] +fn run_utf8_validation_iterator(iter: &mut vec::Items) -> bool { + loop { + // save the current thing we're pointing at. + let old = *iter; + + // restore the iterator we had at the start of this codepoint. + macro_rules! err ( () => { {*iter = old; return false} }); + macro_rules! next ( () => { + match iter.next() { + Some(a) => *a, + // we needed data, but there was none: error! + None => err!() + } + }); + + let first = match iter.next() { + Some(&b) => b, + // we're at the end of the iterator and a codepoint + // boundary at the same time, so this string is valid. + None => return true + }; + + // ASCII characters are always valid, so only large + // bytes need more examination. + if first >= 128 { + let w = utf8_char_width(first); + let second = next!(); + // 2-byte encoding is for codepoints \u0080 to \u07ff + // first C2 80 last DF BF + // 3-byte encoding is for codepoints \u0800 to \uffff + // first E0 A0 80 last EF BF BF + // excluding surrogates codepoints \ud800 to \udfff + // ED A0 80 to ED BF BF + // 4-byte encoding is for codepoints \u10000 to \u10ffff + // first F0 90 80 80 last F4 8F BF BF + // + // Use the UTF-8 syntax from the RFC + // + // https://tools.ietf.org/html/rfc3629 + // UTF8-1 = %x00-7F + // UTF8-2 = %xC2-DF UTF8-tail + // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / + // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) + // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / + // %xF4 %x80-8F 2( UTF8-tail ) + match w { + 2 => if second & 192 != TAG_CONT_U8 {err!()}, + 3 => { + match (first, second, next!() & 192) { + (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) | + (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) | + (0xED , 0x80 .. 0x9F, TAG_CONT_U8) | + (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => {} + _ => err!() + } + } + 4 => { + match (first, second, next!() & 192, next!() & 192) { + (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) | + (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) | + (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {} + _ => err!() + } + } + _ => err!() + } + } + } +} + +/// Determines if a vector of bytes contains valid UTF-8. pub fn is_utf8(v: &[u8]) -> bool { - first_non_utf8_index(v).is_none() + run_utf8_validation_iterator(&mut v.iter()) } #[inline(always)] From a39056e614b61489a8b8afc4171586e454d4dcbd Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Sun, 16 Feb 2014 17:12:47 +1100 Subject: [PATCH 2/2] std: convert first_non_utf8_byte to use the iterator. This makes it very slightly faster, especially when the string is valid UTF-8, and completely removes the use of `unsafe` from the first half. Before: from_utf8_lossy_100_ascii ... bench: 151 ns/iter (+/- 17) from_utf8_lossy_100_invalid ... bench: 447 ns/iter (+/- 33) from_utf8_lossy_100_multibyte ... bench: 135 ns/iter (+/- 4) from_utf8_lossy_invalid ... bench: 124 ns/iter (+/- 10 After: from_utf8_lossy_100_ascii ... bench: 119 ns/iter (+/- 8) from_utf8_lossy_100_invalid ... bench: 454 ns/iter (+/- 16) from_utf8_lossy_100_multibyte ... bench: 116 ns/iter (+/- 9) from_utf8_lossy_invalid ... bench: 119 ns/iter (+/- 9) --- src/libstd/str.rs | 72 ++++++++--------------------------------------- 1 file changed, 11 insertions(+), 61 deletions(-) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 570df45e08070..a780a912d4df3 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -813,69 +813,19 @@ pub fn is_utf8(v: &[u8]) -> bool { #[inline(always)] fn first_non_utf8_index(v: &[u8]) -> Option { - let mut i = 0u; - let total = v.len(); - fn unsafe_get(xs: &[u8], i: uint) -> u8 { - unsafe { *xs.unsafe_ref(i) } - } - while i < total { - let v_i = unsafe_get(v, i); - if v_i < 128u8 { - i += 1u; - } else { - let w = utf8_char_width(v_i); - if w == 0u { return Some(i); } - - let nexti = i + w; - if nexti > total { return Some(i); } + let mut it = v.iter(); - // 2-byte encoding is for codepoints \u0080 to \u07ff - // first C2 80 last DF BF - // 3-byte encoding is for codepoints \u0800 to \uffff - // first E0 A0 80 last EF BF BF - // excluding surrogates codepoints \ud800 to \udfff - // ED A0 80 to ED BF BF - // 4-byte encoding is for codepoints \u10000 to \u10ffff - // first F0 90 80 80 last F4 8F BF BF - // - // Use the UTF-8 syntax from the RFC - // - // https://tools.ietf.org/html/rfc3629 - // UTF8-1 = %x00-7F - // UTF8-2 = %xC2-DF UTF8-tail - // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) / - // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail ) - // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) / - // %xF4 %x80-8F 2( UTF8-tail ) - // UTF8-tail = %x80-BF - match w { - 2 => if unsafe_get(v, i + 1) & 192u8 != TAG_CONT_U8 { - return Some(i) - }, - 3 => match (v_i, - unsafe_get(v, i + 1), - unsafe_get(v, i + 2) & 192u8) { - (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) => (), - (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) => (), - (0xED , 0x80 .. 0x9F, TAG_CONT_U8) => (), - (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => (), - _ => return Some(i), - }, - _ => match (v_i, - unsafe_get(v, i + 1), - unsafe_get(v, i + 2) & 192u8, - unsafe_get(v, i + 3) & 192u8) { - (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (), - (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) => (), - (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => (), - _ => return Some(i) - }, - } - - i = nexti; - } + let ok = run_utf8_validation_iterator(&mut it); + if ok { + None + } else { + // work out how many valid bytes we've consumed + // (run_utf8_validation_iterator resets the iterator to just + // after the last good byte), which we can do because the + // vector iterator size_hint is exact. + let (remaining, _) = it.size_hint(); + Some(v.len() - remaining) } - None } /// Determines if a vector of `u16` contains valid UTF-16