diff --git a/src/lib.rs b/src/lib.rs index 64ace00c8..97f3265be 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -340,6 +340,7 @@ clippy::wildcard_imports, // things are often more readable this way clippy::cast_lossless, + clippy::items_after_statements, clippy::module_name_repetitions, clippy::redundant_else, clippy::shadow_unrelated, diff --git a/src/read.rs b/src/read.rs index e03e13f28..1494ec6d2 100644 --- a/src/read.rs +++ b/src/read.rs @@ -2,6 +2,7 @@ use crate::error::{Error, ErrorCode, Result}; use alloc::vec::Vec; use core::char; use core::cmp; +use core::mem; use core::ops::Deref; use core::str; @@ -221,7 +222,7 @@ where { loop { let ch = tri!(next_or_eof(self)); - if !ESCAPE[ch as usize] { + if !is_escape(ch, true) { scratch.push(ch); continue; } @@ -342,7 +343,7 @@ where fn ignore_str(&mut self) -> Result<()> { loop { let ch = tri!(next_or_eof(self)); - if !ESCAPE[ch as usize] { + if !is_escape(ch, true) { continue; } match ch { @@ -425,6 +426,65 @@ impl<'a> SliceRead<'a> { } } + fn skip_to_escape(&mut self, forbid_control_characters: bool) { + // Immediately bail-out on empty strings and consecutive escapes (e.g. \u041b\u0435) + if self.index == self.slice.len() + || is_escape(self.slice[self.index], forbid_control_characters) + { + return; + } + self.index += 1; + + let rest = &self.slice[self.index..]; + + if !forbid_control_characters { + self.index += memchr::memchr2(b'"', b'\\', rest).unwrap_or(rest.len()); + return; + } + + // We wish to find the first byte in range 0x00..=0x1F or " or \. Ideally, we'd use + // something akin to memchr3, but the memchr crate does not support this at the moment. + // Therefore, we use a variation on Mycroft's algorithm [1] to provide performance better + // than a naive loop. It runs faster than equivalent two-pass memchr2+SWAR code on + // benchmarks and it's cross-platform, so probably the right fit. + // [1]: https://groups.google.com/forum/#!original/comp.lang.c/2HtQXvg7iKc/xOJeipH6KLMJ + type Chunk = usize; + const STEP: usize = mem::size_of::(); + const ONE_BYTES: Chunk = Chunk::MAX / 255; // 0x0101...01 + + for chunk in rest.chunks_exact(STEP) { + let chars = Chunk::from_ne_bytes(chunk.try_into().unwrap()); + let contains_ctrl = chars.wrapping_sub(ONE_BYTES * 0x20) & !chars; + let chars_quote = chars ^ (ONE_BYTES * Chunk::from(b'"')); + let contains_quote = chars_quote.wrapping_sub(ONE_BYTES) & !chars_quote; + let chars_backslash = chars ^ (ONE_BYTES * Chunk::from(b'\\')); + let contains_backslash = chars_backslash.wrapping_sub(ONE_BYTES) & !chars_backslash; + let masked = (contains_ctrl | contains_quote | contains_backslash) & (ONE_BYTES << 7); + if masked != 0 { + let addresswise_first_bit = if cfg!(target_endian = "little") { + masked.trailing_zeros() + } else { + masked.leading_zeros() + }; + // SAFETY: chunk is in-bounds for slice + self.index = unsafe { chunk.as_ptr().offset_from(self.slice.as_ptr()) } as usize + + addresswise_first_bit as usize / 8; + return; + } + } + + self.index += rest.len() / STEP * STEP; + self.skip_to_escape_slow(); + } + + #[cold] + #[inline(never)] + fn skip_to_escape_slow(&mut self) { + while self.index < self.slice.len() && !is_escape(self.slice[self.index], true) { + self.index += 1; + } + } + /// The big optimization here over IoRead is that if the string contains no /// backslash escape sequences, the returned &str is a slice of the raw JSON /// data so we avoid copying into the scratch space. @@ -442,9 +502,7 @@ impl<'a> SliceRead<'a> { let mut start = self.index; loop { - while self.index < self.slice.len() && !ESCAPE[self.slice[self.index] as usize] { - self.index += 1; - } + self.skip_to_escape(validate); if self.index == self.slice.len() { return error(self, ErrorCode::EofWhileParsingString); } @@ -470,9 +528,7 @@ impl<'a> SliceRead<'a> { } _ => { self.index += 1; - if validate { - return error(self, ErrorCode::ControlCharacterWhileParsingString); - } + return error(self, ErrorCode::ControlCharacterWhileParsingString); } } } @@ -538,9 +594,7 @@ impl<'a> Read<'a> for SliceRead<'a> { fn ignore_str(&mut self) -> Result<()> { loop { - while self.index < self.slice.len() && !ESCAPE[self.slice[self.index] as usize] { - self.index += 1; - } + self.skip_to_escape(true); if self.index == self.slice.len() { return error(self, ErrorCode::EofWhileParsingString); } @@ -779,33 +833,9 @@ pub trait Fused: private::Sealed {} impl<'a> Fused for SliceRead<'a> {} impl<'a> Fused for StrRead<'a> {} -// Lookup table of bytes that must be escaped. A value of true at index i means -// that byte i requires an escape sequence in the input. -static ESCAPE: [bool; 256] = { - const CT: bool = true; // control character \x00..=\x1F - const QU: bool = true; // quote \x22 - const BS: bool = true; // backslash \x5C - const __: bool = false; // allow unescaped - [ - // 1 2 3 4 5 6 7 8 9 A B C D E F - CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 0 - CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, CT, // 1 - __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4 - __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E - __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F - ] -}; +fn is_escape(ch: u8, including_control_characters: bool) -> bool { + ch == b'"' || ch == b'\\' || (including_control_characters && ch < 0x20) +} fn next_or_eof<'de, R>(read: &mut R) -> Result where diff --git a/tests/test.rs b/tests/test.rs index 71087162b..6923e6e38 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -2497,3 +2497,22 @@ fn hash_positive_and_negative_zero() { assert_eq!(rand.hash_one(k1), rand.hash_one(k2)); } } + +#[test] +fn test_control_character_search() { + // Different space circumstances + for n in 0..16 { + for m in 0..16 { + test_parse_err::(&[( + &format!("\"{}\n{}\"", ".".repeat(n), ".".repeat(m)), + "control character (\\u0000-\\u001F) found while parsing a string at line 2 column 0", + )]); + } + } + + // Multiple occurrences + test_parse_err::(&[( + &"\"\t\n\r\"", + "control character (\\u0000-\\u001F) found while parsing a string at line 1 column 2", + )]); +}