From 86c75265bd5908d31dfd7546e0ec70feb8afa312 Mon Sep 17 00:00:00 2001 From: ltdk Date: Tue, 28 May 2024 14:27:25 -0400 Subject: [PATCH] Optimize escape_ascii --- core/src/escape.rs | 114 +++++++++++++++++++++++++++++++++++--------- core/tests/ascii.rs | 22 +++++++-- 2 files changed, 110 insertions(+), 26 deletions(-) diff --git a/core/src/escape.rs b/core/src/escape.rs index b213cc2b9167c..0685f525dca83 100644 --- a/core/src/escape.rs +++ b/core/src/escape.rs @@ -18,38 +18,106 @@ const fn backslash(a: ascii::Char) -> ([ascii::Char; N], Range(byte: u8) -> ([ascii::Char; N], Range) { + const { assert!(N >= 4) }; + + let mut output = [ascii::Char::Null; N]; + + let hi = HEX_DIGITS[(byte >> 4) as usize]; + let lo = HEX_DIGITS[(byte & 0xf) as usize]; + + output[0] = ascii::Char::ReverseSolidus; + output[1] = ascii::Char::SmallX; + output[2] = hi; + output[3] = lo; + + (output, 0..4) +} + +#[inline] +const fn verbatim(a: ascii::Char) -> ([ascii::Char; N], Range) { + const { assert!(N >= 1) }; + + let mut output = [ascii::Char::Null; N]; + + output[0] = a; + + (output, 0..1) +} + /// Escapes an ASCII character. /// /// Returns a buffer and the length of the escaped representation. const fn escape_ascii(byte: u8) -> ([ascii::Char; N], Range) { const { assert!(N >= 4) }; - match byte { - b'\t' => backslash(ascii::Char::SmallT), - b'\r' => backslash(ascii::Char::SmallR), - b'\n' => backslash(ascii::Char::SmallN), - b'\\' => backslash(ascii::Char::ReverseSolidus), - b'\'' => backslash(ascii::Char::Apostrophe), - b'\"' => backslash(ascii::Char::QuotationMark), - byte => { - let mut output = [ascii::Char::Null; N]; - - if let Some(c) = byte.as_ascii() - && !byte.is_ascii_control() - { - output[0] = c; - (output, 0..1) - } else { - let hi = HEX_DIGITS[(byte >> 4) as usize]; - let lo = HEX_DIGITS[(byte & 0xf) as usize]; + #[cfg(feature = "optimize_for_size")] + { + match byte { + b'\t' => backslash(ascii::Char::SmallT), + b'\r' => backslash(ascii::Char::SmallR), + b'\n' => backslash(ascii::Char::SmallN), + b'\\' => backslash(ascii::Char::ReverseSolidus), + b'\'' => backslash(ascii::Char::Apostrophe), + b'"' => backslash(ascii::Char::QuotationMark), + 0x00..=0x1F | 0x7F => hex_escape(byte), + _ => match ascii::Char::from_u8(byte) { + Some(a) => verbatim(a), + None => hex_escape(byte), + }, + } + } + + #[cfg(not(feature = "optimize_for_size"))] + { + /// Lookup table helps us determine how to display character. + /// + /// Since ASCII characters will always be 7 bits, we can exploit this to store the 8th bit to + /// indicate whether the result is escaped or unescaped. + /// + /// We additionally use 0x80 (escaped NUL character) to indicate hex-escaped bytes, since + /// escaped NUL will not occur. + const LOOKUP: [u8; 256] = { + let mut arr = [0; 256]; + let mut idx = 0; + while idx <= 255 { + arr[idx] = match idx as u8 { + // use 8th bit to indicate escaped + b'\t' => 0x80 | b't', + b'\r' => 0x80 | b'r', + b'\n' => 0x80 | b'n', + b'\\' => 0x80 | b'\\', + b'\'' => 0x80 | b'\'', + b'"' => 0x80 | b'"', + + // use NUL to indicate hex-escaped + 0x00..=0x1F | 0x7F..=0xFF => 0x80 | b'\0', + + idx => idx, + }; + idx += 1; + } + arr + }; - output[0] = ascii::Char::ReverseSolidus; - output[1] = ascii::Char::SmallX; - output[2] = hi; - output[3] = lo; + let lookup = LOOKUP[byte as usize]; - (output, 0..4) + // 8th bit indicates escape + let lookup_escaped = lookup & 0x80 != 0; + + // SAFETY: We explicitly mask out the eighth bit to get a 7-bit ASCII character. + let lookup_ascii = unsafe { ascii::Char::from_u8_unchecked(lookup & 0x7F) }; + + if lookup_escaped { + // NUL indicates hex-escaped + if matches!(lookup_ascii, ascii::Char::Null) { + hex_escape(byte) + } else { + backslash(lookup_ascii) } + } else { + verbatim(lookup_ascii) } } } diff --git a/core/tests/ascii.rs b/core/tests/ascii.rs index 3d3f8ac10c603..ce09ee507f11f 100644 --- a/core/tests/ascii.rs +++ b/core/tests/ascii.rs @@ -481,9 +481,25 @@ fn ascii_ctype_const() { } #[test] -fn test_ascii_display() { - assert_eq!(b"foo'bar".escape_ascii().to_string(), r#"foo\'bar"#); - assert_eq!(b"\0\xff".escape_ascii().to_string(), r#"\x00\xff"#); +fn test_escape_ascii() { + let mut buf = [0u8; 0x1F + 7]; // 0..=0x1F plus two quotes, slash, \x7F, \x80, \xFF + for idx in 0..=0x1F { + buf[idx] = idx as u8; + } + buf[0x20] = b'\''; + buf[0x21] = b'"'; + buf[0x22] = b'\\'; + buf[0x23] = 0x7F; + buf[0x24] = 0x80; + buf[0x25] = 0xff; + assert_eq!( + buf.escape_ascii().to_string(), + r#"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\'\"\\\x7f\x80\xff"# + ); +} + +#[test] +fn test_escape_ascii_iter() { let mut it = b"\0fastpath\xffremainder\xff".escape_ascii(); let _ = it.advance_by(4); let _ = it.advance_back_by(4);