Skip to content

Commit c1d7a06

Browse files
committed
Faster escape routines
1 parent 5bed370 commit c1d7a06

File tree

2 files changed

+57
-3
lines changed

2 files changed

+57
-3
lines changed

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ serde = { version = ">=1.0.100,<1.0.181", optional = true }
2121
tokio = { version = "1.10", optional = true, default-features = false, features = ["io-util"] }
2222
memchr = "2.1"
2323
arbitrary = { version = "1", features = ["derive"], optional = true }
24+
jetscii = "0.5.2"
25+
once_cell = "1.12.0"
2426

2527
[dev-dependencies]
2628
criterion = "0.4"

src/escapei.rs

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,18 @@ use memchr::memchr2_iter;
44
use std::borrow::Cow;
55
use std::ops::Range;
66

7+
use jetscii::bytes;
8+
use memchr;
9+
use once_cell::sync::Lazy;
10+
711
#[cfg(test)]
812
use pretty_assertions::assert_eq;
913

14+
15+
static XML_ESCAPE_BYTES: Lazy<jetscii::BytesConst> =
16+
Lazy::new(|| bytes!(b'<', b'>', b'&', b'\'', b'"'));
17+
static XML_PARTIAL_ESCAPE_BYTES: Lazy<jetscii::BytesConst> = Lazy::new(|| bytes!(b'<', b'>', b'&'));
18+
1019
/// Error for XML escape / unescape.
1120
#[derive(Clone, Debug)]
1221
pub enum EscapeError {
@@ -72,7 +81,8 @@ impl std::error::Error for EscapeError {}
7281
/// | `'` | `&apos;`
7382
/// | `"` | `&quot;`
7483
pub fn escape(raw: &str) -> Cow<str> {
75-
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
84+
// _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
85+
simd_escape(raw, &XML_ESCAPE_BYTES)
7686
}
7787

7888
/// Escapes an `&str` and replaces xml special characters (`<`, `>`, `&`)
@@ -89,9 +99,11 @@ pub fn escape(raw: &str) -> Cow<str> {
8999
/// | `>` | `&gt;`
90100
/// | `&` | `&amp;`
91101
pub fn partial_escape(raw: &str) -> Cow<str> {
92-
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
102+
// _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
103+
simd_escape(raw, &XML_PARTIAL_ESCAPE_BYTES)
93104
}
94105

106+
95107
/// Escapes an `&str` and replaces a subset of xml special characters (`<`, `>`,
96108
/// `&`, `'`, `"`) with their corresponding xml escaped value.
97109
pub(crate) fn _escape<F: Fn(u8) -> bool>(raw: &str, escape_chars: F) -> Cow<str> {
@@ -121,7 +133,47 @@ pub(crate) fn _escape<F: Fn(u8) -> bool>(raw: &str, escape_chars: F) -> Cow<str>
121133
b'\r' => escaped.extend_from_slice(b"&#13;"),
122134
b' ' => escaped.extend_from_slice(b"&#32;"),
123135
_ => unreachable!(
124-
"Only '<', '>','\', '&', '\"', '\\t', '\\r', '\\n', and ' ' are escaped"
136+
"Only '<', '>','\', '&', '\"', '\\t', '\\r', '\\n', and ' ' are escaped"),
137+
}
138+
pos = new_pos + 1;
139+
}
140+
141+
if let Some(mut escaped) = escaped {
142+
if let Some(raw) = bytes.get(pos..) {
143+
escaped.extend_from_slice(raw);
144+
}
145+
// SAFETY: we operate on UTF-8 input and search for an one byte chars only,
146+
// so all slices that was put to the `escaped` is a valid UTF-8 encoded strings
147+
// TODO: Can be replaced with `unsafe { String::from_utf8_unchecked() }`
148+
// if unsafe code will be allowed
149+
Cow::Owned(String::from_utf8(escaped).unwrap())
150+
} else {
151+
Cow::Borrowed(raw)
152+
}
153+
}
154+
155+
/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
156+
/// corresponding xml escaped value.
157+
pub fn simd_escape<'a>(raw: &'a str, escape_matcher: &jetscii::BytesConst) -> Cow<'a, str> {
158+
let bytes = raw.as_bytes();
159+
let mut escaped = None;
160+
let mut pos = 0;
161+
while let Some(i) = escape_matcher.find(&bytes[pos..]) {
162+
if escaped.is_none() {
163+
escaped = Some(Vec::with_capacity(raw.len()));
164+
}
165+
let escaped = escaped.as_mut().expect("initialized");
166+
let new_pos = pos + i;
167+
escaped.extend_from_slice(&bytes[pos..new_pos]);
168+
match bytes[new_pos] {
169+
b'<' => escaped.extend_from_slice(b"&lt;"),
170+
b'>' => escaped.extend_from_slice(b"&gt;"),
171+
b'\'' => escaped.extend_from_slice(b"&apos;"),
172+
b'&' => escaped.extend_from_slice(b"&amp;"),
173+
b'"' => escaped.extend_from_slice(b"&quot;"),
174+
c @ _ => unreachable!(
175+
"Found {} but only '<', '>', ', '&' and '\"' are escaped",
176+
c as char
125177
),
126178
}
127179
pos = new_pos + 1;

0 commit comments

Comments
 (0)