Skip to content

Commit 34c2f72

Browse files
committed
More correctly normalize attribute values
1 parent add7406 commit 34c2f72

File tree

7 files changed

+308
-12
lines changed

7 files changed

+308
-12
lines changed

Changelog.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414

1515
- [#541]: Deserialize specially named `$text` enum variant in [externally tagged]
1616
enums from textual content
17+
- [#379]: Improved compliance with the XML attribute value normalization process by
18+
adding `Attribute::normalized_value()` and `Attribute::normalized_value_with()`,
19+
which ought to be used in place of `Attribute::unescape_value()` and
20+
`Attribute::unescape_value_with()`
1721

1822
### Bug Fixes
1923

@@ -22,10 +26,15 @@
2226

2327
### Misc Changes
2428

29+
### New Tests
30+
31+
- [#379]: Added tests for attribute value normalization
32+
2533
[externally tagged]: https://serde.rs/enum-representations.html#externally-tagged
2634
[#490]: https://github.com/tafia/quick-xml/pull/490
2735
[#537]: https://github.com/tafia/quick-xml/issues/537
2836
[#541]: https://github.com/tafia/quick-xml/pull/541
37+
[#379]: https://github.com/tafia/quick-xml/pull/379
2938

3039
## 0.27.1 -- 2022-12-28
3140

@@ -120,6 +129,8 @@
120129
- [#489]: Reduced the size of the package uploaded into the crates.io by excluding
121130
tests, examples, and benchmarks.
122131

132+
### New Tests
133+
123134
[#481]: https://github.com/tafia/quick-xml/pull/481
124135
[#489]: https://github.com/tafia/quick-xml/pull/489
125136

benches/macrobenches.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,13 @@ static INPUTS: &[(&str, &str)] = &[
4343
("players.xml", PLAYERS),
4444
];
4545

46-
// TODO: use fully normalized attribute values
4746
fn parse_document_from_str(doc: &str) -> XmlResult<()> {
4847
let mut r = Reader::from_str(doc);
4948
loop {
5049
match criterion::black_box(r.read_event()?) {
5150
Event::Start(e) | Event::Empty(e) => {
5251
for attr in e.attributes() {
53-
criterion::black_box(attr?.decode_and_unescape_value(&r)?);
52+
criterion::black_box(attr?.decode_and_normalize_value(&r)?);
5453
}
5554
}
5655
Event::Text(e) => {

benches/microbenches.rs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,50 @@ fn attributes(c: &mut Criterion) {
242242
assert_eq!(count, 150);
243243
})
244244
});
245+
246+
group.finish();
247+
}
248+
249+
/// Benchmarks normalizing attribute values
250+
fn attribute_value_normalization(c: &mut Criterion) {
251+
let mut group = c.benchmark_group("attribute_value_normalization");
252+
253+
group.bench_function("noop_short", |b| {
254+
b.iter(|| {
255+
criterion::black_box(unescape("foobar")).unwrap();
256+
})
257+
});
258+
259+
group.bench_function("noop_long", |b| {
260+
b.iter(|| {
261+
criterion::black_box(unescape("just a bit of text without any entities")).unwrap();
262+
})
263+
});
264+
265+
group.bench_function("replacement_chars", |b| {
266+
b.iter(|| {
267+
criterion::black_box(unescape("just a bit\n of text without\tany entities")).unwrap();
268+
})
269+
});
270+
271+
group.bench_function("char_reference", |b| {
272+
b.iter(|| {
273+
let text = "prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
274+
criterion::black_box(unescape(text)).unwrap();
275+
let text = "&#38;&#60;";
276+
criterion::black_box(unescape(text)).unwrap();
277+
})
278+
});
279+
280+
group.bench_function("entity_reference", |b| {
281+
b.iter(|| {
282+
let text = "age &gt; 72 &amp;&amp; age &lt; 21";
283+
criterion::black_box(unescape(text)).unwrap();
284+
let text = "&quot;what&apos;s that?&quot;";
285+
criterion::black_box(unescape(text)).unwrap();
286+
})
287+
});
288+
245289
group.finish();
246290
}
247291

@@ -354,6 +398,7 @@ criterion_group!(
354398
read_resolved_event_into,
355399
one_event,
356400
attributes,
401+
attribute_value_normalization,
357402
escaping,
358403
unescaping,
359404
);

src/errors.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ impl From<EscapeError> for Error {
7777
}
7878

7979
impl From<AttrError> for Error {
80+
/// Creates a new `Error::InvalidAttr` from the given error
8081
#[inline]
8182
fn from(error: AttrError) -> Self {
8283
Error::InvalidAttr(error)

src/escapei.rs

Lines changed: 64 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use std::ops::Range;
88
use pretty_assertions::assert_eq;
99

1010
/// Error for XML escape / unescape.
11-
#[derive(Clone, Debug)]
11+
#[derive(Clone, Debug, PartialEq)]
1212
pub enum EscapeError {
1313
/// Entity with Null character
1414
EntityWithNull(Range<usize>),
@@ -212,8 +212,66 @@ where
212212
}
213213
}
214214

215+
/// Returns the attribute value normalized as per the XML specification, using a custom
216+
/// entity resolver.
217+
///
218+
/// https://www.w3.org/TR/xml/#AVNormalize
219+
///
220+
/// Do not use this method with HTML attributes.
221+
///
222+
/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
223+
/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
224+
/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
225+
/// take precedence.
226+
///
227+
/// This will allocate unless the raw attribute value does not require normalization.
228+
pub(crate) fn normalize_attribute_value_with<'a, 'entity>(
229+
value: &'a str,
230+
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
231+
) -> Result<Cow<'a, str>, EscapeError> {
232+
// TODO: avoid allocation when not needed
233+
let mut normalized = String::with_capacity(value.len());
234+
235+
let attr = value.as_bytes();
236+
let mut attr_iter = attr.iter().enumerate();
237+
238+
while let Some((idx, ch)) = attr_iter.next() {
239+
match ch {
240+
b' ' | b'\n' | b'\r' | b'\t' => normalized.push(' '),
241+
b'&' => {
242+
let end = idx
243+
+ 1
244+
+ attr_iter
245+
.position(|(_, c)| *c == b';')
246+
.ok_or_else(|| EscapeError::UnterminatedEntity(idx..attr.len()))?;
247+
let entity = &attr[idx + 1..end]; // starts after the &
248+
let entity_str = std::str::from_utf8(entity).expect("failed UTF-8 check");
249+
250+
if entity.starts_with(b"#") {
251+
let entity = &entity_str[1..]; // starts after the #
252+
let codepoint = parse_number(entity, idx..end)?;
253+
normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
254+
} else if let Some(s) = named_entity(entity_str) {
255+
normalized.push_str(s);
256+
} else if let Some(value) = resolve_entity(entity_str) {
257+
// TODO: recursively apply entity substitution
258+
normalized.push_str(&value);
259+
} else {
260+
return Err(EscapeError::UnrecognizedSymbol(
261+
idx + 1..end,
262+
String::from_utf8(entity.to_vec()).expect("failed UTF-8 check"),
263+
));
264+
}
265+
}
266+
_ => normalized.push(*ch as char),
267+
}
268+
}
269+
270+
Ok(Cow::Owned(normalized))
271+
}
272+
215273
#[cfg(not(feature = "escape-html"))]
216-
fn named_entity(name: &str) -> Option<&str> {
274+
pub(crate) fn named_entity(name: &str) -> Option<&str> {
217275
// match over strings are not allowed in const functions
218276
let s = match name.as_bytes() {
219277
b"lt" => "<",
@@ -226,9 +284,8 @@ fn named_entity(name: &str) -> Option<&str> {
226284
Some(s)
227285
}
228286
#[cfg(feature = "escape-html")]
229-
fn named_entity(name: &str) -> Option<&str> {
287+
pub(crate) fn named_entity(name: &str) -> Option<&str> {
230288
// imported from https://dev.w3.org/html5/html-author/charref
231-
// match over strings are not allowed in const functions
232289
//TODO: automate up-to-dating using https://html.spec.whatwg.org/entities.json
233290
let s = match name.as_bytes() {
234291
b"Tab" => "\u{09}",
@@ -1690,7 +1747,7 @@ fn named_entity(name: &str) -> Option<&str> {
16901747
Some(s)
16911748
}
16921749

1693-
fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
1750+
pub fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
16941751
let code = if bytes.starts_with('x') {
16951752
parse_hexadecimal(&bytes[1..])
16961753
} else {
@@ -1705,7 +1762,7 @@ fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
17051762
}
17061763
}
17071764

1708-
fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
1765+
pub fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
17091766
// maximum code is 0x10FFFF => 6 characters
17101767
if bytes.len() > 6 {
17111768
return Err(EscapeError::TooLongHexadecimal);
@@ -1723,7 +1780,7 @@ fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
17231780
Ok(code)
17241781
}
17251782

1726-
fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
1783+
pub fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
17271784
// maximum code is 0x10FFFF = 1114111 => 7 characters
17281785
if bytes.len() > 7 {
17291786
return Err(EscapeError::TooLongDecimal);

0 commit comments

Comments
 (0)