Skip to content

Commit 9a72c7b

Browse files
authored
Merge pull request #771 from Mingun/escape-fixes
Fixes in unescape routine
2 parents 80f0e7c + 0315ed0 commit 9a72c7b

File tree

8 files changed

+348
-196
lines changed

8 files changed

+348
-196
lines changed

Changelog.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,24 @@
2222
- [#773]: Fixed reporting incorrect end position in `Reader::read_to_end` family
2323
of methods and trimming of the trailing spaces in `Reader::read_text` when
2424
`trim_text_start` is set and the last event is not a `Text` event.
25+
- [#771]: Character references now allow any number of leading zeroes as it should.
26+
As a result, the following variants of `quick_xml::escape::EscapeError` are removed:
27+
- `TooLongDecimal`
28+
- `TooLongHexadecimal`
29+
- [#771]: Fixed `Attribute::unescape_value` which does not unescape predefined values since 0.32.0.
2530

2631
### Misc Changes
2732

33+
- [#771]: `EscapeError::UnrecognizedSymbol` renamed to `EscapeError::UnrecognizedEntity`.
34+
- [#771]: Implemented `PartialEq` for `EscapeError`.
35+
- [#771]: Replace the following variants of `EscapeError` by `InvalidCharRef` variant
36+
with a new `ParseCharRefError` inside:
37+
- `EntityWithNull`
38+
- `InvalidDecimal`
39+
- `InvalidHexadecimal`
40+
- `InvalidCodepoint`
41+
42+
[#771]: https://github.com/tafia/quick-xml/pull/771
2843
[#772]: https://github.com/tafia/quick-xml/pull/772
2944
[#773]: https://github.com/tafia/quick-xml/pull/773
3045

src/de/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2135,9 +2135,9 @@ struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver = PredefinedEntityResolve
21352135
lookahead: Result<PayloadEvent<'i>, DeError>,
21362136

21372137
/// Used to resolve unknown entities that would otherwise cause the parser
2138-
/// to return an [`EscapeError::UnrecognizedSymbol`] error.
2138+
/// to return an [`EscapeError::UnrecognizedEntity`] error.
21392139
///
2140-
/// [`EscapeError::UnrecognizedSymbol`]: crate::escape::EscapeError::UnrecognizedSymbol
2140+
/// [`EscapeError::UnrecognizedEntity`]: crate::escape::EscapeError::UnrecognizedEntity
21412141
entity_resolver: E,
21422142
}
21432143

src/de/resolver.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,10 +81,10 @@ pub trait EntityResolver {
8181
/// Called when an entity needs to be resolved.
8282
///
8383
/// `None` is returned if a suitable value can not be found.
84-
/// In that case an [`EscapeError::UnrecognizedSymbol`] will be returned by
84+
/// In that case an [`EscapeError::UnrecognizedEntity`] will be returned by
8585
/// a deserializer.
8686
///
87-
/// [`EscapeError::UnrecognizedSymbol`]: crate::escape::EscapeError::UnrecognizedSymbol
87+
/// [`EscapeError::UnrecognizedEntity`]: crate::escape::EscapeError::UnrecognizedEntity
8888
fn resolve(&self, entity: &str) -> Option<&str>;
8989
}
9090

src/encoding.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,15 @@ impl Decoder {
9999

100100
Ok(())
101101
}
102+
103+
/// Decodes the `Cow` buffer, preserves the lifetime
104+
pub(crate) fn decode_cow<'b>(&self, bytes: &Cow<'b, [u8]>) -> Result<Cow<'b, str>> {
105+
match bytes {
106+
Cow::Borrowed(bytes) => self.decode(bytes),
107+
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
108+
Cow::Owned(bytes) => Ok(self.decode(bytes)?.into_owned().into()),
109+
}
110+
}
102111
}
103112

104113
/// Decodes the provided bytes using the specified encoding.

src/escape.rs

Lines changed: 70 additions & 164 deletions
Original file line numberDiff line numberDiff line change
@@ -2,62 +2,82 @@
22
33
use memchr::memchr2_iter;
44
use std::borrow::Cow;
5+
use std::num::ParseIntError;
56
use std::ops::Range;
67

7-
#[cfg(test)]
8-
use pretty_assertions::assert_eq;
8+
/// Error of parsing character reference (`&#<dec-number>;` or `&#x<hex-number>;`).
9+
#[derive(Clone, Debug, PartialEq)]
10+
pub enum ParseCharRefError {
11+
/// Number contains sign character (`+` or `-`) which is not allowed.
12+
UnexpectedSign,
13+
/// Number cannot be parsed due to non-number characters or a numeric overflow.
14+
InvalidNumber(ParseIntError),
15+
/// Character reference represents not a valid unicode codepoint.
16+
InvalidCodepoint(u32),
17+
/// Character reference expanded to a not permitted character for an XML.
18+
///
19+
/// Currently, only `0x0` character produces this error.
20+
IllegalCharacter(u32),
21+
}
22+
23+
impl std::fmt::Display for ParseCharRefError {
24+
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
25+
match self {
26+
Self::UnexpectedSign => f.write_str("unexpected number sign"),
27+
Self::InvalidNumber(e) => e.fmt(f),
28+
Self::InvalidCodepoint(n) => write!(f, "`{}` is not a valid codepoint", n),
29+
Self::IllegalCharacter(n) => write!(f, "0x{:x} character is not permitted in XML", n),
30+
}
31+
}
32+
}
33+
34+
impl std::error::Error for ParseCharRefError {
35+
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
36+
match self {
37+
Self::InvalidNumber(e) => Some(e),
38+
_ => None,
39+
}
40+
}
41+
}
942

1043
/// Error for XML escape / unescape.
11-
#[derive(Clone, Debug)]
44+
#[derive(Clone, Debug, PartialEq)]
1245
pub enum EscapeError {
13-
/// Entity with Null character
14-
EntityWithNull(Range<usize>),
15-
/// Unrecognized escape symbol
16-
UnrecognizedSymbol(Range<usize>, String),
46+
/// Referenced entity in unknown to the parser.
47+
UnrecognizedEntity(Range<usize>, String),
1748
/// Cannot find `;` after `&`
1849
UnterminatedEntity(Range<usize>),
19-
/// Cannot convert Hexa to utf8
20-
TooLongHexadecimal,
21-
/// Character is not a valid hexadecimal value
22-
InvalidHexadecimal(char),
23-
/// Cannot convert decimal to hexa
24-
TooLongDecimal,
25-
/// Character is not a valid decimal value
26-
InvalidDecimal(char),
27-
/// Not a valid unicode codepoint
28-
InvalidCodepoint(u32),
50+
/// Attempt to parse character reference (`&#<dec-number>;` or `&#x<hex-number>;`)
51+
/// was unsuccessful, not all characters are decimal or hexadecimal numbers.
52+
InvalidCharRef(ParseCharRefError),
2953
}
3054

3155
impl std::fmt::Display for EscapeError {
3256
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
3357
match self {
34-
EscapeError::EntityWithNull(e) => write!(
35-
f,
36-
"Error while escaping character at range {:?}: Null character entity not allowed",
37-
e
38-
),
39-
EscapeError::UnrecognizedSymbol(rge, res) => write!(
40-
f,
41-
"Error while escaping character at range {:?}: Unrecognized escape symbol: {:?}",
42-
rge, res
43-
),
58+
EscapeError::UnrecognizedEntity(rge, res) => {
59+
write!(f, "at {:?}: unrecognized entity `{}`", rge, res)
60+
}
4461
EscapeError::UnterminatedEntity(e) => write!(
4562
f,
4663
"Error while escaping character at range {:?}: Cannot find ';' after '&'",
4764
e
4865
),
49-
EscapeError::TooLongHexadecimal => write!(f, "Cannot convert hexadecimal to utf8"),
50-
EscapeError::InvalidHexadecimal(e) => {
51-
write!(f, "'{}' is not a valid hexadecimal character", e)
66+
EscapeError::InvalidCharRef(e) => {
67+
write!(f, "invalid character reference: {}", e)
5268
}
53-
EscapeError::TooLongDecimal => write!(f, "Cannot convert decimal to utf8"),
54-
EscapeError::InvalidDecimal(e) => write!(f, "'{}' is not a valid decimal character", e),
55-
EscapeError::InvalidCodepoint(n) => write!(f, "'{}' is not a valid codepoint", n),
5669
}
5770
}
5871
}
5972

60-
impl std::error::Error for EscapeError {}
73+
impl std::error::Error for EscapeError {
74+
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
75+
match self {
76+
Self::InvalidCharRef(e) => Some(e),
77+
_ => None,
78+
}
79+
}
80+
}
6181

6282
/// Escapes an `&str` and replaces all xml special characters (`<`, `>`, `&`, `'`, `"`)
6383
/// with their corresponding xml escaped value.
@@ -251,12 +271,12 @@ where
251271
// search for character correctness
252272
let pat = &raw[start + 1..end];
253273
if let Some(entity) = pat.strip_prefix('#') {
254-
let codepoint = parse_number(entity, start..end)?;
274+
let codepoint = parse_number(entity).map_err(EscapeError::InvalidCharRef)?;
255275
unescaped.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
256276
} else if let Some(value) = resolve_entity(pat) {
257277
unescaped.push_str(value);
258278
} else {
259-
return Err(EscapeError::UnrecognizedSymbol(
279+
return Err(EscapeError::UnrecognizedEntity(
260280
start + 1..end,
261281
pat.to_string(),
262282
));
@@ -1796,141 +1816,27 @@ pub const fn resolve_html5_entity(entity: &str) -> Option<&'static str> {
17961816
Some(s)
17971817
}
17981818

1799-
fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
1800-
let code = if let Some(hex_digits) = bytes.strip_prefix('x') {
1801-
parse_hexadecimal(hex_digits)
1819+
fn parse_number(num: &str) -> Result<char, ParseCharRefError> {
1820+
let code = if let Some(hex) = num.strip_prefix('x') {
1821+
from_str_radix(hex, 16)?
18021822
} else {
1803-
parse_decimal(bytes)
1804-
}?;
1823+
from_str_radix(num, 10)?
1824+
};
18051825
if code == 0 {
1806-
return Err(EscapeError::EntityWithNull(range));
1826+
return Err(ParseCharRefError::IllegalCharacter(code));
18071827
}
18081828
match std::char::from_u32(code) {
18091829
Some(c) => Ok(c),
1810-
None => Err(EscapeError::InvalidCodepoint(code)),
1811-
}
1812-
}
1813-
1814-
fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
1815-
// maximum code is 0x10FFFF => 6 characters
1816-
if bytes.len() > 6 {
1817-
return Err(EscapeError::TooLongHexadecimal);
1818-
}
1819-
let mut code = 0;
1820-
for b in bytes.bytes() {
1821-
code <<= 4;
1822-
code += match b {
1823-
b'0'..=b'9' => b - b'0',
1824-
b'a'..=b'f' => b - b'a' + 10,
1825-
b'A'..=b'F' => b - b'A' + 10,
1826-
b => return Err(EscapeError::InvalidHexadecimal(b as char)),
1827-
} as u32;
1830+
None => Err(ParseCharRefError::InvalidCodepoint(code)),
18281831
}
1829-
Ok(code)
18301832
}
18311833

1832-
fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
1833-
// maximum code is 0x10FFFF = 1114111 => 7 characters
1834-
if bytes.len() > 7 {
1835-
return Err(EscapeError::TooLongDecimal);
1836-
}
1837-
let mut code = 0;
1838-
for b in bytes.bytes() {
1839-
code *= 10;
1840-
code += match b {
1841-
b'0'..=b'9' => b - b'0',
1842-
b => return Err(EscapeError::InvalidDecimal(b as char)),
1843-
} as u32;
1834+
#[inline]
1835+
fn from_str_radix(src: &str, radix: u32) -> Result<u32, ParseCharRefError> {
1836+
match src.as_bytes().first().copied() {
1837+
// We should not allow sign numbers, but u32::from_str_radix will accept `+`.
1838+
// We also handle `-` to be consistent in returned errors
1839+
Some(b'+') | Some(b'-') => Err(ParseCharRefError::UnexpectedSign),
1840+
_ => u32::from_str_radix(src, radix).map_err(ParseCharRefError::InvalidNumber),
18441841
}
1845-
Ok(code)
1846-
}
1847-
1848-
#[test]
1849-
fn test_unescape() {
1850-
let unchanged = unescape("test").unwrap();
1851-
// assert_eq does not check that Cow is borrowed, but we explicitly use Cow
1852-
// because it influences diff
1853-
// TODO: use assert_matches! when stabilized and other features will bump MSRV
1854-
assert_eq!(unchanged, Cow::Borrowed("test"));
1855-
assert!(matches!(unchanged, Cow::Borrowed(_)));
1856-
1857-
assert_eq!(
1858-
unescape("&lt;&amp;test&apos;&quot;&gt;").unwrap(),
1859-
"<&test'\">"
1860-
);
1861-
assert_eq!(unescape("&#x30;").unwrap(), "0");
1862-
assert_eq!(unescape("&#48;").unwrap(), "0");
1863-
assert!(unescape("&foo;").is_err());
1864-
}
1865-
1866-
#[test]
1867-
fn test_unescape_with() {
1868-
let custom_entities = |ent: &str| match ent {
1869-
"foo" => Some("BAR"),
1870-
_ => None,
1871-
};
1872-
1873-
let unchanged = unescape_with("test", custom_entities).unwrap();
1874-
// assert_eq does not check that Cow is borrowed, but we explicitly use Cow
1875-
// because it influences diff
1876-
// TODO: use assert_matches! when stabilized and other features will bump MSRV
1877-
assert_eq!(unchanged, Cow::Borrowed("test"));
1878-
assert!(matches!(unchanged, Cow::Borrowed(_)));
1879-
1880-
assert!(unescape_with("&lt;", custom_entities).is_err());
1881-
assert_eq!(unescape_with("&#x30;", custom_entities).unwrap(), "0");
1882-
assert_eq!(unescape_with("&#48;", custom_entities).unwrap(), "0");
1883-
assert_eq!(unescape_with("&foo;", custom_entities).unwrap(), "BAR");
1884-
assert!(unescape_with("&fop;", custom_entities).is_err());
1885-
}
1886-
1887-
#[test]
1888-
fn test_escape() {
1889-
let unchanged = escape("test");
1890-
// assert_eq does not check that Cow is borrowed, but we explicitly use Cow
1891-
// because it influences diff
1892-
// TODO: use assert_matches! when stabilized and other features will bump MSRV
1893-
assert_eq!(unchanged, Cow::Borrowed("test"));
1894-
assert!(matches!(unchanged, Cow::Borrowed(_)));
1895-
1896-
assert_eq!(escape("<&\"'>"), "&lt;&amp;&quot;&apos;&gt;");
1897-
assert_eq!(escape("<test>"), "&lt;test&gt;");
1898-
assert_eq!(escape("\"a\"bc"), "&quot;a&quot;bc");
1899-
assert_eq!(escape("\"a\"b&c"), "&quot;a&quot;b&amp;c");
1900-
assert_eq!(
1901-
escape("prefix_\"a\"b&<>c"),
1902-
"prefix_&quot;a&quot;b&amp;&lt;&gt;c"
1903-
);
1904-
}
1905-
1906-
#[test]
1907-
fn test_partial_escape() {
1908-
let unchanged = partial_escape("test");
1909-
// assert_eq does not check that Cow is borrowed, but we explicitly use Cow
1910-
// because it influences diff
1911-
// TODO: use assert_matches! when stabilized and other features will bump MSRV
1912-
assert_eq!(unchanged, Cow::Borrowed("test"));
1913-
assert!(matches!(unchanged, Cow::Borrowed(_)));
1914-
1915-
assert_eq!(partial_escape("<&\"'>"), "&lt;&amp;\"'&gt;");
1916-
assert_eq!(partial_escape("<test>"), "&lt;test&gt;");
1917-
assert_eq!(partial_escape("\"a\"bc"), "\"a\"bc");
1918-
assert_eq!(partial_escape("\"a\"b&c"), "\"a\"b&amp;c");
1919-
assert_eq!(
1920-
partial_escape("prefix_\"a\"b&<>c"),
1921-
"prefix_\"a\"b&amp;&lt;&gt;c"
1922-
);
1923-
}
1924-
1925-
#[test]
1926-
fn test_minimal_escape() {
1927-
assert_eq!(minimal_escape("test"), Cow::Borrowed("test"));
1928-
assert_eq!(minimal_escape("<&\"'>"), "&lt;&amp;\"'>");
1929-
assert_eq!(minimal_escape("<test>"), "&lt;test>");
1930-
assert_eq!(minimal_escape("\"a\"bc"), "\"a\"bc");
1931-
assert_eq!(minimal_escape("\"a\"b&c"), "\"a\"b&amp;c");
1932-
assert_eq!(
1933-
minimal_escape("prefix_\"a\"b&<>c"),
1934-
"prefix_\"a\"b&amp;&lt;>c"
1935-
);
19361842
}

0 commit comments

Comments
 (0)