|
2 | 2 |
|
3 | 3 | use memchr::memchr2_iter; |
4 | 4 | use std::borrow::Cow; |
| 5 | +use std::num::ParseIntError; |
5 | 6 | use std::ops::Range; |
6 | 7 |
|
7 | | -#[cfg(test)] |
8 | | -use pretty_assertions::assert_eq; |
| 8 | +/// Error of parsing character reference (`&#<dec-number>;` or `&#x<hex-number>;`). |
| 9 | +#[derive(Clone, Debug, PartialEq)] |
| 10 | +pub enum ParseCharRefError { |
| 11 | + /// Number contains sign character (`+` or `-`) which is not allowed. |
| 12 | + UnexpectedSign, |
| 13 | + /// Number cannot be parsed due to non-number characters or a numeric overflow. |
| 14 | + InvalidNumber(ParseIntError), |
| 15 | + /// Character reference represents not a valid unicode codepoint. |
| 16 | + InvalidCodepoint(u32), |
| 17 | + /// Character reference expanded to a not permitted character for an XML. |
| 18 | + /// |
| 19 | + /// Currently, only `0x0` character produces this error. |
| 20 | + IllegalCharacter(u32), |
| 21 | +} |
| 22 | + |
| 23 | +impl std::fmt::Display for ParseCharRefError { |
| 24 | + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
| 25 | + match self { |
| 26 | + Self::UnexpectedSign => f.write_str("unexpected number sign"), |
| 27 | + Self::InvalidNumber(e) => e.fmt(f), |
| 28 | + Self::InvalidCodepoint(n) => write!(f, "`{}` is not a valid codepoint", n), |
| 29 | + Self::IllegalCharacter(n) => write!(f, "0x{:x} character is not permitted in XML", n), |
| 30 | + } |
| 31 | + } |
| 32 | +} |
| 33 | + |
| 34 | +impl std::error::Error for ParseCharRefError { |
| 35 | + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { |
| 36 | + match self { |
| 37 | + Self::InvalidNumber(e) => Some(e), |
| 38 | + _ => None, |
| 39 | + } |
| 40 | + } |
| 41 | +} |
9 | 42 |
|
10 | 43 | /// Error for XML escape / unescape. |
11 | | -#[derive(Clone, Debug)] |
| 44 | +#[derive(Clone, Debug, PartialEq)] |
12 | 45 | pub enum EscapeError { |
13 | | - /// Entity with Null character |
14 | | - EntityWithNull(Range<usize>), |
15 | | - /// Unrecognized escape symbol |
16 | | - UnrecognizedSymbol(Range<usize>, String), |
| 46 | + /// Referenced entity in unknown to the parser. |
| 47 | + UnrecognizedEntity(Range<usize>, String), |
17 | 48 | /// Cannot find `;` after `&` |
18 | 49 | UnterminatedEntity(Range<usize>), |
19 | | - /// Cannot convert Hexa to utf8 |
20 | | - TooLongHexadecimal, |
21 | | - /// Character is not a valid hexadecimal value |
22 | | - InvalidHexadecimal(char), |
23 | | - /// Cannot convert decimal to hexa |
24 | | - TooLongDecimal, |
25 | | - /// Character is not a valid decimal value |
26 | | - InvalidDecimal(char), |
27 | | - /// Not a valid unicode codepoint |
28 | | - InvalidCodepoint(u32), |
| 50 | + /// Attempt to parse character reference (`&#<dec-number>;` or `&#x<hex-number>;`) |
| 51 | + /// was unsuccessful, not all characters are decimal or hexadecimal numbers. |
| 52 | + InvalidCharRef(ParseCharRefError), |
29 | 53 | } |
30 | 54 |
|
31 | 55 | impl std::fmt::Display for EscapeError { |
32 | 56 | fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { |
33 | 57 | match self { |
34 | | - EscapeError::EntityWithNull(e) => write!( |
35 | | - f, |
36 | | - "Error while escaping character at range {:?}: Null character entity not allowed", |
37 | | - e |
38 | | - ), |
39 | | - EscapeError::UnrecognizedSymbol(rge, res) => write!( |
40 | | - f, |
41 | | - "Error while escaping character at range {:?}: Unrecognized escape symbol: {:?}", |
42 | | - rge, res |
43 | | - ), |
| 58 | + EscapeError::UnrecognizedEntity(rge, res) => { |
| 59 | + write!(f, "at {:?}: unrecognized entity `{}`", rge, res) |
| 60 | + } |
44 | 61 | EscapeError::UnterminatedEntity(e) => write!( |
45 | 62 | f, |
46 | 63 | "Error while escaping character at range {:?}: Cannot find ';' after '&'", |
47 | 64 | e |
48 | 65 | ), |
49 | | - EscapeError::TooLongHexadecimal => write!(f, "Cannot convert hexadecimal to utf8"), |
50 | | - EscapeError::InvalidHexadecimal(e) => { |
51 | | - write!(f, "'{}' is not a valid hexadecimal character", e) |
| 66 | + EscapeError::InvalidCharRef(e) => { |
| 67 | + write!(f, "invalid character reference: {}", e) |
52 | 68 | } |
53 | | - EscapeError::TooLongDecimal => write!(f, "Cannot convert decimal to utf8"), |
54 | | - EscapeError::InvalidDecimal(e) => write!(f, "'{}' is not a valid decimal character", e), |
55 | | - EscapeError::InvalidCodepoint(n) => write!(f, "'{}' is not a valid codepoint", n), |
56 | 69 | } |
57 | 70 | } |
58 | 71 | } |
59 | 72 |
|
60 | | -impl std::error::Error for EscapeError {} |
| 73 | +impl std::error::Error for EscapeError { |
| 74 | + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { |
| 75 | + match self { |
| 76 | + Self::InvalidCharRef(e) => Some(e), |
| 77 | + _ => None, |
| 78 | + } |
| 79 | + } |
| 80 | +} |
61 | 81 |
|
62 | 82 | /// Escapes an `&str` and replaces all xml special characters (`<`, `>`, `&`, `'`, `"`) |
63 | 83 | /// with their corresponding xml escaped value. |
@@ -251,12 +271,12 @@ where |
251 | 271 | // search for character correctness |
252 | 272 | let pat = &raw[start + 1..end]; |
253 | 273 | if let Some(entity) = pat.strip_prefix('#') { |
254 | | - let codepoint = parse_number(entity, start..end)?; |
| 274 | + let codepoint = parse_number(entity).map_err(EscapeError::InvalidCharRef)?; |
255 | 275 | unescaped.push_str(codepoint.encode_utf8(&mut [0u8; 4])); |
256 | 276 | } else if let Some(value) = resolve_entity(pat) { |
257 | 277 | unescaped.push_str(value); |
258 | 278 | } else { |
259 | | - return Err(EscapeError::UnrecognizedSymbol( |
| 279 | + return Err(EscapeError::UnrecognizedEntity( |
260 | 280 | start + 1..end, |
261 | 281 | pat.to_string(), |
262 | 282 | )); |
@@ -1796,141 +1816,27 @@ pub const fn resolve_html5_entity(entity: &str) -> Option<&'static str> { |
1796 | 1816 | Some(s) |
1797 | 1817 | } |
1798 | 1818 |
|
1799 | | -fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> { |
1800 | | - let code = if let Some(hex_digits) = bytes.strip_prefix('x') { |
1801 | | - parse_hexadecimal(hex_digits) |
| 1819 | +fn parse_number(num: &str) -> Result<char, ParseCharRefError> { |
| 1820 | + let code = if let Some(hex) = num.strip_prefix('x') { |
| 1821 | + from_str_radix(hex, 16)? |
1802 | 1822 | } else { |
1803 | | - parse_decimal(bytes) |
1804 | | - }?; |
| 1823 | + from_str_radix(num, 10)? |
| 1824 | + }; |
1805 | 1825 | if code == 0 { |
1806 | | - return Err(EscapeError::EntityWithNull(range)); |
| 1826 | + return Err(ParseCharRefError::IllegalCharacter(code)); |
1807 | 1827 | } |
1808 | 1828 | match std::char::from_u32(code) { |
1809 | 1829 | Some(c) => Ok(c), |
1810 | | - None => Err(EscapeError::InvalidCodepoint(code)), |
1811 | | - } |
1812 | | -} |
1813 | | - |
1814 | | -fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> { |
1815 | | - // maximum code is 0x10FFFF => 6 characters |
1816 | | - if bytes.len() > 6 { |
1817 | | - return Err(EscapeError::TooLongHexadecimal); |
1818 | | - } |
1819 | | - let mut code = 0; |
1820 | | - for b in bytes.bytes() { |
1821 | | - code <<= 4; |
1822 | | - code += match b { |
1823 | | - b'0'..=b'9' => b - b'0', |
1824 | | - b'a'..=b'f' => b - b'a' + 10, |
1825 | | - b'A'..=b'F' => b - b'A' + 10, |
1826 | | - b => return Err(EscapeError::InvalidHexadecimal(b as char)), |
1827 | | - } as u32; |
| 1830 | + None => Err(ParseCharRefError::InvalidCodepoint(code)), |
1828 | 1831 | } |
1829 | | - Ok(code) |
1830 | 1832 | } |
1831 | 1833 |
|
1832 | | -fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> { |
1833 | | - // maximum code is 0x10FFFF = 1114111 => 7 characters |
1834 | | - if bytes.len() > 7 { |
1835 | | - return Err(EscapeError::TooLongDecimal); |
1836 | | - } |
1837 | | - let mut code = 0; |
1838 | | - for b in bytes.bytes() { |
1839 | | - code *= 10; |
1840 | | - code += match b { |
1841 | | - b'0'..=b'9' => b - b'0', |
1842 | | - b => return Err(EscapeError::InvalidDecimal(b as char)), |
1843 | | - } as u32; |
| 1834 | +#[inline] |
| 1835 | +fn from_str_radix(src: &str, radix: u32) -> Result<u32, ParseCharRefError> { |
| 1836 | + match src.as_bytes().first().copied() { |
| 1837 | + // We should not allow sign numbers, but u32::from_str_radix will accept `+`. |
| 1838 | + // We also handle `-` to be consistent in returned errors |
| 1839 | + Some(b'+') | Some(b'-') => Err(ParseCharRefError::UnexpectedSign), |
| 1840 | + _ => u32::from_str_radix(src, radix).map_err(ParseCharRefError::InvalidNumber), |
1844 | 1841 | } |
1845 | | - Ok(code) |
1846 | | -} |
1847 | | - |
1848 | | -#[test] |
1849 | | -fn test_unescape() { |
1850 | | - let unchanged = unescape("test").unwrap(); |
1851 | | - // assert_eq does not check that Cow is borrowed, but we explicitly use Cow |
1852 | | - // because it influences diff |
1853 | | - // TODO: use assert_matches! when stabilized and other features will bump MSRV |
1854 | | - assert_eq!(unchanged, Cow::Borrowed("test")); |
1855 | | - assert!(matches!(unchanged, Cow::Borrowed(_))); |
1856 | | - |
1857 | | - assert_eq!( |
1858 | | - unescape("<&test'">").unwrap(), |
1859 | | - "<&test'\">" |
1860 | | - ); |
1861 | | - assert_eq!(unescape("0").unwrap(), "0"); |
1862 | | - assert_eq!(unescape("0").unwrap(), "0"); |
1863 | | - assert!(unescape("&foo;").is_err()); |
1864 | | -} |
1865 | | - |
1866 | | -#[test] |
1867 | | -fn test_unescape_with() { |
1868 | | - let custom_entities = |ent: &str| match ent { |
1869 | | - "foo" => Some("BAR"), |
1870 | | - _ => None, |
1871 | | - }; |
1872 | | - |
1873 | | - let unchanged = unescape_with("test", custom_entities).unwrap(); |
1874 | | - // assert_eq does not check that Cow is borrowed, but we explicitly use Cow |
1875 | | - // because it influences diff |
1876 | | - // TODO: use assert_matches! when stabilized and other features will bump MSRV |
1877 | | - assert_eq!(unchanged, Cow::Borrowed("test")); |
1878 | | - assert!(matches!(unchanged, Cow::Borrowed(_))); |
1879 | | - |
1880 | | - assert!(unescape_with("<", custom_entities).is_err()); |
1881 | | - assert_eq!(unescape_with("0", custom_entities).unwrap(), "0"); |
1882 | | - assert_eq!(unescape_with("0", custom_entities).unwrap(), "0"); |
1883 | | - assert_eq!(unescape_with("&foo;", custom_entities).unwrap(), "BAR"); |
1884 | | - assert!(unescape_with("&fop;", custom_entities).is_err()); |
1885 | | -} |
1886 | | - |
1887 | | -#[test] |
1888 | | -fn test_escape() { |
1889 | | - let unchanged = escape("test"); |
1890 | | - // assert_eq does not check that Cow is borrowed, but we explicitly use Cow |
1891 | | - // because it influences diff |
1892 | | - // TODO: use assert_matches! when stabilized and other features will bump MSRV |
1893 | | - assert_eq!(unchanged, Cow::Borrowed("test")); |
1894 | | - assert!(matches!(unchanged, Cow::Borrowed(_))); |
1895 | | - |
1896 | | - assert_eq!(escape("<&\"'>"), "<&"'>"); |
1897 | | - assert_eq!(escape("<test>"), "<test>"); |
1898 | | - assert_eq!(escape("\"a\"bc"), ""a"bc"); |
1899 | | - assert_eq!(escape("\"a\"b&c"), ""a"b&c"); |
1900 | | - assert_eq!( |
1901 | | - escape("prefix_\"a\"b&<>c"), |
1902 | | - "prefix_"a"b&<>c" |
1903 | | - ); |
1904 | | -} |
1905 | | - |
1906 | | -#[test] |
1907 | | -fn test_partial_escape() { |
1908 | | - let unchanged = partial_escape("test"); |
1909 | | - // assert_eq does not check that Cow is borrowed, but we explicitly use Cow |
1910 | | - // because it influences diff |
1911 | | - // TODO: use assert_matches! when stabilized and other features will bump MSRV |
1912 | | - assert_eq!(unchanged, Cow::Borrowed("test")); |
1913 | | - assert!(matches!(unchanged, Cow::Borrowed(_))); |
1914 | | - |
1915 | | - assert_eq!(partial_escape("<&\"'>"), "<&\"'>"); |
1916 | | - assert_eq!(partial_escape("<test>"), "<test>"); |
1917 | | - assert_eq!(partial_escape("\"a\"bc"), "\"a\"bc"); |
1918 | | - assert_eq!(partial_escape("\"a\"b&c"), "\"a\"b&c"); |
1919 | | - assert_eq!( |
1920 | | - partial_escape("prefix_\"a\"b&<>c"), |
1921 | | - "prefix_\"a\"b&<>c" |
1922 | | - ); |
1923 | | -} |
1924 | | - |
1925 | | -#[test] |
1926 | | -fn test_minimal_escape() { |
1927 | | - assert_eq!(minimal_escape("test"), Cow::Borrowed("test")); |
1928 | | - assert_eq!(minimal_escape("<&\"'>"), "<&\"'>"); |
1929 | | - assert_eq!(minimal_escape("<test>"), "<test>"); |
1930 | | - assert_eq!(minimal_escape("\"a\"bc"), "\"a\"bc"); |
1931 | | - assert_eq!(minimal_escape("\"a\"b&c"), "\"a\"b&c"); |
1932 | | - assert_eq!( |
1933 | | - minimal_escape("prefix_\"a\"b&<>c"), |
1934 | | - "prefix_\"a\"b&<>c" |
1935 | | - ); |
1936 | 1842 | } |
0 commit comments