@@ -8,7 +8,7 @@ use std::ops::Range;
8
8
use pretty_assertions:: assert_eq;
9
9
10
10
/// Error for XML escape / unescape.
11
- #[ derive( Clone , Debug ) ]
11
+ #[ derive( Clone , Debug , PartialEq ) ]
12
12
pub enum EscapeError {
13
13
/// Entity with Null character
14
14
EntityWithNull ( Range < usize > ) ,
@@ -212,8 +212,66 @@ where
212
212
}
213
213
}
214
214
215
+ /// Returns the attribute value normalized as per the XML specification, using a custom
216
+ /// entity resolver.
217
+ ///
218
+ /// https://www.w3.org/TR/xml/#AVNormalize
219
+ ///
220
+ /// Do not use this method with HTML attributes.
221
+ ///
222
+ /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>`
223
+ /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
224
+ /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
225
+ /// take precedence.
226
+ ///
227
+ /// This will allocate unless the raw attribute value does not require normalization.
228
+ pub ( crate ) fn normalize_attribute_value_with < ' a , ' entity > (
229
+ value : & ' a str ,
230
+ resolve_entity : impl Fn ( & str ) -> Option < & ' entity str > ,
231
+ ) -> Result < Cow < ' a , str > , EscapeError > {
232
+ // TODO: avoid allocation when not needed
233
+ let mut normalized = String :: with_capacity ( value. len ( ) ) ;
234
+
235
+ let attr = value. as_bytes ( ) ;
236
+ let mut attr_iter = attr. iter ( ) . enumerate ( ) ;
237
+
238
+ while let Some ( ( idx, ch) ) = attr_iter. next ( ) {
239
+ match ch {
240
+ b' ' | b'\n' | b'\r' | b'\t' => normalized. push ( ' ' ) ,
241
+ b'&' => {
242
+ let end = idx
243
+ + 1
244
+ + attr_iter
245
+ . position ( |( _, c) | * c == b';' )
246
+ . ok_or_else ( || EscapeError :: UnterminatedEntity ( idx..attr. len ( ) ) ) ?;
247
+ let entity = & attr[ idx + 1 ..end] ; // starts after the &
248
+ let entity_str = std:: str:: from_utf8 ( entity) . expect ( "failed UTF-8 check" ) ;
249
+
250
+ if entity. starts_with ( b"#" ) {
251
+ let entity = & entity_str[ 1 ..] ; // starts after the #
252
+ let codepoint = parse_number ( entity, idx..end) ?;
253
+ normalized. push_str ( codepoint. encode_utf8 ( & mut [ 0u8 ; 4 ] ) ) ;
254
+ } else if let Some ( s) = named_entity ( entity_str) {
255
+ normalized. push_str ( s) ;
256
+ } else if let Some ( value) = resolve_entity ( entity_str) {
257
+ // TODO: recursively apply entity substitution
258
+ normalized. push_str ( & value) ;
259
+ } else {
260
+ return Err ( EscapeError :: UnrecognizedSymbol (
261
+ idx + 1 ..end,
262
+ String :: from_utf8 ( entity. to_vec ( ) ) . expect ( "failed UTF-8 check" ) ,
263
+ ) ) ;
264
+ }
265
+ }
266
+ _ => normalized. push ( * ch as char ) ,
267
+ }
268
+ }
269
+
270
+ Ok ( Cow :: Owned ( normalized) )
271
+ }
272
+
215
273
#[ cfg( not( feature = "escape-html" ) ) ]
216
- fn named_entity ( name : & str ) -> Option < & str > {
274
+ pub ( crate ) fn named_entity ( name : & str ) -> Option < & str > {
217
275
// match over strings are not allowed in const functions
218
276
let s = match name. as_bytes ( ) {
219
277
b"lt" => "<" ,
@@ -226,9 +284,8 @@ fn named_entity(name: &str) -> Option<&str> {
226
284
Some ( s)
227
285
}
228
286
#[ cfg( feature = "escape-html" ) ]
229
- fn named_entity ( name : & str ) -> Option < & str > {
287
+ pub ( crate ) fn named_entity ( name : & str ) -> Option < & str > {
230
288
// imported from https://dev.w3.org/html5/html-author/charref
231
- // match over strings are not allowed in const functions
232
289
//TODO: automate up-to-dating using https://html.spec.whatwg.org/entities.json
233
290
let s = match name. as_bytes ( ) {
234
291
b"Tab" => "\u{09} " ,
@@ -1690,7 +1747,7 @@ fn named_entity(name: &str) -> Option<&str> {
1690
1747
Some ( s)
1691
1748
}
1692
1749
1693
- fn parse_number ( bytes : & str , range : Range < usize > ) -> Result < char , EscapeError > {
1750
+ pub fn parse_number ( bytes : & str , range : Range < usize > ) -> Result < char , EscapeError > {
1694
1751
let code = if bytes. starts_with ( 'x' ) {
1695
1752
parse_hexadecimal ( & bytes[ 1 ..] )
1696
1753
} else {
@@ -1705,7 +1762,7 @@ fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
1705
1762
}
1706
1763
}
1707
1764
1708
- fn parse_hexadecimal ( bytes : & str ) -> Result < u32 , EscapeError > {
1765
+ pub fn parse_hexadecimal ( bytes : & str ) -> Result < u32 , EscapeError > {
1709
1766
// maximum code is 0x10FFFF => 6 characters
1710
1767
if bytes. len ( ) > 6 {
1711
1768
return Err ( EscapeError :: TooLongHexadecimal ) ;
@@ -1723,7 +1780,7 @@ fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
1723
1780
Ok ( code)
1724
1781
}
1725
1782
1726
- fn parse_decimal ( bytes : & str ) -> Result < u32 , EscapeError > {
1783
+ pub fn parse_decimal ( bytes : & str ) -> Result < u32 , EscapeError > {
1727
1784
// maximum code is 0x10FFFF = 1114111 => 7 characters
1728
1785
if bytes. len ( ) > 7 {
1729
1786
return Err ( EscapeError :: TooLongDecimal ) ;
0 commit comments