Skip to content

Commit 953a500

Browse files
committed
Merge consequent text events
1 parent 97b90eb commit 953a500

File tree

3 files changed

+85
-72
lines changed

3 files changed

+85
-72
lines changed

Changelog.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@
2323
mappings (`xmlns:xxx`) that was broken since [#490]
2424
- [#510]: Fix an error of deserialization of `Option<T>` fields where `T` is some
2525
sequence type (for example, `Vec` or tuple)
26+
- [#520]: Merge consequent (delimited only by comments and processing instructions)
27+
texts and CDATA when deserialize using serde deserializer. `DeEvent::Text` and
28+
`DeEvent::CData` events was replaced by `DeEvent::Text` with merged content.
29+
The same behavior for the `Reader` does not implemented (yet?) and should be
30+
implemented manually
2631

2732
### Misc Changes
2833

src/de/mod.rs

Lines changed: 72 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,7 @@
7777
//! ```xml
7878
//! <...>text<![CDATA[cdata]]>text</...>
7979
//! ```
80-
//! <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
81-
//!
82-
//! Merging of the text / CDATA content is tracked in the issue [#474] and
83-
//! will be available in the next release.
84-
//! </div>
80+
//! Mixed text / CDATA content represents one logical string, `"textcdatatext"` in that case.
8581
//! </td>
8682
//! <td>
8783
//!
@@ -90,9 +86,7 @@
9086
//! - [`Cow<str>`]
9187
//! - [`u32`], [`f32`] and other numeric types
9288
//! - `enum`s, like
93-
//! ```ignore
94-
//! // FIXME: #474, merging mixed text / CDATA
95-
//! // content does not work yet
89+
//! ```
9690
//! # use pretty_assertions::assert_eq;
9791
//! # use serde::Deserialize;
9892
//! # #[derive(Debug, PartialEq)]
@@ -149,11 +143,6 @@
149143
//! ...
150144
//! ]]></...>
151145
//! ```
152-
//! <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
153-
//!
154-
//! Merging of the text / CDATA content is tracked in the issue [#474] and
155-
//! will be available in the next release.
156-
//! </div>
157146
//!
158147
//! [`xs:list`]: https://www.w3.org/TR/xmlschema11-2/#list-datatypes
159148
//! </td>
@@ -162,8 +151,6 @@
162151
//! Use any type that deserialized using [`deserialize_seq()`] call, for example:
163152
//!
164153
//! ```
165-
//! // FIXME: #474, merging mixed text / CDATA
166-
//! // content does not work yet
167154
//! type List = Vec<u32>;
168155
//! ```
169156
//!
@@ -520,8 +507,7 @@
520507
//! }
521508
//! # assert_eq!(AnyName::One { field1: () }, quick_xml::de::from_str(r#"<one field1="...">...</one>"#).unwrap());
522509
//! # assert_eq!(AnyName::Two { field2: () }, quick_xml::de::from_str(r#"<two><field2>...</field2></two>"#).unwrap());
523-
//! # assert_eq!(AnyName::Text("text".into()), quick_xml::de::from_str(r#"text"#).unwrap());
524-
//! # // TODO: After #474 parse mixed content
510+
//! # assert_eq!(AnyName::Text("text cdata ".into()), quick_xml::de::from_str(r#"text <![CDATA[ cdata ]]>"#).unwrap());
525511
//! ```
526512
//! ```
527513
//! # use pretty_assertions::assert_eq;
@@ -544,8 +530,7 @@
544530
//! }
545531
//! # assert_eq!(AnyName::One, quick_xml::de::from_str(r#"<one field1="...">...</one>"#).unwrap());
546532
//! # assert_eq!(AnyName::Two(Two { field2: () }), quick_xml::de::from_str(r#"<two><field2>...</field2></two>"#).unwrap());
547-
//! # assert_eq!(AnyName::Text, quick_xml::de::from_str(r#"text"#).unwrap());
548-
//! # // TODO: After #474 parse mixed content
533+
//! # assert_eq!(AnyName::Text, quick_xml::de::from_str(r#"text <![CDATA[ cdata ]]>"#).unwrap());
549534
//! ```
550535
//! ```
551536
//! # use pretty_assertions::assert_eq;
@@ -561,8 +546,7 @@
561546
//! }
562547
//! # assert_eq!(AnyName::One, quick_xml::de::from_str(r#"<one field1="...">...</one>"#).unwrap());
563548
//! # assert_eq!(AnyName::Other, quick_xml::de::from_str(r#"<two><field2>...</field2></two>"#).unwrap());
564-
//! # assert_eq!(AnyName::Other, quick_xml::de::from_str(r#"text"#).unwrap());
565-
//! # // TODO: After #474 parse mixed content
549+
//! # assert_eq!(AnyName::Other, quick_xml::de::from_str(r#"text <![CDATA[ cdata ]]>"#).unwrap());
566550
//! ```
567551
//! <div style="background:rgba(120,145,255,0.45);padding:0.75em;">
568552
//!
@@ -643,9 +627,8 @@
643627
//! # quick_xml::de::from_str(r#"<any-tag field="..."><two>...</two></any-tag>"#).unwrap(),
644628
//! # );
645629
//! # assert_eq!(
646-
//! # AnyName { field: (), any_name: Choice::Text("text".into()) },
647-
//! # // TODO: After #474 parse mixed content
648-
//! # quick_xml::de::from_str(r#"<any-tag field="...">text</any-tag>"#).unwrap(),
630+
//! # AnyName { field: (), any_name: Choice::Text("text cdata ".into()) },
631+
//! # quick_xml::de::from_str(r#"<any-tag field="...">text <![CDATA[ cdata ]]></any-tag>"#).unwrap(),
649632
//! # );
650633
//! ```
651634
//! </td>
@@ -967,8 +950,7 @@
967950
//! from the full element (`<one>...</one>`), so they could use the element name
968951
//! to choose the right variant:
969952
//!
970-
//! ```ignore
971-
//! // FIXME: #474
953+
//! ```
972954
//! # use pretty_assertions::assert_eq;
973955
//! # use serde::Deserialize;
974956
//! # type One = ();
@@ -985,9 +967,7 @@
985967
//! # quick_xml::de::from_str(r#"<one>...</one>text <![CDATA[cdata]]><two>...</two><one>...</one>"#).unwrap(),
986968
//! # );
987969
//! ```
988-
//! ```ignore
989-
//! // FIXME: #474, Custom("unknown variant `two`,
990-
//! // expected `one`")
970+
//! ```
991971
//! # use pretty_assertions::assert_eq;
992972
//! # use serde::Deserialize;
993973
//! # #[derive(Debug, PartialEq)]
@@ -1011,11 +991,6 @@
1011991
//! NOTE: consequent text and CDATA nodes are merged into the one text node,
1012992
//! so you cannot have two adjacent string types in your sequence.
1013993
//! </div>
1014-
//! <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
1015-
//!
1016-
//! Merging of the text / CDATA content is tracked in the issue [#474] and
1017-
//! will be available in the next release.
1018-
//! </div>
1019994
//! </td>
1020995
//! </tr>
1021996
//! <!-- 15 ==================================================================================== -->
@@ -1040,8 +1015,7 @@
10401015
//! <td>
10411016
//! A homogeneous sequence of elements with a fixed or dynamic size:
10421017
//!
1043-
//! ```ignore
1044-
//! // FIXME: #474
1018+
//! ```
10451019
//! # use pretty_assertions::assert_eq;
10461020
//! # use serde::Deserialize;
10471021
//! # #[derive(Debug, PartialEq)]
@@ -1059,8 +1033,7 @@
10591033
//! # quick_xml::de::from_str::<AnyName>(r#"<one>...</one>text <![CDATA[cdata]]><two>...</two><one>...</one>"#).unwrap(),
10601034
//! # );
10611035
//! ```
1062-
//! ```ignore
1063-
//! // FIXME: #474
1036+
//! ```
10641037
//! # use pretty_assertions::assert_eq;
10651038
//! # use serde::Deserialize;
10661039
//! # #[derive(Debug, PartialEq)]
@@ -1088,11 +1061,6 @@
10881061
//! NOTE: consequent text and CDATA nodes are merged into the one text node,
10891062
//! so you cannot have two adjacent string types in your sequence.
10901063
//! </div>
1091-
//! <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
1092-
//!
1093-
//! Merging of the text / CDATA content is tracked in the issue [#474] and
1094-
//! will be available in the next release.
1095-
//! </div>
10961064
//! </td>
10971065
//! </tr>
10981066
//! <!-- 16 ==================================================================================== -->
@@ -1119,8 +1087,7 @@
11191087
//!
11201088
//! You MUST specify `#[serde(rename = "$value")]` on that field:
11211089
//!
1122-
//! ```ignore
1123-
//! // FIXME: #474, Custom("duplicate field `$value`")
1090+
//! ```
11241091
//! # use pretty_assertions::assert_eq;
11251092
//! # use serde::Deserialize;
11261093
//! # type One = ();
@@ -1157,8 +1124,7 @@
11571124
//! # ).unwrap(),
11581125
//! # );
11591126
//! ```
1160-
//! ```ignore
1161-
//! // FIXME: #474, Custom("duplicate field `$value`")
1127+
//! ```
11621128
//! # use pretty_assertions::assert_eq;
11631129
//! # use serde::Deserialize;
11641130
//! # type One = ();
@@ -1204,11 +1170,6 @@
12041170
//! NOTE: consequent text and CDATA nodes are merged into the one text node,
12051171
//! so you cannot have two adjacent string types in your sequence.
12061172
//! </div>
1207-
//! <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
1208-
//!
1209-
//! Merging of the text / CDATA content is tracked in the issue [#474] and
1210-
//! will be available in the next release.
1211-
//! </div>
12121173
//! </td>
12131174
//! </tr>
12141175
//! <!-- 17 ==================================================================================== -->
@@ -1237,8 +1198,7 @@
12371198
//!
12381199
//! You MUST specify `#[serde(rename = "$value")]` on that field:
12391200
//!
1240-
//! ```ignore
1241-
//! // FIXME: #474
1201+
//! ```
12421202
//! # use pretty_assertions::assert_eq;
12431203
//! # use serde::Deserialize;
12441204
//! # #[derive(Debug, PartialEq)]
@@ -1282,8 +1242,7 @@
12821242
//! # ).unwrap(),
12831243
//! # );
12841244
//! ```
1285-
//! ```ignore
1286-
//! // FIXME: #474
1245+
//! ```
12871246
//! # use pretty_assertions::assert_eq;
12881247
//! # use serde::Deserialize;
12891248
//! # #[derive(Debug, PartialEq)]
@@ -1332,11 +1291,6 @@
13321291
//! NOTE: consequent text and CDATA nodes are merged into the one text node,
13331292
//! so you cannot have two adjacent string types in your sequence.
13341293
//! </div>
1335-
//! <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
1336-
//!
1337-
//! Merging of the text / CDATA content is tracked in the issue [#474] and
1338-
//! will be available in the next release.
1339-
//! </div>
13401294
//! </td>
13411295
//! </tr>
13421296
//! </tbody>
@@ -1720,7 +1674,6 @@
17201674
//!
17211675
//! [specification]: https://www.w3.org/TR/xmlschema11-1/#Simple_Type_Definition
17221676
//! [`deserialize_with`]: https://serde.rs/field-attrs.html#deserialize_with
1723-
//! [#474]: https://github.com/tafia/quick-xml/issues/474
17241677
//! [#497]: https://github.com/tafia/quick-xml/issues/497
17251678
17261679
// Macros should be defined before the modules that using them
@@ -2004,6 +1957,53 @@ impl<'i, R: XmlRead<'i>> XmlReader<'i, R> {
20041957
)
20051958
}
20061959

1960+
/// Read all consequent [`Text`] and [`CData`] events until non-text event
1961+
/// occurs. Content of all events would be appended to `result` and returned
1962+
/// as [`DeEvent::Text`].
1963+
///
1964+
/// [`Text`]: PayloadEvent::Text
1965+
/// [`CData`]: PayloadEvent::CData
1966+
fn drain_text(&mut self, mut result: Cow<'i, str>) -> Result<DeEvent<'i>, DeError> {
1967+
loop {
1968+
match self.lookahead {
1969+
Ok(PayloadEvent::Text(_) | PayloadEvent::CData(_)) => {
1970+
let text = self.next_text()?;
1971+
1972+
let mut s = result.into_owned();
1973+
s += &text;
1974+
result = Cow::Owned(s);
1975+
}
1976+
_ => break,
1977+
}
1978+
}
1979+
Ok(DeEvent::Text(result))
1980+
}
1981+
1982+
/// Read one text event, panics if current event is not a text event
1983+
///
1984+
/// |Event |XML |Handling
1985+
/// |-----------------------|---------------------------|----------------------------------------
1986+
/// |[`PayloadEvent::Start`]|`<tag>...</tag>` |Possible panic (unreachable)
1987+
/// |[`PayloadEvent::End`] |`</any-tag>` |Possible panic (unreachable)
1988+
/// |[`PayloadEvent::Text`] |`text content` |Unescapes `text content` and returns it
1989+
/// |[`PayloadEvent::CData`]|`<![CDATA[cdata content]]>`|Returns `cdata content` unchanged
1990+
/// |[`PayloadEvent::Eof`] | |Possible panic (unreachable)
1991+
#[inline(always)]
1992+
fn next_text(&mut self) -> Result<Cow<'i, str>, DeError> {
1993+
match self.next_impl()? {
1994+
PayloadEvent::Text(mut e) => {
1995+
if self.need_trim_end() {
1996+
e.inplace_trim_end();
1997+
}
1998+
Ok(e.unescape()?)
1999+
}
2000+
PayloadEvent::CData(e) => Ok(e.decode()?),
2001+
2002+
// SAFETY: this method is called only when we peeked Text or CData
2003+
_ => unreachable!("Only `Text` and `CData` events can come here"),
2004+
}
2005+
}
2006+
20072007
/// Return an input-borrowing event.
20082008
fn next(&mut self) -> Result<DeEvent<'i>, DeError> {
20092009
loop {
@@ -2014,9 +2014,9 @@ impl<'i, R: XmlRead<'i>> XmlReader<'i, R> {
20142014
if self.need_trim_end() && e.inplace_trim_end() {
20152015
continue;
20162016
}
2017-
Ok(DeEvent::Text(e.unescape()?))
2017+
self.drain_text(e.unescape()?)
20182018
}
2019-
PayloadEvent::CData(e) => Ok(DeEvent::Text(e.decode()?)),
2019+
PayloadEvent::CData(e) => self.drain_text(e.decode()?),
20202020
PayloadEvent::Eof => Ok(DeEvent::Eof),
20212021
};
20222022
}
@@ -2386,11 +2386,12 @@ where
23862386
self.read_string_impl(true)
23872387
}
23882388

2389-
/// Consumes a one XML element or an XML tree, returns associated text or
2389+
/// Consumes consequent [`Text`] and [`CData`] (both a referred below as a _text_)
2390+
/// events, merge them into one string. If there are no such events, returns
23902391
/// an empty string.
23912392
///
2392-
/// If `allow_start` is `false`, then only one event is consumed. If that
2393-
/// event is [`DeEvent::Start`], then [`DeError::UnexpectedStart`] is returned.
2393+
/// If `allow_start` is `false`, then only text events is consumed, for other
2394+
/// events an error is returned (see table below).
23942395
///
23952396
/// If `allow_start` is `true`, then first [`DeEvent::Text`] event is returned
23962397
/// and all other content is skipped until corresponding end tag will be consumed.
@@ -2415,6 +2416,9 @@ where
24152416
/// |[`DeEvent::End`] |`</any-tag>` |Emits [`UnexpectedEnd("any-tag")`](DeError::UnexpectedEnd)
24162417
/// |[`DeEvent::Text`] |`text content` or `<![CDATA[cdata content]]>` (probably mixed)|Returns event content unchanged, consumes events up to `</tag>`
24172418
/// |[`DeEvent::Eof`] | |Emits [`UnexpectedEof`](DeError::UnexpectedEof)
2419+
///
2420+
/// [`Text`]: Event::Text
2421+
/// [`CData`]: Event::CData
24182422
fn read_string_impl(&mut self, allow_start: bool) -> Result<Cow<'de, str>, DeError> {
24192423
match self.next()? {
24202424
DeEvent::Text(e) => Ok(e),
@@ -3002,7 +3006,7 @@ mod tests {
30023006
]
30033007
);
30043008

3005-
// Drop all events thet represents <target> tree. Now unconsumed XML looks like:
3009+
// Drop all events that represents <target> tree. Now unconsumed XML looks like:
30063010
//
30073011
// <skip>
30083012
// text

tests/serde-de.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -530,7 +530,8 @@ mod seq {
530530

531531
#[test]
532532
fn mixed_content() {
533-
from_str::<[(); 3]>(
533+
// Text and CDATA represents a one logical text item
534+
from_str::<[(); 2]>(
534535
r#"
535536
<element/>
536537
text
@@ -547,7 +548,8 @@ mod seq {
547548
"#,
548549
)
549550
.unwrap();
550-
assert_eq!(data, vec![(), (), ()]);
551+
// Text and CDATA represents a one logical text item
552+
assert_eq!(data, vec![(), ()]);
551553
}
552554

553555
/// This test ensures that composition of deserializer building blocks plays well
@@ -2432,8 +2434,9 @@ mod seq {
24322434
fn mixed_content() {
24332435
#[derive(Debug, PartialEq, Deserialize)]
24342436
struct List {
2437+
/// Text and CDATA represents a one logical text item
24352438
#[serde(rename = "$value")]
2436-
item: [(); 3],
2439+
item: [(); 2],
24372440
}
24382441

24392442
from_str::<List>(
@@ -3540,7 +3543,8 @@ mod seq {
35403543
assert_eq!(
35413544
data,
35423545
List {
3543-
item: vec![(), (), ()],
3546+
// Text and CDATA represents a one logical text item
3547+
item: vec![(), ()],
35443548
}
35453549
);
35463550
}

0 commit comments

Comments
 (0)