Skip to content

Commit 8049e7d

Browse files
committed
Merge PR '#37', Stream Parser Support - pt4
2 parents fa5754e + 81676ed commit 8049e7d

File tree

3 files changed

+338
-22
lines changed

3 files changed

+338
-22
lines changed

README.md

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ Overview
1515
Change Logs
1616
===
1717

18+
2020-08-??
19+
- Add XML stream loading and parsing support.
20+
1821
2019-11-11
1922
- Add XPath query caching.
2023

@@ -48,26 +51,58 @@ if err != nil {
4851
}
4952
```
5053

51-
#### Parse a XML from URL.
54+
#### Parse an XML from URL.
5255

5356
```go
5457
doc, err := xmlquery.LoadURL("http://www.example.com/sitemap.xml")
5558
```
5659

57-
#### Parse a XML from string.
60+
#### Parse an XML from string.
5861

5962
```go
6063
s := `<?xml version="1.0" encoding="utf-8"?><rss version="2.0"></rss>`
6164
doc, err := xmlquery.Parse(strings.NewReader(s))
6265
```
6366

64-
#### Parse a XML from io.Reader.
67+
#### Parse an XML from io.Reader.
6568

6669
```go
6770
f, err := os.Open("../books.xml")
6871
doc, err := xmlquery.Parse(f)
6972
```
7073

74+
#### Parse an XML in a stream fashion (simple case without element filtering).
75+
76+
```go
77+
f, err := os.Open("../books.xml")
78+
p, err := xmlquery.CreateStreamParser(f, "/bookstore/book")
79+
for {
80+
n, err := p.Read()
81+
if err == io.EOF {
82+
break
83+
}
84+
if err != nil {
85+
...
86+
}
87+
}
88+
```
89+
90+
#### Parse an XML in a stream fashion (simple case advanced element filtering).
91+
92+
```go
93+
f, err := os.Open("../books.xml")
94+
p, err := xmlquery.CreateStreamParser(f, "/bookstore/book", "/bookstore/book[price>=10]")
95+
for {
96+
n, err := p.Read()
97+
if err == io.EOF {
98+
break
99+
}
100+
if err != nil {
101+
...
102+
}
103+
}
104+
```
105+
71106
#### Find authors of all books in the bookstore.
72107

73108
```go
@@ -210,11 +245,11 @@ func main(){
210245

211246
List of supported XPath query packages
212247
===
213-
|Name |Description |
214-
|--------------------------|----------------|
215-
|[htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document|
216-
|[xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document|
217-
|[jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document|
248+
| Name | Description |
249+
| ------------------------------------------------- | ----------------------------------------- |
250+
| [htmlquery](https://github.com/antchfx/htmlquery) | XPath query package for the HTML document |
251+
| [xmlquery](https://github.com/antchfx/xmlquery) | XPath query package for the XML document |
252+
| [jsonquery](https://github.com/antchfx/jsonquery) | XPath query package for the JSON document |
218253

219254
Questions
220255
===

parse.go

Lines changed: 151 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@ package xmlquery
33
import (
44
"encoding/xml"
55
"errors"
6+
"fmt"
67
"io"
78
"net/http"
89
"strings"
910

11+
"github.com/antchfx/xpath"
1012
"golang.org/x/net/html/charset"
1113
)
1214

@@ -20,12 +22,30 @@ func LoadURL(url string) (*Node, error) {
2022
return Parse(resp.Body)
2123
}
2224

25+
// Parse returns the parse tree for the XML from the given Reader.
26+
func Parse(r io.Reader) (*Node, error) {
27+
p := createParser(r)
28+
for {
29+
_, err := p.parse()
30+
if err == io.EOF {
31+
return p.doc, nil
32+
}
33+
if err != nil {
34+
return nil, err
35+
}
36+
}
37+
}
38+
2339
type parser struct {
24-
decoder *xml.Decoder
25-
doc *Node
26-
space2prefix map[string]string
27-
level int
28-
prev *Node
40+
decoder *xml.Decoder
41+
doc *Node
42+
space2prefix map[string]string
43+
level int
44+
prev *Node
45+
streamElementXPath *xpath.Expr // Under streaming mode, this specifies the xpath to the target element node(s).
46+
streamElementFilter *xpath.Expr // If specified, it provides a futher filtering on the target element.
47+
streamNode *Node // Need to remmeber the last target node So we can clean it up upon next Read() call.
48+
streamNodePrev *Node // Need to remember target node's prev so upon target node removal, we can restore correct prev.
2949
}
3050

3151
func createParser(r io.Reader) *parser {
@@ -43,6 +63,8 @@ func createParser(r io.Reader) *parser {
4363
}
4464

4565
func (p *parser) parse() (*Node, error) {
66+
var streamElementNodeCounter int
67+
4668
for {
4769
tok, err := p.decoder.Token()
4870
if err != nil {
@@ -99,10 +121,54 @@ func (p *parser) parse() (*Node, error) {
99121
}
100122
addSibling(p.prev.Parent, node)
101123
}
124+
// If we're in the streaming mode, we need to remember the node if it is the target node
125+
// so that when we finish processing the node's EndElement, we know how/what to return to
126+
// caller. Also we need to remove the target node from the tree upon next Read() call so
127+
// memory doesn't grow unbounded.
128+
if p.streamElementXPath != nil {
129+
if p.streamNode == nil {
130+
if QuerySelector(p.doc, p.streamElementXPath) != nil {
131+
p.streamNode = node
132+
p.streamNodePrev = p.prev
133+
streamElementNodeCounter = 1
134+
}
135+
} else {
136+
streamElementNodeCounter++
137+
}
138+
}
102139
p.prev = node
103140
p.level++
104141
case xml.EndElement:
105142
p.level--
143+
// If we're in streaming mode, and we already have a potential streaming
144+
// target node identified (p.streamNode != nil) then we need to check if
145+
// this is the real one we want to return to caller.
146+
if p.streamNode != nil {
147+
streamElementNodeCounter--
148+
if streamElementNodeCounter == 0 {
149+
// Now we know this element node is the at least passing the initial
150+
// p.streamElementXPath check and is a potential target node candidate.
151+
// We need to have 1 more check with p.streamElementFilter (if given) to
152+
// ensure it is really the element node we want.
153+
// The reason we need a two-step check process is because the following
154+
// situation:
155+
// <AAA><BBB>b1</BBB></AAA>
156+
// And say the p.streamElementXPath = "/AAA/BBB[. != 'b1']". Now during
157+
// xml.StartElement time, the <BBB> node is still empty, so it will pass
158+
// the p.streamElementXPath check. However, eventually we know this <BBB>
159+
// shouldn't be returned to the caller. Having a second more fine-grained
160+
// filter check ensures that. So in this case, the caller should really
161+
// setup the stream parser with:
162+
// streamElementXPath = "/AAA/BBB["
163+
// streamElementFilter = "/AAA/BBB[. != 'b1']"
164+
if p.streamElementFilter == nil || QuerySelector(p.doc, p.streamElementFilter) != nil {
165+
return p.streamNode, nil
166+
}
167+
// otherwise, this isn't our target node. clean things up.
168+
p.streamNode = nil
169+
p.streamNodePrev = nil
170+
}
171+
}
106172
case xml.CharData:
107173
node := &Node{Type: CharDataNode, Data: string(tok), level: p.level}
108174
if p.level == p.prev.level {
@@ -150,16 +216,87 @@ func (p *parser) parse() (*Node, error) {
150216
}
151217
}
152218

153-
// Parse returns the parse tree for the XML from the given Reader.
154-
func Parse(r io.Reader) (*Node, error) {
155-
p := createParser(r)
156-
for {
157-
_, err := p.parse()
158-
if err == io.EOF {
159-
return p.doc, nil
160-
}
219+
// StreamParser enables loading and parsing an XML document in a streaming fashion.
220+
type StreamParser struct {
221+
p *parser
222+
}
223+
224+
// CreateStreamParser creates a StreamParser. Argument streamElementXPath is required.
225+
// Argument streamElementFilter is optional and should only be used in advanced scenarios.
226+
//
227+
// Scenario 1: simple case:
228+
// xml := `<AAA><BBB>b1</BBB><BBB>b2</BBB></AAA>`
229+
// sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB")
230+
// if err != nil {
231+
// panic(err)
232+
// }
233+
// for {
234+
// n, err := sp.Read()
235+
// if err != nil {
236+
// break
237+
// }
238+
// fmt.Println(n.OutputXML(true))
239+
// }
240+
// Output will be:
241+
// <BBB>b1</BBB>
242+
// <BBB>b2</BBB>
243+
//
244+
// Scenario 2: advanced case:
245+
// xml := `<AAA><BBB>b1</BBB><BBB>b2</BBB></AAA>`
246+
// sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB", "/AAA/BBB[. != 'b1']")
247+
// if err != nil {
248+
// panic(err)
249+
// }
250+
// for {
251+
// n, err := sp.Read()
252+
// if err != nil {
253+
// break
254+
// }
255+
// fmt.Println(n.OutputXML(true))
256+
// }
257+
// Output will be:
258+
// <BBB>b2</BBB>
259+
//
260+
// As the argument names indicate, streamElementXPath should be used for providing xpath query pointing
261+
// to the target element node only, no extra filtering on the element itself or its children; while
262+
// streamElementFilter, if needed, can provide additional filtering on the target element and its children.
263+
//
264+
// CreateStreamParser returns error if either streamElementXPath or streamElementFilter, if provided, cannot
265+
// be successfully parsed and compiled into a valid xpath query.
266+
func CreateStreamParser(r io.Reader, streamElementXPath string, streamElementFilter ...string) (*StreamParser, error) {
267+
elemXPath, err := getQuery(streamElementXPath)
268+
if err != nil {
269+
return nil, fmt.Errorf("invalid streamElementXPath '%s', err: %s", streamElementXPath, err.Error())
270+
}
271+
elemFilter := (*xpath.Expr)(nil)
272+
if len(streamElementFilter) > 0 {
273+
elemFilter, err = getQuery(streamElementFilter[0])
161274
if err != nil {
162-
return nil, err
275+
return nil, fmt.Errorf("invalid streamElementFilter '%s', err: %s", streamElementFilter[0], err.Error())
163276
}
164277
}
278+
sp := &StreamParser{
279+
p: createParser(r),
280+
}
281+
sp.p.streamElementXPath = elemXPath
282+
sp.p.streamElementFilter = elemFilter
283+
return sp, nil
284+
}
285+
286+
// Read returns a target node that satisifies the XPath specified by caller at StreamParser creation
287+
// time. If there is no more satisifying target node after reading the rest of the XML document, io.EOF
288+
// will be returned. At any time, any XML parsing error encountered, the error will be returned and
289+
// the stream parsing is stopped. Calling Read() after an error is returned (including io.EOF) is not
290+
// allowed the behavior will be undefined. Also note, due to the streaming nature, calling Read() will
291+
// automatically remove any previous target node(s) from the document tree.
292+
func (sp *StreamParser) Read() (*Node, error) {
293+
// Because this is a streaming read, we need to release/remove last
294+
// target node from the node tree to free up memory.
295+
if sp.p.streamNode != nil {
296+
removeFromTree(sp.p.streamNode)
297+
sp.p.prev = sp.p.streamNodePrev
298+
sp.p.streamNode = nil
299+
sp.p.streamNodePrev = nil
300+
}
301+
return sp.p.parse()
165302
}

0 commit comments

Comments
 (0)