@@ -3,10 +3,12 @@ package xmlquery
3
3
import (
4
4
"encoding/xml"
5
5
"errors"
6
+ "fmt"
6
7
"io"
7
8
"net/http"
8
9
"strings"
9
10
11
+ "github.com/antchfx/xpath"
10
12
"golang.org/x/net/html/charset"
11
13
)
12
14
@@ -20,12 +22,30 @@ func LoadURL(url string) (*Node, error) {
20
22
return Parse (resp .Body )
21
23
}
22
24
25
+ // Parse returns the parse tree for the XML from the given Reader.
26
+ func Parse (r io.Reader ) (* Node , error ) {
27
+ p := createParser (r )
28
+ for {
29
+ _ , err := p .parse ()
30
+ if err == io .EOF {
31
+ return p .doc , nil
32
+ }
33
+ if err != nil {
34
+ return nil , err
35
+ }
36
+ }
37
+ }
38
+
23
39
type parser struct {
24
- decoder * xml.Decoder
25
- doc * Node
26
- space2prefix map [string ]string
27
- level int
28
- prev * Node
40
+ decoder * xml.Decoder
41
+ doc * Node
42
+ space2prefix map [string ]string
43
+ level int
44
+ prev * Node
45
+ streamElementXPath * xpath.Expr // Under streaming mode, this specifies the xpath to the target element node(s).
46
+ streamElementFilter * xpath.Expr // If specified, it provides a futher filtering on the target element.
47
+ streamNode * Node // Need to remmeber the last target node So we can clean it up upon next Read() call.
48
+ streamNodePrev * Node // Need to remember target node's prev so upon target node removal, we can restore correct prev.
29
49
}
30
50
31
51
func createParser (r io.Reader ) * parser {
@@ -43,6 +63,8 @@ func createParser(r io.Reader) *parser {
43
63
}
44
64
45
65
func (p * parser ) parse () (* Node , error ) {
66
+ var streamElementNodeCounter int
67
+
46
68
for {
47
69
tok , err := p .decoder .Token ()
48
70
if err != nil {
@@ -99,10 +121,54 @@ func (p *parser) parse() (*Node, error) {
99
121
}
100
122
addSibling (p .prev .Parent , node )
101
123
}
124
+ // If we're in the streaming mode, we need to remember the node if it is the target node
125
+ // so that when we finish processing the node's EndElement, we know how/what to return to
126
+ // caller. Also we need to remove the target node from the tree upon next Read() call so
127
+ // memory doesn't grow unbounded.
128
+ if p .streamElementXPath != nil {
129
+ if p .streamNode == nil {
130
+ if QuerySelector (p .doc , p .streamElementXPath ) != nil {
131
+ p .streamNode = node
132
+ p .streamNodePrev = p .prev
133
+ streamElementNodeCounter = 1
134
+ }
135
+ } else {
136
+ streamElementNodeCounter ++
137
+ }
138
+ }
102
139
p .prev = node
103
140
p .level ++
104
141
case xml.EndElement :
105
142
p .level --
143
+ // If we're in streaming mode, and we already have a potential streaming
144
+ // target node identified (p.streamNode != nil) then we need to check if
145
+ // this is the real one we want to return to caller.
146
+ if p .streamNode != nil {
147
+ streamElementNodeCounter --
148
+ if streamElementNodeCounter == 0 {
149
+ // Now we know this element node is the at least passing the initial
150
+ // p.streamElementXPath check and is a potential target node candidate.
151
+ // We need to have 1 more check with p.streamElementFilter (if given) to
152
+ // ensure it is really the element node we want.
153
+ // The reason we need a two-step check process is because the following
154
+ // situation:
155
+ // <AAA><BBB>b1</BBB></AAA>
156
+ // And say the p.streamElementXPath = "/AAA/BBB[. != 'b1']". Now during
157
+ // xml.StartElement time, the <BBB> node is still empty, so it will pass
158
+ // the p.streamElementXPath check. However, eventually we know this <BBB>
159
+ // shouldn't be returned to the caller. Having a second more fine-grained
160
+ // filter check ensures that. So in this case, the caller should really
161
+ // setup the stream parser with:
162
+ // streamElementXPath = "/AAA/BBB["
163
+ // streamElementFilter = "/AAA/BBB[. != 'b1']"
164
+ if p .streamElementFilter == nil || QuerySelector (p .doc , p .streamElementFilter ) != nil {
165
+ return p .streamNode , nil
166
+ }
167
+ // otherwise, this isn't our target node. clean things up.
168
+ p .streamNode = nil
169
+ p .streamNodePrev = nil
170
+ }
171
+ }
106
172
case xml.CharData :
107
173
node := & Node {Type : CharDataNode , Data : string (tok ), level : p .level }
108
174
if p .level == p .prev .level {
@@ -150,16 +216,87 @@ func (p *parser) parse() (*Node, error) {
150
216
}
151
217
}
152
218
153
- // Parse returns the parse tree for the XML from the given Reader.
154
- func Parse (r io.Reader ) (* Node , error ) {
155
- p := createParser (r )
156
- for {
157
- _ , err := p .parse ()
158
- if err == io .EOF {
159
- return p .doc , nil
160
- }
219
+ // StreamParser enables loading and parsing an XML document in a streaming fashion.
220
+ type StreamParser struct {
221
+ p * parser
222
+ }
223
+
224
+ // CreateStreamParser creates a StreamParser. Argument streamElementXPath is required.
225
+ // Argument streamElementFilter is optional and should only be used in advanced scenarios.
226
+ //
227
+ // Scenario 1: simple case:
228
+ // xml := `<AAA><BBB>b1</BBB><BBB>b2</BBB></AAA>`
229
+ // sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB")
230
+ // if err != nil {
231
+ // panic(err)
232
+ // }
233
+ // for {
234
+ // n, err := sp.Read()
235
+ // if err != nil {
236
+ // break
237
+ // }
238
+ // fmt.Println(n.OutputXML(true))
239
+ // }
240
+ // Output will be:
241
+ // <BBB>b1</BBB>
242
+ // <BBB>b2</BBB>
243
+ //
244
+ // Scenario 2: advanced case:
245
+ // xml := `<AAA><BBB>b1</BBB><BBB>b2</BBB></AAA>`
246
+ // sp, err := CreateStreamParser(strings.NewReader(xml), "/AAA/BBB", "/AAA/BBB[. != 'b1']")
247
+ // if err != nil {
248
+ // panic(err)
249
+ // }
250
+ // for {
251
+ // n, err := sp.Read()
252
+ // if err != nil {
253
+ // break
254
+ // }
255
+ // fmt.Println(n.OutputXML(true))
256
+ // }
257
+ // Output will be:
258
+ // <BBB>b2</BBB>
259
+ //
260
+ // As the argument names indicate, streamElementXPath should be used for providing xpath query pointing
261
+ // to the target element node only, no extra filtering on the element itself or its children; while
262
+ // streamElementFilter, if needed, can provide additional filtering on the target element and its children.
263
+ //
264
+ // CreateStreamParser returns error if either streamElementXPath or streamElementFilter, if provided, cannot
265
+ // be successfully parsed and compiled into a valid xpath query.
266
+ func CreateStreamParser (r io.Reader , streamElementXPath string , streamElementFilter ... string ) (* StreamParser , error ) {
267
+ elemXPath , err := getQuery (streamElementXPath )
268
+ if err != nil {
269
+ return nil , fmt .Errorf ("invalid streamElementXPath '%s', err: %s" , streamElementXPath , err .Error ())
270
+ }
271
+ elemFilter := (* xpath .Expr )(nil )
272
+ if len (streamElementFilter ) > 0 {
273
+ elemFilter , err = getQuery (streamElementFilter [0 ])
161
274
if err != nil {
162
- return nil , err
275
+ return nil , fmt . Errorf ( "invalid streamElementFilter '%s', err: %s" , streamElementFilter [ 0 ], err . Error ())
163
276
}
164
277
}
278
+ sp := & StreamParser {
279
+ p : createParser (r ),
280
+ }
281
+ sp .p .streamElementXPath = elemXPath
282
+ sp .p .streamElementFilter = elemFilter
283
+ return sp , nil
284
+ }
285
+
286
+ // Read returns a target node that satisifies the XPath specified by caller at StreamParser creation
287
+ // time. If there is no more satisifying target node after reading the rest of the XML document, io.EOF
288
+ // will be returned. At any time, any XML parsing error encountered, the error will be returned and
289
+ // the stream parsing is stopped. Calling Read() after an error is returned (including io.EOF) is not
290
+ // allowed the behavior will be undefined. Also note, due to the streaming nature, calling Read() will
291
+ // automatically remove any previous target node(s) from the document tree.
292
+ func (sp * StreamParser ) Read () (* Node , error ) {
293
+ // Because this is a streaming read, we need to release/remove last
294
+ // target node from the node tree to free up memory.
295
+ if sp .p .streamNode != nil {
296
+ removeFromTree (sp .p .streamNode )
297
+ sp .p .prev = sp .p .streamNodePrev
298
+ sp .p .streamNode = nil
299
+ sp .p .streamNodePrev = nil
300
+ }
301
+ return sp .p .parse ()
165
302
}
0 commit comments