Skip to content

Commit 4558cdd

Browse files
committed
fix #133. supports xml.ProcInst
1 parent deb27cf commit 4558cdd

File tree

4 files changed

+97
-44
lines changed

4 files changed

+97
-44
lines changed

node.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ const (
3131
AttributeNode
3232
// NotationNode is a directive represents in document (for example, <!text...>).
3333
NotationNode
34+
// ProcessingInstruction represents an XML processing instruction (e.g., <?target instruction?>).
35+
ProcessingInstruction
3436
)
3537

3638
type Attr struct {
@@ -39,6 +41,12 @@ type Attr struct {
3941
NamespaceURI string
4042
}
4143

44+
// ProcInstData represents an XML processing instruction.
45+
type ProcInstData struct {
46+
Target string
47+
Inst string
48+
}
49+
4250
// A Node consists of a NodeType and some Data (tag name for
4351
// element nodes, content for text) and are part of a tree of Nodes.
4452
type Node struct {
@@ -49,6 +57,7 @@ type Node struct {
4957
Prefix string
5058
NamespaceURI string
5159
Attr []Attr
60+
ProcInst *ProcInstData
5261

5362
level int // node level in the tree
5463
LineNumber int // line number where this node appears in the source XML
@@ -261,6 +270,13 @@ func outputXML(w io.Writer, n *Node, preserveSpaces bool, config *outputConfigur
261270
if err != nil {
262271
return
263272
}
273+
case ProcessingInstruction:
274+
if len(n.ProcInst.Inst) > 0 {
275+
_, err = fmt.Fprintf(w, "<?%s %s?>", n.ProcInst.Target, n.ProcInst.Inst)
276+
} else {
277+
_, err = fmt.Fprintf(w, "<?%s?>", n.ProcInst.Target)
278+
}
279+
return
264280
default:
265281
if err = indent.Open(); err != nil {
266282
return

node_test.go

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ func TestRemoveFromTree(t *testing.T) {
325325
t.Run("remove decl node works", func(t *testing.T) {
326326
doc := parseXML()
327327
procInst := doc.FirstChild
328-
testValue(t, procInst.Type, DeclarationNode)
328+
testValue(t, procInst.Type, ProcessingInstruction)
329329
RemoveFromTree(procInst)
330330
verifyNodePointers(t, doc)
331331
testValue(t, doc.OutputXMLWithOptions(WithoutPreserveSpace()),
@@ -812,3 +812,39 @@ func TestOutputXMLWithSingleQuotes(t *testing.T) {
812812
t.Errorf(`expected "%s", obtained "%s"`, expected, output)
813813
}
814814
}
815+
816+
func TestProcessingInstruction(t *testing.T) {
817+
xml := `<?ProcInstTag random string ?><?AnotherProcInst a="b"?><a/>`
818+
819+
doc, err := Parse(strings.NewReader(xml))
820+
if err != nil {
821+
t.Fatalf("Parse failed: %v", err)
822+
}
823+
824+
firstNode := doc.FirstChild
825+
if firstNode.Type != ProcessingInstruction {
826+
t.Errorf("First node type should be ProcessingInstruction, got %d", firstNode.Type)
827+
}
828+
829+
piNode := FindOne(doc, "//ProcInstTag")
830+
if piNode == nil {
831+
t.Fatal("FindOne should be find one, but got nil")
832+
}
833+
if piNode.ProcInst == nil {
834+
t.Fatal("ProcInstData is nil")
835+
}
836+
837+
if piNode.ProcInst.Target != "ProcInstTag" {
838+
t.Errorf("ProcInstData.Target mismatch: expected 'ProcInstTag', got '%s'", piNode.ProcInst.Target)
839+
}
840+
841+
if piNode.ProcInst.Inst != "random string" {
842+
t.Errorf("ProcInstData.Inst mismatch: expected 'random string', got '%s'", piNode.ProcInst.Inst)
843+
}
844+
845+
output := doc.OutputXML(true)
846+
expected := `<?ProcInstTag random string?><?AnotherProcInst a="b"?><a></a>`
847+
if output != expected {
848+
t.Errorf("Output mismatch:\nExpected: %s\nGot: %s", expected, output)
849+
}
850+
}

parse.go

Lines changed: 42 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ import (
1717

1818
var xmlMIMERegex = regexp.MustCompile(`(?i)((application|image|message|model)/((\w|\.|-)+\+?)?|text/)(wb)?xml`)
1919

20-
2120
// LoadURL loads the XML document from the specified URL.
2221
func LoadURL(url string) (*Node, error) {
2322
resp, err := http.Get(url)
@@ -41,7 +40,7 @@ func Parse(r io.Reader) (*Node, error) {
4140
func ParseWithOptions(r io.Reader, options ParserOptions) (*Node, error) {
4241
var data []byte
4342
var lineStarts []int
44-
43+
4544
// If line numbers are requested, read all data for position tracking
4645
if options.WithLineNumbers {
4746
var err error
@@ -50,7 +49,7 @@ func ParseWithOptions(r io.Reader, options ParserOptions) (*Node, error) {
5049
return nil, err
5150
}
5251
r = bytes.NewReader(data)
53-
52+
5453
// Pre-calculate line starts
5554
lineStarts = []int{0}
5655
for i, b := range data {
@@ -90,7 +89,7 @@ func ParseWithOptions(r io.Reader, options ParserOptions) (*Node, error) {
9089
data: data,
9190
lineStarts: lineStarts,
9291
}
93-
92+
9493
err = annotator.annotateLineNumbers(p.doc)
9594
if err != nil {
9695
return nil, err
@@ -115,8 +114,8 @@ type parser struct {
115114
reader *cachedReader // Need to maintain a reference to the reader, so we can determine whether a node contains CDATA.
116115
once sync.Once
117116
space2prefix map[string]*xmlnsPrefix
118-
currentLine int // Track current line number during parsing
119-
lastProcessedPos int // Track how much cached data we've already processed for line counting
117+
currentLine int // Track current line number during parsing
118+
lastProcessedPos int // Track how much cached data we've already processed for line counting
120119
}
121120

122121
type xmlnsPrefix struct {
@@ -144,14 +143,14 @@ func createParser(r io.Reader) *parser {
144143
// updateLineNumber scans only new cached data for newlines to update current line position
145144
func (p *parser) updateLineNumber() {
146145
cached := p.reader.CacheWithLimit(-1) // Get all cached data
147-
146+
148147
// Only process data we haven't seen before
149148
for i := p.lastProcessedPos; i < len(cached); i++ {
150149
if cached[i] == '\n' {
151150
p.currentLine++
152151
}
153152
}
154-
153+
155154
// Update our position to avoid reprocessing this data
156155
p.lastProcessedPos = len(cached)
157156
}
@@ -166,10 +165,10 @@ func (p *parser) parse() (*Node, error) {
166165
p.reader.StartCaching()
167166
tok, err := p.decoder.Token()
168167
p.reader.StopCaching()
169-
168+
170169
// Update line number based on processed content
171170
p.updateLineNumber()
172-
171+
173172
if err != nil {
174173
return nil, err
175174
}
@@ -335,7 +334,7 @@ func (p *parser) parse() (*Node, error) {
335334
AddSibling(p.prev.Parent, node)
336335
}
337336
case xml.ProcInst: // Processing Instruction
338-
if p.prev.Type != DeclarationNode {
337+
if !(p.prev.Type == DeclarationNode || p.prev.Type == ProcessingInstruction) {
339338
p.level++
340339
}
341340
node := &Node{Type: DeclarationNode, Data: tok.Target, level: p.level, LineNumber: p.currentLine}
@@ -346,6 +345,10 @@ func (p *parser) parse() (*Node, error) {
346345
AddAttr(node, pair[:i], strings.Trim(pair[i+1:], `"'`))
347346
}
348347
}
348+
if tok.Target != "xml" {
349+
node.Type = ProcessingInstruction
350+
node.ProcInst = &ProcInstData{Target: tok.Target, Inst: strings.TrimSpace(string(tok.Inst))}
351+
}
349352
if p.level == p.prev.level {
350353
AddSibling(p.prev, node)
351354
} else if p.level > p.prev.level {
@@ -502,11 +505,11 @@ func (p *lineNumberAnnotator) getLineForPosition(pos int) int {
502505
if pos < 0 {
503506
return 1
504507
}
505-
508+
506509
line := 1
507510
for i, start := range p.lineStarts {
508511
if pos < start {
509-
return i // i is the line number (1-based because lineStarts[0] = 0 for line 1)
512+
return i // i is the line number (1-based because lineStarts[0] = 0 for line 1)
510513
}
511514
line = i + 1
512515
}
@@ -538,7 +541,7 @@ func (p *lineNumberAnnotator) annotateNodesByPosition(node *Node) {
538541
if node == nil {
539542
return
540543
}
541-
544+
542545
// Annotate current node if not already done
543546
if node.LineNumber == 0 {
544547
switch node.Type {
@@ -547,19 +550,17 @@ func (p *lineNumberAnnotator) annotateNodesByPosition(node *Node) {
547550
case CommentNode:
548551
node.LineNumber = p.findCommentPosition(node.Data)
549552
case DeclarationNode:
550-
if node.Data == "xml" {
551-
node.LineNumber = p.findDeclarationLine()
552-
} else {
553-
node.LineNumber = p.findProcessingInstructionPosition(node.Data)
554-
}
553+
node.LineNumber = p.findDeclarationLine()
554+
case ProcessingInstruction:
555+
node.LineNumber = p.findProcessingInstructionPosition(node.Data)
555556
case TextNode, CharDataNode:
556557
text := strings.TrimSpace(node.Data)
557558
if text != "" {
558559
node.LineNumber = p.findTextPosition(text)
559560
}
560561
}
561562
}
562-
563+
563564
// Recursively annotate children
564565
for child := node.FirstChild; child != nil; child = child.NextSibling {
565566
p.annotateNodesByPosition(child)
@@ -568,10 +569,10 @@ func (p *lineNumberAnnotator) annotateNodesByPosition(node *Node) {
568569

569570
// State to track positions as we traverse the document
570571
type positionTracker struct {
571-
currentPos int
572-
elementCounts map[string]int
573-
commentCounts map[string]int
574-
textCounts map[string]int
572+
currentPos int
573+
elementCounts map[string]int
574+
commentCounts map[string]int
575+
textCounts map[string]int
575576
}
576577

577578
// findElementPosition finds the line number for the next occurrence of an element
@@ -583,7 +584,7 @@ func (p *lineNumberAnnotator) findElementPosition(name string) int {
583584
textCounts: make(map[string]int),
584585
}
585586
}
586-
587+
587588
p.tracker.elementCounts[name]++
588589
return p.findNthElementOccurrence(name, p.tracker.elementCounts[name])
589590
}
@@ -593,17 +594,17 @@ func (p *lineNumberAnnotator) findNthElementOccurrence(name string, n int) int {
593594
count := 0
594595
pos := 0
595596
dataStr := string(p.data)
596-
597+
597598
// Look for both prefixed and non-prefixed versions
598599
patterns := []string{
599-
fmt.Sprintf("<%s", name), // <name
600-
fmt.Sprintf(":%s", name), // prefix:name
600+
fmt.Sprintf("<%s", name), // <name
601+
fmt.Sprintf(":%s", name), // prefix:name
601602
}
602-
603+
603604
for {
604605
earliestPos := len(p.data)
605606
foundPattern := ""
606-
607+
607608
// Find the earliest occurrence of any pattern
608609
for _, pattern := range patterns {
609610
foundPos := strings.Index(dataStr[pos:], pattern)
@@ -615,15 +616,15 @@ func (p *lineNumberAnnotator) findNthElementOccurrence(name string, n int) int {
615616
}
616617
}
617618
}
618-
619+
619620
if earliestPos == len(p.data) {
620621
break // No more occurrences found
621622
}
622-
623+
623624
// Validate the match
624625
nextCharPos := earliestPos + len(foundPattern)
625626
isValidMatch := false
626-
627+
627628
if foundPattern[0] == '<' {
628629
// Direct element match like <name
629630
if nextCharPos < len(p.data) {
@@ -654,7 +655,7 @@ func (p *lineNumberAnnotator) findNthElementOccurrence(name string, n int) int {
654655
}
655656
}
656657
}
657-
658+
658659
if isValidMatch {
659660
count++
660661
if count == n {
@@ -669,10 +670,10 @@ func (p *lineNumberAnnotator) findNthElementOccurrence(name string, n int) int {
669670
return p.getLineForPosition(earliestPos)
670671
}
671672
}
672-
673+
673674
pos = earliestPos + 1
674675
}
675-
676+
676677
return 1
677678
}
678679

@@ -685,7 +686,7 @@ func (p *lineNumberAnnotator) findCommentPosition(content string) int {
685686
textCounts: make(map[string]int),
686687
}
687688
}
688-
689+
689690
p.tracker.commentCounts[content]++
690691
return p.findNthCommentOccurrence(content, p.tracker.commentCounts[content])
691692
}
@@ -695,7 +696,7 @@ func (p *lineNumberAnnotator) findNthCommentOccurrence(content string, n int) in
695696
pattern := fmt.Sprintf("<!--%s-->", content)
696697
count := 0
697698
pos := 0
698-
699+
699700
for {
700701
foundPos := strings.Index(string(p.data[pos:]), pattern)
701702
if foundPos < 0 {
@@ -730,7 +731,7 @@ func (p *lineNumberAnnotator) findTextPosition(text string) int {
730731
textCounts: make(map[string]int),
731732
}
732733
}
733-
734+
734735
p.tracker.textCounts[text]++
735736
return p.findNthTextOccurrence(text, p.tracker.textCounts[text])
736737
}
@@ -739,7 +740,7 @@ func (p *lineNumberAnnotator) findTextPosition(text string) int {
739740
func (p *lineNumberAnnotator) findNthTextOccurrence(text string, n int) int {
740741
count := 0
741742
pos := 0
742-
743+
743744
for {
744745
foundPos := strings.Index(string(p.data[pos:]), text)
745746
if foundPos < 0 {
@@ -765,16 +766,14 @@ func (p *lineNumberAnnotator) findProcessingInstructionPosition(target string) i
765766
return 1
766767
}
767768

768-
769-
770769
// LoadURLWithLineNumbers loads the XML document from the specified URL with line number annotations.
771770
func LoadURLWithLineNumbers(url string) (*Node, error) {
772771
resp, err := http.Get(url)
773772
if err != nil {
774773
return nil, err
775774
}
776775
defer resp.Body.Close()
777-
776+
778777
if xmlMIMERegex.MatchString(resp.Header.Get("Content-Type")) {
779778
return ParseWithOptions(resp.Body, ParserOptions{WithLineNumbers: true})
780779
}

query.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,8 @@ func (x *NodeNavigator) NodeType() xpath.NodeType {
165165
return xpath.AttributeNode
166166
}
167167
return xpath.ElementNode
168+
case ProcessingInstruction: // Keep backward compatibility
169+
return xpath.ElementNode
168170
}
169171
panic(fmt.Sprintf("unknown XML node type: %v", x.curr.Type))
170172
}

0 commit comments

Comments
 (0)