fix #133. supports xml.ProcInst

zhengchun · zhengchun · commit 4558cdd81f55 · 2025-09-27T19:30:35.000+08:00
diff --git a/node.go b/node.go
@@ -31,6 +31,8 @@ const (
 	AttributeNode
 	// NotationNode is a directive represents in document (for example, <!text...>).
 	NotationNode
+	// ProcessingInstruction represents an XML processing instruction (e.g., <?target instruction?>).
+	ProcessingInstruction
 )
 
 type Attr struct {
@@ -39,6 +41,12 @@ type Attr struct {
 	NamespaceURI string
 }
 
+// ProcInstData represents an XML processing instruction.
+type ProcInstData struct {
+	Target string
+	Inst   string
+}
+
 // A Node consists of a NodeType and some Data (tag name for
 // element nodes, content for text) and are part of a tree of Nodes.
 type Node struct {
@@ -49,6 +57,7 @@ type Node struct {
 	Prefix       string
 	NamespaceURI string
 	Attr         []Attr
+	ProcInst     *ProcInstData
 
 	level      int // node level in the tree
 	LineNumber int // line number where this node appears in the source XML
@@ -261,6 +270,13 @@ func outputXML(w io.Writer, n *Node, preserveSpaces bool, config *outputConfigur
 		if err != nil {
 			return
 		}
+	case ProcessingInstruction:
+		if len(n.ProcInst.Inst) > 0 {
+			_, err = fmt.Fprintf(w, "<?%s %s?>", n.ProcInst.Target, n.ProcInst.Inst)
+		} else {
+			_, err = fmt.Fprintf(w, "<?%s?>", n.ProcInst.Target)
+		}
+		return
 	default:
 		if err = indent.Open(); err != nil {
 			return
diff --git a/node_test.go b/node_test.go
@@ -325,7 +325,7 @@ func TestRemoveFromTree(t *testing.T) {
 	t.Run("remove decl node works", func(t *testing.T) {
 		doc := parseXML()
 		procInst := doc.FirstChild
-		testValue(t, procInst.Type, DeclarationNode)
+		testValue(t, procInst.Type, ProcessingInstruction)
 		RemoveFromTree(procInst)
 		verifyNodePointers(t, doc)
 		testValue(t, doc.OutputXMLWithOptions(WithoutPreserveSpace()),
@@ -812,3 +812,39 @@ func TestOutputXMLWithSingleQuotes(t *testing.T) {
 		t.Errorf(`expected "%s", obtained "%s"`, expected, output)
 	}
 }
+
+func TestProcessingInstruction(t *testing.T) {
+	xml := `<?ProcInstTag random string ?><?AnotherProcInst a="b"?><a/>`
+
+	doc, err := Parse(strings.NewReader(xml))
+	if err != nil {
+		t.Fatalf("Parse failed: %v", err)
+	}
+
+	firstNode := doc.FirstChild
+	if firstNode.Type != ProcessingInstruction {
+		t.Errorf("First node type should be ProcessingInstruction, got %d", firstNode.Type)
+	}
+
+	piNode := FindOne(doc, "//ProcInstTag")
+	if piNode == nil {
+		t.Fatal("FindOne should be find one, but got nil")
+	}
+	if piNode.ProcInst == nil {
+		t.Fatal("ProcInstData is nil")
+	}
+
+	if piNode.ProcInst.Target != "ProcInstTag" {
+		t.Errorf("ProcInstData.Target mismatch: expected 'ProcInstTag', got '%s'", piNode.ProcInst.Target)
+	}
+
+	if piNode.ProcInst.Inst != "random string" {
+		t.Errorf("ProcInstData.Inst mismatch: expected 'random string', got '%s'", piNode.ProcInst.Inst)
+	}
+
+	output := doc.OutputXML(true)
+	expected := `<?ProcInstTag random string?><?AnotherProcInst a="b"?><a></a>`
+	if output != expected {
+		t.Errorf("Output mismatch:\nExpected: %s\nGot: %s", expected, output)
+	}
+}
diff --git a/parse.go b/parse.go
@@ -17,7 +17,6 @@ import (
 
 var xmlMIMERegex = regexp.MustCompile(`(?i)((application|image|message|model)/((\w|\.|-)+\+?)?|text/)(wb)?xml`)
 
-
 // LoadURL loads the XML document from the specified URL.
 func LoadURL(url string) (*Node, error) {
 	resp, err := http.Get(url)
@@ -41,7 +40,7 @@ func Parse(r io.Reader) (*Node, error) {
 func ParseWithOptions(r io.Reader, options ParserOptions) (*Node, error) {
 	var data []byte
 	var lineStarts []int
-	
+
 	// If line numbers are requested, read all data for position tracking
 	if options.WithLineNumbers {
 		var err error
@@ -50,7 +49,7 @@ func ParseWithOptions(r io.Reader, options ParserOptions) (*Node, error) {
 			return nil, err
 		}
 		r = bytes.NewReader(data)
-		
+
 		// Pre-calculate line starts
 		lineStarts = []int{0}
 		for i, b := range data {
@@ -90,7 +89,7 @@ func ParseWithOptions(r io.Reader, options ParserOptions) (*Node, error) {
 				data:       data,
 				lineStarts: lineStarts,
 			}
-			
+
 			err = annotator.annotateLineNumbers(p.doc)
 			if err != nil {
 				return nil, err
@@ -115,8 +114,8 @@ type parser struct {
 	reader              *cachedReader // Need to maintain a reference to the reader, so we can determine whether a node contains CDATA.
 	once                sync.Once
 	space2prefix        map[string]*xmlnsPrefix
-	currentLine         int           // Track current line number during parsing
-	lastProcessedPos    int           // Track how much cached data we've already processed for line counting
+	currentLine         int // Track current line number during parsing
+	lastProcessedPos    int // Track how much cached data we've already processed for line counting
 }
 
 type xmlnsPrefix struct {
@@ -144,14 +143,14 @@ func createParser(r io.Reader) *parser {
 // updateLineNumber scans only new cached data for newlines to update current line position
 func (p *parser) updateLineNumber() {
 	cached := p.reader.CacheWithLimit(-1) // Get all cached data
-	
+
 	// Only process data we haven't seen before
 	for i := p.lastProcessedPos; i < len(cached); i++ {
 		if cached[i] == '\n' {
 			p.currentLine++
 		}
 	}
-	
+
 	// Update our position to avoid reprocessing this data
 	p.lastProcessedPos = len(cached)
 }
@@ -166,10 +165,10 @@ func (p *parser) parse() (*Node, error) {
 		p.reader.StartCaching()
 		tok, err := p.decoder.Token()
 		p.reader.StopCaching()
-		
+
 		// Update line number based on processed content
 		p.updateLineNumber()
-		
+
 		if err != nil {
 			return nil, err
 		}
@@ -335,7 +334,7 @@ func (p *parser) parse() (*Node, error) {
 				AddSibling(p.prev.Parent, node)
 			}
 		case xml.ProcInst: // Processing Instruction
-			if p.prev.Type != DeclarationNode {
+			if !(p.prev.Type == DeclarationNode || p.prev.Type == ProcessingInstruction) {
 				p.level++
 			}
 			node := &Node{Type: DeclarationNode, Data: tok.Target, level: p.level, LineNumber: p.currentLine}
@@ -346,6 +345,10 @@ func (p *parser) parse() (*Node, error) {
 					AddAttr(node, pair[:i], strings.Trim(pair[i+1:], `"'`))
 				}
 			}
+			if tok.Target != "xml" {
+				node.Type = ProcessingInstruction
+				node.ProcInst = &ProcInstData{Target: tok.Target, Inst: strings.TrimSpace(string(tok.Inst))}
+			}
 			if p.level == p.prev.level {
 				AddSibling(p.prev, node)
 			} else if p.level > p.prev.level {
@@ -502,11 +505,11 @@ func (p *lineNumberAnnotator) getLineForPosition(pos int) int {
 	if pos < 0 {
 		return 1
 	}
-	
+
 	line := 1
 	for i, start := range p.lineStarts {
 		if pos < start {
-			return i  // i is the line number (1-based because lineStarts[0] = 0 for line 1)
+			return i // i is the line number (1-based because lineStarts[0] = 0 for line 1)
 		}
 		line = i + 1
 	}
@@ -538,7 +541,7 @@ func (p *lineNumberAnnotator) annotateNodesByPosition(node *Node) {
 	if node == nil {
 		return
 	}
-	
+
 	// Annotate current node if not already done
 	if node.LineNumber == 0 {
 		switch node.Type {
@@ -547,19 +550,17 @@ func (p *lineNumberAnnotator) annotateNodesByPosition(node *Node) {
 		case CommentNode:
 			node.LineNumber = p.findCommentPosition(node.Data)
 		case DeclarationNode:
-			if node.Data == "xml" {
-				node.LineNumber = p.findDeclarationLine()
-			} else {
-				node.LineNumber = p.findProcessingInstructionPosition(node.Data)
-			}
+			node.LineNumber = p.findDeclarationLine()
+		case ProcessingInstruction:
+			node.LineNumber = p.findProcessingInstructionPosition(node.Data)
 		case TextNode, CharDataNode:
 			text := strings.TrimSpace(node.Data)
 			if text != "" {
 				node.LineNumber = p.findTextPosition(text)
 			}
 		}
 	}
-	
+
 	// Recursively annotate children
 	for child := node.FirstChild; child != nil; child = child.NextSibling {
 		p.annotateNodesByPosition(child)
@@ -568,10 +569,10 @@ func (p *lineNumberAnnotator) annotateNodesByPosition(node *Node) {
 
 // State to track positions as we traverse the document
 type positionTracker struct {
-	currentPos     int
-	elementCounts  map[string]int
-	commentCounts  map[string]int
-	textCounts     map[string]int
+	currentPos    int
+	elementCounts map[string]int
+	commentCounts map[string]int
+	textCounts    map[string]int
 }
 
 // findElementPosition finds the line number for the next occurrence of an element
@@ -583,7 +584,7 @@ func (p *lineNumberAnnotator) findElementPosition(name string) int {
 			textCounts:    make(map[string]int),
 		}
 	}
-	
+
 	p.tracker.elementCounts[name]++
 	return p.findNthElementOccurrence(name, p.tracker.elementCounts[name])
 }
@@ -593,17 +594,17 @@ func (p *lineNumberAnnotator) findNthElementOccurrence(name string, n int) int {
 	count := 0
 	pos := 0
 	dataStr := string(p.data)
-	
+
 	// Look for both prefixed and non-prefixed versions
 	patterns := []string{
-		fmt.Sprintf("<%s", name),     // <name
-		fmt.Sprintf(":%s", name),     // prefix:name
+		fmt.Sprintf("<%s", name), // <name
+		fmt.Sprintf(":%s", name), // prefix:name
 	}
-	
+
 	for {
 		earliestPos := len(p.data)
 		foundPattern := ""
-		
+
 		// Find the earliest occurrence of any pattern
 		for _, pattern := range patterns {
 			foundPos := strings.Index(dataStr[pos:], pattern)
@@ -615,15 +616,15 @@ func (p *lineNumberAnnotator) findNthElementOccurrence(name string, n int) int {
 				}
 			}
 		}
-		
+
 		if earliestPos == len(p.data) {
 			break // No more occurrences found
 		}
-		
+
 		// Validate the match
 		nextCharPos := earliestPos + len(foundPattern)
 		isValidMatch := false
-		
+
 		if foundPattern[0] == '<' {
 			// Direct element match like <name
 			if nextCharPos < len(p.data) {
@@ -654,7 +655,7 @@ func (p *lineNumberAnnotator) findNthElementOccurrence(name string, n int) int {
 				}
 			}
 		}
-		
+
 		if isValidMatch {
 			count++
 			if count == n {
@@ -669,10 +670,10 @@ func (p *lineNumberAnnotator) findNthElementOccurrence(name string, n int) int {
 				return p.getLineForPosition(earliestPos)
 			}
 		}
-		
+
 		pos = earliestPos + 1
 	}
-	
+
 	return 1
 }
 
@@ -685,7 +686,7 @@ func (p *lineNumberAnnotator) findCommentPosition(content string) int {
 			textCounts:    make(map[string]int),
 		}
 	}
-	
+
 	p.tracker.commentCounts[content]++
 	return p.findNthCommentOccurrence(content, p.tracker.commentCounts[content])
 }
@@ -695,7 +696,7 @@ func (p *lineNumberAnnotator) findNthCommentOccurrence(content string, n int) in
 	pattern := fmt.Sprintf("<!--%s-->", content)
 	count := 0
 	pos := 0
-	
+
 	for {
 		foundPos := strings.Index(string(p.data[pos:]), pattern)
 		if foundPos < 0 {
@@ -730,7 +731,7 @@ func (p *lineNumberAnnotator) findTextPosition(text string) int {
 			textCounts:    make(map[string]int),
 		}
 	}
-	
+
 	p.tracker.textCounts[text]++
 	return p.findNthTextOccurrence(text, p.tracker.textCounts[text])
 }
@@ -739,7 +740,7 @@ func (p *lineNumberAnnotator) findTextPosition(text string) int {
 func (p *lineNumberAnnotator) findNthTextOccurrence(text string, n int) int {
 	count := 0
 	pos := 0
-	
+
 	for {
 		foundPos := strings.Index(string(p.data[pos:]), text)
 		if foundPos < 0 {
@@ -765,16 +766,14 @@ func (p *lineNumberAnnotator) findProcessingInstructionPosition(target string) i
 	return 1
 }
 
-
-
 // LoadURLWithLineNumbers loads the XML document from the specified URL with line number annotations.
 func LoadURLWithLineNumbers(url string) (*Node, error) {
 	resp, err := http.Get(url)
 	if err != nil {
 		return nil, err
 	}
 	defer resp.Body.Close()
-	
+
 	if xmlMIMERegex.MatchString(resp.Header.Get("Content-Type")) {
 		return ParseWithOptions(resp.Body, ParserOptions{WithLineNumbers: true})
 	}
diff --git a/query.go b/query.go
@@ -165,6 +165,8 @@ func (x *NodeNavigator) NodeType() xpath.NodeType {
 			return xpath.AttributeNode
 		}
 		return xpath.ElementNode
+	case ProcessingInstruction: // Keep backward compatibility
+		return xpath.ElementNode
 	}
 	panic(fmt.Sprintf("unknown XML node type: %v", x.curr.Type))
 }

Original file line number	Diff line number	Diff line change
`@@ -165,6 +165,8 @@ func (x *NodeNavigator) NodeType() xpath.NodeType {`
`165`	`165`	`return xpath.AttributeNode`
`166`	`166`	`}`
`167`	`167`	`return xpath.ElementNode`
	`168`	`+ case ProcessingInstruction: // Keep backward compatibility`
	`169`	`+ return xpath.ElementNode`
`168`	`170`	`}`
`169`	`171`	`panic(fmt.Sprintf("unknown XML node type: %v", x.curr.Type))`
`170`	`172`	`}`