Skip to content

Commit

Permalink
XML allows for comments and processing instructions to be present bef…
Browse files Browse the repository at this point in the history
…ore the start and after the end of the root element. Currently, `FactoryAdapter` does not capture those nodes, and `XMLLoader.loadXML` does not provide access to anything other than the root element anyway.

This pull request addresses the issue.

Note: at least with the JDK's Xerces, whitespace in the prolog and epilogue gets lost in parsing: the parser does not fire any white-space related events.
  • Loading branch information
dubinsky committed Oct 1, 2021
1 parent df9759b commit 156f2ea
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 24 deletions.
31 changes: 22 additions & 9 deletions jvm/src/test/scala/scala/xml/XMLTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,7 @@ class XMLTestJVM {
def issue508commentParsing: Unit = {
// confirm that comments are processed correctly now
roundtrip("<a><!-- comment --> suffix</a>")
roundtrip("<a>prefix <!-- comment --> suffix</a>")
roundtrip("<a>prefix <!-- comment --> <!-- comment2 --> suffix</a>")
roundtrip("<a>prefix <b><!-- comment --></b> suffix</a>")
roundtrip("<a>prefix <b><!-- multi-\nline\n comment --></b> suffix</a>")
roundtrip("""<a>prefix <b><!-- multi-
Expand All @@ -596,13 +596,7 @@ class XMLTestJVM {
// confirm that processing instructions were always processed correctly
roundtrip("<a><?target content ?> suffix</a>")
roundtrip("<a>prefix <?target content ?> suffix</a>")
roundtrip("<a>prefix <b><?target content?></b> suffix</a>")

// TODO since XMLLoader retrieves FactoryAdapter.rootNode,
// capturing comments before and after the root element is not currently possible
// (by the way, the same applies to processing instructions).
//check("<!-- prologue --><a>text</a>")
//check("<a>text</a><!-- epilogue -->")
roundtrip("<a>prefix <b><?target content?> </b> suffix</a>")
}

@UnitTest
Expand All @@ -613,7 +607,26 @@ class XMLTestJVM {
roundtrip("""<a>prefix <b><![CDATA[
| multi-
| line cdata
| section]]></b> suffix</a>""".stripMargin)
| section]]> </b> suffix</a>""".stripMargin)
}

def roundtripNodes(xml: String): Unit = assertEquals(xml, XML.loadStringNodes(xml).map(_.toString).mkString(""))

@UnitTest
def xmlLoaderLoadNodes: Unit = {
roundtripNodes("<!-- prolog --><a>text</a>")
roundtripNodes("<!-- prolog --><?target content ?><!-- comment2 --><a>text</a>")
roundtripNodes("""<!-- prolog
| --><?target content ?><!--
| comment2 --><a>text</a>""".stripMargin)

roundtripNodes("<a>text</a><!-- epilogue -->")
roundtripNodes("<a>text</a><!-- epilogue --><?target content ?><!-- comment2 -->")

// Note: at least with the JDK's Xerces, whitespace in the prolog and epilogue gets lost in parsing:
// the parser does not fire any white-space related events, so:
// does not work: roundtripNodes("<!-- c --> <a/>")
// does not work: roundtripNodes("<a/> <!-- epilogue -->")
}

@UnitTest
Expand Down
35 changes: 28 additions & 7 deletions shared/src/main/scala/scala/xml/factory/XMLLoader.scala
Original file line number Diff line number Diff line change
Expand Up @@ -51,19 +51,29 @@ trait XMLLoader[T <: Node] {
* The methods available in scala.xml.XML use the XML parser in the JDK.
*/
def loadXML(source: InputSource, parser: SAXParser): T = {
val newAdapter = adapter
val result: FactoryAdapter = parse(source, parser)
result.rootElem.asInstanceOf[T]
}

def loadXMLNodes(source: InputSource, parser: SAXParser): Seq[Node] = {
val result: FactoryAdapter = parse(source, parser)
result.prolog ++ (result.rootElem :: result.epilogue)
}

private def parse(source: InputSource, parser: SAXParser): FactoryAdapter = {
val result: FactoryAdapter = adapter

try {
parser.setProperty("http://xml.org/sax/properties/lexical-handler", newAdapter)
parser.setProperty("http://xml.org/sax/properties/lexical-handler", result)
} catch {
case _: SAXNotRecognizedException =>
}

newAdapter.scopeStack = TopScope :: newAdapter.scopeStack
parser.parse(source, newAdapter)
newAdapter.scopeStack = newAdapter.scopeStack.tail
result.scopeStack = TopScope :: result.scopeStack
parser.parse(source, result)
result.scopeStack = result.scopeStack.tail

newAdapter.rootElem.asInstanceOf[T]
result
}

/** Loads XML from the given file, file descriptor, or filename. */
Expand All @@ -80,4 +90,15 @@ trait XMLLoader[T <: Node] {

/** Loads XML from the given String. */
def loadString(string: String): T = loadXML(fromString(string), parser)
}

/** Load XML nodes, including comments and processing instructions that precede and follow the root element. */
def loadFileNodes(file: File): Seq[Node] = loadXMLNodes(fromFile(file), parser)
def loadFileNodes(fd: FileDescriptor): Seq[Node] = loadXMLNodes(fromFile(fd), parser)
def loadFileNodes(name: String): Seq[Node] = loadXMLNodes(fromFile(name), parser)
def loadNodes(is: InputStream): Seq[Node] = loadXMLNodes(fromInputStream(is), parser)
def loadNodes(reader: Reader): Seq[Node] = loadXMLNodes(fromReader(reader), parser)
def loadNodes(sysID: String): Seq[Node] = loadXMLNodes(fromSysId(sysID), parser)
def loadNodes(source: InputSource): Seq[Node] = loadXMLNodes(source, parser)
def loadNodes(url: URL): Seq[Node] = loadXMLNodes(fromInputStream(url.openStream()), parser)
def loadStringNodes(string: String): Seq[Node] = loadXMLNodes(fromString(string), parser)
}
31 changes: 23 additions & 8 deletions shared/src/main/scala/scala/xml/parsing/FactoryAdapter.scala
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,11 @@ trait ConsoleErrorHandler extends DefaultHandler2 {
* underlying SAX parser.
*/
abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Node] {
var prolog: List[Node] = List.empty
var rootElem: Node = _
var epilogue: List[Node] = List.empty

val buffer = new StringBuilder()
val buffer: StringBuilder = new StringBuilder()
private var inCDATA: Boolean = false

/** List of attributes
Expand All @@ -51,28 +53,28 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
*
* @since 2.0.0
*/
var attribStack = List.empty[MetaData]
var attribStack: List[MetaData] = List.empty
/** List of elements
*
* Previously was a mutable [[scala.collection.mutable.Stack Stack]], but is now a mutable reference to an immutable [[scala.collection.immutable.List List]].
*
* @since 2.0.0
*/
var hStack = List.empty[Node] // [ element ] contains siblings
var hStack: List[Node] = List.empty // [ element ] contains siblings
/** List of element names
*
* Previously was a mutable [[scala.collection.mutable.Stack Stack]], but is now a mutable reference to an immutable [[scala.collection.immutable.List List]].
*
* @since 2.0.0
*/
var tagStack = List.empty[String]
var tagStack: List[String] = List.empty
/** List of namespaces
*
* Previously was a mutable [[scala.collection.mutable.Stack Stack]], but is now a mutable reference to an immutable [[scala.collection.immutable.List List]].
*
* @since 2.0.0
*/
var scopeStack = List.empty[NamespaceBinding]
var scopeStack: List[NamespaceBinding] = List.empty

var curTag: String = _
var capture: Boolean = false
Expand Down Expand Up @@ -123,7 +125,7 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
// ContentHandler methods
//

val normalizeWhitespace = false
val normalizeWhitespace: Boolean = false

/**
* Capture characters, possibly normalizing whitespace.
Expand Down Expand Up @@ -177,13 +179,20 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
attributes: Attributes): Unit =
{
captureText()

// capture the prolog at the start of the root element
if (tagStack.isEmpty) {
prolog = hStack.reverse
hStack = List.empty
}

tagStack = curTag :: tagStack
curTag = qname

val localName = splitName(qname)._2
capture = nodeContainsText(localName)

hStack = null :: hStack
hStack = null :: hStack
var m: MetaData = Null
var scpe: NamespaceBinding =
if (scopeStack.isEmpty) TopScope
Expand All @@ -193,7 +202,7 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
val qname = attributes getQName i
val value = attributes getValue i
val (pre, key) = splitName(qname)
def nullIfEmpty(s: String) = if (s == "") null else s
def nullIfEmpty(s: String): String = if (s == "") null else s

if (pre == "xmlns" || (pre == null && qname == "xmlns")) {
val arg = if (pre == null) null else key
Expand Down Expand Up @@ -250,6 +259,12 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod
capture = curTag != null && nodeContainsText(curTag) // root level
}

override def endDocument(): Unit = {
// capture the epilogue at the end of the document
epilogue = hStack.init.reverse
hStack = hStack.last :: Nil
}

/**
* Processing instruction.
*/
Expand Down
2 changes: 2 additions & 0 deletions shared/src/main/scala/scala/xml/parsing/MarkupParser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ trait MarkupParser extends MarkupParserCommon with TokenTests {
var extIndex = -1

/** holds temporary values of pos */
// Note: this is clearly an override, but if marked as such it causes a "...cannot override a mutable variable"
// error with Scala 3; does it work with Scala 3 if not explicitly marked as an override remains to be seen...
var tmppos: Int = _

/** holds the next character */
Expand Down

0 comments on commit 156f2ea

Please sign in to comment.