Preserve whitespace in nodes before <head>

jhy · Mar 1, 2020 · 9675a92 · 9675a92
1 parent 328f2e4
commit 9675a92
Show file tree

Hide file tree

Showing 7 changed files with 51 additions and 19 deletions.
diff --git a/CHANGES b/CHANGES
@@ -25,6 +25,8 @@ jsoup changelog
   * Improvement: added Elements#forms(), Elements#textNodes(), Elements#dataNodes(), and Elements#comments(), as a
     convenient way to get access to these node types directly from an element selection.
 
+  * Improvement: preserve whitespace before html and head tag, if pretty-printing is off.
+
   * Bugfix: in a <select> tag, a second <optgroup> would not automatically close an earlier open <optgroup>
     <https://github.com/jhy/jsoup/issues/1313>
 

diff --git a/src/main/java/org/jsoup/nodes/TextNode.java b/src/main/java/org/jsoup/nodes/TextNode.java
@@ -81,11 +81,13 @@ public TextNode splitText(int offset) {
     }
 
 	void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
-        if (out.prettyPrint() && ((siblingIndex() == 0 && parentNode instanceof Element && ((Element) parentNode).tag().formatAsBlock() && !isBlank()) || (out.outline() && siblingNodes().size()>0 && !isBlank()) ))
+        final boolean prettyPrint = out.prettyPrint();
+        if (prettyPrint && ((siblingIndex() == 0 && parentNode instanceof Element && ((Element) parentNode).tag().formatAsBlock() && !isBlank()) || (out.outline() && siblingNodes().size()>0 && !isBlank()) ))
             indent(accum, depth, out);
 
-        boolean normaliseWhite = out.prettyPrint() && !Element.preserveWhitespace(parent());
-        Entities.escape(accum, coreValue(), out, false, normaliseWhite, false);
+        final boolean normaliseWhite = prettyPrint && !Element.preserveWhitespace(parentNode);
+        final boolean stripWhite = prettyPrint && parentNode instanceof Document;
+        Entities.escape(accum, coreValue(), out, false, normaliseWhite, stripWhite);
     }
 
 	void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {}

diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
@@ -261,7 +261,9 @@ void insert(Token.Comment commentToken) {
 
     void insert(Token.Character characterToken) {
         final Node node;
-        final Element el = currentElement();
+        Element el = currentElement();
+        if (el == null)
+            el = doc; // allows for whitespace to be inserted into the doc root object (not on the stack)
         final String tagName = el.normalName();
         final String data = characterToken.getData();
 
@@ -338,13 +340,14 @@ boolean removeFromStack(Element el) {
         return false;
     }
 
-    void popStackToClose(String elName) {
+    Element popStackToClose(String elName) {
         for (int pos = stack.size() -1; pos >= 0; pos--) {
-            Element next = stack.get(pos);
+            Element el = stack.get(pos);
             stack.remove(pos);
-            if (next.normalName().equals(elName))
-                break;
+            if (el.normalName().equals(elName))
+                return el;
         }
+        return null;
     }
 
     // elnames is sorted, comes from Constants

diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
@@ -20,7 +20,7 @@ enum HtmlTreeBuilderState {
     Initial {
         boolean process(Token t, HtmlTreeBuilder tb) {
             if (isWhitespace(t)) {
-                return true; // ignore whitespace
+                return true; // ignore whitespace until we get the first content
             } else if (t.isComment()) {
                 tb.insert(t.asComment());
             } else if (t.isDoctype()) {
@@ -50,7 +50,7 @@ boolean process(Token t, HtmlTreeBuilder tb) {
             } else if (t.isComment()) {
                 tb.insert(t.asComment());
             } else if (isWhitespace(t)) {
-                return true; // ignore whitespace
+                tb.insert(t.asCharacter()); // out of spec - include whitespace
             } else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) {
                 tb.insert(t.asStartTag());
                 tb.transition(BeforeHead);
@@ -74,7 +74,7 @@ private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
     BeforeHead {
         boolean process(Token t, HtmlTreeBuilder tb) {
             if (isWhitespace(t)) {
-                return true;
+                tb.insert(t.asCharacter()); // out of spec - include whitespace
             } else if (t.isComment()) {
                 tb.insert(t.asComment());
             } else if (t.isDoctype()) {
@@ -102,7 +102,7 @@ boolean process(Token t, HtmlTreeBuilder tb) {
     InHead {
         boolean process(Token t, HtmlTreeBuilder tb) {
             if (isWhitespace(t)) {
-                tb.insert(t.asCharacter());
+                tb.insert(t.asCharacter()); // out of spec - include whitespace
                 return true;
             }
             switch (t.type) {
@@ -1406,7 +1406,7 @@ boolean process(Token t, HtmlTreeBuilder tb) {
     AfterBody {
         boolean process(Token t, HtmlTreeBuilder tb) {
             if (isWhitespace(t)) {
-                return tb.process(t, InBody);
+                tb.insert(t.asCharacter()); // out of spec - include whitespace. spec would move into body
             } else if (t.isComment()) {
                 tb.insert(t.asComment()); // into html node
             } else if (t.isDoctype()) {
@@ -1507,9 +1507,17 @@ boolean process(Token t, HtmlTreeBuilder tb) {
         boolean process(Token t, HtmlTreeBuilder tb) {
             if (t.isComment()) {
                 tb.insert(t.asComment());
-            } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) {
+            } else if (t.isDoctype() || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) {
                 return tb.process(t, InBody);
-            } else if (t.isEOF()) {
+            } else if (isWhitespace(t)) {
+                // allows space after </html>, and put the body back on stack to allow subsequent tags if any
+                // todo - might be better for </body> and </html> to close them, allow trailing space, and then reparent
+                //  that space into body if other tags get re-added. but that's overkill for now
+                Element html = tb.popStackToClose("html");
+                tb.insert(t.asCharacter());
+                tb.stack.add(html);
+                tb.stack.add(html.selectFirst("body"));
+            }else if (t.isEOF()) {
                 // nice work chuck
             } else {
                 tb.error(this);
@@ -1550,7 +1558,7 @@ boolean process(Token t, HtmlTreeBuilder tb) {
     private static boolean isWhitespace(Token t) {
         if (t.isCharacter()) {
             String data = t.asCharacter().getData();
-            return isWhitespace(data);
+            return StringUtil.isBlank(data);
         }
         return false;
     }

diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java
@@ -134,12 +134,12 @@ public void namespacePreservation() throws IOException {
         assertEquals("html", htmlEl.getNodeName());
 
         // inherits default namespace
-        Node head = htmlEl.getFirstChild();
+        Node head = htmlEl.getFirstChild().getNextSibling();
         assertEquals("http://www.w3.org/1999/xhtml", head.getNamespaceURI());
         assertEquals("head", head.getLocalName());
         assertEquals("head", head.getNodeName());
 
-        Node epubTitle = htmlEl.getChildNodes().item(2).getChildNodes().item(3);
+        Node epubTitle = htmlEl.getChildNodes().item(3).getChildNodes().item(3);
         assertEquals("Check", epubTitle.getTextContent());
         assertEquals("http://www.idpf.org/2007/ops", epubTitle.getNamespaceURI());
         assertEquals("title", epubTitle.getLocalName());

diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -1355,4 +1355,21 @@ public void testUNewlines() {
         doc = Jsoup.parse(html, "", Parser.htmlParser().settings(preserveCase));
         assertEquals("YES YES", doc.selectFirst("textarea").val());
     }
+
+    @Test public void preserveWhitespaceInHead() {
+        String html = "\n<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n</body>\n</html>\n";
+        Document doc = Jsoup.parse(html);
+        doc.outputSettings().prettyPrint(false);
+        System.out.println(doc.outerHtml());
+        assertEquals("<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n\n</body></html>\n", doc.outerHtml());
+    }
+
+    @Test public void handleContentAfterBody() {
+        String html = "<body>One</body>  <p>Hello!</p></html> <p>There</p>";
+        // todo - ideally would move that space afer /html to the body when the There <p> is seen
+        Document doc = Jsoup.parse(html);
+        doc.outputSettings().prettyPrint(false);
+        System.out.println(doc.outerHtml());
+        assertEquals("<html><head></head><body>One  <p>Hello!</p><p>There</p></body></html> ", doc.outerHtml());
+    }
 }
diff --git a/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java b/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java
@@ -143,7 +143,7 @@ public void testDetectCharsetEncodingDeclaration() throws IOException, URISyntax
         InputStream inStream = new FileInputStream(xmlFile);
         Document doc = Jsoup.parse(inStream, null, "http://example.com/", Parser.xmlParser());
         assertEquals("ISO-8859-1", doc.charset().name());
-        assertEquals("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?> <data>äöåéü</data>",
+        assertEquals("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?><data>äöåéü</data>",
             TextUtil.stripNewlines(doc.html()));
     }