Skip to content

Commit

Permalink
Preserve whitespace in nodes before <head>
Browse files Browse the repository at this point in the history
  • Loading branch information
jhy committed Mar 1, 2020
1 parent 328f2e4 commit 9675a92
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 19 deletions.
2 changes: 2 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ jsoup changelog
* Improvement: added Elements#forms(), Elements#textNodes(), Elements#dataNodes(), and Elements#comments(), as a
convenient way to get access to these node types directly from an element selection.

* Improvement: preserve whitespace before html and head tag, if pretty-printing is off.

* Bugfix: in a <select> tag, a second <optgroup> would not automatically close an earlier open <optgroup>
<https://github.com/jhy/jsoup/issues/1313>

Expand Down
8 changes: 5 additions & 3 deletions src/main/java/org/jsoup/nodes/TextNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,13 @@ public TextNode splitText(int offset) {
}

void outerHtmlHead(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
if (out.prettyPrint() && ((siblingIndex() == 0 && parentNode instanceof Element && ((Element) parentNode).tag().formatAsBlock() && !isBlank()) || (out.outline() && siblingNodes().size()>0 && !isBlank()) ))
final boolean prettyPrint = out.prettyPrint();
if (prettyPrint && ((siblingIndex() == 0 && parentNode instanceof Element && ((Element) parentNode).tag().formatAsBlock() && !isBlank()) || (out.outline() && siblingNodes().size()>0 && !isBlank()) ))
indent(accum, depth, out);

boolean normaliseWhite = out.prettyPrint() && !Element.preserveWhitespace(parent());
Entities.escape(accum, coreValue(), out, false, normaliseWhite, false);
final boolean normaliseWhite = prettyPrint && !Element.preserveWhitespace(parentNode);
final boolean stripWhite = prettyPrint && parentNode instanceof Document;
Entities.escape(accum, coreValue(), out, false, normaliseWhite, stripWhite);
}

void outerHtmlTail(Appendable accum, int depth, Document.OutputSettings out) {}
Expand Down
13 changes: 8 additions & 5 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,9 @@ void insert(Token.Comment commentToken) {

void insert(Token.Character characterToken) {
final Node node;
final Element el = currentElement();
Element el = currentElement();
if (el == null)
el = doc; // allows for whitespace to be inserted into the doc root object (not on the stack)
final String tagName = el.normalName();
final String data = characterToken.getData();

Expand Down Expand Up @@ -338,13 +340,14 @@ boolean removeFromStack(Element el) {
return false;
}

void popStackToClose(String elName) {
Element popStackToClose(String elName) {
for (int pos = stack.size() -1; pos >= 0; pos--) {
Element next = stack.get(pos);
Element el = stack.get(pos);
stack.remove(pos);
if (next.normalName().equals(elName))
break;
if (el.normalName().equals(elName))
return el;
}
return null;
}

// elnames is sorted, comes from Constants
Expand Down
24 changes: 16 additions & 8 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ enum HtmlTreeBuilderState {
Initial {
boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
return true; // ignore whitespace
return true; // ignore whitespace until we get the first content
} else if (t.isComment()) {
tb.insert(t.asComment());
} else if (t.isDoctype()) {
Expand Down Expand Up @@ -50,7 +50,7 @@ boolean process(Token t, HtmlTreeBuilder tb) {
} else if (t.isComment()) {
tb.insert(t.asComment());
} else if (isWhitespace(t)) {
return true; // ignore whitespace
tb.insert(t.asCharacter()); // out of spec - include whitespace
} else if (t.isStartTag() && t.asStartTag().normalName().equals("html")) {
tb.insert(t.asStartTag());
tb.transition(BeforeHead);
Expand All @@ -74,7 +74,7 @@ private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
BeforeHead {
boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
return true;
tb.insert(t.asCharacter()); // out of spec - include whitespace
} else if (t.isComment()) {
tb.insert(t.asComment());
} else if (t.isDoctype()) {
Expand Down Expand Up @@ -102,7 +102,7 @@ boolean process(Token t, HtmlTreeBuilder tb) {
InHead {
boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
tb.insert(t.asCharacter());
tb.insert(t.asCharacter()); // out of spec - include whitespace
return true;
}
switch (t.type) {
Expand Down Expand Up @@ -1406,7 +1406,7 @@ boolean process(Token t, HtmlTreeBuilder tb) {
AfterBody {
boolean process(Token t, HtmlTreeBuilder tb) {
if (isWhitespace(t)) {
return tb.process(t, InBody);
tb.insert(t.asCharacter()); // out of spec - include whitespace. spec would move into body
} else if (t.isComment()) {
tb.insert(t.asComment()); // into html node
} else if (t.isDoctype()) {
Expand Down Expand Up @@ -1507,9 +1507,17 @@ boolean process(Token t, HtmlTreeBuilder tb) {
boolean process(Token t, HtmlTreeBuilder tb) {
if (t.isComment()) {
tb.insert(t.asComment());
} else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) {
} else if (t.isDoctype() || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) {
return tb.process(t, InBody);
} else if (t.isEOF()) {
} else if (isWhitespace(t)) {
// allows space after </html>, and put the body back on stack to allow subsequent tags if any
// todo - might be better for </body> and </html> to close them, allow trailing space, and then reparent
// that space into body if other tags get re-added. but that's overkill for now
Element html = tb.popStackToClose("html");
tb.insert(t.asCharacter());
tb.stack.add(html);
tb.stack.add(html.selectFirst("body"));
}else if (t.isEOF()) {
// nice work chuck
} else {
tb.error(this);
Expand Down Expand Up @@ -1550,7 +1558,7 @@ boolean process(Token t, HtmlTreeBuilder tb) {
private static boolean isWhitespace(Token t) {
if (t.isCharacter()) {
String data = t.asCharacter().getData();
return isWhitespace(data);
return StringUtil.isBlank(data);
}
return false;
}
Expand Down
4 changes: 2 additions & 2 deletions src/test/java/org/jsoup/helper/W3CDomTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,12 @@ public void namespacePreservation() throws IOException {
assertEquals("html", htmlEl.getNodeName());

// inherits default namespace
Node head = htmlEl.getFirstChild();
Node head = htmlEl.getFirstChild().getNextSibling();
assertEquals("http://www.w3.org/1999/xhtml", head.getNamespaceURI());
assertEquals("head", head.getLocalName());
assertEquals("head", head.getNodeName());

Node epubTitle = htmlEl.getChildNodes().item(2).getChildNodes().item(3);
Node epubTitle = htmlEl.getChildNodes().item(3).getChildNodes().item(3);
assertEquals("Check", epubTitle.getTextContent());
assertEquals("http://www.idpf.org/2007/ops", epubTitle.getNamespaceURI());
assertEquals("title", epubTitle.getLocalName());
Expand Down
17 changes: 17 additions & 0 deletions src/test/java/org/jsoup/parser/HtmlParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1355,4 +1355,21 @@ public void testUNewlines() {
doc = Jsoup.parse(html, "", Parser.htmlParser().settings(preserveCase));
assertEquals("YES YES", doc.selectFirst("textarea").val());
}

@Test public void preserveWhitespaceInHead() {
String html = "\n<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n</body>\n</html>\n";
Document doc = Jsoup.parse(html);
doc.outputSettings().prettyPrint(false);
System.out.println(doc.outerHtml());
assertEquals("<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n\n</body></html>\n", doc.outerHtml());
}

@Test public void handleContentAfterBody() {
String html = "<body>One</body> <p>Hello!</p></html> <p>There</p>";
// todo - ideally would move that space afer /html to the body when the There <p> is seen
Document doc = Jsoup.parse(html);
doc.outputSettings().prettyPrint(false);
System.out.println(doc.outerHtml());
assertEquals("<html><head></head><body>One <p>Hello!</p><p>There</p></body></html> ", doc.outerHtml());
}
}
2 changes: 1 addition & 1 deletion src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ public void testDetectCharsetEncodingDeclaration() throws IOException, URISyntax
InputStream inStream = new FileInputStream(xmlFile);
Document doc = Jsoup.parse(inStream, null, "http://example.com/", Parser.xmlParser());
assertEquals("ISO-8859-1", doc.charset().name());
assertEquals("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?> <data>äöåéü</data>",
assertEquals("<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?><data>äöåéü</data>",
TextUtil.stripNewlines(doc.html()));
}

Expand Down

0 comments on commit 9675a92

Please sign in to comment.