Skip to content

Commit dea4969

Browse files
committed
Updated AfterBody and AfterAfterBody to current spec
Don't pop stack to close on </body> or </html>, but leave them on the stack. Had to deviate from the spec slightly to allow whitespace to be added to the html or doc elements. (The goal of that is so that when pretty-printing is off, the output more closely resembles the input, by tracking newlines after </body> etc) Fixes #1851
1 parent 21aac91 commit dea4969

File tree

6 files changed

+81
-14
lines changed

6 files changed

+81
-14
lines changed

CHANGES

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ Release 1.16.1 [PENDING]
88
* Improvement: Calling Node.remove() on a node with no parent is now a no-op, vs a validation error.
99
<https://github.com/jhy/jsoup/issues/1898>
1010

11+
* Bugfix: aligned the HTML Tree Builder processing steps for AfterBody and AfterAfterBody to the updated WHATWG
12+
standard, to not pop the stack to close <body> or <html> elements. This prevents an errant </html> closing preceding
13+
structure. Also added appropriate error message outputs in this case.
14+
<https://github.com/jhy/jsoup/issues/1851>
15+
1116
* Bugfix: Corrected support for ruby elements (<ruby>, <rp>, <rt>, and <rtc>) to current spec.
1217
<https://github.com/jhy/jsoup/issues/1294>
1318

src/main/java/org/jsoup/parser/HtmlTreeBuilder.java

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,9 +301,14 @@ void insert(Token.Comment commentToken) {
301301
insertNode(comment, commentToken);
302302
}
303303

304+
/** Inserts the provided character token into the current element. */
304305
void insert(Token.Character characterToken) {
306+
final Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack)
307+
insert(characterToken, el);
308+
}
309+
310+
void insert(Token.Character characterToken, Element el) {
305311
final Node node;
306-
Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack)
307312
final String tagName = el.normalName();
308313
final String data = characterToken.getData();
309314

@@ -317,6 +322,7 @@ else if (isContentForTagData(tagName))
317322
onNodeInserted(node, characterToken);
318323
}
319324

325+
/** Inserts the provided character token into the provided element. Use when not going onto stack element */
320326
private void insertNode(Node node, @Nullable Token token) {
321327
// if the stack hasn't been set up yet, elements (doctype, comments) go into the doc
322328
if (stack.isEmpty())
@@ -632,6 +638,20 @@ boolean inSelectScope(String targetName) {
632638
return false;
633639
}
634640

641+
/** Tests if there is some element on the stack that is not in the provided set. */
642+
boolean onStackNot(String[] allowedTags) {
643+
final int bottom = stack.size() -1;
644+
final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0;
645+
// don't walk too far up the tree
646+
647+
for (int pos = bottom; pos >= top; pos--) {
648+
final String elName = stack.get(pos).normalName();
649+
if (!inSorted(elName, allowedTags))
650+
return true;
651+
}
652+
return false;
653+
}
654+
635655
void setHeadElement(Element headElement) {
636656
this.headElement = headElement;
637657
}

src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,8 @@ boolean process(Token t, HtmlTreeBuilder tb) {
310310
case EOF:
311311
if (tb.templateModeSize() > 0)
312312
return tb.process(t, InTemplate);
313-
// todo: error if stack contains something not dd, dt, li, p, tbody, td, tfoot, th, thead, tr, body, html
313+
if (tb.onStackNot(InBodyEndOtherErrors))
314+
tb.error(this);
314315
// stop parsing
315316
break;
316317
}
@@ -726,16 +727,22 @@ private boolean inBodyEndTag(Token t, HtmlTreeBuilder tb) {
726727
tb.error(this);
727728
return false;
728729
} else {
729-
// todo: error if stack contains something not dd, dt, li, optgroup, option, p, rp, rt, tbody, td, tfoot, th, thead, tr, body, html
730-
anyOtherEndTag(t, tb);
730+
if (tb.onStackNot(InBodyEndOtherErrors))
731+
tb.error(this);
731732
tb.transition(AfterBody);
732733
}
733734
break;
734735
case "html":
735-
boolean notIgnored = tb.processEndTag("body");
736-
if (notIgnored)
737-
return tb.process(endTag);
738-
break;
736+
if (!tb.onStack("body")) {
737+
tb.error(this);
738+
return false; // ignore
739+
} else {
740+
if (tb.onStackNot(InBodyEndOtherErrors))
741+
tb.error(this);
742+
tb.transition(AfterBody);
743+
return tb.process(t); // re-process
744+
}
745+
739746
case "form":
740747
if (!tb.onStack("template")) {
741748
Element currentForm = tb.getFormElement();
@@ -1594,7 +1601,12 @@ else if (name.equals("col")) {
15941601
AfterBody {
15951602
boolean process(Token t, HtmlTreeBuilder tb) {
15961603
if (isWhitespace(t)) {
1597-
tb.insert(t.asCharacter()); // out of spec - include whitespace. spec would move into body
1604+
// spec deviation - currently body is still on stack, but we want this to go to the html node
1605+
Element html = tb.getFromStack("html");
1606+
if (html != null)
1607+
tb.insert(t.asCharacter(), html);
1608+
else
1609+
tb.process(t, InBody); // will get into body
15981610
} else if (t.isComment()) {
15991611
tb.insert(t.asComment()); // into html node
16001612
} else if (t.isDoctype()) {
@@ -1607,7 +1619,6 @@ boolean process(Token t, HtmlTreeBuilder tb) {
16071619
tb.error(this);
16081620
return false;
16091621
} else {
1610-
if (tb.onStack("html")) tb.popStackToClose("html");
16111622
tb.transition(AfterAfterBody);
16121623
}
16131624
} else if (t.isEOF()) {
@@ -1699,7 +1710,9 @@ boolean process(Token t, HtmlTreeBuilder tb) {
16991710
} else if (t.isDoctype() || (t.isStartTag() && t.asStartTag().normalName().equals("html"))) {
17001711
return tb.process(t, InBody);
17011712
} else if (isWhitespace(t)) {
1702-
tb.insert(t.asCharacter());
1713+
// spec deviation - body and html still on stack, but want this space to go after </html>
1714+
Element doc = tb.getDocument();
1715+
tb.insert(t.asCharacter(), doc);
17031716
}else if (t.isEOF()) {
17041717
// nice work chuck
17051718
} else {
@@ -1786,6 +1799,7 @@ static final class Constants {
17861799
static final String[] InBodyEndClosers = new String[]{"address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div",
17871800
"dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu",
17881801
"nav", "ol", "pre", "section", "summary", "ul"};
1802+
static final String[] InBodyEndOtherErrors = new String[] {"body", "dd", "dt", "html", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"};
17891803
static final String[] InBodyEndAdoptionFormatters = new String[]{"a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u"};
17901804
static final String[] InBodyEndTableFosters = new String[]{"table", "tbody", "tfoot", "thead", "tr"};
17911805
static final String[] InTableToBody = new String[]{"tbody", "tfoot", "thead"};

src/main/java/org/jsoup/parser/Tag.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,8 @@ protected Tag clone() {
246246
"sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "button", "optgroup",
247247
"option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track",
248248
"summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track",
249-
"data", "bdi", "s", "strike", "nobr"
249+
"data", "bdi", "s", "strike", "nobr",
250+
"rb" // deprecated but still known / special handling
250251
};
251252
private static final String[] emptyTags = {
252253
"meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command",

src/test/java/org/jsoup/parser/HtmlParserTest.java

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1698,11 +1698,38 @@ private boolean didAddElements(String input) {
16981698
parser.setTrackErrors(10);
16991699
Document doc = Jsoup.parse(html, parser);
17001700
ParseErrorList errors = parser.getErrors();
1701-
assertEquals(1, errors.size());
1701+
assertEquals(2, errors.size());
17021702
Element ruby = doc.expectFirst("ruby");
17031703
assertEquals(
17041704
"<ruby><div><rp>Hello</rp></div></ruby>",
17051705
TextUtil.stripNewlines(ruby.outerHtml()));
17061706
assertEquals("<1:16>: Unexpected StartTag token [<rp>] when in state [InBody]", errors.get(0).toString());
17071707
}
1708+
1709+
@Test void errorOnEofIfOpen() {
1710+
String html = "<div>";
1711+
Parser parser = Parser.htmlParser();
1712+
parser.setTrackErrors(10);
1713+
Document doc = Jsoup.parse(html, parser);
1714+
ParseErrorList errors = parser.getErrors();
1715+
assertEquals(1, errors.size());
1716+
assertEquals("Unexpected EOF token [] when in state [InBody]", errors.get(0).getErrorMessage());
1717+
}
1718+
1719+
@Test void NoErrorOnEofIfBodyOpen() {
1720+
String html = "<body>";
1721+
Parser parser = Parser.htmlParser();
1722+
parser.setTrackErrors(10);
1723+
Document doc = Jsoup.parse(html, parser);
1724+
ParseErrorList errors = parser.getErrors();
1725+
assertEquals(0, errors.size());
1726+
}
1727+
1728+
@Test void htmlClose() {
1729+
// https://github.com/jhy/jsoup/issues/1851
1730+
String html = "<body><div>One</html>Two</div></body>";
1731+
Document doc = Jsoup.parse(html);
1732+
//assertEquals("OneTwo", doc.expectFirst("body > div").text());
1733+
System.out.println(doc.html());
1734+
}
17081735
}

src/test/java/org/jsoup/parser/HtmlTreeBuilderStateTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ static void ensureSorted(List<Object[]> constants) {
4545
public void ensureArraysAreSorted() {
4646
List<Object[]> constants = findConstantArrays(Constants.class);
4747
ensureSorted(constants);
48-
assertEquals(38, constants.size());
48+
assertEquals(39, constants.size());
4949
}
5050

5151
@Test public void ensureTagSearchesAreKnownTags() {

0 commit comments

Comments
 (0)