Skip to content

Commit 3ddc80a

Browse files
Make consecutive hyphens in comments a non-error
Also allow `<!-->` at (IE conditional) comment end See whatwg/html#1356 See whatwg/html#1456
1 parent ab28833 commit 3ddc80a

File tree

2 files changed

+190
-19
lines changed

2 files changed

+190
-19
lines changed

src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2009-2013 Mozilla Foundation
2+
* Copyright (c) 2009-2017 Mozilla Foundation
33
*
44
* Permission is hereby granted, free of charge, to any person obtaining a
55
* copy of this software and associated documentation files (the "Software"),
@@ -395,8 +395,8 @@ private boolean isAstralPrivateUse(int c) {
395395
err("Nameless doctype.");
396396
}
397397

398-
@Override protected void errConsecutiveHyphens() throws SAXException {
399-
err("Consecutive hyphens did not terminate a comment. \u201C--\u201D is not permitted inside a comment, but e.g. \u201C- -\u201D is.");
398+
@Override protected void errNestedComment() throws SAXException {
399+
err("Saw \u201C<!--\u201D within a comment. Probable cause: Nested comment (not allowed).");
400400
}
401401

402402
@Override protected void errPrematureEndOfComment() throws SAXException {
@@ -678,10 +678,6 @@ private boolean isAstralPrivateUse(int c) {
678678
err("Missing space before doctype name.");
679679
}
680680

681-
@Override protected void errHyphenHyphenBang() throws SAXException {
682-
err("\u201C--!\u201D found in comment.");
683-
}
684-
685681
@Override protected void errNcrControlChar() throws SAXException {
686682
err("Character reference expands to a control character ("
687683
+ toUPlusString((char) value) + ").");

src/nu/validator/htmlparser/impl/Tokenizer.java

Lines changed: 187 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,14 @@ public class Tokenizer implements Locator, Locator2 {
223223

224224
public static final int AMBIGUOUS_AMPERSAND = 75;
225225

226+
public static final int COMMENT_LESSTHAN = 76;
227+
228+
public static final int COMMENT_LESSTHAN_BANG = 77;
229+
230+
public static final int COMMENT_LESSTHAN_BANG_DASH = 78;
231+
232+
public static final int COMMENT_LESSTHAN_BANG_DASH_DASH = 79;
233+
226234
/**
227235
* Magic value for UTF-16 operations.
228236
*/
@@ -1031,9 +1039,8 @@ private void maybeAppendSpaceToBogusComment() throws SAXException {
10311039

10321040
// ]NOCPP]
10331041

1034-
@Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c)
1042+
@Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c, boolean reportedConsecutiveHyphens)
10351043
throws SAXException {
1036-
errConsecutiveHyphens();
10371044
// [NOCPP[
10381045
switch (commentPolicy) {
10391046
case ALTER_INFOSET:
@@ -1044,7 +1051,9 @@ private void maybeAppendSpaceToBogusComment() throws SAXException {
10441051
appendStrBuf('-');
10451052
// CPPONLY: MOZ_FALLTHROUGH;
10461053
case ALLOW:
1047-
warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
1054+
if (!reportedConsecutiveHyphens) {
1055+
warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
1056+
}
10481057
// ]NOCPP]
10491058
appendStrBuf(c);
10501059
// [NOCPP[
@@ -1466,6 +1475,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
14661475
@SuppressWarnings("unused") private int stateLoop(int state, char c,
14671476
int pos, @NoLength char[] buf, boolean reconsume, int returnState,
14681477
int endPos) throws SAXException {
1478+
boolean reportedConsecutiveHyphens = false;
14691479
/*
14701480
* Idioms used in this code:
14711481
*
@@ -2542,6 +2552,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
25422552
}
25432553
// CPPONLY: MOZ_FALLTHROUGH;
25442554
case COMMENT_START:
2555+
reportedConsecutiveHyphens = false;
25452556
commentstartloop: for (;;) {
25462557
if (++pos == endPos) {
25472558
break stateloop;
@@ -2574,6 +2585,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
25742585
*/
25752586
state = transition(state, Tokenizer.DATA, reconsume, pos);
25762587
continue stateloop;
2588+
case '<':
2589+
appendStrBuf(c);
2590+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2591+
continue stateloop;
25772592
case '\r':
25782593
appendStrBufCarriageReturn();
25792594
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
@@ -2619,6 +2634,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
26192634
state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
26202635
break commentloop;
26212636
// continue stateloop;
2637+
case '<':
2638+
appendStrBuf(c);
2639+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2640+
continue stateloop;
26222641
case '\r':
26232642
appendStrBufCarriageReturn();
26242643
break stateloop;
@@ -2661,6 +2680,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
26612680
state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
26622681
break commentenddashloop;
26632682
// continue stateloop;
2683+
case '<':
2684+
appendStrBuf(c);
2685+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2686+
continue stateloop;
26642687
case '\r':
26652688
appendStrBufCarriageReturn();
26662689
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
@@ -2715,11 +2738,16 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
27152738
* Append a U+002D HYPHEN-MINUS (-) character to
27162739
* the comment token's data.
27172740
*/
2718-
adjustDoubleHyphenAndAppendToStrBufAndErr(c);
2741+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2742+
reportedConsecutiveHyphens = true;
27192743
/*
27202744
* Stay in the comment end state.
27212745
*/
27222746
continue;
2747+
case '<':
2748+
appendStrBuf(c);
2749+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2750+
continue stateloop;
27232751
case '\r':
27242752
adjustDoubleHyphenAndAppendToStrBufCarriageReturn();
27252753
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
@@ -2729,7 +2757,6 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
27292757
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
27302758
continue stateloop;
27312759
case '!':
2732-
errHyphenHyphenBang();
27332760
appendStrBuf(c);
27342761
state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
27352762
continue stateloop;
@@ -2742,7 +2769,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
27422769
* and the input character to the comment
27432770
* token's data.
27442771
*/
2745-
adjustDoubleHyphenAndAppendToStrBufAndErr(c);
2772+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2773+
reportedConsecutiveHyphens = true;
27462774
/*
27472775
* Switch to the comment state.
27482776
*/
@@ -2812,6 +2840,148 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
28122840
continue stateloop;
28132841
}
28142842
}
2843+
case COMMENT_LESSTHAN:
2844+
for (;;) {
2845+
if (++pos == endPos) {
2846+
break stateloop;
2847+
}
2848+
c = checkChar(buf, pos);
2849+
switch (c) {
2850+
case '!':
2851+
appendStrBuf(c);
2852+
state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG, reconsume, pos);
2853+
continue stateloop;
2854+
case '<':
2855+
appendStrBuf(c);
2856+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2857+
continue stateloop;
2858+
case '-':
2859+
appendStrBuf(c);
2860+
state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2861+
continue stateloop;
2862+
case '\r':
2863+
appendStrBufCarriageReturn();
2864+
break stateloop;
2865+
case '\n':
2866+
appendStrBufLineFeed();
2867+
continue;
2868+
case '\u0000':
2869+
c = '\uFFFD';
2870+
// fall thru
2871+
default:
2872+
appendStrBuf(c);
2873+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2874+
continue stateloop;
2875+
}
2876+
}
2877+
case COMMENT_LESSTHAN_BANG:
2878+
for (;;) {
2879+
if (++pos == endPos) {
2880+
break stateloop;
2881+
}
2882+
c = checkChar(buf, pos);
2883+
switch (c) {
2884+
case '-':
2885+
appendStrBuf(c);
2886+
state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG_DASH, reconsume, pos);
2887+
continue stateloop;
2888+
case '<':
2889+
appendStrBuf(c);
2890+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2891+
continue stateloop;
2892+
case '\r':
2893+
appendStrBufCarriageReturn();
2894+
break stateloop;
2895+
case '\n':
2896+
appendStrBufLineFeed();
2897+
continue;
2898+
case '\u0000':
2899+
c = '\uFFFD';
2900+
// fall thru
2901+
default:
2902+
appendStrBuf(c);
2903+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2904+
continue stateloop;
2905+
}
2906+
}
2907+
case COMMENT_LESSTHAN_BANG_DASH:
2908+
for (;;) {
2909+
if (++pos == endPos) {
2910+
break stateloop;
2911+
}
2912+
c = checkChar(buf, pos);
2913+
switch (c) {
2914+
case '-':
2915+
appendStrBuf(c);
2916+
state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG_DASH_DASH, reconsume, pos);
2917+
continue stateloop;
2918+
case '<':
2919+
appendStrBuf(c);
2920+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2921+
continue stateloop;
2922+
case '\r':
2923+
appendStrBufCarriageReturn();
2924+
break stateloop;
2925+
case '\n':
2926+
appendStrBufLineFeed();
2927+
continue;
2928+
case '\u0000':
2929+
c = '\uFFFD';
2930+
// fall thru
2931+
default:
2932+
appendStrBuf(c);
2933+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2934+
continue stateloop;
2935+
}
2936+
}
2937+
case COMMENT_LESSTHAN_BANG_DASH_DASH:
2938+
for (;;) {
2939+
if (++pos == endPos) {
2940+
break stateloop;
2941+
}
2942+
c = checkChar(buf, pos);
2943+
switch (c) {
2944+
case '>':
2945+
appendStrBuf(c);
2946+
emitComment(3, pos);
2947+
state = transition(state, Tokenizer.DATA, reconsume, pos);
2948+
continue stateloop;
2949+
case '-':
2950+
errNestedComment();
2951+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2952+
reportedConsecutiveHyphens = true;
2953+
state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2954+
continue stateloop;
2955+
case '\r':
2956+
errNestedComment();
2957+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2958+
reportedConsecutiveHyphens = true;
2959+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2960+
break stateloop;
2961+
case '\n':
2962+
errNestedComment();
2963+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2964+
reportedConsecutiveHyphens = true;
2965+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2966+
continue;
2967+
case '\u0000':
2968+
c = '\uFFFD';
2969+
// fall thru
2970+
case '!':
2971+
errNestedComment();
2972+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2973+
reportedConsecutiveHyphens = true;
2974+
state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
2975+
continue stateloop;
2976+
default:
2977+
errNestedComment();
2978+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2979+
reportedConsecutiveHyphens = true;
2980+
state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2981+
continue stateloop;
2982+
}
2983+
}
2984+
// XXX reorder point
28152985
case COMMENT_START_DASH:
28162986
if (++pos == endPos) {
28172987
break stateloop;
@@ -2840,6 +3010,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
28403010
*/
28413011
state = transition(state, Tokenizer.DATA, reconsume, pos);
28423012
continue stateloop;
3013+
case '<':
3014+
appendStrBuf(c);
3015+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
3016+
continue stateloop;
28433017
case '\r':
28443018
appendStrBufCarriageReturn();
28453019
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
@@ -5967,13 +6141,13 @@ private void initDoctypeFields() {
59676141
@Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn()
59686142
throws SAXException {
59696143
silentCarriageReturn();
5970-
adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
6144+
adjustDoubleHyphenAndAppendToStrBufAndErr('\n', false);
59716145
}
59726146

59736147
@Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed()
59746148
throws SAXException {
59756149
silentLineFeed();
5976-
adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
6150+
adjustDoubleHyphenAndAppendToStrBufAndErr('\n', false);
59776151
}
59786152

59796153
@Inline private void appendStrBufLineFeed() {
@@ -6278,6 +6452,8 @@ public void eof() throws SAXException {
62786452
break eofloop;
62796453
case COMMENT_START:
62806454
case COMMENT:
6455+
case COMMENT_LESSTHAN:
6456+
case COMMENT_LESSTHAN_BANG:
62816457
/*
62826458
* EOF Parse error.
62836459
*/
@@ -6289,6 +6465,7 @@ public void eof() throws SAXException {
62896465
*/
62906466
break eofloop;
62916467
case COMMENT_END:
6468+
case COMMENT_LESSTHAN_BANG_DASH_DASH:
62926469
errEofInComment();
62936470
/* Emit the comment token. */
62946471
emitComment(2, 0);
@@ -6298,6 +6475,7 @@ public void eof() throws SAXException {
62986475
break eofloop;
62996476
case COMMENT_END_DASH:
63006477
case COMMENT_START_DASH:
6478+
case COMMENT_LESSTHAN_BANG_DASH:
63016479
errEofInComment();
63026480
/* Emit the comment token. */
63036481
emitComment(1, 0);
@@ -6921,7 +7099,7 @@ protected void errGtInPublicId() throws SAXException {
69217099
protected void errNamelessDoctype() throws SAXException {
69227100
}
69237101

6924-
protected void errConsecutiveHyphens() throws SAXException {
7102+
protected void errNestedComment() throws SAXException {
69257103
}
69267104

69277105
protected void errPrematureEndOfComment() throws SAXException {
@@ -7064,9 +7242,6 @@ protected void errExpectedSystemId() throws SAXException {
70647242
protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
70657243
}
70667244

7067-
protected void errHyphenHyphenBang() throws SAXException {
7068-
}
7069-
70707245
protected void errNcrControlChar() throws SAXException {
70717246
}
70727247

0 commit comments

Comments
 (0)