Skip to content

Commit 6c06ef8

Browse files
sideshowbarkerhsivonen
authored andcommitted
Make consecutive hyphens in comments a non-error
Also allow `<!-->` at (IE conditional) comment end See whatwg/html#1356 See whatwg/html#1456
1 parent 46aca0c commit 6c06ef8

File tree

2 files changed

+190
-19
lines changed

2 files changed

+190
-19
lines changed

src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2009-2013 Mozilla Foundation
2+
* Copyright (c) 2009-2017 Mozilla Foundation
33
*
44
* Permission is hereby granted, free of charge, to any person obtaining a
55
* copy of this software and associated documentation files (the "Software"),
@@ -395,8 +395,8 @@ private boolean isAstralPrivateUse(int c) {
395395
err("Nameless doctype.");
396396
}
397397

398-
@Override protected void errConsecutiveHyphens() throws SAXException {
399-
err("Consecutive hyphens did not terminate a comment. \u201C--\u201D is not permitted inside a comment, but e.g. \u201C- -\u201D is.");
398+
@Override protected void errNestedComment() throws SAXException {
399+
err("Saw \u201C<!--\u201D within a comment. Probable cause: Nested comment (not allowed).");
400400
}
401401

402402
@Override protected void errPrematureEndOfComment() throws SAXException {
@@ -678,10 +678,6 @@ private boolean isAstralPrivateUse(int c) {
678678
err("Missing space before doctype name.");
679679
}
680680

681-
@Override protected void errHyphenHyphenBang() throws SAXException {
682-
err("\u201C--!\u201D found in comment.");
683-
}
684-
685681
@Override protected void errNcrControlChar() throws SAXException {
686682
err("Character reference expands to a control character ("
687683
+ toUPlusString((char) value) + ").");

src/nu/validator/htmlparser/impl/Tokenizer.java

Lines changed: 187 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,14 @@ public class Tokenizer implements Locator, Locator2 {
221221

222222
public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
223223

224+
public static final int COMMENT_LESSTHAN = 76;
225+
226+
public static final int COMMENT_LESSTHAN_BANG = 77;
227+
228+
public static final int COMMENT_LESSTHAN_BANG_DASH = 78;
229+
230+
public static final int COMMENT_LESSTHAN_BANG_DASH_DASH = 79;
231+
224232
/**
225233
* Magic value for UTF-16 operations.
226234
*/
@@ -1029,9 +1037,8 @@ private void maybeAppendSpaceToBogusComment() throws SAXException {
10291037

10301038
// ]NOCPP]
10311039

1032-
@Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c)
1040+
@Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c, boolean reportedConsecutiveHyphens)
10331041
throws SAXException {
1034-
errConsecutiveHyphens();
10351042
// [NOCPP[
10361043
switch (commentPolicy) {
10371044
case ALTER_INFOSET:
@@ -1042,7 +1049,9 @@ private void maybeAppendSpaceToBogusComment() throws SAXException {
10421049
appendStrBuf('-');
10431050
// CPPONLY: MOZ_FALLTHROUGH;
10441051
case ALLOW:
1045-
warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
1052+
if (!reportedConsecutiveHyphens) {
1053+
warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment.");
1054+
}
10461055
// ]NOCPP]
10471056
appendStrBuf(c);
10481057
// [NOCPP[
@@ -1464,6 +1473,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
14641473
@SuppressWarnings("unused") private int stateLoop(int state, char c,
14651474
int pos, @NoLength char[] buf, boolean reconsume, int returnState,
14661475
int endPos) throws SAXException {
1476+
boolean reportedConsecutiveHyphens = false;
14671477
/*
14681478
* Idioms used in this code:
14691479
*
@@ -2540,6 +2550,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
25402550
}
25412551
// CPPONLY: MOZ_FALLTHROUGH;
25422552
case COMMENT_START:
2553+
reportedConsecutiveHyphens = false;
25432554
commentstartloop: for (;;) {
25442555
if (++pos == endPos) {
25452556
break stateloop;
@@ -2572,6 +2583,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
25722583
*/
25732584
state = transition(state, Tokenizer.DATA, reconsume, pos);
25742585
continue stateloop;
2586+
case '<':
2587+
appendStrBuf(c);
2588+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2589+
continue stateloop;
25752590
case '\r':
25762591
appendStrBufCarriageReturn();
25772592
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
@@ -2617,6 +2632,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
26172632
state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
26182633
break commentloop;
26192634
// continue stateloop;
2635+
case '<':
2636+
appendStrBuf(c);
2637+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2638+
continue stateloop;
26202639
case '\r':
26212640
appendStrBufCarriageReturn();
26222641
break stateloop;
@@ -2659,6 +2678,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
26592678
state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
26602679
break commentenddashloop;
26612680
// continue stateloop;
2681+
case '<':
2682+
appendStrBuf(c);
2683+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2684+
continue stateloop;
26622685
case '\r':
26632686
appendStrBufCarriageReturn();
26642687
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
@@ -2713,11 +2736,16 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
27132736
* Append a U+002D HYPHEN-MINUS (-) character to
27142737
* the comment token's data.
27152738
*/
2716-
adjustDoubleHyphenAndAppendToStrBufAndErr(c);
2739+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2740+
reportedConsecutiveHyphens = true;
27172741
/*
27182742
* Stay in the comment end state.
27192743
*/
27202744
continue;
2745+
case '<':
2746+
appendStrBuf(c);
2747+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2748+
continue stateloop;
27212749
case '\r':
27222750
adjustDoubleHyphenAndAppendToStrBufCarriageReturn();
27232751
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
@@ -2727,7 +2755,6 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
27272755
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
27282756
continue stateloop;
27292757
case '!':
2730-
errHyphenHyphenBang();
27312758
appendStrBuf(c);
27322759
state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
27332760
continue stateloop;
@@ -2740,7 +2767,8 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
27402767
* and the input character to the comment
27412768
* token's data.
27422769
*/
2743-
adjustDoubleHyphenAndAppendToStrBufAndErr(c);
2770+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2771+
reportedConsecutiveHyphens = true;
27442772
/*
27452773
* Switch to the comment state.
27462774
*/
@@ -2810,6 +2838,148 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
28102838
continue stateloop;
28112839
}
28122840
}
2841+
case COMMENT_LESSTHAN:
2842+
for (;;) {
2843+
if (++pos == endPos) {
2844+
break stateloop;
2845+
}
2846+
c = checkChar(buf, pos);
2847+
switch (c) {
2848+
case '!':
2849+
appendStrBuf(c);
2850+
state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG, reconsume, pos);
2851+
continue stateloop;
2852+
case '<':
2853+
appendStrBuf(c);
2854+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2855+
continue stateloop;
2856+
case '-':
2857+
appendStrBuf(c);
2858+
state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
2859+
continue stateloop;
2860+
case '\r':
2861+
appendStrBufCarriageReturn();
2862+
break stateloop;
2863+
case '\n':
2864+
appendStrBufLineFeed();
2865+
continue;
2866+
case '\u0000':
2867+
c = '\uFFFD';
2868+
// fall thru
2869+
default:
2870+
appendStrBuf(c);
2871+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2872+
continue stateloop;
2873+
}
2874+
}
2875+
case COMMENT_LESSTHAN_BANG:
2876+
for (;;) {
2877+
if (++pos == endPos) {
2878+
break stateloop;
2879+
}
2880+
c = checkChar(buf, pos);
2881+
switch (c) {
2882+
case '-':
2883+
appendStrBuf(c);
2884+
state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG_DASH, reconsume, pos);
2885+
continue stateloop;
2886+
case '<':
2887+
appendStrBuf(c);
2888+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2889+
continue stateloop;
2890+
case '\r':
2891+
appendStrBufCarriageReturn();
2892+
break stateloop;
2893+
case '\n':
2894+
appendStrBufLineFeed();
2895+
continue;
2896+
case '\u0000':
2897+
c = '\uFFFD';
2898+
// fall thru
2899+
default:
2900+
appendStrBuf(c);
2901+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2902+
continue stateloop;
2903+
}
2904+
}
2905+
case COMMENT_LESSTHAN_BANG_DASH:
2906+
for (;;) {
2907+
if (++pos == endPos) {
2908+
break stateloop;
2909+
}
2910+
c = checkChar(buf, pos);
2911+
switch (c) {
2912+
case '-':
2913+
appendStrBuf(c);
2914+
state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG_DASH_DASH, reconsume, pos);
2915+
continue stateloop;
2916+
case '<':
2917+
appendStrBuf(c);
2918+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
2919+
continue stateloop;
2920+
case '\r':
2921+
appendStrBufCarriageReturn();
2922+
break stateloop;
2923+
case '\n':
2924+
appendStrBufLineFeed();
2925+
continue;
2926+
case '\u0000':
2927+
c = '\uFFFD';
2928+
// fall thru
2929+
default:
2930+
appendStrBuf(c);
2931+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2932+
continue stateloop;
2933+
}
2934+
}
2935+
case COMMENT_LESSTHAN_BANG_DASH_DASH:
2936+
for (;;) {
2937+
if (++pos == endPos) {
2938+
break stateloop;
2939+
}
2940+
c = checkChar(buf, pos);
2941+
switch (c) {
2942+
case '>':
2943+
appendStrBuf(c);
2944+
emitComment(3, pos);
2945+
state = transition(state, Tokenizer.DATA, reconsume, pos);
2946+
continue stateloop;
2947+
case '-':
2948+
errNestedComment();
2949+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2950+
reportedConsecutiveHyphens = true;
2951+
state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2952+
continue stateloop;
2953+
case '\r':
2954+
errNestedComment();
2955+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2956+
reportedConsecutiveHyphens = true;
2957+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2958+
break stateloop;
2959+
case '\n':
2960+
errNestedComment();
2961+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2962+
reportedConsecutiveHyphens = true;
2963+
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
2964+
continue;
2965+
case '\u0000':
2966+
c = '\uFFFD';
2967+
// fall thru
2968+
case '!':
2969+
errNestedComment();
2970+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2971+
reportedConsecutiveHyphens = true;
2972+
state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos);
2973+
continue stateloop;
2974+
default:
2975+
errNestedComment();
2976+
adjustDoubleHyphenAndAppendToStrBufAndErr(c, reportedConsecutiveHyphens);
2977+
reportedConsecutiveHyphens = true;
2978+
state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
2979+
continue stateloop;
2980+
}
2981+
}
2982+
// XXX reorder point
28132983
case COMMENT_START_DASH:
28142984
if (++pos == endPos) {
28152985
break stateloop;
@@ -2838,6 +3008,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
28383008
*/
28393009
state = transition(state, Tokenizer.DATA, reconsume, pos);
28403010
continue stateloop;
3011+
case '<':
3012+
appendStrBuf(c);
3013+
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
3014+
continue stateloop;
28413015
case '\r':
28423016
appendStrBufCarriageReturn();
28433017
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
@@ -5957,13 +6131,13 @@ private void initDoctypeFields() {
59576131
@Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn()
59586132
throws SAXException {
59596133
silentCarriageReturn();
5960-
adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
6134+
adjustDoubleHyphenAndAppendToStrBufAndErr('\n', false);
59616135
}
59626136

59636137
@Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed()
59646138
throws SAXException {
59656139
silentLineFeed();
5966-
adjustDoubleHyphenAndAppendToStrBufAndErr('\n');
6140+
adjustDoubleHyphenAndAppendToStrBufAndErr('\n', false);
59676141
}
59686142

59696143
@Inline private void appendStrBufLineFeed() {
@@ -6268,6 +6442,8 @@ public void eof() throws SAXException {
62686442
break eofloop;
62696443
case COMMENT_START:
62706444
case COMMENT:
6445+
case COMMENT_LESSTHAN:
6446+
case COMMENT_LESSTHAN_BANG:
62716447
/*
62726448
* EOF Parse error.
62736449
*/
@@ -6279,6 +6455,7 @@ public void eof() throws SAXException {
62796455
*/
62806456
break eofloop;
62816457
case COMMENT_END:
6458+
case COMMENT_LESSTHAN_BANG_DASH_DASH:
62826459
errEofInComment();
62836460
/* Emit the comment token. */
62846461
emitComment(2, 0);
@@ -6288,6 +6465,7 @@ public void eof() throws SAXException {
62886465
break eofloop;
62896466
case COMMENT_END_DASH:
62906467
case COMMENT_START_DASH:
6468+
case COMMENT_LESSTHAN_BANG_DASH:
62916469
errEofInComment();
62926470
/* Emit the comment token. */
62936471
emitComment(1, 0);
@@ -6917,7 +7095,7 @@ protected void errGtInPublicId() throws SAXException {
69177095
protected void errNamelessDoctype() throws SAXException {
69187096
}
69197097

6920-
protected void errConsecutiveHyphens() throws SAXException {
7098+
protected void errNestedComment() throws SAXException {
69217099
}
69227100

69237101
protected void errPrematureEndOfComment() throws SAXException {
@@ -7060,9 +7238,6 @@ protected void errExpectedSystemId() throws SAXException {
70607238
protected void errMissingSpaceBeforeDoctypeName() throws SAXException {
70617239
}
70627240

7063-
protected void errHyphenHyphenBang() throws SAXException {
7064-
}
7065-
70667241
protected void errNcrControlChar() throws SAXException {
70677242
}
70687243

0 commit comments

Comments
 (0)