Skip to content

Commit ab28833

Browse files
Conform ampersand-error reporting to HTML spec
1 parent 2f61c94 commit ab28833

File tree

1 file changed

+29
-25
lines changed

1 file changed

+29
-25
lines changed

src/nu/validator/htmlparser/impl/Tokenizer.java

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,8 @@ public class Tokenizer implements Locator, Locator2 {
221221

222222
public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
223223

224+
public static final int AMBIGUOUS_AMPERSAND = 75;
225+
224226
/**
225227
* Magic value for UTF-16 operations.
226228
*/
@@ -3054,6 +3056,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
30543056
case '<':
30553057
case '&':
30563058
case '\u0000':
3059+
case ';':
30573060
emitOrAppendCharRefBuf(returnState);
30583061
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
30593062
cstart = pos;
@@ -3082,17 +3085,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
30823085
firstCharKey = c - 'A';
30833086
} else {
30843087
// No match
3085-
/*
3086-
* If no match can be made, then this is a parse
3087-
* error.
3088-
*/
3089-
errNoNamedCharacterMatch();
30903088
emitOrAppendCharRefBuf(returnState);
30913089
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
30923090
cstart = pos;
30933091
}
30943092
reconsume = true;
3095-
state = transition(state, returnState, reconsume, pos);
3093+
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
30963094
continue stateloop;
30973095
}
30983096
// Didn't fail yet
@@ -3153,17 +3151,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
31533151
}
31543152
}
31553153
if (hilo == 0) {
3156-
/*
3157-
* If no match can be made, then this is a parse
3158-
* error.
3159-
*/
3160-
errNoNamedCharacterMatch();
31613154
emitOrAppendCharRefBuf(returnState);
31623155
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
31633156
cstart = pos;
31643157
}
31653158
reconsume = true;
3166-
state = transition(state, returnState, reconsume, pos);
3159+
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
31673160
continue stateloop;
31683161
}
31693162
// Didn't fail yet
@@ -3246,16 +3239,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
32463239

32473240
if (candidate == -1) {
32483241
// reconsume deals with CR, LF or nul
3249-
/*
3250-
* If no match can be made, then this is a parse error.
3251-
*/
3252-
errNoNamedCharacterMatch();
32533242
emitOrAppendCharRefBuf(returnState);
32543243
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
32553244
cstart = pos;
32563245
}
32573246
reconsume = true;
3258-
state = transition(state, returnState, reconsume, pos);
3247+
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
32593248
continue stateloop;
32603249
} else {
32613250
// c can't be CR, LF or nul if we got here
@@ -3293,10 +3282,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
32933282
* after the U+0026 AMPERSAND (&) must be
32943283
* unconsumed, and nothing is returned.
32953284
*/
3296-
errNoNamedCharacterMatch();
32973285
appendCharRefBufToStrBuf();
32983286
reconsume = true;
3299-
state = transition(state, returnState, reconsume, pos);
3287+
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
33003288
continue stateloop;
33013289
}
33023290
}
@@ -3359,6 +3347,28 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
33593347
* I'm ∉ I tell you.
33603348
*/
33613349
}
3350+
// XXX reorder point
3351+
case AMBIGUOUS_AMPERSAND:
3352+
ampersandloop: for (;;) {
3353+
if (reconsume) {
3354+
if (++pos == endPos) {
3355+
break stateloop;
3356+
}
3357+
pos--;
3358+
c = checkChar(buf, pos);
3359+
}
3360+
if (c == ';') {
3361+
errNoNamedCharacterMatch();
3362+
} else if ((c >= '0' && c <= '9')
3363+
|| (c >= 'A' && c <= 'Z')
3364+
|| (c >= 'a' && c <= 'z')) {
3365+
appendStrBuf(c);
3366+
pos++;
3367+
continue;
3368+
}
3369+
state = transition(state, returnState, reconsume, pos);
3370+
continue stateloop;
3371+
}
33623372
case CONSUME_NCR:
33633373
if (++pos == endPos) {
33643374
break stateloop;
@@ -6449,7 +6459,6 @@ public void eof() throws SAXException {
64496459
state = returnState;
64506460
continue;
64516461
case CHARACTER_REFERENCE_HILO_LOOKUP:
6452-
errNoNamedCharacterMatch();
64536462
emitOrAppendCharRefBuf(returnState);
64546463
state = returnState;
64556464
continue;
@@ -6503,10 +6512,6 @@ public void eof() throws SAXException {
65036512
}
65046513

65056514
if (candidate == -1) {
6506-
/*
6507-
* If no match can be made, then this is a parse error.
6508-
*/
6509-
errNoNamedCharacterMatch();
65106515
emitOrAppendCharRefBuf(returnState);
65116516
state = returnState;
65126517
continue eofloop;
@@ -6544,7 +6549,6 @@ public void eof() throws SAXException {
65446549
* after the U+0026 AMPERSAND (&) must be
65456550
* unconsumed, and nothing is returned.
65466551
*/
6547-
errNoNamedCharacterMatch();
65486552
appendCharRefBufToStrBuf();
65496553
state = returnState;
65506554
continue eofloop;

0 commit comments

Comments
 (0)