Skip to content

Commit 9ce4bd4

Browse files
Conform ampersand-error reporting to HTML spec
1 parent c5d11f9 commit 9ce4bd4

File tree

1 file changed

+29
-25
lines changed

1 file changed

+29
-25
lines changed

src/nu/validator/htmlparser/impl/Tokenizer.java

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,8 @@ public class Tokenizer implements Locator {
220220

221221
public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74;
222222

223+
public static final int AMBIGUOUS_AMPERSAND = 75;
224+
223225
/**
224226
* Magic value for UTF-16 operations.
225227
*/
@@ -3106,6 +3108,7 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
31063108
case '<':
31073109
case '&':
31083110
case '\u0000':
3111+
case ';':
31093112
emitOrAppendCharRefBuf(returnState);
31103113
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
31113114
cstart = pos;
@@ -3134,17 +3137,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
31343137
firstCharKey = c - 'A';
31353138
} else {
31363139
// No match
3137-
/*
3138-
* If no match can be made, then this is a parse
3139-
* error.
3140-
*/
3141-
errNoNamedCharacterMatch();
31423140
emitOrAppendCharRefBuf(returnState);
31433141
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
31443142
cstart = pos;
31453143
}
31463144
reconsume = true;
3147-
state = transition(state, returnState, reconsume, pos);
3145+
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
31483146
continue stateloop;
31493147
}
31503148
// Didn't fail yet
@@ -3205,17 +3203,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
32053203
}
32063204
}
32073205
if (hilo == 0) {
3208-
/*
3209-
* If no match can be made, then this is a parse
3210-
* error.
3211-
*/
3212-
errNoNamedCharacterMatch();
32133206
emitOrAppendCharRefBuf(returnState);
32143207
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
32153208
cstart = pos;
32163209
}
32173210
reconsume = true;
3218-
state = transition(state, returnState, reconsume, pos);
3211+
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
32193212
continue stateloop;
32203213
}
32213214
// Didn't fail yet
@@ -3298,16 +3291,12 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
32983291

32993292
if (candidate == -1) {
33003293
// reconsume deals with CR, LF or nul
3301-
/*
3302-
* If no match can be made, then this is a parse error.
3303-
*/
3304-
errNoNamedCharacterMatch();
33053294
emitOrAppendCharRefBuf(returnState);
33063295
if ((returnState & DATA_AND_RCDATA_MASK) == 0) {
33073296
cstart = pos;
33083297
}
33093298
reconsume = true;
3310-
state = transition(state, returnState, reconsume, pos);
3299+
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
33113300
continue stateloop;
33123301
} else {
33133302
// c can't be CR, LF or nul if we got here
@@ -3345,10 +3334,9 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
33453334
* after the U+0026 AMPERSAND (&) must be
33463335
* unconsumed, and nothing is returned.
33473336
*/
3348-
errNoNamedCharacterMatch();
33493337
appendCharRefBufToStrBuf();
33503338
reconsume = true;
3351-
state = transition(state, returnState, reconsume, pos);
3339+
state = transition(state, Tokenizer.AMBIGUOUS_AMPERSAND, reconsume, pos);
33523340
continue stateloop;
33533341
}
33543342
}
@@ -3411,6 +3399,28 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
34113399
* I'm ∉ I tell you.
34123400
*/
34133401
}
3402+
// XXX reorder point
3403+
case AMBIGUOUS_AMPERSAND:
3404+
ampersandloop: for (;;) {
3405+
if (reconsume) {
3406+
if (++pos == endPos) {
3407+
break stateloop;
3408+
}
3409+
pos--;
3410+
c = checkChar(buf, pos);
3411+
}
3412+
if (c == ';') {
3413+
errNoNamedCharacterMatch();
3414+
} else if ((c >= '0' && c <= '9')
3415+
|| (c >= 'A' && c <= 'Z')
3416+
|| (c >= 'a' && c <= 'z')) {
3417+
appendStrBuf(c);
3418+
pos++;
3419+
continue;
3420+
}
3421+
state = transition(state, returnState, reconsume, pos);
3422+
continue stateloop;
3423+
}
34143424
case CONSUME_NCR:
34153425
if (++pos == endPos) {
34163426
break stateloop;
@@ -6501,7 +6511,6 @@ public void eof() throws SAXException {
65016511
state = returnState;
65026512
continue;
65036513
case CHARACTER_REFERENCE_HILO_LOOKUP:
6504-
errNoNamedCharacterMatch();
65056514
emitOrAppendCharRefBuf(returnState);
65066515
state = returnState;
65076516
continue;
@@ -6555,10 +6564,6 @@ public void eof() throws SAXException {
65556564
}
65566565

65576566
if (candidate == -1) {
6558-
/*
6559-
* If no match can be made, then this is a parse error.
6560-
*/
6561-
errNoNamedCharacterMatch();
65626567
emitOrAppendCharRefBuf(returnState);
65636568
state = returnState;
65646569
continue eofloop;
@@ -6596,7 +6601,6 @@ public void eof() throws SAXException {
65966601
* after the U+0026 AMPERSAND (&) must be
65976602
* unconsumed, and nothing is returned.
65986603
*/
6599-
errNoNamedCharacterMatch();
66006604
appendCharRefBufToStrBuf();
66016605
state = returnState;
66026606
continue eofloop;

0 commit comments

Comments
 (0)