Skip to content

Commit 633e548

Browse files
Improve HTML parsing a lil' bit
1 parent 7578fb3 commit 633e548

File tree

13 files changed

+351
-35
lines changed

13 files changed

+351
-35
lines changed

Spec/src/main/java/com/github/webicitybrowser/spec/html/parse/ParseError.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@ public enum ParseError {
77
NON_VOID_HTML_ELEMENT_START_TAG_WITH_TRAILING_SOLIDUS,
88
UNEXPECTED_SOLIDUS_IN_TAG,
99
MISSING_WHITESPACE_BETWEEN_ATTRIBUTES,
10+
EOF_IN_DOCTYPE,
1011
ABRUPT_CLOSING_OF_EMPTY_COMMENT,
1112
EOF_IN_COMMENT,
1213
NESTED_COMMENT,
14+
INCORRECTLY_OPENED_COMMENT,
1315
INCORRECTLY_CLOSED_COMMENT,
1416
MISSING_ATTRIBUTE_VALUE,
1517
UNEXPECTED_CHARACTER_IN_UNQUOTED_ATTRIBUTE,

SpiderHTML/src/main/java/com/github/webicitybrowser/spiderhtml/SpiderHTMLParserImp.java

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -59,33 +59,6 @@ private void continueParsing() throws IOException {
5959
}
6060
}
6161

62-
/*@Override
63-
public void parse(InputStream inputStream, HTMLTreeBuilder treeBuilder, ParserSettings settings) throws IOException {
64-
Reader inputReader = EncodingUtil.decode(inputStream, StandardCharsets.UTF_8);
65-
parse(inputReader, treeBuilder, settings);
66-
}
67-
68-
@Override
69-
public void parse(Reader inputReader, HTMLTreeBuilder treeBuilder, ParserSettings settings) throws IOException {
70-
PushbackReader reader = new PushbackReader(inputReader, 32);
71-
ParsingContext parsingContext = new ParsingContext(reader);
72-
73-
while (true) {
74-
TokenizeState tokenizeState = sharedContext.getTokenizeState();
75-
if (tokenizeState == null) {
76-
break;
77-
}
78-
79-
int ch = reader.read();
80-
if (ch == '\r') {
81-
ch = '\n';
82-
} else if (ch == '\n' && parsingContext.readerHandle().peek() == '\r') {
83-
reader.read();
84-
}
85-
tokenizeState.process(sharedContext, parsingContext, ch);
86-
}
87-
}*/
88-
8962
private void initializeContext(ParserSettings settings, SharedContext sharedContext) {
9063
ParsingInitializer parsingInitializer = new ParsingInitializer(settings);
9164

SpiderHTML/src/main/java/com/github/webicitybrowser/spiderhtml/tokenize/DoctypeState.java

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
package com.github.webicitybrowser.spiderhtml.tokenize;
22

3+
import java.io.IOException;
34
import java.util.function.Consumer;
45

6+
import com.github.webicitybrowser.spec.html.parse.ParseError;
57
import com.github.webicitybrowser.spiderhtml.context.ParsingContext;
68
import com.github.webicitybrowser.spiderhtml.context.ParsingInitializer;
79
import com.github.webicitybrowser.spiderhtml.context.SharedContext;
10+
import com.github.webicitybrowser.spiderhtml.token.DoctypeToken;
11+
import com.github.webicitybrowser.spiderhtml.token.EOFToken;
812

913
public class DoctypeState implements TokenizeState {
1014

@@ -16,17 +20,23 @@ public DoctypeState(ParsingInitializer initializer, Consumer<TokenizeState> call
1620
}
1721

1822
@Override
19-
public void process(SharedContext context, ParsingContext parsingContext, int ch) {
23+
public void process(SharedContext context, ParsingContext parsingContext, int ch) throws IOException {
2024
switch (ch) {
2125
case '\t':
2226
case '\n':
2327
case '\f':
2428
case ' ':
2529
context.setTokenizeState(beforeDoctypeNameState);
2630
break;
27-
default:
28-
// TODO
29-
throw new UnsupportedOperationException();
31+
case '>':
32+
parsingContext.readerHandle().unread(ch);
33+
context.setTokenizeState(beforeDoctypeNameState);
34+
break;
35+
case -1:
36+
context.recordError(ParseError.EOF_IN_DOCTYPE);
37+
// TODO: Force quirks
38+
context.emit(new DoctypeToken());
39+
context.emit(new EOFToken());
3040
}
3141
}
3242

SpiderHTML/src/main/java/com/github/webicitybrowser/spiderhtml/tokenize/MarkupDeclarationOpenState.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import java.io.IOException;
44
import java.util.function.Consumer;
55

6+
import com.github.webicitybrowser.spec.html.parse.ParseError;
67
import com.github.webicitybrowser.spiderhtml.context.ParsingContext;
78
import com.github.webicitybrowser.spiderhtml.context.ParsingInitializer;
89
import com.github.webicitybrowser.spiderhtml.context.ReaderHandle;
@@ -13,11 +14,13 @@ public class MarkupDeclarationOpenState implements TokenizeState {
1314

1415
private final DoctypeState doctypeState;
1516
private final CommentStartState commentStartState;
17+
private final BogusCommentState bogusCommentState;
1618

1719
public MarkupDeclarationOpenState(ParsingInitializer initializer, Consumer<TokenizeState> callback) {
1820
callback.accept(this);
1921
this.doctypeState = initializer.getTokenizeState(DoctypeState.class);
2022
this.commentStartState = initializer.getTokenizeState(CommentStartState.class);
23+
this.bogusCommentState = initializer.getTokenizeState(BogusCommentState.class);
2124
}
2225

2326
@Override
@@ -32,8 +35,10 @@ public void process(SharedContext context, ParsingContext parsingContext, int ch
3235
reader.eat(7);
3336
context.setTokenizeState(doctypeState);
3437
} else {
35-
// TODO
36-
throw new UnsupportedOperationException();
38+
context.recordError(ParseError.INCORRECTLY_OPENED_COMMENT);
39+
CommentToken token = new CommentToken("");
40+
parsingContext.setCurrentToken(token);
41+
context.setTokenizeState(bogusCommentState);
3742
}
3843
}
3944

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package com.github.webicitybrowser.spiderhtml.tokenize;
2+
3+
import java.io.IOException;
4+
import java.util.function.Consumer;
5+
6+
import com.github.webicitybrowser.spec.infra.util.ASCIIUtil;
7+
import com.github.webicitybrowser.spiderhtml.context.ParsingContext;
8+
import com.github.webicitybrowser.spiderhtml.context.ParsingInitializer;
9+
import com.github.webicitybrowser.spiderhtml.context.SharedContext;
10+
import com.github.webicitybrowser.spiderhtml.token.CharacterToken;
11+
12+
public class ScriptDataDoubleEscapeEndState implements TokenizeState {
13+
14+
private final ScriptDataDoubleEscapedState scriptDataDoubleEscapedState;
15+
private final ScriptDataEscapedState scriptDataEscapedState;
16+
17+
public ScriptDataDoubleEscapeEndState(ParsingInitializer initializer, Consumer<TokenizeState> callback) {
18+
callback.accept(this);
19+
this.scriptDataDoubleEscapedState = initializer.getTokenizeState(ScriptDataDoubleEscapedState.class);
20+
this.scriptDataEscapedState = initializer.getTokenizeState(ScriptDataEscapedState.class);
21+
}
22+
23+
@Override
24+
public void process(SharedContext context, ParsingContext parsingContext, int ch) throws IOException {
25+
switch(ch) {
26+
case '\t':
27+
case '\n':
28+
case '\f':
29+
case ' ':
30+
case '/':
31+
case '>':
32+
if (parsingContext.getTemporaryBuffer().equals("script")) {
33+
context.setTokenizeState(scriptDataEscapedState);
34+
} else {
35+
context.setTokenizeState(scriptDataDoubleEscapedState);
36+
}
37+
context.emit(new CharacterToken(ch));
38+
default:
39+
if (ASCIIUtil.isASCIIAlpha(ch)) {
40+
parsingContext.appendToTemporaryBuffer(ASCIIUtil.toASCIILowerCase(ch));
41+
context.emit(new CharacterToken(ch));
42+
} else {
43+
parsingContext.readerHandle().unread(ch);
44+
context.setTokenizeState(scriptDataDoubleEscapedState);
45+
}
46+
}
47+
}
48+
49+
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package com.github.webicitybrowser.spiderhtml.tokenize;
2+
3+
import java.io.IOException;
4+
import java.util.function.Consumer;
5+
6+
import com.github.webicitybrowser.spec.infra.util.ASCIIUtil;
7+
import com.github.webicitybrowser.spiderhtml.context.ParsingContext;
8+
import com.github.webicitybrowser.spiderhtml.context.ParsingInitializer;
9+
import com.github.webicitybrowser.spiderhtml.context.SharedContext;
10+
import com.github.webicitybrowser.spiderhtml.token.CharacterToken;
11+
12+
public class ScriptDataDoubleEscapeStartState implements TokenizeState {
13+
14+
private final ScriptDataDoubleEscapedState scriptDataDoubleEscapedState;
15+
private final ScriptDataEscapedState scriptDataEscapedState;
16+
17+
public ScriptDataDoubleEscapeStartState(ParsingInitializer initializer, Consumer<TokenizeState> callback) {
18+
callback.accept(this);
19+
this.scriptDataDoubleEscapedState = initializer.getTokenizeState(ScriptDataDoubleEscapedState.class);
20+
this.scriptDataEscapedState = initializer.getTokenizeState(ScriptDataEscapedState.class);
21+
}
22+
23+
@Override
24+
public void process(SharedContext context, ParsingContext parsingContext, int ch) throws IOException {
25+
switch(ch) {
26+
case '\t':
27+
case '\n':
28+
case '\f':
29+
case ' ':
30+
case '/':
31+
case '>':
32+
if (parsingContext.getTemporaryBuffer().equals("script")) {
33+
context.setTokenizeState(scriptDataDoubleEscapedState);
34+
} else {
35+
context.setTokenizeState(scriptDataEscapedState);
36+
}
37+
context.emit(new CharacterToken(ch));
38+
default:
39+
if (ASCIIUtil.isASCIIAlpha(ch)) {
40+
parsingContext.appendToTemporaryBuffer(ASCIIUtil.toASCIILowerCase(ch));
41+
context.emit(new CharacterToken(ch));
42+
} else {
43+
parsingContext.readerHandle().unread(ch);
44+
context.setTokenizeState(scriptDataEscapedState);
45+
}
46+
}
47+
}
48+
49+
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
package com.github.webicitybrowser.spiderhtml.tokenize;
2+
3+
import java.io.IOException;
4+
import java.util.function.Consumer;
5+
6+
import com.github.webicitybrowser.spec.html.parse.ParseError;
7+
import com.github.webicitybrowser.spiderhtml.context.ParsingContext;
8+
import com.github.webicitybrowser.spiderhtml.context.ParsingInitializer;
9+
import com.github.webicitybrowser.spiderhtml.context.SharedContext;
10+
import com.github.webicitybrowser.spiderhtml.token.CharacterToken;
11+
import com.github.webicitybrowser.spiderhtml.token.EOFToken;
12+
13+
public class ScriptDataDoubleEscapedDashDashState implements TokenizeState {
14+
15+
private final ScriptDataDoubleEscapedLessThanSignState scriptDataDoubleEscapedLessThanSignState;
16+
private final ScriptDataState scriptDataState;
17+
private final ScriptDataDoubleEscapedState scriptDataDoubleEscapedState;
18+
19+
public ScriptDataDoubleEscapedDashDashState(ParsingInitializer initializer, Consumer<TokenizeState> callback) {
20+
callback.accept(this);
21+
this.scriptDataDoubleEscapedLessThanSignState = initializer.getTokenizeState(ScriptDataDoubleEscapedLessThanSignState.class);
22+
this.scriptDataState = initializer.getTokenizeState(ScriptDataState.class);
23+
this.scriptDataDoubleEscapedState = initializer.getTokenizeState(ScriptDataDoubleEscapedState.class);
24+
}
25+
26+
@Override
27+
public void process(SharedContext context, ParsingContext parsingContext, int ch) throws IOException {
28+
switch (ch) {
29+
case '-':
30+
context.emit(new CharacterToken('-'));
31+
break;
32+
case '<':
33+
context.setTokenizeState(scriptDataDoubleEscapedLessThanSignState);
34+
context.emit(new CharacterToken('<'));
35+
break;
36+
case '>':
37+
context.setTokenizeState(scriptDataState);
38+
context.emit(new CharacterToken('<'));
39+
break;
40+
case 0:
41+
context.recordError(ParseError.UNEXPECTED_NULL_CHARACTER);
42+
context.setReturnState(scriptDataDoubleEscapedState);
43+
context.emit(new CharacterToken('\uFFFD'));
44+
break;
45+
case -1:
46+
context.recordError(ParseError.EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
47+
context.emit(new EOFToken());
48+
break;
49+
default:
50+
context.setReturnState(scriptDataDoubleEscapedState);
51+
context.emit(new CharacterToken(ch));
52+
}
53+
}
54+
55+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
package com.github.webicitybrowser.spiderhtml.tokenize;
2+
3+
import java.io.IOException;
4+
import java.util.function.Consumer;
5+
6+
import com.github.webicitybrowser.spec.html.parse.ParseError;
7+
import com.github.webicitybrowser.spiderhtml.context.ParsingContext;
8+
import com.github.webicitybrowser.spiderhtml.context.ParsingInitializer;
9+
import com.github.webicitybrowser.spiderhtml.context.SharedContext;
10+
import com.github.webicitybrowser.spiderhtml.token.CharacterToken;
11+
import com.github.webicitybrowser.spiderhtml.token.EOFToken;
12+
13+
public class ScriptDataDoubleEscapedDashState implements TokenizeState {
14+
15+
private final ScriptDataDoubleEscapedDashDashState scriptDataDoubleEscapedDashDashState;
16+
private final ScriptDataDoubleEscapedLessThanSignState scriptDataDoubleEscapedLessThanSignState;
17+
private final ScriptDataDoubleEscapedState scriptDataDoubleEscapedState;
18+
19+
public ScriptDataDoubleEscapedDashState(ParsingInitializer initializer, Consumer<TokenizeState> callback) {
20+
callback.accept(this);
21+
this.scriptDataDoubleEscapedDashDashState = initializer.getTokenizeState(ScriptDataDoubleEscapedDashDashState.class);
22+
this.scriptDataDoubleEscapedLessThanSignState = initializer.getTokenizeState(ScriptDataDoubleEscapedLessThanSignState.class);
23+
this.scriptDataDoubleEscapedState = initializer.getTokenizeState(ScriptDataDoubleEscapedState.class);
24+
}
25+
26+
@Override
27+
public void process(SharedContext context, ParsingContext parsingContext, int ch) throws IOException {
28+
switch (ch) {
29+
case '-':
30+
context.setTokenizeState(scriptDataDoubleEscapedDashDashState);
31+
context.emit(new CharacterToken('-'));
32+
break;
33+
case '<':
34+
context.setTokenizeState(scriptDataDoubleEscapedLessThanSignState);
35+
context.emit(new CharacterToken('<'));
36+
break;
37+
case 0:
38+
context.recordError(ParseError.UNEXPECTED_NULL_CHARACTER);
39+
context.setReturnState(scriptDataDoubleEscapedState);
40+
context.emit(new CharacterToken('\uFFFD'));
41+
break;
42+
case -1:
43+
context.recordError(ParseError.EOF_IN_SCRIPT_HTML_COMMENT_LIKE_TEXT);
44+
context.emit(new EOFToken());
45+
break;
46+
default:
47+
context.setReturnState(scriptDataDoubleEscapedState);
48+
context.emit(new CharacterToken(ch));
49+
}
50+
}
51+
52+
}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package com.github.webicitybrowser.spiderhtml.tokenize;
2+
3+
import java.io.IOException;
4+
import java.util.function.Consumer;
5+
6+
import com.github.webicitybrowser.spiderhtml.context.ParsingContext;
7+
import com.github.webicitybrowser.spiderhtml.context.ParsingInitializer;
8+
import com.github.webicitybrowser.spiderhtml.context.SharedContext;
9+
import com.github.webicitybrowser.spiderhtml.token.CharacterToken;
10+
11+
public class ScriptDataDoubleEscapedLessThanSignState implements TokenizeState {
12+
13+
private final ScriptDataDoubleEscapeEndState scriptDataDoubleEscapeEndState;
14+
private final ScriptDataDoubleEscapedState scriptDataDoubleEscapedState;
15+
16+
public ScriptDataDoubleEscapedLessThanSignState(ParsingInitializer initializer, Consumer<TokenizeState> callback) {
17+
callback.accept(this);
18+
this.scriptDataDoubleEscapeEndState = initializer.getTokenizeState(ScriptDataDoubleEscapeEndState.class);
19+
this.scriptDataDoubleEscapedState = initializer.getTokenizeState(ScriptDataDoubleEscapedState.class);
20+
}
21+
22+
@Override
23+
public void process(SharedContext context, ParsingContext parsingContext, int ch) throws IOException {
24+
if (ch == '/') {
25+
parsingContext.resetTemporaryBuffer();
26+
context.setTokenizeState(scriptDataDoubleEscapeEndState);
27+
context.emit(new CharacterToken('/'));
28+
} else {
29+
parsingContext.readerHandle().unread(ch);
30+
context.setTokenizeState(scriptDataDoubleEscapedState);
31+
}
32+
}
33+
34+
}

0 commit comments

Comments
 (0)