Skip to content

Commit

Permalink
Update html5lib-tests (#460)
Browse files Browse the repository at this point in the history
* wip on updating html5lib-tests

* fix up parse error parsing

* add better debug output

* wip

* wip

* wip

* wip

* adjust all switches to BogusComment (according to html5gum)

* wip

* wip

* wip

* wip

* wip

* wip

* wip (test3 done)

* fix test1

* wip on entities.test

* get rid of addnl_allowed in charref tokenizer

* remove bogusname???

* fix escapeFlag.test: End tag surrounded by bogus comment in RCDATA or RAWTEXT (in state RawData(Rawtext))

* update html5lib tests

* Revert "remove bogusname???"

This reverts commit 575b077.

* wip restore bogusname

* more bugfixes

* Revert "wip restore bogusname"

This reverts commit eb28165.

* fix a bug when peeking characters in BeforeAttributeValue

* make eat() pre-process input characters

input where it matters (JSON-escaped):

"<!DOCTYPE0\r\nPUBLIC'"

* update charref states

* add regression tests, skip broken test

* fix hang

* fix bug where ignore_lf was not reset during unconsuming

* fix webkit02.dat-26 test

* fix wbekit02.dat-22

* fix ack self-closing

* fix tests26.dat-19

* fix foreign-fragment.dat-65

corresponding changes in HTML spec are: whatwg/html@f690ad9

and follow-up discussion at whatwg/html#6439

* fix search-element.dat-0

* fix search-element.dat-1

* fix bug in charref tokenizer wrt newline normalization
  • Loading branch information
untitaker authored Aug 11, 2023
1 parent d31791f commit c736fec
Show file tree
Hide file tree
Showing 11 changed files with 266 additions and 133 deletions.
63 changes: 30 additions & 33 deletions html5ever/src/tokenizer/char_ref/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ enum State {

pub struct CharRefTokenizer {
state: State,
addnl_allowed: Option<char>,
result: Option<CharRef>,
is_consumed_in_attribute: bool,

num: u32,
num_too_big: bool,
Expand All @@ -61,12 +61,10 @@ pub struct CharRefTokenizer {
}

impl CharRefTokenizer {
// NB: We assume that we have an additional allowed character iff we're
// tokenizing in an attribute value.
pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer {
pub fn new(is_consumed_in_attribute: bool) -> CharRefTokenizer {
CharRefTokenizer {
is_consumed_in_attribute,
state: Begin,
addnl_allowed,
result: None,
num: 0,
num_too_big: false,
Expand Down Expand Up @@ -140,20 +138,18 @@ impl CharRefTokenizer {
input: &mut BufferQueue,
) -> Status {
match unwrap_or_return!(tokenizer.peek(input), Stuck) {
'\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(),
c if Some(c) == self.addnl_allowed => self.finish_none(),
'a'..='z' | 'A'..='Z' | '0'..='9' => {
self.state = Named;
self.name_buf_opt = Some(StrTendril::new());
Progress
},

'#' => {
tokenizer.discard_char(input);
self.state = Octothorpe;
Progress
},

_ => {
self.state = Named;
self.name_buf_opt = Some(StrTendril::new());
Progress
},
_ => self.finish_none(),
}
}

Expand Down Expand Up @@ -277,7 +273,10 @@ impl CharRefTokenizer {
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
// peek + discard skips over newline normalization, therefore making it easier to
// un-consume
let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
tokenizer.discard_char(input);
self.name_buf_mut().push_char(c);
match data::NAMED_ENTITIES.get(&self.name_buf()[..]) {
// We have either a full match or a prefix of one.
Expand Down Expand Up @@ -356,26 +355,20 @@ impl CharRefTokenizer {
Some(self.name_buf()[name_len..].chars().next().unwrap())
};

// "If the character reference is being consumed as part of an
// attribute, and the last character matched is not a U+003B
// SEMICOLON character (;), and the next character is either a
// U+003D EQUALS SIGN character (=) or an alphanumeric ASCII
// character, then, for historical reasons, all the characters
// that were matched after the U+0026 AMPERSAND character (&)
// must be unconsumed, and nothing is returned. However, if
// this next character is in fact a U+003D EQUALS SIGN
// character (=), then this is a parse error"

let unconsume_all = match (self.addnl_allowed, last_matched, next_after) {
// If the character reference was consumed as part of an attribute, and the last
// character matched is not a U+003B SEMICOLON character (;), and the next input
// character is either a U+003D EQUALS SIGN character (=) or an ASCII alphanumeric,
// then, for historical reasons, flush code points consumed as a character
// reference and switch to the return state.

let unconsume_all = match (self.is_consumed_in_attribute, last_matched, next_after) {
(_, ';', _) => false,
(Some(_), _, Some('=')) => {
tokenizer.emit_error(Borrowed(
"Equals sign after character reference in attribute",
));
true
},
(Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true,
(true, _, Some('=')) => true,
(true, _, Some(c)) if c.is_ascii_alphanumeric() => true,
_ => {
// 1. If the last character matched is not a U+003B SEMICOLON character
// (;), then this is a missing-semicolon-after-character-reference parse
// error.
tokenizer.emit_error(Borrowed(
"Character reference does not end with semicolon",
));
Expand All @@ -388,6 +381,7 @@ impl CharRefTokenizer {
self.finish_none()
} else {
input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..]));
tokenizer.ignore_lf = false;
self.result = Some(CharRef {
chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()],
num_chars: if c2 == 0 { 1 } else { 2 },
Expand All @@ -403,7 +397,10 @@ impl CharRefTokenizer {
tokenizer: &mut Tokenizer<Sink>,
input: &mut BufferQueue,
) -> Status {
let c = unwrap_or_return!(tokenizer.get_char(input), Stuck);
// peek + discard skips over newline normalization, therefore making it easier to
// un-consume
let c = unwrap_or_return!(tokenizer.peek(input), Stuck);
tokenizer.discard_char(input);
self.name_buf_mut().push_char(c);
match c {
_ if c.is_ascii_alphanumeric() => return Progress,
Expand Down
Loading

0 comments on commit c736fec

Please sign in to comment.