From c736fec7dadc4f2c0f7580d0ba13373d2616a85e Mon Sep 17 00:00:00 2001 From: Markus Unterwaditzer Date: Fri, 11 Aug 2023 10:59:23 +0200 Subject: [PATCH] Update html5lib-tests (#460) * wip on updating html5lib-tests * fix up parse error parsing * add better debug output * wip * wip * wip * wip * adjust all switches to BogusComment (according to html5gum) * wip * wip * wip * wip * wip * wip * wip (test3 done) * fix test1 * wip on entities.test * get rid of addnl_allowed in charref tokenizer * remove bogusname??? * fix escapeFlag.test: End tag surrounded by bogus comment in RCDATA or RAWTEXT (in state RawData(Rawtext)) * update html5lib tests * Revert "remove bogusname???" This reverts commit 575b07719ca860cf710839cf082ed875a29b3236. * wip restore bogusname * more bugfixes * Revert "wip restore bogusname" This reverts commit eb281656da577d40ab506d75de4b722b49ed3d86. * fix a bug when peeking characters in BeforeAttributeValue * make eat() pre-process input characters input where it matters (JSON-escaped): ", result: Option, + is_consumed_in_attribute: bool, num: u32, num_too_big: bool, @@ -61,12 +61,10 @@ pub struct CharRefTokenizer { } impl CharRefTokenizer { - // NB: We assume that we have an additional allowed character iff we're - // tokenizing in an attribute value. - pub fn new(addnl_allowed: Option) -> CharRefTokenizer { + pub fn new(is_consumed_in_attribute: bool) -> CharRefTokenizer { CharRefTokenizer { + is_consumed_in_attribute, state: Begin, - addnl_allowed, result: None, num: 0, num_too_big: false, @@ -140,20 +138,18 @@ impl CharRefTokenizer { input: &mut BufferQueue, ) -> Status { match unwrap_or_return!(tokenizer.peek(input), Stuck) { - '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(), - c if Some(c) == self.addnl_allowed => self.finish_none(), + 'a'..='z' | 'A'..='Z' | '0'..='9' => { + self.state = Named; + self.name_buf_opt = Some(StrTendril::new()); + Progress + }, '#' => { tokenizer.discard_char(input); self.state = Octothorpe; Progress }, - - _ => { - self.state = Named; - self.name_buf_opt = Some(StrTendril::new()); - Progress - }, + _ => self.finish_none(), } } @@ -277,7 +273,10 @@ impl CharRefTokenizer { tokenizer: &mut Tokenizer, input: &mut BufferQueue, ) -> Status { - let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); + // peek + discard skips over newline normalization, therefore making it easier to + // un-consume + let c = unwrap_or_return!(tokenizer.peek(input), Stuck); + tokenizer.discard_char(input); self.name_buf_mut().push_char(c); match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { // We have either a full match or a prefix of one. @@ -356,26 +355,20 @@ impl CharRefTokenizer { Some(self.name_buf()[name_len..].chars().next().unwrap()) }; - // "If the character reference is being consumed as part of an - // attribute, and the last character matched is not a U+003B - // SEMICOLON character (;), and the next character is either a - // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII - // character, then, for historical reasons, all the characters - // that were matched after the U+0026 AMPERSAND character (&) - // must be unconsumed, and nothing is returned. However, if - // this next character is in fact a U+003D EQUALS SIGN - // character (=), then this is a parse error" - - let unconsume_all = match (self.addnl_allowed, last_matched, next_after) { + // If the character reference was consumed as part of an attribute, and the last + // character matched is not a U+003B SEMICOLON character (;), and the next input + // character is either a U+003D EQUALS SIGN character (=) or an ASCII alphanumeric, + // then, for historical reasons, flush code points consumed as a character + // reference and switch to the return state. + + let unconsume_all = match (self.is_consumed_in_attribute, last_matched, next_after) { (_, ';', _) => false, - (Some(_), _, Some('=')) => { - tokenizer.emit_error(Borrowed( - "Equals sign after character reference in attribute", - )); - true - }, - (Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true, + (true, _, Some('=')) => true, + (true, _, Some(c)) if c.is_ascii_alphanumeric() => true, _ => { + // 1. If the last character matched is not a U+003B SEMICOLON character + // (;), then this is a missing-semicolon-after-character-reference parse + // error. tokenizer.emit_error(Borrowed( "Character reference does not end with semicolon", )); @@ -388,6 +381,7 @@ impl CharRefTokenizer { self.finish_none() } else { input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..])); + tokenizer.ignore_lf = false; self.result = Some(CharRef { chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], num_chars: if c2 == 0 { 1 } else { 2 }, @@ -403,7 +397,10 @@ impl CharRefTokenizer { tokenizer: &mut Tokenizer, input: &mut BufferQueue, ) -> Status { - let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); + // peek + discard skips over newline normalization, therefore making it easier to + // un-consume + let c = unwrap_or_return!(tokenizer.peek(input), Stuck); + tokenizer.discard_char(input); self.name_buf_mut().push_char(c); match c { _ if c.is_ascii_alphanumeric() => return Progress, diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs index 20a96204..0fb0d014 100644 --- a/html5ever/src/tokenizer/mod.rs +++ b/html5ever/src/tokenizer/mod.rs @@ -46,6 +46,7 @@ pub enum ProcessResult { } #[must_use] +#[derive(Debug)] pub enum TokenizerResult { Done, Script(Handle), @@ -318,14 +319,20 @@ impl Tokenizer { // Check if the next characters are an ASCII case-insensitive match. See // BufferQueue::eat. // - // NB: this doesn't do input stream preprocessing or set the current input - // character. + // NB: this doesn't set the current input character. fn eat( &mut self, input: &mut BufferQueue, pat: &str, eq: fn(&u8, &u8) -> bool, ) -> Option { + if self.ignore_lf { + self.ignore_lf = false; + if self.peek(input) == Some('\n') { + self.discard_char(input); + } + } + input.push_front(replace(&mut self.temp_buf, StrTendril::new())); match input.eat(pat, eq) { None if self.at_eof => Some(false), @@ -545,10 +552,10 @@ impl Tokenizer { } } - fn consume_char_ref(&mut self, addnl_allowed: Option) { - // NB: The char ref tokenizer assumes we have an additional allowed - // character iff we're tokenizing in an attribute value. - self.char_ref_tokenizer = Some(Box::new(CharRefTokenizer::new(addnl_allowed))); + fn consume_char_ref(&mut self) { + self.char_ref_tokenizer = Some( + Box::new(CharRefTokenizer::new(matches!(self.state, states::AttributeValue(_)))) + ); } fn emit_eof(&mut self) { @@ -564,7 +571,16 @@ impl Tokenizer { } fn discard_char(&mut self, input: &mut BufferQueue) { - self.get_char(input); + // peek() deals in un-processed characters (no newline normalization), while get_char() + // does. + // + // since discard_char is supposed to be used in combination with peek(), discard_char must + // discard a single raw input character, not a normalized newline. + if self.reconsume { + self.reconsume = false; + } else { + input.next(); + } } fn emit_error(&mut self, error: Cow<'static, str>) { @@ -632,8 +648,7 @@ macro_rules! go ( ( $me:ident : reconsume $s:ident $k1:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1); }); ( $me:ident : reconsume $s:ident $k1:ident $k2:expr ) => ({ $me.reconsume = true; go!($me: to $s $k1 $k2); }); - ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(None); return ProcessResult::Continue; }); - ( $me:ident : consume_char_ref $addnl:expr ) => ({ $me.consume_char_ref(Some($addnl)); return ProcessResult::Continue; }); + ( $me:ident : consume_char_ref ) => ({ $me.consume_char_ref(); return ProcessResult::Continue; }); // We have a default next state after emitting a tag, but the sink can override. ( $me:ident : emit_tag $s:ident ) => ({ @@ -769,9 +784,9 @@ impl Tokenizer { //§ tag-open-state states::TagOpen => loop { match get_char!(self, input) { - '!' => go!(self: clear_temp; to MarkupDeclarationOpen), + '!' => go!(self: to MarkupDeclarationOpen), '/' => go!(self: to EndTagOpen), - '?' => go!(self: error; clear_comment; push_comment '?'; to BogusComment), + '?' => go!(self: error; clear_comment; reconsume BogusComment), c => match lower_ascii_letter(c) { Some(cl) => go!(self: create_tag StartTag cl; to TagName), None => go!(self: error; emit '<'; reconsume Data), @@ -783,12 +798,9 @@ impl Tokenizer { states::EndTagOpen => loop { match get_char!(self, input) { '>' => go!(self: error; to Data), - '\0' => { - go!(self: error; clear_comment; push_comment '\u{fffd}'; to BogusComment) - }, c => match lower_ascii_letter(c) { Some(cl) => go!(self: create_tag EndTag cl; to TagName), - None => go!(self: error; clear_comment; push_comment c; to BogusComment), + None => go!(self: error; clear_comment; reconsume BogusComment), }, } }, @@ -852,7 +864,7 @@ impl Tokenizer { match c { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeAttributeName), '/' => go!(self: to SelfClosingStartTag), - '>' => go!(self: emit_tag Data), + '>' => go!(self: clear_temp; emit_tag Data), _ => (), } } @@ -1014,9 +1026,6 @@ impl Tokenizer { '\t' | '\n' | '\r' | '\x0C' | ' ' => go!(self: discard_char input), '"' => go!(self: discard_char input; to AttributeValue DoubleQuoted), '\'' => go!(self: discard_char input; to AttributeValue SingleQuoted), - '\0' => { - go!(self: discard_char input; error; push_value '\u{fffd}'; to AttributeValue Unquoted) - }, '>' => go!(self: discard_char input; error; emit_tag Data), _ => go!(self: to AttributeValue Unquoted), } @@ -1026,7 +1035,7 @@ impl Tokenizer { states::AttributeValue(DoubleQuoted) => loop { match pop_except_from!(self, input, small_char_set!('\r' '"' '&' '\0' '\n')) { FromSet('"') => go!(self: to AfterAttributeValueQuoted), - FromSet('&') => go!(self: consume_char_ref '"'), + FromSet('&') => go!(self: consume_char_ref), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), FromSet(c) => go!(self: push_value c), NotFromSet(ref b) => go!(self: append_value b), @@ -1037,7 +1046,7 @@ impl Tokenizer { states::AttributeValue(SingleQuoted) => loop { match pop_except_from!(self, input, small_char_set!('\r' '\'' '&' '\0' '\n')) { FromSet('\'') => go!(self: to AfterAttributeValueQuoted), - FromSet('&') => go!(self: consume_char_ref '\''), + FromSet('&') => go!(self: consume_char_ref), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), FromSet(c) => go!(self: push_value c), NotFromSet(ref b) => go!(self: append_value b), @@ -1054,7 +1063,7 @@ impl Tokenizer { FromSet('\t') | FromSet('\n') | FromSet('\x0C') | FromSet(' ') => { go!(self: to BeforeAttributeName) }, - FromSet('&') => go!(self: consume_char_ref '>'), + FromSet('&') => go!(self: consume_char_ref), FromSet('>') => go!(self: emit_tag Data), FromSet('\0') => go!(self: error; push_value '\u{fffd}'), FromSet(c) => { @@ -1110,12 +1119,46 @@ impl Tokenizer { //§ comment-state states::Comment => loop { match get_char!(self, input) { + c @ '<' => go!(self: push_comment c; to CommentLessThanSign), '-' => go!(self: to CommentEndDash), '\0' => go!(self: error; push_comment '\u{fffd}'), c => go!(self: push_comment c), } }, + //§ comment-less-than-sign-state + states::CommentLessThanSign => loop { + match get_char!(self, input) { + c @ '!' => go!(self: push_comment c; to CommentLessThanSignBang), + c @ '<' => go!(self: push_comment c), + _ => go!(self: reconsume Comment), + } + }, + + //§ comment-less-than-sign-bang + states::CommentLessThanSignBang => loop { + match get_char!(self, input) { + '-' => go!(self: to CommentLessThanSignBangDash), + _ => go!(self: reconsume Comment), + } + }, + + //§ comment-less-than-sign-bang-dash + states::CommentLessThanSignBangDash => loop { + match get_char!(self, input) { + '-' => go!(self: to CommentLessThanSignBangDashDash), + _ => go!(self: reconsume CommentEndDash), + } + }, + + //§ comment-less-than-sign-bang-dash-dash + states::CommentLessThanSignBangDashDash => loop { + match get_char!(self, input) { + '>' => go!(self: reconsume CommentEnd), + _ => go!(self: error; reconsume CommentEnd), + } + }, + //§ comment-end-dash-state states::CommentEndDash => loop { match get_char!(self, input) { @@ -1129,10 +1172,9 @@ impl Tokenizer { states::CommentEnd => loop { match get_char!(self, input) { '>' => go!(self: emit_comment; to Data), - '\0' => go!(self: error; append_comment "--\u{fffd}"; to Comment), - '!' => go!(self: error; to CommentEndBang), - '-' => go!(self: error; push_comment '-'), - c => go!(self: error; append_comment "--"; push_comment c; to Comment), + '!' => go!(self: to CommentEndBang), + '-' => go!(self: push_comment '-'), + _ => go!(self: append_comment "--"; reconsume Comment), } }, @@ -1140,7 +1182,7 @@ impl Tokenizer { states::CommentEndBang => loop { match get_char!(self, input) { '-' => go!(self: append_comment "--!"; to CommentEndDash), - '>' => go!(self: emit_comment; to Data), + '>' => go!(self: error; emit_comment; to Data), '\0' => go!(self: error; append_comment "--!\u{fffd}"; to Comment), c => go!(self: append_comment "--!"; push_comment c; to Comment), } @@ -1150,6 +1192,7 @@ impl Tokenizer { states::Doctype => loop { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => go!(self: to BeforeDoctypeName), + '>' => go!(self: reconsume BeforeDoctypeName), _ => go!(self: error; reconsume BeforeDoctypeName), } }, @@ -1187,7 +1230,7 @@ impl Tokenizer { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '>' => go!(self: emit_doctype; to Data), - _ => go!(self: error; force_quirks; to BogusDoctype), + _ => go!(self: error; force_quirks; reconsume BogusDoctype), } } }, @@ -1203,7 +1246,7 @@ impl Tokenizer { go!(self: error; clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind) }, '>' => go!(self: error; force_quirks; emit_doctype; to Data), - _ => go!(self: error; force_quirks; to BogusDoctype), + _ => go!(self: error; force_quirks; reconsume BogusDoctype), } }, @@ -1214,7 +1257,7 @@ impl Tokenizer { '"' => go!(self: clear_doctype_id kind; to DoctypeIdentifierDoubleQuoted kind), '\'' => go!(self: clear_doctype_id kind; to DoctypeIdentifierSingleQuoted kind), '>' => go!(self: error; force_quirks; emit_doctype; to Data), - _ => go!(self: error; force_quirks; to BogusDoctype), + _ => go!(self: error; force_quirks; reconsume BogusDoctype), } }, @@ -1251,7 +1294,7 @@ impl Tokenizer { '\'' => { go!(self: error; clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) }, - _ => go!(self: error; force_quirks; to BogusDoctype), + _ => go!(self: error; force_quirks; reconsume BogusDoctype), } }, @@ -1260,7 +1303,7 @@ impl Tokenizer { match get_char!(self, input) { '\t' | '\n' | '\x0C' | ' ' => (), '>' => go!(self: emit_doctype; to Data), - _ => go!(self: error; to BogusDoctype), + _ => go!(self: error; reconsume BogusDoctype), } }, @@ -1275,7 +1318,7 @@ impl Tokenizer { '\'' => { go!(self: clear_doctype_id System; to DoctypeIdentifierSingleQuoted System) }, - _ => go!(self: error; force_quirks; to BogusDoctype), + _ => go!(self: error; force_quirks; reconsume BogusDoctype), } }, @@ -1283,6 +1326,7 @@ impl Tokenizer { states::BogusDoctype => loop { match get_char!(self, input) { '>' => go!(self: emit_doctype; to Data), + '\0' => go!(self: error), _ => (), } }, @@ -1291,7 +1335,7 @@ impl Tokenizer { states::BogusComment => loop { match get_char!(self, input) { '>' => go!(self: emit_comment; to Data), - '\0' => go!(self: push_comment '\u{fffd}'), + '\0' => go!(self: error; push_comment '\u{fffd}'), c => go!(self: push_comment c), } }, @@ -1311,7 +1355,7 @@ impl Tokenizer { go!(self: clear_temp; to CdataSection); } } - go!(self: error; to BogusComment); + go!(self: error; clear_comment; to BogusComment); } }, @@ -1455,13 +1499,14 @@ impl Tokenizer { states::BeforeAttributeName | states::AttributeName | states::AfterAttributeName | - states::BeforeAttributeValue | states::AttributeValue(_) | states::AfterAttributeValueQuoted | states::SelfClosingStartTag | states::ScriptDataEscapedDash(_) | states::ScriptDataEscapedDashDash(_) => go!(self: error_eof; to Data), + states::BeforeAttributeValue => go!(self: reconsume AttributeValue Unquoted), + states::TagOpen => go!(self: error_eof; emit '<'; to Data), states::EndTagOpen => go!(self: error_eof; emit '<'; emit '/'; to Data), @@ -1493,6 +1538,12 @@ impl Tokenizer { states::CommentEnd | states::CommentEndBang => go!(self: error_eof; emit_comment; to Data), + states::CommentLessThanSign | states::CommentLessThanSignBang => go!(self: reconsume Comment), + + states::CommentLessThanSignBangDash => go!(self: reconsume CommentEndDash), + + states::CommentLessThanSignBangDashDash => go!(self: reconsume CommentEnd), + states::Doctype | states::BeforeDoctypeName => { go!(self: error_eof; create_doctype; force_quirks; emit_doctype; to Data) }, diff --git a/html5ever/src/tokenizer/states.rs b/html5ever/src/tokenizer/states.rs index d455e9a8..3c320188 100644 --- a/html5ever/src/tokenizer/states.rs +++ b/html5ever/src/tokenizer/states.rs @@ -73,6 +73,10 @@ pub enum State { CommentStart, CommentStartDash, Comment, + CommentLessThanSign, + CommentLessThanSignBang, + CommentLessThanSignBangDash, + CommentLessThanSignBangDashDash, CommentEndDash, CommentEnd, CommentEndBang, diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs index 39bd55c1..98b209fb 100644 --- a/html5ever/src/tree_builder/mod.rs +++ b/html5ever/src/tree_builder/mod.rs @@ -1630,7 +1630,6 @@ where local_name!("xlink:show") => Some(qualname!("xlink" xlink "show")), local_name!("xlink:title") => Some(qualname!("xlink" xlink "title")), local_name!("xlink:type") => Some(qualname!("xlink" xlink "type")), - local_name!("xml:base") => Some(qualname!("xml" xml "base")), local_name!("xml:lang") => Some(qualname!("xml" xml "lang")), local_name!("xml:space") => Some(qualname!("xml" xml "space")), local_name!("xmlns") => Some(qualname!("" xmlns "xmlns")), @@ -1662,18 +1661,13 @@ where fn unexpected_start_tag_in_foreign_content(&mut self, tag: Tag) -> ProcessResult { self.unexpected(&tag); - if self.is_fragment() { - self.foreign_start_tag(tag) - } else { + while !self.current_node_in(|n| { + *n.ns == ns!(html) || + mathml_text_integration_point(n) || + svg_html_integration_point(n) + }) { self.pop(); - while !self.current_node_in(|n| { - *n.ns == ns!(html) || - mathml_text_integration_point(n) || - svg_html_integration_point(n) - }) { - self.pop(); - } - ReprocessForeign(TagToken(tag)) } + self.step(self.mode, TagToken(tag)) } } diff --git a/html5ever/src/tree_builder/rules.rs b/html5ever/src/tree_builder/rules.rs index d9a4ba1f..521ce1cc 100644 --- a/html5ever/src/tree_builder/rules.rs +++ b/html5ever/src/tree_builder/rules.rs @@ -337,7 +337,7 @@ where tag @
- => { + => { if !self.in_scope_named(default_scope, tag.name.clone()) { self.unexpected(&tag); } else { @@ -1115,6 +1115,18 @@ where Done } + tag @
=> { + if self.current_node_named(local_name!("option")) { + self.pop(); + } + if self.current_node_named(local_name!("optgroup")) { + self.pop(); + } + self.insert_element_for(tag); + self.pop(); + DoneAckSelfClosing + } + => { if self.open_elems.len() >= 2 && self.current_node_named(local_name!("option")) @@ -1388,7 +1400,7 @@ where


    1.  
                              
      -                 
        => self.unexpected_start_tag_in_foreign_content(tag), +

          => self.unexpected_start_tag_in_foreign_content(tag), tag @ => { let unexpected = tag.attrs.iter().any(|attr| { diff --git a/markup5ever/local_names.txt b/markup5ever/local_names.txt index fdd57f82..47c635c8 100644 --- a/markup5ever/local_names.txt +++ b/markup5ever/local_names.txt @@ -810,6 +810,7 @@ scrolldelay scrolling sdev seamless +search sec sech section diff --git a/rcdom/custom-html5lib-tokenizer-tests/regression.test b/rcdom/custom-html5lib-tokenizer-tests/regression.test new file mode 100644 index 00000000..1a7c5e6e --- /dev/null +++ b/rcdom/custom-html5lib-tokenizer-tests/regression.test @@ -0,0 +1,44 @@ +{"tests": [ + +{"description": "Nested HTML comment", +"input": "", +"output": [ + ["StartTag", "j", {"0": ""}] +], +"errors": [ + {"code": "missing-attribute-value"} +]}, + +{"description": "Windows newline in docstring", +"input": "", +"output": [], +"errors": [ + {"code": "eof-in-tag"} +]}, + +{"description": "Windows newline between unquoted attributes", +"input": "", +"output": [], +"errors": [ + {"code": "missing-semicolon-after-character-reference"}, + {"code": "eof-in-tag"} +]}, + +{"description": "Windows newline after bogusname", +"input": "&0\r\n", +"output": [["Character", "&0\n"]], +"errors": []} + +]} diff --git a/rcdom/html5lib-tests b/rcdom/html5lib-tests index c75a9f56..c67f90ea 160000 --- a/rcdom/html5lib-tests +++ b/rcdom/html5lib-tests @@ -1 +1 @@ -Subproject commit c75a9f566fb18aa9746ca45769763cbaf1430ef1 +Subproject commit c67f90eacac14e022b1f2c2e5ac559879581e9ff diff --git a/rcdom/tests/foreach_html5lib_test/mod.rs b/rcdom/tests/foreach_html5lib_test/mod.rs index 6138c98c..f996c28b 100644 --- a/rcdom/tests/foreach_html5lib_test/mod.rs +++ b/rcdom/tests/foreach_html5lib_test/mod.rs @@ -21,7 +21,6 @@ pub fn foreach_html5lib_test( Mk: FnMut(&Path, fs::File), { let mut test_dir_path = src_dir.to_path_buf(); - test_dir_path.push("html5lib-tests"); test_dir_path.push(subdir); let maybe_test_files = fs::read_dir(&test_dir_path); diff --git a/rcdom/tests/html-tokenizer.rs b/rcdom/tests/html-tokenizer.rs index 78b7ca09..520a8301 100644 --- a/rcdom/tests/html-tokenizer.rs +++ b/rcdom/tests/html-tokenizer.rs @@ -11,7 +11,7 @@ mod foreach_html5lib_test; use foreach_html5lib_test::foreach_html5lib_test; use html5ever::tendril::*; -use html5ever::tokenizer::states::{Plaintext, RawData, Rawtext, Rcdata}; +use html5ever::tokenizer::states::{Plaintext, RawData, Rawtext, Rcdata, ScriptData, CdataSection, Data}; use html5ever::tokenizer::BufferQueue; use html5ever::tokenizer::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; use html5ever::tokenizer::{CommentToken, DoctypeToken, TagToken, Token}; @@ -20,14 +20,29 @@ use html5ever::tokenizer::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts} use html5ever::{namespace_url, ns, Attribute, LocalName, QualName}; use rustc_test::{DynTestFn, DynTestName, TestDesc, TestDescAndFn}; use serde_json::{Map, Value}; -use std::borrow::Cow::Borrowed; +use std::borrow::Cow; use std::default::Default; use std::ffi::OsStr; use std::io::Read; +use std::fs::File; use std::mem::replace; use std::path::Path; use std::{char, env}; + +#[derive(Debug)] +struct TestError(Cow<'static, str>); + +impl PartialEq for TestError { + fn eq(&self, _: &TestError) -> bool { + // TODO: actually match exact error messages + true + } +} + +// some large testcases hang forever without an upper-bound of splits to generate +const MAX_SPLITS: usize = 1000; + // Return all ways of splitting the string into at most n // possibly-empty pieces. fn splits(s: &str, n: usize) -> Vec> { @@ -35,12 +50,8 @@ fn splits(s: &str, n: usize) -> Vec> { return vec![vec![s.to_tendril()]]; } - let mut points: Vec = s.char_indices().map(|(n, _)| n).collect(); - points.push(s.len()); - - // do this with iterators? let mut out = vec![]; - for p in points.into_iter() { + for p in s.char_indices().map(|(n, _)| n).chain(Some(s.len())) { let y = &s[p..]; for mut x in splits(&s[..p], n - 1).into_iter() { x.push(y.to_tendril()); @@ -49,11 +60,13 @@ fn splits(s: &str, n: usize) -> Vec> { } out.extend(splits(s, n - 1).into_iter()); + out.truncate(MAX_SPLITS); out } struct TokenLogger { tokens: Vec, + errors: Vec, current_str: StrTendril, exact_errors: bool, } @@ -62,6 +75,7 @@ impl TokenLogger { fn new(exact_errors: bool) -> TokenLogger { TokenLogger { tokens: vec![], + errors: vec![], current_str: StrTendril::new(), exact_errors: exact_errors, } @@ -80,9 +94,9 @@ impl TokenLogger { } } - fn get_tokens(mut self) -> Vec { + fn get_tokens(mut self) -> (Vec, Vec){ self.finish_str(); - self.tokens + (self.tokens, self.errors) } } @@ -99,9 +113,9 @@ impl TokenSink for TokenLogger { self.current_str.push_char('\0'); }, - ParseError(_) => { + ParseError(e) => { if self.exact_errors { - self.push(ParseError(Borrowed(""))); + self.errors.push(TestError(e)); } }, @@ -127,7 +141,7 @@ impl TokenSink for TokenLogger { } } -fn tokenize(input: Vec, opts: TokenizerOpts) -> Vec { +fn tokenize(input: Vec, opts: TokenizerOpts) -> (Vec, Vec) { let sink = TokenLogger::new(opts.exact_errors); let mut tok = Tokenizer::new(sink, opts); let mut buffer = BufferQueue::new(); @@ -247,21 +261,24 @@ fn json_to_token(js: &Value) -> Token { } // Parse the "output" field of the test case into a vector of tokens. -fn json_to_tokens(js: &Value, exact_errors: bool) -> Vec { +fn json_to_tokens(js_tokens: &Value, js_errors: &[Value], exact_errors: bool) -> (Vec, Vec) { // Use a TokenLogger so that we combine character tokens separated // by an ignored error. let mut sink = TokenLogger::new(exact_errors); - for tok in js.get_list().iter() { + for tok in js_tokens.get_list().iter() { assert_eq!( - match *tok { - Value::String(ref s) if &s[..] == "ParseError" => { - sink.process_token(ParseError(Borrowed("")), 0) - }, - _ => sink.process_token(json_to_token(tok), 0), - }, + sink.process_token(json_to_token(tok), 0), + TokenSinkResult::Continue + ); + } + + for err in js_errors { + assert_eq!( + sink.process_token(ParseError(err.find("code").get_str().into()), 0), TokenSinkResult::Continue ); } + sink.get_tokens() } @@ -276,7 +293,7 @@ fn unescape(s: &str) -> Option { if it.peek() != Some(&'u') { panic!("can't understand escape"); } - drop(it.next()); + let _ = it.next(); let hex: String = it.by_ref().take(4).collect(); match u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) { // Some of the tests use lone surrogates, but we have no @@ -309,7 +326,7 @@ fn unescape_json(js: &Value) -> Value { } } -fn mk_test(desc: String, input: String, expect: Value, opts: TokenizerOpts) -> TestDescAndFn { +fn mk_test(desc: String, input: String, expect: Value, expect_errors: Vec, opts: TokenizerOpts) -> TestDescAndFn { TestDescAndFn { desc: TestDesc::new(DynTestName(desc)), testfn: DynTestFn(Box::new(move || { @@ -321,11 +338,11 @@ fn mk_test(desc: String, input: String, expect: Value, opts: TokenizerOpts) -> T // result but the compiler doesn't catch it! // Possibly mozilla/rust#12223. let output = tokenize(input.clone(), opts.clone()); - let expect_toks = json_to_tokens(&expect, opts.exact_errors); + let expect_toks = json_to_tokens(&expect, &expect_errors, opts.exact_errors); if output != expect_toks { panic!( "\ninput: {:?}\ngot: {:?}\nexpected: {:?}", - input, output, expect + input, output, expect_toks ); } } @@ -337,6 +354,7 @@ fn mk_tests(tests: &mut Vec, filename: &str, js: &Value) { let obj = js.get_obj(); let mut input = js.find("input").get_str(); let mut expect = js.find("output").clone(); + let expect_errors = js.get("errors").map(JsonExt::get_list).map(Vec::as_slice).unwrap_or_default(); let desc = format!("tok: {}: {}", filename, js.find("description").get_str()); // "Double-escaped" tests require additional processing of @@ -364,6 +382,9 @@ fn mk_tests(tests: &mut Vec, filename: &str, js: &Value) { "PLAINTEXT state" => Plaintext, "RAWTEXT state" => RawData(Rawtext), "RCDATA state" => RawData(Rcdata), + "Script data state" => RawData(ScriptData), + "CDATA section state" => CdataSection, + "Data state" => Data, s => panic!("don't know state {}", s), }) }) @@ -388,6 +409,7 @@ fn mk_tests(tests: &mut Vec, filename: &str, js: &Value) { newdesc, input.clone(), expect.clone(), + expect_errors.to_owned(), TokenizerOpts { exact_errors: exact_errors, initial_state: state, @@ -407,32 +429,41 @@ fn mk_tests(tests: &mut Vec, filename: &str, js: &Value) { fn tests(src_dir: &Path) -> Vec { let mut tests = vec![]; + let mut add_test = |path: &Path, mut file: File| { + let mut s = String::new(); + file.read_to_string(&mut s) + .ok() + .expect("file reading error"); + let js: Value = serde_json::from_str(&s).ok().expect("json parse error"); + + match js.get_obj().get(&"tests".to_string()) { + Some(&Value::Array(ref lst)) => { + for test in lst.iter() { + mk_tests( + &mut tests, + path.file_name().unwrap().to_str().unwrap(), + test, + ) + } + }, + + // xmlViolation.test doesn't follow this format. + _ => (), + } + }; + foreach_html5lib_test( src_dir, - "tokenizer", + "html5lib-tests/tokenizer", OsStr::new("test"), - |path, mut file| { - let mut s = String::new(); - file.read_to_string(&mut s) - .ok() - .expect("file reading error"); - let js: Value = serde_json::from_str(&s).ok().expect("json parse error"); - - match js.get_obj().get(&"tests".to_string()) { - Some(&Value::Array(ref lst)) => { - for test in lst.iter() { - mk_tests( - &mut tests, - path.file_name().unwrap().to_str().unwrap(), - test, - ); - } - }, + &mut add_test + ); - // xmlViolation.test doesn't follow this format. - _ => (), - } - }, + foreach_html5lib_test( + src_dir, + "custom-html5lib-tokenizer-tests", + OsStr::new("test"), + &mut add_test ); tests diff --git a/rcdom/tests/html-tree-builder.rs b/rcdom/tests/html-tree-builder.rs index 9d882484..e82116f4 100644 --- a/rcdom/tests/html-tree-builder.rs +++ b/rcdom/tests/html-tree-builder.rs @@ -266,7 +266,7 @@ fn tests(src_dir: &Path, ignores: &HashSet) -> Vec { foreach_html5lib_test( src_dir, - "tree-construction", + "html5lib-tests/tree-construction", OsStr::new("dat"), |path, file| { let buf = io::BufReader::new(file);