Skip to content

xml5ever: Bubble parser blocking scripts to the caller instead of the TreeSink #591

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions html5ever/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
//! High-level interface to the parser.

use crate::buffer_queue::BufferQueue;
use crate::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult};
use crate::tokenizer::{Tokenizer, TokenizerOpts};
use crate::tree_builder::{create_element, TreeBuilder, TreeBuilderOpts, TreeSink};
use crate::{Attribute, QualName};

use markup5ever::TokenizerResult;
use std::borrow::Cow;

use crate::tendril;
Expand Down
9 changes: 1 addition & 8 deletions html5ever/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use crate::util::str::lower_ascii_letter;

use log::{debug, trace};
use mac::format_if;
use markup5ever::{namespace_url, ns, small_char_set};
use markup5ever::{namespace_url, ns, small_char_set, TokenizerResult};
use std::borrow::Cow::{self, Borrowed};
use std::cell::{Cell, RefCell, RefMut};
use std::collections::BTreeMap;
Expand All @@ -45,13 +45,6 @@ pub enum ProcessResult<Handle> {
Script(Handle),
}

#[must_use]
#[derive(Debug)]
pub enum TokenizerResult<Handle> {
Done,
Script(Handle),
}

fn option_push(opt_str: &mut Option<StrTendril>, c: char) {
match *opt_str {
Some(ref mut s) => s.push_char(c),
Expand Down
4 changes: 1 addition & 3 deletions html5ever/src/tree_builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@

//! The HTML5 tree builder.

pub use crate::interface::{
create_element, ElemName, ElementFlags, NextParserState, Tracer, TreeSink,
};
pub use crate::interface::{create_element, ElemName, ElementFlags, Tracer, TreeSink};
pub use crate::interface::{AppendNode, AppendText, Attribute, NodeOrText};
pub use crate::interface::{LimitedQuirks, NoQuirks, Quirks, QuirksMode};

Expand Down
9 changes: 8 additions & 1 deletion markup5ever/interface/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use std::fmt;
use tendril::StrTendril;

pub use self::tree_builder::{create_element, AppendNode, AppendText, ElementFlags, NodeOrText};
pub use self::tree_builder::{ElemName, NextParserState, Tracer, TreeSink};
pub use self::tree_builder::{ElemName, Tracer, TreeSink};
pub use self::tree_builder::{LimitedQuirks, NoQuirks, Quirks, QuirksMode};
use super::{LocalName, Namespace, Prefix};

Expand Down Expand Up @@ -60,6 +60,13 @@ impl fmt::Debug for ExpandedName<'_> {
}
}

#[must_use]
#[derive(Debug)]
pub enum TokenizerResult<Handle> {
Done,
Script(Handle),
}

/// Helper to quickly create an expanded name.
///
/// Can be used with no namespace as `expanded_name!("", "some_name")`
Expand Down
16 changes: 0 additions & 16 deletions markup5ever/interface/tree_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,6 @@ pub enum QuirksMode {
NoQuirks,
}

/// Whether to interrupt further parsing of the current input until
/// the next explicit resumption of the tokenizer, or continue without
/// any interruption.
#[derive(PartialEq, Eq, Copy, Clone, Hash, Debug)]
pub enum NextParserState {
/// Stop further parsing.
Suspend,
/// Continue without interruptions.
Continue,
}

/// Special properties of an element, useful for tagging elements with this information.
#[derive(Default)]
#[non_exhaustive]
Expand Down Expand Up @@ -256,11 +245,6 @@ pub trait TreeSink {
/// Called whenever the line number changes.
fn set_current_line(&self, _line_number: u64) {}

/// Indicate that a `script` element is complete.
fn complete_script(&self, _node: &Self::Handle) -> NextParserState {
NextParserState::Continue
}

fn allow_declarative_shadow_roots(&self, _intended_parent: &Self::Handle) -> bool {
true
}
Expand Down
2 changes: 1 addition & 1 deletion markup5ever/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,6 @@ mod util {
pub mod smallcharset;
}

pub use interface::{Attribute, ExpandedName, QualName};
pub use interface::{Attribute, ExpandedName, QualName, TokenizerResult};
pub use util::smallcharset::SmallCharSet;
pub use util::*;
18 changes: 12 additions & 6 deletions rcdom/tests/xml-tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use std::env;
use std::ffi::OsStr;
use std::io::Read;
use std::path::Path;
use xml5ever::tokenizer::ProcessResult;

use util::find_tests::foreach_xml5lib_test;
use util::runner::{run_all, Test};
Expand Down Expand Up @@ -91,7 +92,9 @@ impl TokenLogger {
}

impl TokenSink for TokenLogger {
fn process_token(&self, token: Token) {
type Handle = ();

fn process_token(&self, token: Token) -> ProcessResult<()> {
match token {
CharacterTokens(b) => {
self.current_str.borrow_mut().push_slice(&b);
Expand Down Expand Up @@ -123,7 +126,8 @@ impl TokenSink for TokenLogger {
EOFToken => (),

_ => self.push(token),
}
};
ProcessResult::Continue
}
}

Expand All @@ -134,9 +138,9 @@ fn tokenize_xml(input: Vec<StrTendril>, opts: XmlTokenizerOpts) -> Vec<Token> {

for chunk in input.into_iter() {
buf.push_back(chunk);
tok.feed(&buf);
let _ = tok.feed(&buf);
}
tok.feed(&buf);
let _ = tok.feed(&buf);
tok.end();
tok.sink.get_tokens()
}
Expand Down Expand Up @@ -274,9 +278,11 @@ fn json_to_tokens(js: &Value, exact_errors: bool) -> Vec<Token> {
for tok in js.as_array().unwrap().iter() {
match *tok {
Value::String(ref s) if &s[..] == "ParseError" => {
sink.process_token(ParseError(Borrowed("")))
let _ = sink.process_token(ParseError(Borrowed("")));
},
_ => {
let _ = sink.process_token(json_to_token(tok));
},
_ => sink.process_token(json_to_token(tok)),
}
}
sink.get_tokens()
Expand Down
11 changes: 7 additions & 4 deletions xml5ever/benches/xml5ever.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,18 @@ use criterion::{black_box, Criterion};

use markup5ever::buffer_queue::BufferQueue;
use xml5ever::tendril::*;
use xml5ever::tokenizer::{Token, TokenSink, XmlTokenizer};
use xml5ever::tokenizer::{ProcessResult, Token, TokenSink, XmlTokenizer};

struct Sink;

impl TokenSink for Sink {
fn process_token(&self, token: Token) {
type Handle = ();

fn process_token(&self, token: Token) -> ProcessResult<()> {
// Don't use the token, but make sure we don't get
// optimized out entirely.
black_box(token);
ProcessResult::Continue
}
}

Expand Down Expand Up @@ -58,9 +61,9 @@ fn run_bench(c: &mut Criterion, name: &str) {
// necessary since our iterator consumes the underlying buffer.
for buf in input.clone().into_iter() {
buffer.push_back(buf);
tok.feed(&buffer);
let _ = tok.feed(&buffer);
}
tok.feed(&buffer);
let _ = tok.feed(&buffer);
tok.end();
})
});
Expand Down
11 changes: 7 additions & 4 deletions xml5ever/examples/simple_xml_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,17 @@ use std::io;

use markup5ever::buffer_queue::BufferQueue;
use xml5ever::tendril::{ByteTendril, ReadExt};
use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken};
use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, ProcessResult, TagToken};
use xml5ever::tokenizer::{CommentToken, PIToken, Pi};
use xml5ever::tokenizer::{Doctype, DoctypeToken, EOFToken};
use xml5ever::tokenizer::{ParseError, Token, TokenSink, XmlTokenizer};

struct SimpleTokenPrinter;

impl TokenSink for SimpleTokenPrinter {
fn process_token(&self, token: Token) {
type Handle = ();

fn process_token(&self, token: Token) -> ProcessResult<()> {
match token {
CharacterTokens(b) => {
println!("TEXT: {}", &*b);
Expand Down Expand Up @@ -55,7 +57,8 @@ impl TokenSink for SimpleTokenPrinter {
}) => {
println!("<!DOCTYPE {name:?} {public_id:?}>");
},
}
};
ProcessResult::Continue
}
}

Expand All @@ -76,6 +79,6 @@ fn main() {
input_buffer.push_back(input.try_reinterpret().unwrap());
// Here we create and run tokenizer
let tok = XmlTokenizer::new(sink, Default::default());
tok.feed(&input_buffer);
let _ = tok.feed(&input_buffer);
tok.end();
}
12 changes: 8 additions & 4 deletions xml5ever/examples/xml_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use std::io;

use markup5ever::buffer_queue::BufferQueue;
use xml5ever::tendril::{ByteTendril, ReadExt};
use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, TagToken};
use xml5ever::tokenizer::{CharacterTokens, NullCharacterToken, ProcessResult, TagToken};
use xml5ever::tokenizer::{EmptyTag, EndTag, ShortTag, StartTag};
use xml5ever::tokenizer::{PIToken, Pi};
use xml5ever::tokenizer::{ParseError, Token, TokenSink, XmlTokenizer, XmlTokenizerOpts};
Expand All @@ -44,7 +44,9 @@ impl TokenPrinter {
}

impl TokenSink for TokenPrinter {
fn process_token(&self, token: Token) {
type Handle = ();

fn process_token(&self, token: Token) -> ProcessResult<()> {
match token {
CharacterTokens(b) => {
for c in b.chars() {
Expand Down Expand Up @@ -84,7 +86,9 @@ impl TokenSink for TokenPrinter {
self.is_char(false);
println!("OTHER: {token:?}");
},
}
};

ProcessResult::Continue
}
}

Expand All @@ -105,7 +109,7 @@ fn main() {
..Default::default()
},
);
tok.feed(&input_buffer);
let _ = tok.feed(&input_buffer);
tok.end();
tok.sink.is_char(false);
}
3 changes: 2 additions & 1 deletion xml5ever/src/driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ impl<Sink: TreeSink> TendrilSink<tendril::fmt::UTF8> for XmlParser<Sink> {

fn process(&mut self, t: StrTendril) {
self.input_buffer.push_back(t);
self.tokenizer.feed(&self.input_buffer);
// FIXME: Properly support </script> somehow.
let _ = self.tokenizer.feed(&self.input_buffer);
Comment on lines +66 to +67
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is okay, because it is exactly what html5ever does too:

self.input_buffer.push_back(t);
// FIXME: Properly support </script> somehow.
while let TokenizerResult::Script(_) = self.tokenizer.feed(&self.input_buffer) {}

}

// FIXME: Is it too noisy to report every character decoding error?
Expand Down
15 changes: 5 additions & 10 deletions xml5ever/src/tokenizer/interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,13 @@
use std::borrow::Cow;

use crate::tendril::StrTendril;
use crate::tokenizer::ProcessResult;
use crate::{Attribute, QualName};

pub use self::TagKind::{EmptyTag, EndTag, ShortTag, StartTag};
pub use self::Token::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
pub use self::Token::{CommentToken, DoctypeToken, PIToken, TagToken};

use super::states;

/// Tag kind denotes which kind of tag did we encounter.
#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
pub enum TagKind {
Expand Down Expand Up @@ -108,16 +107,12 @@ pub enum Token {

/// Types which can receive tokens from the tokenizer.
pub trait TokenSink {
/// Handle to a DOM script element
type Handle;

/// Process a token.
fn process_token(&self, token: Token);
fn process_token(&self, token: Token) -> ProcessResult<Self::Handle>;

/// Signal to the sink that parsing has ended.
fn end(&self) {}

/// The tokenizer will call this after emitting any start tag.
/// This allows the tree builder to change the tokenizer's state.
/// By default no state changes occur.
fn query_state_change(&self) -> Option<states::XmlState> {
None
}
}
Loading