Skip to content

Commit

Permalink
perf(parser): support peeking over bytes
Browse files Browse the repository at this point in the history
  • Loading branch information
lucab committed Jul 16, 2024
1 parent 529dde4 commit 828202e
Show file tree
Hide file tree
Showing 9 changed files with 102 additions and 71 deletions.
10 changes: 5 additions & 5 deletions crates/oxc_parser/src/lexer/byte_handlers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -336,12 +336,12 @@ ascii_byte_handler!(PRD(lexer) {
// /
ascii_byte_handler!(SLH(lexer) {
lexer.consume_char();
match lexer.peek() {
Some('/') => {
match lexer.peek_byte() {
Some(b'/') => {
lexer.consume_char();
lexer.skip_single_line_comment()
}
Some('*') => {
Some(b'*') => {
lexer.consume_char();
lexer.skip_multi_line_comment()
}
Expand Down Expand Up @@ -418,9 +418,9 @@ ascii_byte_handler!(QST(lexer) {
} else {
Kind::Question2
}
} else if lexer.peek() == Some('.') {
} else if lexer.peek_byte() == Some(b'.') {
// parse `?.1` as `?` `.1`
if lexer.peek2().is_some_and(|c| c.is_ascii_digit()) {
if lexer.peek_char2().is_some_and(|c| c.is_ascii_digit()) {
Kind::Question
} else {
lexer.consume_char();
Expand Down
8 changes: 4 additions & 4 deletions crates/oxc_parser/src/lexer/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ impl<'a> Lexer<'a> {
/// Any number of characters can have already been consumed from `self.source` prior to it.
/// `self.source` should be positioned at start of Unicode character.
fn identifier_tail_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
let c = self.peek().unwrap();
let c = self.peek_char().unwrap();
if is_identifier_part_unicode(c) {
self.consume_char();
self.identifier_tail_after_unicode(start_pos)
Expand All @@ -115,7 +115,7 @@ impl<'a> Lexer<'a> {
pub(super) fn identifier_tail_after_unicode(&mut self, start_pos: SourcePosition) -> &'a str {
// Identifier contains a Unicode chars, so probably contains more.
// So just iterate over chars now, instead of bytes.
while let Some(c) = self.peek() {
while let Some(c) = self.peek_char() {
if is_identifier_part(c) {
self.consume_char();
} else if c == '\\' {
Expand Down Expand Up @@ -177,7 +177,7 @@ impl<'a> Lexer<'a> {
// Consume chars until reach end of identifier or another escape
let chunk_start = self.source.position();
loop {
let maybe_char = self.peek();
let maybe_char = self.peek_char();
if maybe_char.is_some_and(is_identifier_part) {
self.consume_char();
continue;
Expand Down Expand Up @@ -272,7 +272,7 @@ impl<'a> Lexer<'a> {
fn private_identifier_not_ascii_id(&mut self) -> Kind {
let b = self.source.peek_byte().unwrap();
if !b.is_ascii() {
let c = self.peek().unwrap();
let c = self.peek_char().unwrap();
if is_identifier_start_unicode(c) {
let start_pos = self.source.position();
self.consume_char();
Expand Down
4 changes: 2 additions & 2 deletions crates/oxc_parser/src/lexer/jsx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ impl<'a> Lexer<'a> {
/// `JSXFragment`
/// { `JSXChildExpressionopt` }
fn read_jsx_child(&mut self) -> Kind {
match self.peek() {
match self.peek_char() {
Some('<') => {
self.consume_char();
Kind::LAngle
Expand Down Expand Up @@ -122,7 +122,7 @@ impl<'a> Lexer<'a> {
// Unicode chars are rare in identifiers, so cold branch to keep common path for ASCII
// as fast as possible
cold_branch(|| {
while let Some(c) = self.peek() {
while let Some(c) = self.peek_char() {
if c == '-' || is_identifier_part(c) {
self.consume_char();
} else {
Expand Down
18 changes: 15 additions & 3 deletions crates/oxc_parser/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,15 +251,27 @@ impl<'a> Lexer<'a> {
self.source.next_char().unwrap()
}

/// Peek the next byte without advancing the position
#[inline]
fn peek_byte(&self) -> Option<u8> {
self.source.peek_byte()
}

/// Peek the next two bytes without advancing the position
#[inline]
fn peek_byte2(&self) -> Option<[u8; 2]> {
self.source.peek_byte2()
}

/// Peek the next char without advancing the position
#[inline]
fn peek(&self) -> Option<char> {
fn peek_char(&self) -> Option<char> {
self.source.peek_char()
}

/// Peek the next next char without advancing the position
#[inline]
fn peek2(&self) -> Option<char> {
fn peek_char2(&self) -> Option<char> {
self.source.peek_char2()
}

Expand All @@ -284,7 +296,7 @@ impl<'a> Lexer<'a> {
/// Return `IllegalCharacter` Error or `UnexpectedEnd` if EOF
fn unexpected_err(&mut self) {
let offset = self.current_offset();
match self.peek() {
match self.peek_char() {
Some(c) => self.error(diagnostics::invalid_character(c, offset)),
None => self.error(diagnostics::unexpected_end(offset)),
}
Expand Down
66 changes: 31 additions & 35 deletions crates/oxc_parser/src/lexer/numeric.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,19 @@ use crate::diagnostics;
impl<'a> Lexer<'a> {
/// 12.9.3 Numeric Literals with `0` prefix
pub(super) fn read_zero(&mut self) -> Kind {
match self.peek() {
Some('b' | 'B') => self.read_non_decimal(Kind::Binary),
Some('o' | 'O') => self.read_non_decimal(Kind::Octal),
Some('x' | 'X') => self.read_non_decimal(Kind::Hex),
Some('e' | 'E') => {
match self.peek_byte() {
Some(b'b' | b'B') => self.read_non_decimal(Kind::Binary),
Some(b'o' | b'O') => self.read_non_decimal(Kind::Octal),
Some(b'x' | b'X') => self.read_non_decimal(Kind::Hex),
Some(b'e' | b'E') => {
self.consume_char();
self.read_decimal_exponent()
}
Some('.') => {
Some(b'.') => {
self.consume_char();
self.decimal_literal_after_decimal_point_after_digits()
}
Some('n') => {
Some(b'n') => {
self.consume_char();
self.check_after_numeric_literal(Kind::Decimal)
}
Expand All @@ -42,14 +42,14 @@ impl<'a> Lexer<'a> {
fn read_non_decimal(&mut self, kind: Kind) -> Kind {
self.consume_char();

if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
if self.peek_char().is_some_and(|c| kind.matches_number_char(c)) {
self.consume_char();
} else {
self.unexpected_err();
return Kind::Undetermined;
}

while let Some(c) = self.peek() {
while let Some(c) = self.peek_char() {
match c {
'_' => {
self.consume_char();
Expand All @@ -58,7 +58,7 @@ impl<'a> Lexer<'a> {
// call here instead of after we ensure the next character
// is a number character
self.token.set_has_separator();
if self.peek().is_some_and(|c| kind.matches_number_char(c)) {
if self.peek_char().is_some_and(|c| kind.matches_number_char(c)) {
self.consume_char();
} else {
self.unexpected_err();
Expand All @@ -71,35 +71,33 @@ impl<'a> Lexer<'a> {
_ => break,
}
}
if self.peek() == Some('n') {
self.consume_char();
}
self.next_ascii_char_eq(b'n');
self.check_after_numeric_literal(kind)
}

fn read_legacy_octal(&mut self) -> Kind {
let mut kind = Kind::Octal;
loop {
match self.peek() {
Some('0'..='7') => {
match self.peek_byte() {
Some(b'0'..=b'7') => {
self.consume_char();
}
Some('8'..='9') => {
Some(b'8'..=b'9') => {
self.consume_char();
kind = Kind::Decimal;
}
_ => break,
}
}

match self.peek() {
match self.peek_byte() {
// allow 08.5 and 09.5
Some('.') if kind == Kind::Decimal => {
Some(b'.') if kind == Kind::Decimal => {
self.consume_char();
self.decimal_literal_after_decimal_point_after_digits()
}
// allow 08e1 and 09e1
Some('e') if kind == Kind::Decimal => {
Some(b'e') if kind == Kind::Decimal => {
self.consume_char();
self.read_decimal_exponent()
}
Expand All @@ -108,12 +106,12 @@ impl<'a> Lexer<'a> {
}

fn read_decimal_exponent(&mut self) -> Kind {
let kind = match self.peek() {
Some('-') => {
let kind = match self.peek_byte() {
Some(b'-') => {
self.consume_char();
Kind::NegativeExponential
}
Some('+') => {
Some(b'+') => {
self.consume_char();
Kind::PositiveExponential
}
Expand All @@ -124,7 +122,7 @@ impl<'a> Lexer<'a> {
}

fn read_decimal_digits(&mut self) {
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
self.consume_char();
} else {
self.unexpected_err();
Expand All @@ -135,23 +133,23 @@ impl<'a> Lexer<'a> {
}

fn read_decimal_digits_after_first_digit(&mut self) {
while let Some(c) = self.peek() {
match c {
'_' => {
while let Some(b) = self.peek_byte() {
match b {
b'_' => {
self.consume_char();
// NOTE: it looks invalid numeric tokens are still parsed.
// This seems to be a waste. It also requires us to put this
// call here instead of after we ensure the next character
// is an ASCII digit
self.token.set_has_separator();
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
self.consume_char();
} else {
self.unexpected_err();
return;
}
}
'0'..='9' => {
b'0'..=b'9' => {
self.consume_char();
}
_ => break,
Expand All @@ -172,16 +170,14 @@ impl<'a> Lexer<'a> {
}

fn optional_decimal_digits(&mut self) {
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
self.consume_char();
} else {
return;
self.read_decimal_digits_after_first_digit();
}
self.read_decimal_digits_after_first_digit();
}

fn optional_exponent(&mut self) -> Option<Kind> {
if matches!(self.peek(), Some('e' | 'E')) {
if matches!(self.peek_byte(), Some(b'e' | b'E')) {
self.consume_char();
return Some(self.read_decimal_exponent());
}
Expand All @@ -191,12 +187,12 @@ impl<'a> Lexer<'a> {
fn check_after_numeric_literal(&mut self, kind: Kind) -> Kind {
let offset = self.offset();
// The SourceCharacter immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit.
let c = self.peek();
let c = self.peek_char();
if c.is_none() || c.is_some_and(|ch| !ch.is_ascii_digit() && !is_identifier_start(ch)) {
return kind;
}
self.consume_char();
while let Some(c) = self.peek() {
while let Some(c) = self.peek_char() {
if is_identifier_start(c) {
self.consume_char();
} else {
Expand Down
6 changes: 3 additions & 3 deletions crates/oxc_parser/src/lexer/punctuation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@ use super::{Kind, Lexer, Token};
impl<'a> Lexer<'a> {
/// Section 12.8 Punctuators
pub(super) fn read_dot(&mut self) -> Kind {
if self.peek() == Some('.') && self.peek2() == Some('.') {
if self.peek_byte2() == Some([b'.', b'.']) {
self.consume_char();
self.consume_char();
return Kind::Dot3;
}
if self.peek().is_some_and(|c| c.is_ascii_digit()) {
if self.peek_byte().is_some_and(|b| b.is_ascii_digit()) {
self.decimal_literal_after_decimal_point()
} else {
Kind::Dot
Expand All @@ -25,7 +25,7 @@ impl<'a> Lexer<'a> {
}
} else if self.next_ascii_char_eq(b'=') {
Some(Kind::LtEq)
} else if self.peek() == Some('!')
} else if self.peek_byte() == Some(b'!')
// SingleLineHTMLOpenComment `<!--` in script mode
&& self.source_type.is_script()
&& self.remaining().starts_with("!--")
Expand Down
10 changes: 6 additions & 4 deletions crates/oxc_parser/src/lexer/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,16 @@ impl<'a> Lexer<'a> {
let pattern_end = self.offset() - 1; // -1 to exclude `/`
let mut flags = RegExpFlags::empty();

while let Some(ch @ ('$' | '_' | 'a'..='z' | 'A'..='Z' | '0'..='9')) = self.peek() {
while let Some(ch @ (b'$' | b'_' | b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9')) =
self.peek_byte()
{
self.consume_char();
let Ok(flag) = RegExpFlags::try_from(ch) else {
self.error(diagnostics::reg_exp_flag(ch, self.current_offset()));
let Ok(flag) = RegExpFlags::try_from(ch as char) else {
self.error(diagnostics::reg_exp_flag(ch as char, self.current_offset()));
continue;
};
if flags.contains(flag) {
self.error(diagnostics::reg_exp_flag_twice(ch, self.current_offset()));
self.error(diagnostics::reg_exp_flag_twice(ch as char, self.current_offset()));
continue;
}
flags |= flag;
Expand Down
23 changes: 23 additions & 0 deletions crates/oxc_parser/src/lexer/source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,29 @@ impl<'a> Source<'a> {
}
}

/// Peek next two bytes of source without consuming them.
#[inline]
pub(super) fn peek_byte2(&self) -> Option<[u8; 2]> {
let will_reach_eof = {
if self.ptr as isize == isize::MAX || self.ptr == self.end {
true
} else {
// SAFETY: the check above ensured that the resulting pointer:
// * does not overflow an `isize`, and
// * is either in bounds or one byte past the end.
unsafe { self.ptr.add(1) == self.end }
}
};

if will_reach_eof {
None
} else {
// SAFETY: Safe to read from `ptr` as we just checked it's not out of bounds
let bytes = unsafe { self.position().read2() };
Some(bytes)
}
}

/// Peek next byte of source without consuming it, without EOF bounds-check.
///
/// # SAFETY
Expand Down
Loading

0 comments on commit 828202e

Please sign in to comment.