Skip to content

Commit 5f08c69

Browse files
committed
perf(lexer): remove branches from unicode handling (#15328)
#12768 split `next_char`, `next_2_chars`, and `peek_char` into separate functions for the hot and cold paths. That was a good change, but had one side-effect - because the unicode branch is now in a separate function which isn't inlined, the compiler loses knowledge of the context - that `Source` isn't at EOF, and that (in 2 of the 3 methods) the next character is known not to be ASCII. Add unchecked assertions to inform compiler of the known facts, so it can remove 2 branches when calling `chars.next().unwrap()`. This code is on a cold path, so will likely not make a noticeable difference in files which don't contain many Unicode chars (like our benchmark fixtures), but why not?
1 parent e0edaef commit 5f08c69

File tree

1 file changed

+76
-39
lines changed

1 file changed

+76
-39
lines changed

crates/oxc_parser/src/lexer/source.rs

Lines changed: 76 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
use std::{cmp::Ordering, marker::PhantomData, slice, str};
22

3+
use oxc_data_structures::assert_unchecked;
4+
35
use crate::{MAX_LEN, UniquePromise};
46

57
use super::search::SEARCH_BATCH_SIZE;
@@ -371,25 +373,39 @@ impl<'a> Source<'a> {
371373
// So incrementing `ptr` cannot result in `ptr > end`.
372374
// Current byte is ASCII, so incremented `ptr` must be on a UTF-8 character boundary.
373375
unsafe { self.ptr = self.ptr.add(1) };
374-
return Some(byte as char);
376+
Some(byte as char)
377+
} else {
378+
// Multi-byte Unicode character.
379+
// Check invariant that `ptr` is on a UTF-8 character boundary.
380+
debug_assert!(!is_utf8_cont_byte(byte));
381+
382+
// SAFETY: `Source` is not empty, and we just checked next byte is not ASCII
383+
unsafe { self.next_unicode_char() }
375384
}
376-
self.next_unicode_char(byte)
377385
}
378386

387+
/// Get next char of source, and advance position to after it, when next char is a unicode character.
388+
///
389+
/// # SAFETY
390+
/// * `Source` must not be empty.
391+
/// * Next character in source must not be ASCII.
379392
#[expect(clippy::unnecessary_wraps)]
380-
#[cold] // Unicode is rare.
381-
fn next_unicode_char(&mut self, byte: u8) -> Option<char> {
382-
// Multi-byte Unicode character.
383-
// Check invariant that `ptr` is on a UTF-8 character boundary.
384-
debug_assert!(!is_utf8_cont_byte(byte));
385-
393+
#[cold] // Unicode is rare
394+
unsafe fn next_unicode_char(&mut self) -> Option<char> {
386395
// Create a `Chars` iterator, get next char from it, and then update `self.ptr`
387396
// to match `Chars` iterator's updated pointer afterwards.
388397
// `Chars` iterator upholds same invariants as `Source`, so its pointer is guaranteed
389398
// to be valid as `self.ptr`.
390-
let mut chars = self.remaining().chars();
391-
// SAFETY: We know that there's a byte to be consumed, so `chars.next()` must return `Some(_)`
392-
let c = unsafe { chars.next().unwrap_unchecked() };
399+
let remaining = self.remaining();
400+
// Inform compiler that next char in `Source` is non-ASCII, so it can remove some branches from
401+
// `chars.next().unwrap()`. Compiler will not be aware of these invariants if this function is not inlined.
402+
// SAFETY: Caller guarantees these invariants.
403+
unsafe {
404+
assert_unchecked!(!remaining.is_empty());
405+
assert_unchecked!(!remaining.as_bytes()[0].is_ascii());
406+
}
407+
let mut chars = remaining.chars();
408+
let c = chars.next().unwrap();
393409
self.ptr = chars.as_str().as_ptr();
394410
Some(c)
395411
}
@@ -404,27 +420,34 @@ impl<'a> Source<'a> {
404420
// and next 2 bytes are ASCII, so advancing by 2 bytes must put `ptr`
405421
// in bounds and on a UTF-8 character boundary
406422
unsafe { self.ptr = self.ptr.add(2) };
407-
return Some([byte1 as char, byte2 as char]);
408-
}
423+
Some([byte1 as char, byte2 as char])
424+
} else {
425+
// Multi-byte Unicode character.
426+
// Check invariant that `ptr` is on a UTF-8 character boundary.
427+
debug_assert!(!is_utf8_cont_byte(byte1));
409428

410-
// Handle Unicode characters
411-
self.next_2_unicode_chars(byte1)
429+
// SAFETY: `Source` is not empty
430+
unsafe { self.next_2_unicode_chars() }
431+
}
412432
}
413433

414-
#[cold] // Unicode is rare.
415-
fn next_2_unicode_chars(&mut self, byte1: u8) -> Option<[char; 2]> {
416-
// Multi-byte Unicode character.
417-
// Check invariant that `ptr` is on a UTF-8 character boundary.
418-
debug_assert!(!is_utf8_cont_byte(byte1));
419-
434+
/// Get next 2 chars of source, and advance position to after them, when one of next 2 chars is non-ASCII.
435+
///
436+
/// # SAFETY
437+
/// `Source` must not be empty.
438+
#[cold] // Unicode is rare
439+
unsafe fn next_2_unicode_chars(&mut self) -> Option<[char; 2]> {
420440
// Create a `Chars` iterator, get next 2 chars from it, and then update `self.ptr`
421441
// to match `Chars` iterator's updated pointer afterwards.
422442
// `Chars` iterator upholds same invariants as `Source`, so its pointer is guaranteed
423443
// to be valid as `self.ptr`.
424-
let mut chars = self.remaining().chars();
425-
// SAFETY: We know that there's 2 bytes to be consumed, so first call to
426-
// `chars.next()` must return `Some(_)`
427-
let c1 = unsafe { chars.next().unwrap_unchecked() };
444+
let remaining = self.remaining();
445+
// Inform compiler that `Source` is not empty, to remove check from `chars.next().unwrap()`.
446+
// Compiler will not be aware of this invariant if this function is not inlined.
447+
// SAFETY: Caller guarantees `Source` is not empty.
448+
unsafe { assert_unchecked!(!remaining.is_empty()) };
449+
let mut chars = remaining.chars();
450+
let c1 = chars.next().unwrap();
428451
let c2 = chars.next()?;
429452
self.ptr = chars.as_str().as_ptr();
430453
Some([c1, c2])
@@ -512,25 +535,39 @@ impl<'a> Source<'a> {
512535
// Check not at EOF and handle ASCII bytes
513536
let byte = self.peek_byte()?;
514537
if byte.is_ascii() {
515-
return Some(byte as char);
538+
Some(byte as char)
539+
} else {
540+
// Multi-byte Unicode character.
541+
// Check invariant that `ptr` is on a UTF-8 character boundary.
542+
debug_assert!(!is_utf8_cont_byte(byte));
543+
544+
// SAFETY: `Source` is not empty, and we just checked next byte is not ASCII
545+
unsafe { self.peek_unicode_char() }
516546
}
517-
self.peek_unicode_char(byte)
518547
}
519548

549+
/// Peek next char of source, without consuming it, when next char is a unicode character.
550+
///
551+
/// # SAFETY
552+
/// * `Source` must not be empty.
553+
/// * Next character in source must not be ASCII.
520554
#[expect(clippy::unnecessary_wraps)]
521-
#[cold] // Unicode is rare.
522-
fn peek_unicode_char(&self, byte: u8) -> Option<char> {
523-
// Multi-byte Unicode character.
524-
// Check invariant that `ptr` is on a UTF-8 character boundary.
525-
debug_assert!(!is_utf8_cont_byte(byte));
526-
527-
// Create a `Chars` iterator, and get next char from it
528-
let mut chars = self.remaining().chars();
529-
// SAFETY: We know that there's a byte to be consumed, so `chars.next()` must return `Some(_)`.
555+
#[cold] // Unicode is rare
556+
unsafe fn peek_unicode_char(&self) -> Option<char> {
557+
// Create a `Chars` iterator, and get next char from it.
558+
let remaining = self.remaining();
559+
// Inform compiler that next char in `Source` is non-ASCII, so it can remove some branches from
560+
// `chars.next().unwrap()`. Compiler will not be aware of these invariants if this function is not inlined.
561+
// SAFETY: Caller guarantees these invariants.
562+
unsafe {
563+
assert_unchecked!(!remaining.is_empty());
564+
assert_unchecked!(!remaining.as_bytes()[0].is_ascii());
565+
}
566+
let mut chars = remaining.chars();
567+
// We know that there's a byte to be consumed, so `chars.next()` must return `Some(_)`.
530568
// Could just return `chars.next()` here, but making it clear to compiler that this branch
531-
// always returns `Some(_)` may help it optimize the caller. Compiler seems to have difficulty
532-
// "seeing into" `Chars` iterator and making deductions.
533-
let c = unsafe { chars.next().unwrap_unchecked() };
569+
// always returns `Some(_)` may help it optimize the caller.
570+
let c = chars.next().unwrap();
534571
Some(c)
535572
}
536573

0 commit comments

Comments
 (0)