perf(lexer): remove branches from unicode handling (#15328)

overlookmotel · overlookmotel · commit 5f08c691472f · 2025-11-05T15:25:59.000Z
#12768 split `next_char`, `next_2_chars`, and `peek_char` into separate functions for the hot and cold paths. That was a good change, but had one side-effect - because the unicode branch is now in a separate function which isn't inlined, the compiler loses knowledge of the context - that `Source` isn't at EOF, and that (in 2 of the 3 methods) the next character is known not to be ASCII. Add unchecked assertions to inform compiler of the known facts, so it can remove 2 branches when calling `chars.next().unwrap()`. This code is on a cold path, so will likely not make a noticeable difference in files which don't contain many Unicode chars (like our benchmark fixtures), but why not?
diff --git a/crates/oxc_parser/src/lexer/source.rs b/crates/oxc_parser/src/lexer/source.rs
@@ -1,5 +1,7 @@
 use std::{cmp::Ordering, marker::PhantomData, slice, str};
 
+use oxc_data_structures::assert_unchecked;
+
 use crate::{MAX_LEN, UniquePromise};
 
 use super::search::SEARCH_BATCH_SIZE;
@@ -371,25 +373,39 @@ impl<'a> Source<'a> {
             // So incrementing `ptr` cannot result in `ptr > end`.
             // Current byte is ASCII, so incremented `ptr` must be on a UTF-8 character boundary.
             unsafe { self.ptr = self.ptr.add(1) };
-            return Some(byte as char);
+            Some(byte as char)
+        } else {
+            // Multi-byte Unicode character.
+            // Check invariant that `ptr` is on a UTF-8 character boundary.
+            debug_assert!(!is_utf8_cont_byte(byte));
+
+            // SAFETY: `Source` is not empty, and we just checked next byte is not ASCII
+            unsafe { self.next_unicode_char() }
         }
-        self.next_unicode_char(byte)
     }
 
+    /// Get next char of source, and advance position to after it, when next char is a unicode character.
+    ///
+    /// # SAFETY
+    /// * `Source` must not be empty.
+    /// * Next character in source must not be ASCII.
     #[expect(clippy::unnecessary_wraps)]
-    #[cold] // Unicode is rare.
-    fn next_unicode_char(&mut self, byte: u8) -> Option<char> {
-        // Multi-byte Unicode character.
-        // Check invariant that `ptr` is on a UTF-8 character boundary.
-        debug_assert!(!is_utf8_cont_byte(byte));
-
+    #[cold] // Unicode is rare
+    unsafe fn next_unicode_char(&mut self) -> Option<char> {
         // Create a `Chars` iterator, get next char from it, and then update `self.ptr`
         // to match `Chars` iterator's updated pointer afterwards.
         // `Chars` iterator upholds same invariants as `Source`, so its pointer is guaranteed
         // to be valid as `self.ptr`.
-        let mut chars = self.remaining().chars();
-        // SAFETY: We know that there's a byte to be consumed, so `chars.next()` must return `Some(_)`
-        let c = unsafe { chars.next().unwrap_unchecked() };
+        let remaining = self.remaining();
+        // Inform compiler that next char in `Source` is non-ASCII, so it can remove some branches from
+        // `chars.next().unwrap()`. Compiler will not be aware of these invariants if this function is not inlined.
+        // SAFETY: Caller guarantees these invariants.
+        unsafe {
+            assert_unchecked!(!remaining.is_empty());
+            assert_unchecked!(!remaining.as_bytes()[0].is_ascii());
+        }
+        let mut chars = remaining.chars();
+        let c = chars.next().unwrap();
         self.ptr = chars.as_str().as_ptr();
         Some(c)
     }
@@ -404,27 +420,34 @@ impl<'a> Source<'a> {
             // and next 2 bytes are ASCII, so advancing by 2 bytes must put `ptr`
             // in bounds and on a UTF-8 character boundary
             unsafe { self.ptr = self.ptr.add(2) };
-            return Some([byte1 as char, byte2 as char]);
-        }
+            Some([byte1 as char, byte2 as char])
+        } else {
+            // Multi-byte Unicode character.
+            // Check invariant that `ptr` is on a UTF-8 character boundary.
+            debug_assert!(!is_utf8_cont_byte(byte1));
 
-        // Handle Unicode characters
-        self.next_2_unicode_chars(byte1)
+            // SAFETY: `Source` is not empty
+            unsafe { self.next_2_unicode_chars() }
+        }
     }
 
-    #[cold] // Unicode is rare.
-    fn next_2_unicode_chars(&mut self, byte1: u8) -> Option<[char; 2]> {
-        // Multi-byte Unicode character.
-        // Check invariant that `ptr` is on a UTF-8 character boundary.
-        debug_assert!(!is_utf8_cont_byte(byte1));
-
+    /// Get next 2 chars of source, and advance position to after them, when one of next 2 chars is non-ASCII.
+    ///
+    /// # SAFETY
+    /// `Source` must not be empty.
+    #[cold] // Unicode is rare
+    unsafe fn next_2_unicode_chars(&mut self) -> Option<[char; 2]> {
         // Create a `Chars` iterator, get next 2 chars from it, and then update `self.ptr`
         // to match `Chars` iterator's updated pointer afterwards.
         // `Chars` iterator upholds same invariants as `Source`, so its pointer is guaranteed
         // to be valid as `self.ptr`.
-        let mut chars = self.remaining().chars();
-        // SAFETY: We know that there's 2 bytes to be consumed, so first call to
-        // `chars.next()` must return `Some(_)`
-        let c1 = unsafe { chars.next().unwrap_unchecked() };
+        let remaining = self.remaining();
+        // Inform compiler that `Source` is not empty, to remove check from `chars.next().unwrap()`.
+        // Compiler will not be aware of this invariant if this function is not inlined.
+        // SAFETY: Caller guarantees `Source` is not empty.
+        unsafe { assert_unchecked!(!remaining.is_empty()) };
+        let mut chars = remaining.chars();
+        let c1 = chars.next().unwrap();
         let c2 = chars.next()?;
         self.ptr = chars.as_str().as_ptr();
         Some([c1, c2])
@@ -512,25 +535,39 @@ impl<'a> Source<'a> {
         // Check not at EOF and handle ASCII bytes
         let byte = self.peek_byte()?;
         if byte.is_ascii() {
-            return Some(byte as char);
+            Some(byte as char)
+        } else {
+            // Multi-byte Unicode character.
+            // Check invariant that `ptr` is on a UTF-8 character boundary.
+            debug_assert!(!is_utf8_cont_byte(byte));
+
+            // SAFETY: `Source` is not empty, and we just checked next byte is not ASCII
+            unsafe { self.peek_unicode_char() }
         }
-        self.peek_unicode_char(byte)
     }
 
+    /// Peek next char of source, without consuming it, when next char is a unicode character.
+    ///
+    /// # SAFETY
+    /// * `Source` must not be empty.
+    /// * Next character in source must not be ASCII.
     #[expect(clippy::unnecessary_wraps)]
-    #[cold] // Unicode is rare.
-    fn peek_unicode_char(&self, byte: u8) -> Option<char> {
-        // Multi-byte Unicode character.
-        // Check invariant that `ptr` is on a UTF-8 character boundary.
-        debug_assert!(!is_utf8_cont_byte(byte));
-
-        // Create a `Chars` iterator, and get next char from it
-        let mut chars = self.remaining().chars();
-        // SAFETY: We know that there's a byte to be consumed, so `chars.next()` must return `Some(_)`.
+    #[cold] // Unicode is rare
+    unsafe fn peek_unicode_char(&self) -> Option<char> {
+        // Create a `Chars` iterator, and get next char from it.
+        let remaining = self.remaining();
+        // Inform compiler that next char in `Source` is non-ASCII, so it can remove some branches from
+        // `chars.next().unwrap()`. Compiler will not be aware of these invariants if this function is not inlined.
+        // SAFETY: Caller guarantees these invariants.
+        unsafe {
+            assert_unchecked!(!remaining.is_empty());
+            assert_unchecked!(!remaining.as_bytes()[0].is_ascii());
+        }
+        let mut chars = remaining.chars();
+        // We know that there's a byte to be consumed, so `chars.next()` must return `Some(_)`.
         // Could just return `chars.next()` here, but making it clear to compiler that this branch
-        // always returns `Some(_)` may help it optimize the caller. Compiler seems to have difficulty
-        // "seeing into" `Chars` iterator and making deductions.
-        let c = unsafe { chars.next().unwrap_unchecked() };
+        // always returns `Some(_)` may help it optimize the caller.
+        let c = chars.next().unwrap();
         Some(c)
     }