perf(codegen): faster splitting comments into lines (#13190)

overlookmotel · overlookmotel · commit e3bfff1d8e7c · 2025-08-19T00:32:58.000Z
Follow-on after #13169. Implement the first optimization mentioned in #13169 (comment). Iterate over string byte-by-byte rather than char-by-char. It's amazing how bad Rust is at string operations. I tried it without unsafe code at first, but Rust inserts checks for whether a slice falls on a UTF-8 char boundary on every single operation, even though it's obvious from the context that these checks can never fail. It made the assembly x4 longer, which is no good as this is meant to be a tight loop.
diff --git a/crates/oxc_codegen/src/comment.rs b/crates/oxc_codegen/src/comment.rs
@@ -1,66 +1,116 @@
-use std::borrow::Cow;
+use std::{borrow::Cow, iter::FusedIterator};
 
 use rustc_hash::{FxHashMap, FxHashSet};
 
 use oxc_ast::{Comment, CommentKind, ast::Program};
-use oxc_syntax::identifier::is_line_terminator;
+use oxc_syntax::identifier::{LS, PS, is_line_terminator};
 
 use crate::{Codegen, LegalComment, options::CommentOptions};
 
 pub type CommentsMap = FxHashMap</* attached_to */ u32, Vec<Comment>>;
 
+/// Convert `char` to UTF-8 bytes array.
+const fn to_bytes<const N: usize>(ch: char) -> [u8; N] {
+    assert!(ch.len_utf8() == N);
+    let mut bytes = [0u8; N];
+    ch.encode_utf8(&mut bytes);
+    bytes
+}
+
+/// `LS` character as UTF-8 bytes.
+const LS_BYTES: [u8; 3] = to_bytes(LS);
+/// `PS` character as UTF-8 bytes.
+const PS_BYTES: [u8; 3] = to_bytes(PS);
+
+const LS_OR_PS_FIRST_BYTE: u8 = 0xE2;
+
+const _: () = assert!(LS_BYTES[0] == LS_OR_PS_FIRST_BYTE);
+const _: () = assert!(PS_BYTES[0] == LS_OR_PS_FIRST_BYTE);
+const LS_LAST_2_BYTES: [u8; 2] = [LS_BYTES[1], LS_BYTES[2]];
+const PS_LAST_2_BYTES: [u8; 2] = [PS_BYTES[1], PS_BYTES[2]];
+
 /// Custom iterator that splits text on line terminators while handling CRLF as a single unit.
 /// This avoids creating empty strings between CR and LF characters.
 ///
+/// Also splits on irregular line breaks (LS and PS).
+///
 /// # Example
 /// Standard split would turn `"line1\r\nline2"` into `["line1", "", "line2"]` because
-/// it treats \r and \n as separate terminators. This iterator correctly produces
-/// `["line1", "line2"]` by treating \r\n as a single terminator.
+/// it treats `\r` and `\n` as separate terminators. This iterator correctly produces
+/// `["line1", "line2"]` by treating `\r\n` as a single terminator.
 struct LineTerminatorSplitter<'a> {
     text: &'a str,
-    position: usize,
 }
 
 impl<'a> LineTerminatorSplitter<'a> {
     fn new(text: &'a str) -> Self {
-        Self { text, position: 0 }
+        Self { text }
     }
 }
 
 impl<'a> Iterator for LineTerminatorSplitter<'a> {
     type Item = &'a str;
 
     fn next(&mut self) -> Option<Self::Item> {
-        if self.position >= self.text.len() {
+        if self.text.is_empty() {
             return None;
         }
 
-        let start = self.position;
-        let chars = self.text[self.position..].char_indices();
-
-        for (i, c) in chars {
-            if is_line_terminator(c) {
-                let line = &self.text[start..start + i];
-                self.position = start + i + c.len_utf8();
-
-                // If this is CR followed by LF, skip the LF to treat CRLF as a single terminator
-                if c == '\r'
-                    && self.text.as_bytes().get(self.position).is_some_and(|&next| next == b'\n')
-                {
-                    self.position += 1;
+        for (index, &byte) in self.text.as_bytes().iter().enumerate() {
+            match byte {
+                b'\n' => {
+                    // SAFETY: Byte at `index` is `\n`, so `index` and `index + 1` are both UTF-8 char boundaries.
+                    // Therefore, slices up to `index` and from `index + 1` are both valid `&str`s.
+                    unsafe {
+                        let line = self.text.get_unchecked(..index);
+                        self.text = self.text.get_unchecked(index + 1..);
+                        return Some(line);
+                    }
                 }
-
-                return Some(line);
+                b'\r' => {
+                    // SAFETY: Byte at `index` is `\r`, so `index` is on a UTF-8 char boundary
+                    let line = unsafe { self.text.get_unchecked(..index) };
+                    // If the next byte is `\n`, consume it as well
+                    let skip_bytes =
+                        if self.text.as_bytes().get(index + 1) == Some(&b'\n') { 2 } else { 1 };
+                    // SAFETY: `index + skip_bytes` is after `\r` or `\n`, so on a UTF-8 char boundary.
+                    // Therefore slice from `index + skip_bytes` is a valid `&str`.
+                    self.text = unsafe { self.text.get_unchecked(index + skip_bytes..) };
+                    return Some(line);
+                }
+                LS_OR_PS_FIRST_BYTE => {
+                    let next2: [u8; 2] = {
+                        // SAFETY: 0xE2 is always the start of a 3-byte Unicode character,
+                        // so there must be 2 more bytes available to consume
+                        let next2 =
+                            unsafe { self.text.as_bytes().get_unchecked(index + 1..index + 3) };
+                        next2.try_into().unwrap()
+                    };
+                    // If this is LS or PS, treat it as a line terminator
+                    if matches!(next2, LS_LAST_2_BYTES | PS_LAST_2_BYTES) {
+                        // SAFETY: `index` is the start of a 3-byte Unicode character,
+                        // so `index` and `index + 3` are both UTF-8 char boundaries.
+                        // Therefore, slices up to `index` and from `index + 3` are both valid `&str`s.
+                        unsafe {
+                            let line = self.text.get_unchecked(..index);
+                            self.text = self.text.get_unchecked(index + 3..);
+                            return Some(line);
+                        }
+                    }
+                }
+                _ => {}
             }
         }
 
-        // Return the remaining text
-        let line = &self.text[start..];
-        self.position = self.text.len();
+        // No line break found - return the remaining text. Next call will return `None`.
+        let line = self.text;
+        self.text = "";
         Some(line)
     }
 }
 
+impl FusedIterator for LineTerminatorSplitter<'_> {}
+
 impl Codegen<'_> {
     pub(crate) fn build_comments(&mut self, comments: &[Comment]) {
         if self.options.comments == CommentOptions::disabled() {