Skip to content

Commit e3bfff1

Browse files
committed
perf(codegen): faster splitting comments into lines (#13190)
Follow-on after #13169. Implement the first optimization mentioned in #13169 (comment). Iterate over string byte-by-byte rather than char-by-char. It's amazing how bad Rust is at string operations. I tried it without unsafe code at first, but Rust inserts checks for whether a slice falls on a UTF-8 char boundary on every single operation, even though it's obvious from the context that these checks can never fail. It made the assembly x4 longer, which is no good as this is meant to be a tight loop.
1 parent 5668192 commit e3bfff1

File tree

1 file changed

+75
-25
lines changed

1 file changed

+75
-25
lines changed

crates/oxc_codegen/src/comment.rs

Lines changed: 75 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,66 +1,116 @@
1-
use std::borrow::Cow;
1+
use std::{borrow::Cow, iter::FusedIterator};
22

33
use rustc_hash::{FxHashMap, FxHashSet};
44

55
use oxc_ast::{Comment, CommentKind, ast::Program};
6-
use oxc_syntax::identifier::is_line_terminator;
6+
use oxc_syntax::identifier::{LS, PS, is_line_terminator};
77

88
use crate::{Codegen, LegalComment, options::CommentOptions};
99

1010
pub type CommentsMap = FxHashMap</* attached_to */ u32, Vec<Comment>>;
1111

12+
/// Convert `char` to UTF-8 bytes array.
13+
const fn to_bytes<const N: usize>(ch: char) -> [u8; N] {
14+
assert!(ch.len_utf8() == N);
15+
let mut bytes = [0u8; N];
16+
ch.encode_utf8(&mut bytes);
17+
bytes
18+
}
19+
20+
/// `LS` character as UTF-8 bytes.
21+
const LS_BYTES: [u8; 3] = to_bytes(LS);
22+
/// `PS` character as UTF-8 bytes.
23+
const PS_BYTES: [u8; 3] = to_bytes(PS);
24+
25+
const LS_OR_PS_FIRST_BYTE: u8 = 0xE2;
26+
27+
const _: () = assert!(LS_BYTES[0] == LS_OR_PS_FIRST_BYTE);
28+
const _: () = assert!(PS_BYTES[0] == LS_OR_PS_FIRST_BYTE);
29+
const LS_LAST_2_BYTES: [u8; 2] = [LS_BYTES[1], LS_BYTES[2]];
30+
const PS_LAST_2_BYTES: [u8; 2] = [PS_BYTES[1], PS_BYTES[2]];
31+
1232
/// Custom iterator that splits text on line terminators while handling CRLF as a single unit.
1333
/// This avoids creating empty strings between CR and LF characters.
1434
///
35+
/// Also splits on irregular line breaks (LS and PS).
36+
///
1537
/// # Example
1638
/// Standard split would turn `"line1\r\nline2"` into `["line1", "", "line2"]` because
17-
/// it treats \r and \n as separate terminators. This iterator correctly produces
18-
/// `["line1", "line2"]` by treating \r\n as a single terminator.
39+
/// it treats `\r` and `\n` as separate terminators. This iterator correctly produces
40+
/// `["line1", "line2"]` by treating `\r\n` as a single terminator.
1941
struct LineTerminatorSplitter<'a> {
2042
text: &'a str,
21-
position: usize,
2243
}
2344

2445
impl<'a> LineTerminatorSplitter<'a> {
2546
fn new(text: &'a str) -> Self {
26-
Self { text, position: 0 }
47+
Self { text }
2748
}
2849
}
2950

3051
impl<'a> Iterator for LineTerminatorSplitter<'a> {
3152
type Item = &'a str;
3253

3354
fn next(&mut self) -> Option<Self::Item> {
34-
if self.position >= self.text.len() {
55+
if self.text.is_empty() {
3556
return None;
3657
}
3758

38-
let start = self.position;
39-
let chars = self.text[self.position..].char_indices();
40-
41-
for (i, c) in chars {
42-
if is_line_terminator(c) {
43-
let line = &self.text[start..start + i];
44-
self.position = start + i + c.len_utf8();
45-
46-
// If this is CR followed by LF, skip the LF to treat CRLF as a single terminator
47-
if c == '\r'
48-
&& self.text.as_bytes().get(self.position).is_some_and(|&next| next == b'\n')
49-
{
50-
self.position += 1;
59+
for (index, &byte) in self.text.as_bytes().iter().enumerate() {
60+
match byte {
61+
b'\n' => {
62+
// SAFETY: Byte at `index` is `\n`, so `index` and `index + 1` are both UTF-8 char boundaries.
63+
// Therefore, slices up to `index` and from `index + 1` are both valid `&str`s.
64+
unsafe {
65+
let line = self.text.get_unchecked(..index);
66+
self.text = self.text.get_unchecked(index + 1..);
67+
return Some(line);
68+
}
5169
}
52-
53-
return Some(line);
70+
b'\r' => {
71+
// SAFETY: Byte at `index` is `\r`, so `index` is on a UTF-8 char boundary
72+
let line = unsafe { self.text.get_unchecked(..index) };
73+
// If the next byte is `\n`, consume it as well
74+
let skip_bytes =
75+
if self.text.as_bytes().get(index + 1) == Some(&b'\n') { 2 } else { 1 };
76+
// SAFETY: `index + skip_bytes` is after `\r` or `\n`, so on a UTF-8 char boundary.
77+
// Therefore slice from `index + skip_bytes` is a valid `&str`.
78+
self.text = unsafe { self.text.get_unchecked(index + skip_bytes..) };
79+
return Some(line);
80+
}
81+
LS_OR_PS_FIRST_BYTE => {
82+
let next2: [u8; 2] = {
83+
// SAFETY: 0xE2 is always the start of a 3-byte Unicode character,
84+
// so there must be 2 more bytes available to consume
85+
let next2 =
86+
unsafe { self.text.as_bytes().get_unchecked(index + 1..index + 3) };
87+
next2.try_into().unwrap()
88+
};
89+
// If this is LS or PS, treat it as a line terminator
90+
if matches!(next2, LS_LAST_2_BYTES | PS_LAST_2_BYTES) {
91+
// SAFETY: `index` is the start of a 3-byte Unicode character,
92+
// so `index` and `index + 3` are both UTF-8 char boundaries.
93+
// Therefore, slices up to `index` and from `index + 3` are both valid `&str`s.
94+
unsafe {
95+
let line = self.text.get_unchecked(..index);
96+
self.text = self.text.get_unchecked(index + 3..);
97+
return Some(line);
98+
}
99+
}
100+
}
101+
_ => {}
54102
}
55103
}
56104

57-
// Return the remaining text
58-
let line = &self.text[start..];
59-
self.position = self.text.len();
105+
// No line break found - return the remaining text. Next call will return `None`.
106+
let line = self.text;
107+
self.text = "";
60108
Some(line)
61109
}
62110
}
63111

112+
impl FusedIterator for LineTerminatorSplitter<'_> {}
113+
64114
impl Codegen<'_> {
65115
pub(crate) fn build_comments(&mut self, comments: &[Comment]) {
66116
if self.options.comments == CommentOptions::disabled() {

0 commit comments

Comments
 (0)