Skip to content

Commit 6e4329a

Browse files
committed
perf(codegen): faster splitting comments into lines
1 parent ea1e0a6 commit 6e4329a

File tree

1 file changed

+75
-25
lines changed

1 file changed

+75
-25
lines changed

crates/oxc_codegen/src/comment.rs

Lines changed: 75 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,66 +1,116 @@
1-
use std::borrow::Cow;
1+
use std::{borrow::Cow, iter::FusedIterator};
22

33
use rustc_hash::{FxHashMap, FxHashSet};
44

55
use oxc_ast::{Comment, CommentKind, ast::Program};
6-
use oxc_syntax::identifier::is_line_terminator;
6+
use oxc_syntax::identifier::{LS, PS, is_line_terminator};
77

88
use crate::{Codegen, LegalComment, options::CommentOptions};
99

1010
pub type CommentsMap = FxHashMap</* attached_to */ u32, Vec<Comment>>;
1111

12+
/// Convert `char` to UTF-8 bytes array.
13+
const fn to_bytes<const N: usize>(ch: char) -> [u8; N] {
14+
assert!(ch.len_utf8() == N);
15+
let mut bytes = [0u8; N];
16+
ch.encode_utf8(&mut bytes);
17+
bytes
18+
}
19+
20+
/// `LS` character as UTF-8 bytes.
21+
const LS_BYTES: [u8; 3] = to_bytes(LS);
22+
/// `PS` character as UTF-8 bytes.
23+
const PS_BYTES: [u8; 3] = to_bytes(PS);
24+
25+
const LS_OR_PS_FIRST_BYTE: u8 = 0xE2;
26+
27+
const _: () = assert!(LS_BYTES[0] == LS_OR_PS_FIRST_BYTE);
28+
const _: () = assert!(PS_BYTES[0] == LS_OR_PS_FIRST_BYTE);
29+
const LS_LAST_2_BYTES: [u8; 2] = [LS_BYTES[1], LS_BYTES[2]];
30+
const PS_LAST_2_BYTES: [u8; 2] = [PS_BYTES[1], PS_BYTES[2]];
31+
1232
/// Custom iterator that splits text on line terminators while handling CRLF as a single unit.
1333
/// This avoids creating empty strings between CR and LF characters.
1434
///
35+
/// Also splits on irregular line breaks (LS and PS).
36+
///
1537
/// # Example
1638
/// Standard split would turn `"line1\r\nline2"` into `["line1", "", "line2"]` because
17-
/// it treats \r and \n as separate terminators. This iterator correctly produces
18-
/// `["line1", "line2"]` by treating \r\n as a single terminator.
39+
/// it treats `\r` and `\n` as separate terminators. This iterator correctly produces
40+
/// `["line1", "line2"]` by treating `\r\n` as a single terminator.
1941
struct LineTerminatorSplitter<'a> {
2042
text: &'a str,
21-
position: usize,
2243
}
2344

2445
impl<'a> LineTerminatorSplitter<'a> {
2546
fn new(text: &'a str) -> Self {
26-
Self { text, position: 0 }
47+
Self { text }
2748
}
2849
}
2950

3051
impl<'a> Iterator for LineTerminatorSplitter<'a> {
3152
type Item = &'a str;
3253

3354
fn next(&mut self) -> Option<Self::Item> {
34-
if self.position >= self.text.len() {
55+
if self.text.is_empty() {
3556
return None;
3657
}
3758

38-
let start = self.position;
39-
let chars = self.text[self.position..].char_indices();
40-
41-
for (i, c) in chars {
42-
if is_line_terminator(c) {
43-
let line = &self.text[start..start + i];
44-
self.position = start + i + c.len_utf8();
45-
46-
// If this is CR followed by LF, skip the LF to treat CRLF as a single terminator
47-
if c == '\r'
48-
&& self.text.as_bytes().get(self.position).is_some_and(|&next| next == b'\n')
49-
{
50-
self.position += 1;
59+
for (index, &byte) in self.text.as_bytes().iter().enumerate() {
60+
match byte {
61+
b'\n' => {
62+
// SAFETY: Byte at `index` is `\n`, so `index` and `index + 1` are both UTF-8 char boundaries.
63+
// Therefore, slices up to `index` and from `index + 1` are both valid `&str`s.
64+
unsafe {
65+
let line = self.text.get_unchecked(..index);
66+
self.text = self.text.get_unchecked(index + 1..);
67+
return Some(line);
68+
}
5169
}
52-
53-
return Some(line);
70+
b'\r' => {
71+
// SAFETY: Byte at `index` is `\n`, so `index` is on a UTF-8 char boundary
72+
let line = unsafe { self.text.get_unchecked(..index) };
73+
// If the next byte is `\n`, consume it as well
74+
let skip_bytes =
75+
if self.text.as_bytes().get(index + 1) == Some(&b'\n') { 2 } else { 1 };
76+
// SAFETY: `index + skip_bytes` is after `\r` or `\n`, so on a UTF-8 char boundary.
77+
// Therefore slice from `index + skip_bytes` is a valid `&str`.
78+
self.text = unsafe { self.text.get_unchecked(index + skip_bytes..) };
79+
return Some(line);
80+
}
81+
LS_OR_PS_FIRST_BYTE => {
82+
let next2: [u8; 2] = {
83+
// SAFETY: 0xE2 is always the start of a 3-byte Unicode character,
84+
// so there must be 2 more bytes available to consume
85+
let next2 =
86+
unsafe { self.text.as_bytes().get_unchecked(index + 1..index + 3) };
87+
next2.try_into().unwrap()
88+
};
89+
// If this is LS or PS, treat it as a line terminator
90+
if matches!(next2, LS_LAST_2_BYTES | PS_LAST_2_BYTES) {
91+
// SAFETY: `index` is the start of a 3-byte Unicode character,
92+
// so `index` and `index + 3` are both UTF-8 char boundaries.
93+
// Therefore, slices up to `index` and from `index + 3` are both valid `&str`s.
94+
unsafe {
95+
let line = self.text.get_unchecked(..index);
96+
self.text = self.text.get_unchecked(index + 3..);
97+
return Some(line);
98+
}
99+
}
100+
}
101+
_ => {}
54102
}
55103
}
56104

57-
// Return the remaining text
58-
let line = &self.text[start..];
59-
self.position = self.text.len();
105+
// No line break found - return the remaining text. Next call will return `None`.
106+
let line = self.text;
107+
self.text = "";
60108
Some(line)
61109
}
62110
}
63111

112+
impl FusedIterator for LineTerminatorSplitter<'_> {}
113+
64114
impl Codegen<'_> {
65115
pub(crate) fn build_comments(&mut self, comments: &[Comment]) {
66116
if self.options.comments == CommentOptions::disabled() {

0 commit comments

Comments
 (0)