|
1 | | -use std::borrow::Cow; |
| 1 | +use std::{borrow::Cow, iter::FusedIterator}; |
2 | 2 |
|
3 | 3 | use rustc_hash::{FxHashMap, FxHashSet}; |
4 | 4 |
|
5 | 5 | use oxc_ast::{Comment, CommentKind, ast::Program}; |
6 | | -use oxc_syntax::identifier::is_line_terminator; |
| 6 | +use oxc_syntax::identifier::{LS, PS, is_line_terminator}; |
7 | 7 |
|
8 | 8 | use crate::{Codegen, LegalComment, options::CommentOptions}; |
9 | 9 |
|
10 | 10 | pub type CommentsMap = FxHashMap</* attached_to */ u32, Vec<Comment>>; |
11 | 11 |
|
| 12 | +/// Convert `char` to UTF-8 bytes array. |
| 13 | +const fn to_bytes<const N: usize>(ch: char) -> [u8; N] { |
| 14 | + assert!(ch.len_utf8() == N); |
| 15 | + let mut bytes = [0u8; N]; |
| 16 | + ch.encode_utf8(&mut bytes); |
| 17 | + bytes |
| 18 | +} |
| 19 | + |
| 20 | +/// `LS` character as UTF-8 bytes. |
| 21 | +const LS_BYTES: [u8; 3] = to_bytes(LS); |
| 22 | +/// `PS` character as UTF-8 bytes. |
| 23 | +const PS_BYTES: [u8; 3] = to_bytes(PS); |
| 24 | + |
| 25 | +const LS_OR_PS_FIRST_BYTE: u8 = 0xE2; |
| 26 | + |
| 27 | +const _: () = assert!(LS_BYTES[0] == LS_OR_PS_FIRST_BYTE); |
| 28 | +const _: () = assert!(PS_BYTES[0] == LS_OR_PS_FIRST_BYTE); |
| 29 | +const LS_LAST_2_BYTES: [u8; 2] = [LS_BYTES[1], LS_BYTES[2]]; |
| 30 | +const PS_LAST_2_BYTES: [u8; 2] = [PS_BYTES[1], PS_BYTES[2]]; |
| 31 | + |
12 | 32 | /// Custom iterator that splits text on line terminators while handling CRLF as a single unit. |
13 | 33 | /// This avoids creating empty strings between CR and LF characters. |
14 | 34 | /// |
| 35 | +/// Also splits on irregular line breaks (LS and PS). |
| 36 | +/// |
15 | 37 | /// # Example |
16 | 38 | /// Standard split would turn `"line1\r\nline2"` into `["line1", "", "line2"]` because |
17 | | -/// it treats \r and \n as separate terminators. This iterator correctly produces |
18 | | -/// `["line1", "line2"]` by treating \r\n as a single terminator. |
| 39 | +/// it treats `\r` and `\n` as separate terminators. This iterator correctly produces |
| 40 | +/// `["line1", "line2"]` by treating `\r\n` as a single terminator. |
19 | 41 | struct LineTerminatorSplitter<'a> { |
20 | 42 | text: &'a str, |
21 | | - position: usize, |
22 | 43 | } |
23 | 44 |
|
24 | 45 | impl<'a> LineTerminatorSplitter<'a> { |
25 | 46 | fn new(text: &'a str) -> Self { |
26 | | - Self { text, position: 0 } |
| 47 | + Self { text } |
27 | 48 | } |
28 | 49 | } |
29 | 50 |
|
30 | 51 | impl<'a> Iterator for LineTerminatorSplitter<'a> { |
31 | 52 | type Item = &'a str; |
32 | 53 |
|
33 | 54 | fn next(&mut self) -> Option<Self::Item> { |
34 | | - if self.position >= self.text.len() { |
| 55 | + if self.text.is_empty() { |
35 | 56 | return None; |
36 | 57 | } |
37 | 58 |
|
38 | | - let start = self.position; |
39 | | - let chars = self.text[self.position..].char_indices(); |
40 | | - |
41 | | - for (i, c) in chars { |
42 | | - if is_line_terminator(c) { |
43 | | - let line = &self.text[start..start + i]; |
44 | | - self.position = start + i + c.len_utf8(); |
45 | | - |
46 | | - // If this is CR followed by LF, skip the LF to treat CRLF as a single terminator |
47 | | - if c == '\r' |
48 | | - && self.text.as_bytes().get(self.position).is_some_and(|&next| next == b'\n') |
49 | | - { |
50 | | - self.position += 1; |
| 59 | + for (index, &byte) in self.text.as_bytes().iter().enumerate() { |
| 60 | + match byte { |
| 61 | + b'\n' => { |
| 62 | + // SAFETY: Byte at `index` is `\n`, so `index` and `index + 1` are both UTF-8 char boundaries. |
| 63 | + // Therefore, slices up to `index` and from `index + 1` are both valid `&str`s. |
| 64 | + unsafe { |
| 65 | + let line = self.text.get_unchecked(..index); |
| 66 | + self.text = self.text.get_unchecked(index + 1..); |
| 67 | + return Some(line); |
| 68 | + } |
51 | 69 | } |
52 | | - |
53 | | - return Some(line); |
| 70 | + b'\r' => { |
| 71 | + // SAFETY: Byte at `index` is `\n`, so `index` is on a UTF-8 char boundary |
| 72 | + let line = unsafe { self.text.get_unchecked(..index) }; |
| 73 | + // If the next byte is `\n`, consume it as well |
| 74 | + let skip_bytes = |
| 75 | + if self.text.as_bytes().get(index + 1) == Some(&b'\n') { 2 } else { 1 }; |
| 76 | + // SAFETY: `index + skip_bytes` is after `\r` or `\n`, so on a UTF-8 char boundary. |
| 77 | + // Therefore slice from `index + skip_bytes` is a valid `&str`. |
| 78 | + self.text = unsafe { self.text.get_unchecked(index + skip_bytes..) }; |
| 79 | + return Some(line); |
| 80 | + } |
| 81 | + LS_OR_PS_FIRST_BYTE => { |
| 82 | + let next2: [u8; 2] = { |
| 83 | + // SAFETY: 0xE2 is always the start of a 3-byte Unicode character, |
| 84 | + // so there must be 2 more bytes available to consume |
| 85 | + let next2 = |
| 86 | + unsafe { self.text.as_bytes().get_unchecked(index + 1..index + 3) }; |
| 87 | + next2.try_into().unwrap() |
| 88 | + }; |
| 89 | + // If this is LS or PS, treat it as a line terminator |
| 90 | + if matches!(next2, LS_LAST_2_BYTES | PS_LAST_2_BYTES) { |
| 91 | + // SAFETY: `index` is the start of a 3-byte Unicode character, |
| 92 | + // so `index` and `index + 3` are both UTF-8 char boundaries. |
| 93 | + // Therefore, slices up to `index` and from `index + 3` are both valid `&str`s. |
| 94 | + unsafe { |
| 95 | + let line = self.text.get_unchecked(..index); |
| 96 | + self.text = self.text.get_unchecked(index + 3..); |
| 97 | + return Some(line); |
| 98 | + } |
| 99 | + } |
| 100 | + } |
| 101 | + _ => {} |
54 | 102 | } |
55 | 103 | } |
56 | 104 |
|
57 | | - // Return the remaining text |
58 | | - let line = &self.text[start..]; |
59 | | - self.position = self.text.len(); |
| 105 | + // No line break found - return the remaining text. Next call will return `None`. |
| 106 | + let line = self.text; |
| 107 | + self.text = ""; |
60 | 108 | Some(line) |
61 | 109 | } |
62 | 110 | } |
63 | 111 |
|
| 112 | +impl FusedIterator for LineTerminatorSplitter<'_> {} |
| 113 | + |
64 | 114 | impl Codegen<'_> { |
65 | 115 | pub(crate) fn build_comments(&mut self, comments: &[Comment]) { |
66 | 116 | if self.options.comments == CommentOptions::disabled() { |
|
0 commit comments