11use std:: { cmp:: Ordering , marker:: PhantomData , slice, str} ;
22
3+ use oxc_data_structures:: assert_unchecked;
4+
35use crate :: { MAX_LEN , UniquePromise } ;
46
57use super :: search:: SEARCH_BATCH_SIZE ;
@@ -371,25 +373,39 @@ impl<'a> Source<'a> {
371373 // So incrementing `ptr` cannot result in `ptr > end`.
372374 // Current byte is ASCII, so incremented `ptr` must be on a UTF-8 character boundary.
373375 unsafe { self . ptr = self . ptr . add ( 1 ) } ;
374- return Some ( byte as char ) ;
376+ Some ( byte as char )
377+ } else {
378+ // Multi-byte Unicode character.
379+ // Check invariant that `ptr` is on a UTF-8 character boundary.
380+ debug_assert ! ( !is_utf8_cont_byte( byte) ) ;
381+
382+ // SAFETY: `Source` is not empty, and we just checked next byte is not ASCII
383+ unsafe { self . next_unicode_char ( ) }
375384 }
376- self . next_unicode_char ( byte)
377385 }
378386
387+ /// Get next char of source, and advance position to after it, when next char is a unicode character.
388+ ///
389+ /// # SAFETY
390+ /// * `Source` must not be empty.
391+ /// * Next character in source must not be ASCII.
379392 #[ expect( clippy:: unnecessary_wraps) ]
380- #[ cold] // Unicode is rare.
381- fn next_unicode_char ( & mut self , byte : u8 ) -> Option < char > {
382- // Multi-byte Unicode character.
383- // Check invariant that `ptr` is on a UTF-8 character boundary.
384- debug_assert ! ( !is_utf8_cont_byte( byte) ) ;
385-
393+ #[ cold] // Unicode is rare
394+ unsafe fn next_unicode_char ( & mut self ) -> Option < char > {
386395 // Create a `Chars` iterator, get next char from it, and then update `self.ptr`
387396 // to match `Chars` iterator's updated pointer afterwards.
388397 // `Chars` iterator upholds same invariants as `Source`, so its pointer is guaranteed
389398 // to be valid as `self.ptr`.
390- let mut chars = self . remaining ( ) . chars ( ) ;
391- // SAFETY: We know that there's a byte to be consumed, so `chars.next()` must return `Some(_)`
392- let c = unsafe { chars. next ( ) . unwrap_unchecked ( ) } ;
399+ let remaining = self . remaining ( ) ;
400+ // Inform compiler that next char in `Source` is non-ASCII, so it can remove some branches from
401+ // `chars.next().unwrap()`. Compiler will not be aware of these invariants if this function is not inlined.
402+ // SAFETY: Caller guarantees these invariants.
403+ unsafe {
404+ assert_unchecked ! ( !remaining. is_empty( ) ) ;
405+ assert_unchecked ! ( !remaining. as_bytes( ) [ 0 ] . is_ascii( ) ) ;
406+ }
407+ let mut chars = remaining. chars ( ) ;
408+ let c = chars. next ( ) . unwrap ( ) ;
393409 self . ptr = chars. as_str ( ) . as_ptr ( ) ;
394410 Some ( c)
395411 }
@@ -404,27 +420,34 @@ impl<'a> Source<'a> {
404420 // and next 2 bytes are ASCII, so advancing by 2 bytes must put `ptr`
405421 // in bounds and on a UTF-8 character boundary
406422 unsafe { self . ptr = self . ptr . add ( 2 ) } ;
407- return Some ( [ byte1 as char , byte2 as char ] ) ;
408- }
423+ Some ( [ byte1 as char , byte2 as char ] )
424+ } else {
425+ // Multi-byte Unicode character.
426+ // Check invariant that `ptr` is on a UTF-8 character boundary.
427+ debug_assert ! ( !is_utf8_cont_byte( byte1) ) ;
409428
410- // Handle Unicode characters
411- self . next_2_unicode_chars ( byte1)
429+ // SAFETY: `Source` is not empty
430+ unsafe { self . next_2_unicode_chars ( ) }
431+ }
412432 }
413433
414- # [ cold ] // Unicode is rare .
415- fn next_2_unicode_chars ( & mut self , byte1 : u8 ) -> Option < [ char ; 2 ] > {
416- // Multi-byte Unicode character.
417- // Check invariant that `ptr` is on a UTF-8 character boundary .
418- debug_assert ! ( !is_utf8_cont_byte ( byte1 ) ) ;
419-
434+ /// Get next 2 chars of source, and advance position to after them, when one of next 2 chars is non-ASCII .
435+ ///
436+ /// # SAFETY
437+ /// `Source` must not be empty .
438+ # [ cold ] // Unicode is rare
439+ unsafe fn next_2_unicode_chars ( & mut self ) -> Option < [ char ; 2 ] > {
420440 // Create a `Chars` iterator, get next 2 chars from it, and then update `self.ptr`
421441 // to match `Chars` iterator's updated pointer afterwards.
422442 // `Chars` iterator upholds same invariants as `Source`, so its pointer is guaranteed
423443 // to be valid as `self.ptr`.
424- let mut chars = self . remaining ( ) . chars ( ) ;
425- // SAFETY: We know that there's 2 bytes to be consumed, so first call to
426- // `chars.next()` must return `Some(_)`
427- let c1 = unsafe { chars. next ( ) . unwrap_unchecked ( ) } ;
444+ let remaining = self . remaining ( ) ;
445+ // Inform compiler that `Source` is not empty, to remove check from `chars.next().unwrap()`.
446+ // Compiler will not be aware of this invariant if this function is not inlined.
447+ // SAFETY: Caller guarantees `Source` is not empty.
448+ unsafe { assert_unchecked ! ( !remaining. is_empty( ) ) } ;
449+ let mut chars = remaining. chars ( ) ;
450+ let c1 = chars. next ( ) . unwrap ( ) ;
428451 let c2 = chars. next ( ) ?;
429452 self . ptr = chars. as_str ( ) . as_ptr ( ) ;
430453 Some ( [ c1, c2] )
@@ -512,25 +535,39 @@ impl<'a> Source<'a> {
512535 // Check not at EOF and handle ASCII bytes
513536 let byte = self . peek_byte ( ) ?;
514537 if byte. is_ascii ( ) {
515- return Some ( byte as char ) ;
538+ Some ( byte as char )
539+ } else {
540+ // Multi-byte Unicode character.
541+ // Check invariant that `ptr` is on a UTF-8 character boundary.
542+ debug_assert ! ( !is_utf8_cont_byte( byte) ) ;
543+
544+ // SAFETY: `Source` is not empty, and we just checked next byte is not ASCII
545+ unsafe { self . peek_unicode_char ( ) }
516546 }
517- self . peek_unicode_char ( byte)
518547 }
519548
549+ /// Peek next char of source, without consuming it, when next char is a unicode character.
550+ ///
551+ /// # SAFETY
552+ /// * `Source` must not be empty.
553+ /// * Next character in source must not be ASCII.
520554 #[ expect( clippy:: unnecessary_wraps) ]
521- #[ cold] // Unicode is rare.
522- fn peek_unicode_char ( & self , byte : u8 ) -> Option < char > {
523- // Multi-byte Unicode character.
524- // Check invariant that `ptr` is on a UTF-8 character boundary.
525- debug_assert ! ( !is_utf8_cont_byte( byte) ) ;
526-
527- // Create a `Chars` iterator, and get next char from it
528- let mut chars = self . remaining ( ) . chars ( ) ;
529- // SAFETY: We know that there's a byte to be consumed, so `chars.next()` must return `Some(_)`.
555+ #[ cold] // Unicode is rare
556+ unsafe fn peek_unicode_char ( & self ) -> Option < char > {
557+ // Create a `Chars` iterator, and get next char from it.
558+ let remaining = self . remaining ( ) ;
559+ // Inform compiler that next char in `Source` is non-ASCII, so it can remove some branches from
560+ // `chars.next().unwrap()`. Compiler will not be aware of these invariants if this function is not inlined.
561+ // SAFETY: Caller guarantees these invariants.
562+ unsafe {
563+ assert_unchecked ! ( !remaining. is_empty( ) ) ;
564+ assert_unchecked ! ( !remaining. as_bytes( ) [ 0 ] . is_ascii( ) ) ;
565+ }
566+ let mut chars = remaining. chars ( ) ;
567+ // We know that there's a byte to be consumed, so `chars.next()` must return `Some(_)`.
530568 // Could just return `chars.next()` here, but making it clear to compiler that this branch
531- // always returns `Some(_)` may help it optimize the caller. Compiler seems to have difficulty
532- // "seeing into" `Chars` iterator and making deductions.
533- let c = unsafe { chars. next ( ) . unwrap_unchecked ( ) } ;
569+ // always returns `Some(_)` may help it optimize the caller.
570+ let c = chars. next ( ) . unwrap ( ) ;
534571 Some ( c)
535572 }
536573
0 commit comments