@@ -26,18 +26,21 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
2626}
2727
2828/// Reads the next code point out of a byte iterator (assuming a
29- /// UTF-8-like encoding).
29+ /// UTF-8-like encoding) and returns it along with its width .
3030///
3131/// # Safety
3232///
3333/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
3434#[ unstable( feature = "str_internals" , issue = "none" ) ]
3535#[ inline]
36- pub unsafe fn next_code_point < ' a , I : Iterator < Item = & ' a u8 > > ( bytes : & mut I ) -> Option < u32 > {
36+ #[ allow( dead_code) ]
37+ pub unsafe fn next_code_point_with_width < ' a , I : Iterator < Item = & ' a u8 > > (
38+ bytes : & mut I ,
39+ ) -> Option < ( u32 , usize ) > {
3740 // Decode UTF-8
3841 let x = * bytes. next ( ) ?;
3942 if x < 128 {
40- return Some ( x as u32 ) ;
43+ return Some ( ( x as u32 , 1 ) ) ;
4144 }
4245
4346 // Multibyte case follows
@@ -47,13 +50,15 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
4750 // SAFETY: `bytes` produces an UTF-8-like string,
4851 // so the iterator must produce a value here.
4952 let y = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
53+ let mut width = 2 ;
5054 let mut ch = utf8_acc_cont_byte ( init, y) ;
5155 if x >= 0xE0 {
5256 // [[x y z] w] case
5357 // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
5458 // SAFETY: `bytes` produces an UTF-8-like string,
5559 // so the iterator must produce a value here.
5660 let z = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
61+ width = 3 ;
5762 let y_z = utf8_acc_cont_byte ( ( y & CONT_MASK ) as u32 , z) ;
5863 ch = init << 12 | y_z;
5964 if x >= 0xF0 {
@@ -62,11 +67,25 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
6267 // SAFETY: `bytes` produces an UTF-8-like string,
6368 // so the iterator must produce a value here.
6469 let w = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
70+ width = 4 ;
6571 ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ( y_z, w) ;
6672 }
6773 }
6874
69- Some ( ch)
75+ Some ( ( ch, width) )
76+ }
77+
78+ /// Reads the next code point out of a byte iterator (assuming a
79+ /// UTF-8-like encoding).
80+ ///
81+ /// # Safety
82+ ///
83+ /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
84+ #[ unstable( feature = "str_internals" , issue = "none" ) ]
85+ #[ inline]
86+ pub unsafe fn next_code_point < ' a , I : Iterator < Item = & ' a u8 > > ( bytes : & mut I ) -> Option < u32 > {
87+ // SAFETY: same call condition
88+ Some ( unsafe { next_code_point_with_width ( bytes) } ?. 0 )
7089}
7190
7291/// Reads the last code point out of a byte iterator (assuming a
0 commit comments