@@ -593,16 +593,7 @@ impl char {
593593 #[ stable( feature = "rust1" , since = "1.0.0" ) ]
594594 #[ inline]
595595 pub fn len_utf8 ( self ) -> usize {
596- let code = self as u32 ;
597- if code < MAX_ONE_B {
598- 1
599- } else if code < MAX_TWO_B {
600- 2
601- } else if code < MAX_THREE_B {
602- 3
603- } else {
604- 4
605- }
596+ len_utf8 ( self as u32 )
606597 }
607598
608599 /// Returns the number of 16-bit code units this `char` would need if
@@ -670,36 +661,8 @@ impl char {
670661 #[ stable( feature = "unicode_encode_char" , since = "1.15.0" ) ]
671662 #[ inline]
672663 pub fn encode_utf8 ( self , dst : & mut [ u8 ] ) -> & mut str {
673- let code = self as u32 ;
674- let len = self . len_utf8 ( ) ;
675- match ( len, & mut dst[ ..] ) {
676- ( 1 , [ a, ..] ) => {
677- * a = code as u8 ;
678- }
679- ( 2 , [ a, b, ..] ) => {
680- * a = ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ;
681- * b = ( code & 0x3F ) as u8 | TAG_CONT ;
682- }
683- ( 3 , [ a, b, c, ..] ) => {
684- * a = ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ;
685- * b = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
686- * c = ( code & 0x3F ) as u8 | TAG_CONT ;
687- }
688- ( 4 , [ a, b, c, d, ..] ) => {
689- * a = ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ;
690- * b = ( code >> 12 & 0x3F ) as u8 | TAG_CONT ;
691- * c = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
692- * d = ( code & 0x3F ) as u8 | TAG_CONT ;
693- }
694- _ => panic ! (
695- "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}" ,
696- len,
697- code,
698- dst. len( ) ,
699- ) ,
700- } ;
701- // SAFETY: We just wrote UTF-8 content in, so converting to str is fine.
702- unsafe { from_utf8_unchecked_mut ( & mut dst[ ..len] ) }
664+ // SAFETY: `char` is not a surrogate, so this is valid UTF-8.
665+ unsafe { from_utf8_unchecked_mut ( encode_utf8_raw ( self as u32 , dst) ) }
703666 }
704667
705668 /// Encodes this character as UTF-16 into the provided `u16` buffer,
@@ -739,28 +702,7 @@ impl char {
739702 #[ stable( feature = "unicode_encode_char" , since = "1.15.0" ) ]
740703 #[ inline]
741704 pub fn encode_utf16 ( self , dst : & mut [ u16 ] ) -> & mut [ u16 ] {
742- let mut code = self as u32 ;
743- // SAFETY: each arm checks whether there are enough bits to write into
744- unsafe {
745- if ( code & 0xFFFF ) == code && !dst. is_empty ( ) {
746- // The BMP falls through (assuming non-surrogate, as it should)
747- * dst. get_unchecked_mut ( 0 ) = code as u16 ;
748- slice:: from_raw_parts_mut ( dst. as_mut_ptr ( ) , 1 )
749- } else if dst. len ( ) >= 2 {
750- // Supplementary planes break into surrogates.
751- code -= 0x1_0000 ;
752- * dst. get_unchecked_mut ( 0 ) = 0xD800 | ( ( code >> 10 ) as u16 ) ;
753- * dst. get_unchecked_mut ( 1 ) = 0xDC00 | ( ( code as u16 ) & 0x3FF ) ;
754- slice:: from_raw_parts_mut ( dst. as_mut_ptr ( ) , 2 )
755- } else {
756- panic ! (
757- "encode_utf16: need {} units to encode U+{:X}, but the buffer has {}" ,
758- from_u32_unchecked( code) . len_utf16( ) ,
759- code,
760- dst. len( ) ,
761- )
762- }
763- }
705+ encode_utf16_raw ( self as u32 , dst)
764706 }
765707
766708 /// Returns `true` if this `char` has the `Alphabetic` property.
@@ -1673,3 +1615,100 @@ impl char {
16731615 }
16741616 }
16751617}
1618+
1619+ #[ inline]
1620+ fn len_utf8 ( code : u32 ) -> usize {
1621+ if code < MAX_ONE_B {
1622+ 1
1623+ } else if code < MAX_TWO_B {
1624+ 2
1625+ } else if code < MAX_THREE_B {
1626+ 3
1627+ } else {
1628+ 4
1629+ }
1630+ }
1631+
1632+ /// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
1633+ /// and then returns the subslice of the buffer that contains the encoded character.
1634+ ///
1635+ /// Unlike `char::encode_utf8`, this method also handles codepoints in the surrogate range.
1636+ /// (Creating a `char` in the surrogate range is UB.)
1637+ /// The result is valid [generalized UTF-8] but not valid UTF-8.
1638+ ///
1639+ /// [generalized UTF-8]: https://simonsapin.github.io/wtf-8/#generalized-utf8
1640+ ///
1641+ /// # Panics
1642+ ///
1643+ /// Panics if the buffer is not large enough.
1644+ /// A buffer of length four is large enough to encode any `char`.
1645+ #[ unstable( feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" ) ]
1646+ #[ doc( hidden) ]
1647+ #[ inline]
1648+ pub fn encode_utf8_raw ( code : u32 , dst : & mut [ u8 ] ) -> & mut [ u8 ] {
1649+ let len = len_utf8 ( code) ;
1650+ match ( len, & mut dst[ ..] ) {
1651+ ( 1 , [ a, ..] ) => {
1652+ * a = code as u8 ;
1653+ }
1654+ ( 2 , [ a, b, ..] ) => {
1655+ * a = ( code >> 6 & 0x1F ) as u8 | TAG_TWO_B ;
1656+ * b = ( code & 0x3F ) as u8 | TAG_CONT ;
1657+ }
1658+ ( 3 , [ a, b, c, ..] ) => {
1659+ * a = ( code >> 12 & 0x0F ) as u8 | TAG_THREE_B ;
1660+ * b = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1661+ * c = ( code & 0x3F ) as u8 | TAG_CONT ;
1662+ }
1663+ ( 4 , [ a, b, c, d, ..] ) => {
1664+ * a = ( code >> 18 & 0x07 ) as u8 | TAG_FOUR_B ;
1665+ * b = ( code >> 12 & 0x3F ) as u8 | TAG_CONT ;
1666+ * c = ( code >> 6 & 0x3F ) as u8 | TAG_CONT ;
1667+ * d = ( code & 0x3F ) as u8 | TAG_CONT ;
1668+ }
1669+ _ => panic ! (
1670+ "encode_utf8: need {} bytes to encode U+{:X}, but the buffer has {}" ,
1671+ len,
1672+ code,
1673+ dst. len( ) ,
1674+ ) ,
1675+ } ;
1676+ & mut dst[ ..len]
1677+ }
1678+
1679+ /// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
1680+ /// and then returns the subslice of the buffer that contains the encoded character.
1681+ ///
1682+ /// Unlike `char::encode_utf16`, this method also handles codepoints in the surrogate range.
1683+ /// (Creating a `char` in the surrogate range is UB.)
1684+ ///
1685+ /// # Panics
1686+ ///
1687+ /// Panics if the buffer is not large enough.
1688+ /// A buffer of length 2 is large enough to encode any `char`.
1689+ #[ unstable( feature = "char_internals" , reason = "exposed only for libstd" , issue = "none" ) ]
1690+ #[ doc( hidden) ]
1691+ #[ inline]
1692+ pub fn encode_utf16_raw ( mut code : u32 , dst : & mut [ u16 ] ) -> & mut [ u16 ] {
1693+ // SAFETY: each arm checks whether there are enough bits to write into
1694+ unsafe {
1695+ if ( code & 0xFFFF ) == code && !dst. is_empty ( ) {
1696+ // The BMP falls through
1697+ * dst. get_unchecked_mut ( 0 ) = code as u16 ;
1698+ slice:: from_raw_parts_mut ( dst. as_mut_ptr ( ) , 1 )
1699+ } else if dst. len ( ) >= 2 {
1700+ // Supplementary planes break into surrogates.
1701+ code -= 0x1_0000 ;
1702+ * dst. get_unchecked_mut ( 0 ) = 0xD800 | ( ( code >> 10 ) as u16 ) ;
1703+ * dst. get_unchecked_mut ( 1 ) = 0xDC00 | ( ( code as u16 ) & 0x3FF ) ;
1704+ slice:: from_raw_parts_mut ( dst. as_mut_ptr ( ) , 2 )
1705+ } else {
1706+ panic ! (
1707+ "encode_utf16: need {} units to encode U+{:X}, but the buffer has {}" ,
1708+ from_u32_unchecked( code) . len_utf16( ) ,
1709+ code,
1710+ dst. len( ) ,
1711+ )
1712+ }
1713+ }
1714+ }
0 commit comments