@@ -89,6 +89,24 @@ impl CodePoint {
8989 self . value
9090 }
9191
92+ /// Returns the numeric value of the code point if it is a leading surrogate.
93+ #[ inline]
94+ pub fn to_lead_surrogate ( & self ) -> Option < u16 > {
95+ match self . value {
96+ lead @ 0xD800 ..=0xDBFF => Some ( lead as u16 ) ,
97+ _ => None ,
98+ }
99+ }
100+
101+ /// Returns the numeric value of the code point if it is a trailing surrogate.
102+ #[ inline]
103+ pub fn to_trail_surrogate ( & self ) -> Option < u16 > {
104+ match self . value {
105+ trail @ 0xDC00 ..=0xDFFF => Some ( trail as u16 ) ,
106+ _ => None ,
107+ }
108+ }
109+
92110 /// Optionally returns a Unicode scalar value for the code point.
93111 ///
94112 /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
@@ -117,6 +135,14 @@ impl CodePoint {
117135#[ derive( Eq , PartialEq , Ord , PartialOrd , Clone ) ]
118136pub struct Wtf8Buf {
119137 bytes : Vec < u8 > ,
138+
139+ /// Do we know that `bytes` holds a valid UTF-8 encoding? We can easily
140+ /// know this if we're constructed from a `String` or `&str`.
141+ ///
142+ /// It is possible for `bytes` to have valid UTF-8 without this being
143+ /// set, such as when we're concatenating `&Wtf8`'s and surrogates become
144+ /// paired, as we don't bother to rescan the entire string.
145+ is_known_utf8 : bool ,
120146}
121147
122148impl ops:: Deref for Wtf8Buf {
@@ -147,13 +173,13 @@ impl Wtf8Buf {
147173 /// Creates a new, empty WTF-8 string.
148174 #[ inline]
149175 pub fn new ( ) -> Wtf8Buf {
150- Wtf8Buf { bytes : Vec :: new ( ) }
176+ Wtf8Buf { bytes : Vec :: new ( ) , is_known_utf8 : true }
151177 }
152178
153179 /// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
154180 #[ inline]
155181 pub fn with_capacity ( capacity : usize ) -> Wtf8Buf {
156- Wtf8Buf { bytes : Vec :: with_capacity ( capacity) }
182+ Wtf8Buf { bytes : Vec :: with_capacity ( capacity) , is_known_utf8 : true }
157183 }
158184
159185 /// Creates a WTF-8 string from a UTF-8 `String`.
@@ -163,7 +189,7 @@ impl Wtf8Buf {
163189 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
164190 #[ inline]
165191 pub fn from_string ( string : String ) -> Wtf8Buf {
166- Wtf8Buf { bytes : string. into_bytes ( ) }
192+ Wtf8Buf { bytes : string. into_bytes ( ) , is_known_utf8 : true }
167193 }
168194
169195 /// Creates a WTF-8 string from a UTF-8 `&str` slice.
@@ -173,11 +199,12 @@ impl Wtf8Buf {
173199 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
174200 #[ inline]
175201 pub fn from_str ( str : & str ) -> Wtf8Buf {
176- Wtf8Buf { bytes : <[ _ ] >:: to_vec ( str. as_bytes ( ) ) }
202+ Wtf8Buf { bytes : <[ _ ] >:: to_vec ( str. as_bytes ( ) ) , is_known_utf8 : true }
177203 }
178204
179205 pub fn clear ( & mut self ) {
180- self . bytes . clear ( )
206+ self . bytes . clear ( ) ;
207+ self . is_known_utf8 = true ;
181208 }
182209
183210 /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
@@ -195,15 +222,17 @@ impl Wtf8Buf {
195222 let code_point = unsafe { CodePoint :: from_u32_unchecked ( surrogate as u32 ) } ;
196223 // Skip the WTF-8 concatenation check,
197224 // surrogate pairs are already decoded by decode_utf16
198- string. push_code_point_unchecked ( code_point)
225+ string. push_code_point_unchecked ( code_point) ;
226+ // The string now contains an unpaired surrogate.
227+ string. is_known_utf8 = false ;
199228 }
200229 }
201230 }
202231 string
203232 }
204233
205234 /// Copied from String::push
206- /// This does **not** include the WTF-8 concatenation check.
235+ /// This does **not** include the WTF-8 concatenation check or `is_known_utf8` check .
207236 fn push_code_point_unchecked ( & mut self , code_point : CodePoint ) {
208237 let mut bytes = [ 0 ; 4 ] ;
209238 let bytes = char:: encode_utf8_raw ( code_point. value , & mut bytes) ;
@@ -217,6 +246,9 @@ impl Wtf8Buf {
217246
218247 #[ inline]
219248 pub fn as_mut_slice ( & mut self ) -> & mut Wtf8 {
249+ // Safety: `Wtf8` doesn't expose any way to mutate the bytes that would
250+ // cause them to change from well-formed UTF-8 to ill-formed UTF-8,
251+ // which would break the assumptions of the `is_known_utf8` field.
220252 unsafe { Wtf8 :: from_mut_bytes_unchecked ( & mut self . bytes ) }
221253 }
222254
@@ -313,7 +345,15 @@ impl Wtf8Buf {
313345 self . push_char ( decode_surrogate_pair ( lead, trail) ) ;
314346 self . bytes . extend_from_slice ( other_without_trail_surrogate) ;
315347 }
316- _ => self . bytes . extend_from_slice ( & other. bytes ) ,
348+ _ => {
349+ self . bytes . extend_from_slice ( & other. bytes ) ;
350+
351+ // If we're pushing a string containing a surrogate, we may no
352+ // longer have UTF-8.
353+ if other. next_surrogate ( 0 ) . is_some ( ) {
354+ self . is_known_utf8 = false ;
355+ }
356+ }
317357 }
318358 }
319359
@@ -330,13 +370,19 @@ impl Wtf8Buf {
330370 /// like concatenating ill-formed UTF-16 strings effectively would.
331371 #[ inline]
332372 pub fn push ( & mut self , code_point : CodePoint ) {
333- if let trail @ 0xDC00 ..= 0xDFFF = code_point. to_u32 ( ) {
373+ if let Some ( trail) = code_point. to_trail_surrogate ( ) {
334374 if let Some ( lead) = ( & * self ) . final_lead_surrogate ( ) {
335375 let len_without_lead_surrogate = self . len ( ) - 3 ;
336376 self . bytes . truncate ( len_without_lead_surrogate) ;
337- self . push_char ( decode_surrogate_pair ( lead, trail as u16 ) ) ;
377+ self . push_char ( decode_surrogate_pair ( lead, trail) ) ;
338378 return ;
339379 }
380+
381+ // We're pushing a trailing surrogate.
382+ self . is_known_utf8 = false ;
383+ } else if code_point. to_lead_surrogate ( ) . is_some ( ) {
384+ // We're pushing a leading surrogate.
385+ self . is_known_utf8 = false ;
340386 }
341387
342388 // No newly paired surrogates at the boundary.
@@ -363,9 +409,10 @@ impl Wtf8Buf {
363409 /// (that is, if the string contains surrogates),
364410 /// the original WTF-8 string is returned instead.
365411 pub fn into_string ( self ) -> Result < String , Wtf8Buf > {
366- match self . next_surrogate ( 0 ) {
367- None => Ok ( unsafe { String :: from_utf8_unchecked ( self . bytes ) } ) ,
368- Some ( _) => Err ( self ) ,
412+ if self . is_known_utf8 || self . next_surrogate ( 0 ) . is_none ( ) {
413+ Ok ( unsafe { String :: from_utf8_unchecked ( self . bytes ) } )
414+ } else {
415+ Err ( self )
369416 }
370417 }
371418
@@ -375,6 +422,11 @@ impl Wtf8Buf {
375422 ///
376423 /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
377424 pub fn into_string_lossy ( mut self ) -> String {
425+ // Fast path: If we already have UTF-8, we can return it immediately.
426+ if self . is_known_utf8 {
427+ return unsafe { String :: from_utf8_unchecked ( self . bytes ) } ;
428+ }
429+
378430 let mut pos = 0 ;
379431 loop {
380432 match self . next_surrogate ( pos) {
@@ -397,7 +449,7 @@ impl Wtf8Buf {
397449 /// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
398450 pub fn from_box ( boxed : Box < Wtf8 > ) -> Wtf8Buf {
399451 let bytes: Box < [ u8 ] > = unsafe { mem:: transmute ( boxed) } ;
400- Wtf8Buf { bytes : bytes. into_vec ( ) }
452+ Wtf8Buf { bytes : bytes. into_vec ( ) , is_known_utf8 : false }
401453 }
402454}
403455
@@ -575,6 +627,11 @@ impl Wtf8 {
575627 }
576628 }
577629
630+ /// Creates an owned `Wtf8Buf` from a borrowed `Wtf8`.
631+ pub fn to_owned ( & self ) -> Wtf8Buf {
632+ Wtf8Buf { bytes : self . bytes . to_vec ( ) , is_known_utf8 : false }
633+ }
634+
578635 /// Lossily converts the string to UTF-8.
579636 /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
580637 ///
@@ -664,7 +721,8 @@ impl Wtf8 {
664721 }
665722
666723 pub fn clone_into ( & self , buf : & mut Wtf8Buf ) {
667- self . bytes . clone_into ( & mut buf. bytes )
724+ self . bytes . clone_into ( & mut buf. bytes ) ;
725+ buf. is_known_utf8 = false ;
668726 }
669727
670728 /// Boxes this `Wtf8`.
@@ -704,12 +762,18 @@ impl Wtf8 {
704762
705763 #[ inline]
706764 pub fn to_ascii_lowercase ( & self ) -> Wtf8Buf {
707- Wtf8Buf { bytes : self . bytes . to_ascii_lowercase ( ) }
765+ Wtf8Buf {
766+ bytes : self . bytes . to_ascii_lowercase ( ) ,
767+ is_known_utf8 : self . next_surrogate ( 0 ) . is_none ( ) ,
768+ }
708769 }
709770
710771 #[ inline]
711772 pub fn to_ascii_uppercase ( & self ) -> Wtf8Buf {
712- Wtf8Buf { bytes : self . bytes . to_ascii_uppercase ( ) }
773+ Wtf8Buf {
774+ bytes : self . bytes . to_ascii_uppercase ( ) ,
775+ is_known_utf8 : self . next_surrogate ( 0 ) . is_none ( ) ,
776+ }
713777 }
714778
715779 #[ inline]
0 commit comments