First prototype of make_uppercase

krtab · krtab · commit 41aea71d3dee · 2025-01-22T17:59:52.000+01:00
diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs
@@ -79,6 +79,7 @@ use crate::alloc::Global;
 #[cfg(not(no_global_oom_handling))]
 use crate::borrow::ToOwned;
 use crate::boxed::Box;
+use crate::collections::VecDeque;
 use crate::vec::Vec;
 
 // HACK(japaric): With cfg(test) `impl [T]` is not available, these three
@@ -665,6 +666,45 @@ impl<T> [T] {
     }
 }
 
+#[cfg(not(test))]
+impl [u8] {
+    #[rustc_allow_incoherent_impl]
+    #[unstable(issue = "none", feature = "std_internals")]
+    #[allow(dead_code)]
+    /// Safety:
+    ///    - Must be UTF-8
+    pub unsafe fn make_utf8_uppercase(&mut self) -> Result<usize, VecDeque<u8>> {
+        let mut queue = VecDeque::new();
+
+        let mut read_offset = 0;
+        let mut write_offset = 0;
+
+        let mut buffer = [0; 4];
+        while let Some((codepoint, width)) =
+            unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) }
+        {
+            read_offset += width;
+            let lowercase_char = unsafe { char::from_u32_unchecked(codepoint) };
+            for c in lowercase_char.to_uppercase() {
+                let l = c.len_utf8();
+                c.encode_utf8(&mut buffer);
+                queue.extend(&buffer[..l]);
+            }
+            while write_offset < read_offset {
+                match queue.pop_front() {
+                    Some(b) => {
+                        self[write_offset] = b;
+                        write_offset += 1;
+                    }
+                    None => break,
+                }
+            }
+        }
+        assert_eq!(read_offset, self.len());
+        if write_offset < read_offset { Ok(write_offset) } else { Err(queue) }
+    }
+}
+
 #[cfg(not(test))]
 impl [u8] {
     /// Returns a vector containing a copy of this slice where each byte
diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs
@@ -1127,6 +1127,18 @@ impl String {
         self.vec.extend_from_slice(string.as_bytes())
     }
 
+    #[unstable(feature = "string_make_uplowercase", issue = "135885")]
+    #[allow(missing_docs)]
+    pub fn make_uppercase(&mut self) {
+        let mut v = core::mem::take(self).vec;
+        let res = unsafe { v.make_utf8_uppercase() };
+        match res {
+            Ok(n) => v.truncate(n),
+            Err(queue) => v.extend(queue),
+        }
+        *self = unsafe { Self::from_utf8_unchecked(v) }
+    }
+
     /// Copies elements from `src` range to the end of the string.
     ///
     /// # Panics
diff --git a/library/alloc/tests/lib.rs b/library/alloc/tests/lib.rs
@@ -43,6 +43,7 @@
 #![allow(internal_features)]
 #![deny(fuzzy_provenance_casts)]
 #![deny(unsafe_op_in_unsafe_fn)]
+#![feature(string_make_uplowercase)]
 
 extern crate test;
 
diff --git a/library/alloc/tests/string.rs b/library/alloc/tests/string.rs
@@ -903,3 +903,20 @@ fn test_str_concat() {
     let s: String = format!("{a}{b}");
     assert_eq!(s.as_bytes()[9], 'd' as u8);
 }
+
+#[test]
+fn make_uppercase() {
+    fn test(s: &str) {
+        let ground_truth = s.to_uppercase();
+        let mut tested = s.to_owned();
+        tested.make_uppercase();
+        assert_eq!(tested, ground_truth);
+    }
+    test("");
+    test("abcde");
+    // 4 to 9 bytes
+    test("ǰΐ");
+    // 10*3 to 10*2 bytes
+    test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ");
+    test("aéǅßﬁᾀ");
+}
diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs
@@ -57,7 +57,7 @@ pub use lossy::{Utf8Chunk, Utf8Chunks};
 #[stable(feature = "rust1", since = "1.0.0")]
 pub use traits::FromStr;
 #[unstable(feature = "str_internals", issue = "none")]
-pub use validations::{next_code_point, utf8_char_width};
+pub use validations::{next_code_point, next_code_point_with_width, utf8_char_width};
 
 #[inline(never)]
 #[cold]
diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs
@@ -26,18 +26,21 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
 }
 
 /// Reads the next code point out of a byte iterator (assuming a
-/// UTF-8-like encoding).
+/// UTF-8-like encoding) and returns it along with its width.
 ///
 /// # Safety
 ///
 /// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
 #[unstable(feature = "str_internals", issue = "none")]
 #[inline]
-pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
+#[allow(dead_code)]
+pub unsafe fn next_code_point_with_width<'a, I: Iterator<Item = &'a u8>>(
+    bytes: &mut I,
+) -> Option<(u32, usize)> {
     // Decode UTF-8
     let x = *bytes.next()?;
     if x < 128 {
-        return Some(x as u32);
+        return Some((x as u32, 1));
     }
 
     // Multibyte case follows
@@ -47,13 +50,15 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
     // SAFETY: `bytes` produces an UTF-8-like string,
     // so the iterator must produce a value here.
     let y = unsafe { *bytes.next().unwrap_unchecked() };
+    let mut width = 2;
     let mut ch = utf8_acc_cont_byte(init, y);
     if x >= 0xE0 {
         // [[x y z] w] case
         // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
         // SAFETY: `bytes` produces an UTF-8-like string,
         // so the iterator must produce a value here.
         let z = unsafe { *bytes.next().unwrap_unchecked() };
+        width = 3;
         let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
         ch = init << 12 | y_z;
         if x >= 0xF0 {
@@ -62,11 +67,25 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
             // SAFETY: `bytes` produces an UTF-8-like string,
             // so the iterator must produce a value here.
             let w = unsafe { *bytes.next().unwrap_unchecked() };
+            width = 4;
             ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
         }
     }
 
-    Some(ch)
+    Some((ch, width))
+}
+
+/// Reads the next code point out of a byte iterator (assuming a
+/// UTF-8-like encoding).
+///
+/// # Safety
+///
+/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
+#[unstable(feature = "str_internals", issue = "none")]
+#[inline]
+pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
+    // SAFETY: same call condition
+    Some(unsafe { next_code_point_with_width(bytes) }?.0)
 }
 
 /// Reads the last code point out of a byte iterator (assuming a