char: µoptimise UTF-16 surrogates decoding

mina86 · mina86 · commit 28162ad97024 · 2022-12-23T14:15:33.000+01:00
According to Godbolt¹, on x86_64 using binary and produces slightly better code than using subtraction. Readability of both is pretty much equivalent so might just as well use the shorter option. ¹ https://rust.godbolt.org/z/9jM3ejbMx
diff --git a/library/core/src/char/decode.rs b/library/core/src/char/decode.rs
@@ -67,7 +67,7 @@ impl<I: Iterator<Item = u16>> Iterator for DecodeUtf16<I> {
             }
 
             // all ok, so lets decode it.
-            let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
+            let c = (((u & 0x3ff) as u32) << 10 | (u2 & 0x3ff) as u32) + 0x1_0000;
             // SAFETY: we checked that it's a legal unicode value
             Some(Ok(unsafe { from_u32_unchecked(c) }))
         }
diff --git a/library/core/tests/char.rs b/library/core/tests/char.rs
@@ -306,6 +306,10 @@ fn test_decode_utf16() {
     }
     check(&[0xD800, 0x41, 0x42], &[Err(0xD800), Ok('A'), Ok('B')]);
     check(&[0xD800, 0], &[Err(0xD800), Ok('\0')]);
+    check(&[0xD800], &[Err(0xD800)]);
+    check(&[0xD840, 0xDC00], &[Ok('\u{20000}')]);
+    check(&[0xD840, 0xD840, 0xDC00], &[Err(0xD840), Ok('\u{20000}')]);
+    check(&[0xDC00, 0xD840], &[Err(0xDC00), Err(0xD840)]);
 }
 
 #[test]

Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ impl<I: Iterator<Item = u16>> Iterator for DecodeUtf16<I> {`
`67`	`67`	`}`
`68`	`68`
`69`	`69`	`// all ok, so lets decode it.`
`70`		`- let c = (((u - 0xD800) as u32) << 10 \| (u2 - 0xDC00) as u32) + 0x1_0000;`
	`70`	`+ let c = (((u & 0x3ff) as u32) << 10 \| (u2 & 0x3ff) as u32) + 0x1_0000;`
`71`	`71`	`// SAFETY: we checked that it's a legal unicode value`
`72`	`72`	`Some(Ok(unsafe { from_u32_unchecked(c) }))`
`73`	`73`	`}`
Original file line number	Diff line number	Diff line change
`@@ -306,6 +306,10 @@ fn test_decode_utf16() {`
`306`	`306`	`}`
`307`	`307`	`check(&[0xD800, 0x41, 0x42], &[Err(0xD800), Ok('A'), Ok('B')]);`
`308`	`308`	`check(&[0xD800, 0], &[Err(0xD800), Ok('\0')]);`
	`309`	`+ check(&[0xD800], &[Err(0xD800)]);`
	`310`	`+ check(&[0xD840, 0xDC00], &[Ok('\u{20000}')]);`
	`311`	`+ check(&[0xD840, 0xD840, 0xDC00], &[Err(0xD840), Ok('\u{20000}')]);`
	`312`	`+ check(&[0xDC00, 0xD840], &[Err(0xDC00), Err(0xD840)]);`
`309`	`313`	`}`
`310`	`314`
`311`	`315`	`#[test]`