From 182044248ca2aa569844a25e73f90e5bc2fd05d3 Mon Sep 17 00:00:00 2001
From: Simon Sapin <simon.sapin@exyr.org>
Date: Thu, 2 Mar 2017 17:27:57 +0100
Subject: [PATCH 1/3] Add Utf8Error::resume_from, to help incremental and/or
 lossy decoding.

Without this, code outside of the standard library needs to reimplement
most of the logic `from_utf8` to interpret the bytes after `valid_up_to()`.
---
 src/libcollectionstest/lib.rs |  1 +
 src/libcollectionstest/str.rs | 30 ++++++++++++++
 src/libcore/str/mod.rs        | 78 +++++++++++++++++++++++++----------
 3 files changed, 87 insertions(+), 22 deletions(-)
diff --git a/src/libcollectionstest/lib.rs b/src/libcollectionstest/lib.rs
index d97d9b8ab83f6..a7018daf09847 100644
--- a/src/libcollectionstest/lib.rs
+++ b/src/libcollectionstest/lib.rs
@@ -28,6 +28,7 @@
 #![feature(test)]
 #![feature(unboxed_closures)]
 #![feature(unicode)]
+#![feature(utf8_error_resume_from)]
 
 extern crate collections;
 extern crate test;
diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs
index 8071c7e8c20d5..5de74d68b9eca 100644
--- a/src/libcollectionstest/str.rs
+++ b/src/libcollectionstest/str.rs
@@ -540,6 +540,36 @@ fn from_utf8_mostly_ascii() {
     }
 }
 
+#[test]
+fn from_utf8_error() {
+    macro_rules! test {
+        ($input: expr, $expected_valid_up_to: expr, $expected_resume_from: expr) => {
+            let error = from_utf8($input).unwrap_err();
+            assert_eq!(error.valid_up_to(), $expected_valid_up_to);
+            assert_eq!(error.resume_from(), $expected_resume_from);
+        }
+    }
+    test!(b"A\xC3\xA9 \xFF ", 4, Some(5));
+    test!(b"A\xC3\xA9 \x80 ", 4, Some(5));
+    test!(b"A\xC3\xA9 \xC1 ", 4, Some(5));
+    test!(b"A\xC3\xA9 \xC1", 4, Some(5));
+    test!(b"A\xC3\xA9 \xC2", 4, None);
+    test!(b"A\xC3\xA9 \xC2 ", 4, Some(5));
+    test!(b"A\xC3\xA9 \xC2\xC0", 4, Some(5));
+    test!(b"A\xC3\xA9 \xE0", 4, None);
+    test!(b"A\xC3\xA9 \xE0\x9F", 4, Some(5));
+    test!(b"A\xC3\xA9 \xE0\xA0", 4, None);
+    test!(b"A\xC3\xA9 \xE0\xA0\xC0", 4, Some(6));
+    test!(b"A\xC3\xA9 \xE0\xA0 ", 4, Some(6));
+    test!(b"A\xC3\xA9 \xED\xA0\x80 ", 4, Some(5));
+    test!(b"A\xC3\xA9 \xF1", 4, None);
+    test!(b"A\xC3\xA9 \xF1\x80", 4, None);
+    test!(b"A\xC3\xA9 \xF1\x80\x80", 4, None);
+    test!(b"A\xC3\xA9 \xF1 ", 4, Some(5));
+    test!(b"A\xC3\xA9 \xF1\x80 ", 4, Some(6));
+    test!(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(7));
+}
+
 #[test]
 fn test_as_bytes() {
     // no null
diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs
index 52e3301631052..eb13d28e82d23 100644
--- a/src/libcore/str/mod.rs
+++ b/src/libcore/str/mod.rs
@@ -125,13 +125,14 @@ Section: Creating a string
 #[stable(feature = "rust1", since = "1.0.0")]
 pub struct Utf8Error {
     valid_up_to: usize,
+    invalid_length: Option<u8>,
 }
 
 impl Utf8Error {
     /// Returns the index in the given string up to which valid UTF-8 was
     /// verified.
     ///
-    /// It is the maximum index such that `from_utf8(input[..index])`
+    /// It is the maximum index such that `from_utf8(&input[..index])`
     /// would return `Ok(_)`.
     ///
     /// # Examples
@@ -152,6 +153,21 @@ impl Utf8Error {
     /// ```
     #[stable(feature = "utf8_error", since = "1.5.0")]
     pub fn valid_up_to(&self) -> usize { self.valid_up_to }
+
+    /// Provide more information about the failure:
+    ///
+    /// * `None`: the end of the input was reached unexpectedly.
+    ///   `self.valid_up_to()` is 1 to 3 bytes from the end of the input.
+    ///   If a byte stream (such as a file or a network socket) is being decoded incrementally,
+    ///   this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks.
+    ///
+    /// * `Some(index)`: an unexpected byte was encountered.
+    ///   The index provided is where decoding should resume
+    ///   (after inserting a U+FFFD REPLACEMENT CHARACTER) in case of lossy decoding.
+    #[unstable(feature = "utf8_error_resume_from", reason ="new", issue = "0")]
+    pub fn resume_from(&self) -> Option<usize> {
+        self.invalid_length.map(|l| self.valid_up_to + l as usize)
+    }
 }
 
 /// Converts a slice of bytes to a string slice.
@@ -300,7 +316,12 @@ pub unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
 #[stable(feature = "rust1", since = "1.0.0")]
 impl fmt::Display for Utf8Error {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "invalid utf-8: invalid byte near index {}", self.valid_up_to)
+        if let Some(invalid_length) = self.invalid_length {
+            write!(f, "invalid utf-8 sequence of {} bytes from index {}",
+                   invalid_length, self.valid_up_to)
+        } else {
+            write!(f, "incomplete utf-8 byte sequence from index {}", self.valid_up_to)
+        }
     }
 }
 
@@ -1241,17 +1262,20 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
 
     while index < len {
         let old_offset = index;
-        macro_rules! err { () => {{
-            return Err(Utf8Error {
-                valid_up_to: old_offset
-            })
-        }}}
+        macro_rules! err {
+            ($invalid_length: expr) => {
+                return Err(Utf8Error {
+                    valid_up_to: old_offset,
+                    invalid_length: $invalid_length,
+                })
+            }
+        }
 
         macro_rules! next { () => {{
             index += 1;
             // we needed data, but there was none: error!
             if index >= len {
-                err!()
+                err!(None)
             }
             v[index]
         }}}
@@ -1259,7 +1283,6 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
         let first = v[index];
         if first >= 128 {
             let w = UTF8_CHAR_WIDTH[first as usize];
-            let second = next!();
             // 2-byte encoding is for codepoints  \u{0080} to  \u{07ff}
             //        first  C2 80        last DF BF
             // 3-byte encoding is for codepoints  \u{0800} to  \u{ffff}
@@ -1279,25 +1302,36 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
             // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
             //               %xF4 %x80-8F 2( UTF8-tail )
             match w {
-                2 => if second & !CONT_MASK != TAG_CONT_U8 {err!()},
+                2 => if next!() & !CONT_MASK != TAG_CONT_U8 {
+                    err!(Some(1))
+                },
                 3 => {
-                    match (first, second, next!() & !CONT_MASK) {
-                        (0xE0         , 0xA0 ... 0xBF, TAG_CONT_U8) |
-                        (0xE1 ... 0xEC, 0x80 ... 0xBF, TAG_CONT_U8) |
-                        (0xED         , 0x80 ... 0x9F, TAG_CONT_U8) |
-                        (0xEE ... 0xEF, 0x80 ... 0xBF, TAG_CONT_U8) => {}
-                        _ => err!()
+                    match (first, next!()) {
+                        (0xE0         , 0xA0 ... 0xBF) |
+                        (0xE1 ... 0xEC, 0x80 ... 0xBF) |
+                        (0xED         , 0x80 ... 0x9F) |
+                        (0xEE ... 0xEF, 0x80 ... 0xBF) => {}
+                        _ => err!(Some(1))
+                    }
+                    if next!() & !CONT_MASK != TAG_CONT_U8 {
+                        err!(Some(2))
                     }
                 }
                 4 => {
-                    match (first, second, next!() & !CONT_MASK, next!() & !CONT_MASK) {
-                        (0xF0         , 0x90 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
-                        (0xF1 ... 0xF3, 0x80 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
-                        (0xF4         , 0x80 ... 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
-                        _ => err!()
+                    match (first, next!()) {
+                        (0xF0         , 0x90 ... 0xBF) |
+                        (0xF1 ... 0xF3, 0x80 ... 0xBF) |
+                        (0xF4         , 0x80 ... 0x8F) => {}
+                        _ => err!(Some(1))
+                    }
+                    if next!() & !CONT_MASK != TAG_CONT_U8 {
+                        err!(Some(2))
+                    }
+                    if next!() & !CONT_MASK != TAG_CONT_U8 {
+                        err!(Some(3))
                     }
                 }
-                _ => err!()
+                _ => err!(Some(1))
             }
             index += 1;
         } else {

From b5f16a10e9406fc1c19294fee1c33e507a17458e Mon Sep 17 00:00:00 2001
From: Simon Sapin <simon.sapin@exyr.org>
Date: Mon, 6 Mar 2017 22:06:30 +0100
Subject: [PATCH 2/3] Replace Utf8Error::resume_from with Utf8Error::error_len

Their relationship is:

* `resume_from = error_len.map(|l| l + valid_up_to)`
* error_len is always one of None, Some(1), Some(2), or Some(3).

When I started using resume_from I almost always ended up subtracting
valid_up_to to obtain error_len.
Therefore the latter is what should be provided in the first place.
---
 src/libcollectionstest/lib.rs |  2 +-
 src/libcollectionstest/str.rs | 30 +++++++++++++++---------------
 src/libcore/str/mod.rs        | 22 ++++++++++++----------
 3 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/src/libcollectionstest/lib.rs b/src/libcollectionstest/lib.rs
index a7018daf09847..98d0b1c8e1565 100644
--- a/src/libcollectionstest/lib.rs
+++ b/src/libcollectionstest/lib.rs
@@ -28,7 +28,7 @@
 #![feature(test)]
 #![feature(unboxed_closures)]
 #![feature(unicode)]
-#![feature(utf8_error_resume_from)]
+#![feature(utf8_error_error_len)]
 
 extern crate collections;
 extern crate test;
diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs
index 5de74d68b9eca..c9b7104fec4f0 100644
--- a/src/libcollectionstest/str.rs
+++ b/src/libcollectionstest/str.rs
@@ -543,31 +543,31 @@ fn from_utf8_mostly_ascii() {
 #[test]
 fn from_utf8_error() {
     macro_rules! test {
-        ($input: expr, $expected_valid_up_to: expr, $expected_resume_from: expr) => {
+        ($input: expr, $expected_valid_up_to: expr, $expected_error_len: expr) => {
             let error = from_utf8($input).unwrap_err();
             assert_eq!(error.valid_up_to(), $expected_valid_up_to);
-            assert_eq!(error.resume_from(), $expected_resume_from);
+            assert_eq!(error.error_len(), $expected_error_len);
         }
     }
-    test!(b"A\xC3\xA9 \xFF ", 4, Some(5));
-    test!(b"A\xC3\xA9 \x80 ", 4, Some(5));
-    test!(b"A\xC3\xA9 \xC1 ", 4, Some(5));
-    test!(b"A\xC3\xA9 \xC1", 4, Some(5));
+    test!(b"A\xC3\xA9 \xFF ", 4, Some(1));
+    test!(b"A\xC3\xA9 \x80 ", 4, Some(1));
+    test!(b"A\xC3\xA9 \xC1 ", 4, Some(1));
+    test!(b"A\xC3\xA9 \xC1", 4, Some(1));
     test!(b"A\xC3\xA9 \xC2", 4, None);
-    test!(b"A\xC3\xA9 \xC2 ", 4, Some(5));
-    test!(b"A\xC3\xA9 \xC2\xC0", 4, Some(5));
+    test!(b"A\xC3\xA9 \xC2 ", 4, Some(1));
+    test!(b"A\xC3\xA9 \xC2\xC0", 4, Some(1));
     test!(b"A\xC3\xA9 \xE0", 4, None);
-    test!(b"A\xC3\xA9 \xE0\x9F", 4, Some(5));
+    test!(b"A\xC3\xA9 \xE0\x9F", 4, Some(1));
     test!(b"A\xC3\xA9 \xE0\xA0", 4, None);
-    test!(b"A\xC3\xA9 \xE0\xA0\xC0", 4, Some(6));
-    test!(b"A\xC3\xA9 \xE0\xA0 ", 4, Some(6));
-    test!(b"A\xC3\xA9 \xED\xA0\x80 ", 4, Some(5));
+    test!(b"A\xC3\xA9 \xE0\xA0\xC0", 4, Some(2));
+    test!(b"A\xC3\xA9 \xE0\xA0 ", 4, Some(2));
+    test!(b"A\xC3\xA9 \xED\xA0\x80 ", 4, Some(1));
     test!(b"A\xC3\xA9 \xF1", 4, None);
     test!(b"A\xC3\xA9 \xF1\x80", 4, None);
     test!(b"A\xC3\xA9 \xF1\x80\x80", 4, None);
-    test!(b"A\xC3\xA9 \xF1 ", 4, Some(5));
-    test!(b"A\xC3\xA9 \xF1\x80 ", 4, Some(6));
-    test!(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(7));
+    test!(b"A\xC3\xA9 \xF1 ", 4, Some(1));
+    test!(b"A\xC3\xA9 \xF1\x80 ", 4, Some(2));
+    test!(b"A\xC3\xA9 \xF1\x80\x80 ", 4, Some(3));
 }
 
 #[test]
diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs
index eb13d28e82d23..63b12932c3d62 100644
--- a/src/libcore/str/mod.rs
+++ b/src/libcore/str/mod.rs
@@ -125,7 +125,7 @@ Section: Creating a string
 #[stable(feature = "rust1", since = "1.0.0")]
 pub struct Utf8Error {
     valid_up_to: usize,
-    invalid_length: Option<u8>,
+    error_len: Option<u8>,
 }
 
 impl Utf8Error {
@@ -161,12 +161,14 @@ impl Utf8Error {
     ///   If a byte stream (such as a file or a network socket) is being decoded incrementally,
     ///   this could be a valid `char` whose UTF-8 byte sequence is spanning multiple chunks.
     ///
-    /// * `Some(index)`: an unexpected byte was encountered.
-    ///   The index provided is where decoding should resume
+    /// * `Some(len)`: an unexpected byte was encountered.
+    ///   The length provided is that of the invalid byte sequence
+    ///   that starts at the index given by `valid_up_to()`.
+    ///   Decoding should resume after that sequence
     ///   (after inserting a U+FFFD REPLACEMENT CHARACTER) in case of lossy decoding.
-    #[unstable(feature = "utf8_error_resume_from", reason ="new", issue = "0")]
-    pub fn resume_from(&self) -> Option<usize> {
-        self.invalid_length.map(|l| self.valid_up_to + l as usize)
+    #[unstable(feature = "utf8_error_error_len", reason ="new", issue = "0")]
+    pub fn error_len(&self) -> Option<usize> {
+        self.error_len.map(|len| len as usize)
     }
 }
 
@@ -316,9 +318,9 @@ pub unsafe fn from_utf8_unchecked(v: &[u8]) -> &str {
 #[stable(feature = "rust1", since = "1.0.0")]
 impl fmt::Display for Utf8Error {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        if let Some(invalid_length) = self.invalid_length {
+        if let Some(error_len) = self.error_len {
             write!(f, "invalid utf-8 sequence of {} bytes from index {}",
-                   invalid_length, self.valid_up_to)
+                   error_len, self.valid_up_to)
         } else {
             write!(f, "incomplete utf-8 byte sequence from index {}", self.valid_up_to)
         }
@@ -1263,10 +1265,10 @@ fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> {
     while index < len {
         let old_offset = index;
         macro_rules! err {
-            ($invalid_length: expr) => {
+            ($error_len: expr) => {
                 return Err(Utf8Error {
                     valid_up_to: old_offset,
-                    invalid_length: $invalid_length,
+                    error_len: $error_len,
                 })
             }
         }

From 73370c543ea130a3d6d9097aa56b786c72dc6c94 Mon Sep 17 00:00:00 2001
From: Simon Sapin <simon.sapin@exyr.org>
Date: Mon, 13 Mar 2017 23:54:06 +0100
Subject: [PATCH 3/3] Add tracking issue number for Utf8Error::error_len

---
 src/libcore/str/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs
index 63b12932c3d62..2919adc1cbc63 100644
--- a/src/libcore/str/mod.rs
+++ b/src/libcore/str/mod.rs
@@ -166,7 +166,7 @@ impl Utf8Error {
     ///   that starts at the index given by `valid_up_to()`.
     ///   Decoding should resume after that sequence
     ///   (after inserting a U+FFFD REPLACEMENT CHARACTER) in case of lossy decoding.
-    #[unstable(feature = "utf8_error_error_len", reason ="new", issue = "0")]
+    #[unstable(feature = "utf8_error_error_len", reason ="new", issue = "40494")]
     pub fn error_len(&self) -> Option<usize> {
         self.error_len.map(|len| len as usize)
     }