From 493a4b63c1d6791ae7d2001123d8953bd62aa443 Mon Sep 17 00:00:00 2001
From: Huon Wilson <dbau.pp+github@gmail.com>
Date: Sun, 16 Feb 2014 23:52:14 +1100
Subject: [PATCH 1/6] std: iteratize str::is_utf16 & add tests.

Most of the tests are randomly generated with Python 3 and rely on it's
UTF-16be encoder/decoder being correct.
---
 src/libstd/str.rs | 96 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 78 insertions(+), 18 deletions(-)
diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index 0a7f513581c0d..8214382fb0d0e 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -805,23 +805,23 @@ fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
 
 /// Determines if a vector of `u16` contains valid UTF-16
 pub fn is_utf16(v: &[u16]) -> bool {
-    let len = v.len();
-    let mut i = 0u;
-    while i < len {
-        let u = v[i];
-
-        if  u <= 0xD7FF_u16 || u >= 0xE000_u16 {
-            i += 1u;
+    let mut it = v.iter();
+    macro_rules! next ( ($ret:expr) => {
+            match it.next() { Some(u) => *u, None => return $ret }
+        }
+    )
+    loop {
+        let u = next!(true);
 
-        } else {
-            if i+1u < len { return false; }
-            let u2 = v[i+1u];
-            if u < 0xD7FF_u16 || u > 0xDBFF_u16 { return false; }
-            if u2 < 0xDC00_u16 || u2 > 0xDFFF_u16 { return false; }
-            i += 2u;
+        match char::from_u32(u as u32) {
+            Some(_) => {}
+            None => {
+                let u2 = next!(false);
+                if u < 0xD7FF || u > 0xDBFF ||
+                    u2 < 0xDC00 || u2 > 0xDFFF { return false; }
+            }
         }
     }
-    return true;
 }
 
 /// Iterates over the utf-16 characters in the specified slice, yielding each
@@ -3511,6 +3511,65 @@ mod tests {
         assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
     }
 
+    #[test]
+    fn test_is_utf16() {
+        macro_rules! pos ( ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } });
+
+        // non-surrogates
+        pos!([0x0000],
+             [0x0001, 0x0002],
+             [0xD7FF],
+             [0xE000]);
+
+        // surrogate pairs (randomly generated with Python 3's
+        // .encode('utf-16be'))
+        pos!([0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45],
+             [0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14],
+             [0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]);
+
+        // mixtures (also random)
+        pos!([0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65],
+             [0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006],
+             [0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]);
+
+        // negative tests
+        macro_rules! neg ( ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } });
+
+        neg!(
+            // surrogate + regular unit
+            [0xdb45, 0x0000],
+            // surrogate + lead surrogate
+            [0xd900, 0xd900],
+            // unterminated surrogate
+            [0xd8ff],
+            // trail surrogate without a lead
+            [0xddb7]);
+
+        // random byte sequences that Python 3's .decode('utf-16be')
+        // failed on
+        neg!([0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7],
+             [0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3],
+             [0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca],
+             [0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278],
+             [0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e],
+             [0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5],
+             [0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee],
+             [0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7],
+             [0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a],
+             [0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a],
+             [0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe],
+             [0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf],
+             [0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e],
+             [0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5],
+             [0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f],
+             [0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b],
+             [0x934b, 0x8956, 0xc434, 0x1881, 0xddf7],
+             [0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9],
+             [0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8],
+             [0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282],
+             [0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]);
+    }
+
     #[test]
     fn test_raw_from_c_str() {
         unsafe {
@@ -3666,10 +3725,11 @@ mod tests {
 
         for p in pairs.iter() {
             let (s, u) = (*p).clone();
-            assert!(s.to_utf16() == u);
-            assert!(from_utf16(u) == s);
-            assert!(from_utf16(s.to_utf16()) == s);
-            assert!(from_utf16(u).to_utf16() == u);
+            assert!(is_utf16(u));
+            assert_eq!(s.to_utf16(), u);
+            assert_eq!(from_utf16(u), s);
+            assert_eq!(from_utf16(s.to_utf16()), s);
+            assert_eq!(from_utf16(u).to_utf16(), u);
         }
     }
 

From b7656d048feb828af12278a6028c17b0c86241b3 Mon Sep 17 00:00:00 2001
From: Huon Wilson <dbau.pp+github@gmail.com>
Date: Mon, 17 Feb 2014 00:09:45 +1100
Subject: [PATCH 2/6] std: convert str::from_utf16 to an external iterator.

Fixes #12316.
---
 src/libstd/str.rs | 65 +++++++++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 27 deletions(-)

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index 8214382fb0d0e..93abf8bc5428f 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -824,41 +824,52 @@ pub fn is_utf16(v: &[u16]) -> bool {
     }
 }
 
-/// Iterates over the utf-16 characters in the specified slice, yielding each
-/// decoded unicode character to the function provided.
+/// An iterator that decodes UTF-16 encoded codepoints from a vector
+/// of `u16`s.
 ///
-/// # Failures
-///
-/// * Fails on invalid utf-16 data
-pub fn utf16_chars(v: &[u16], f: |char|) {
-    let len = v.len();
-    let mut i = 0u;
-    while i < len && v[i] != 0u16 {
-        let u = v[i];
-
-        if  u <= 0xD7FF_u16 || u >= 0xE000_u16 {
-            f(unsafe { cast::transmute(u as u32) });
-            i += 1u;
+/// Fails when it encounters invalid UTF-16 data.
+pub struct UTF16Chars<'a> {
+    priv iter: vec::Items<'a, u16>
+}
+impl<'a> Iterator<char> for UTF16Chars<'a> {
+    fn next(&mut self) -> Option<char> {
+        let u = match self.iter.next() {
+            Some(u) => *u,
+            None => return None
+        };
+        match char::from_u32(u as u32) {
+            Some(c) => Some(c),
+            None => {
+                let u2 = *self.iter.next().expect("UTF16Chars: unmatched lead surrogate");
+                if u < 0xD7FF || u > 0xDBFF ||
+                    u2 < 0xDC00 || u2 > 0xDFFF {
+                    fail!("UTF16Chars: invalid surrogate pair")
+                }
 
-        } else {
-            let u2 = v[i+1u];
-            assert!(u >= 0xD800_u16 && u <= 0xDBFF_u16);
-            assert!(u2 >= 0xDC00_u16 && u2 <= 0xDFFF_u16);
-            let mut c: u32 = (u - 0xD800_u16) as u32;
-            c = c << 10;
-            c |= (u2 - 0xDC00_u16) as u32;
-            c |= 0x1_0000_u32;
-            f(unsafe { cast::transmute(c) });
-            i += 2u;
+                let mut c = (u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32 | 0x1_0000;
+                char::from_u32(c)
+            }
         }
     }
+
+    fn size_hint(&self) -> (uint, Option<uint>) {
+        let (low, high) = self.iter.size_hint();
+        // we could be entirely surrogates (2 elements per char), or
+        // entirely non-surrogates (1 element per char)
+        (low / 2, high)
+    }
+}
+
+/// Create an iterator over the UTF-16 encoded codepoints in `v`.
+///
+/// The iterator fails if it attempts to decode invalid UTF-16 data.
+pub fn utf16_chars<'a>(v: &'a [u16]) -> UTF16Chars<'a> {
+    UTF16Chars { iter : v.iter() }
 }
 
 /// Allocates a new string from the utf-16 slice provided
 pub fn from_utf16(v: &[u16]) -> ~str {
-    let mut buf = with_capacity(v.len());
-    utf16_chars(v, |ch| buf.push_char(ch));
-    buf
+    utf16_chars(v).collect()
 }
 
 /// Allocates a new string with the specified capacity. The string returned is

From a96cea4f5a713eb357bde3d395c5453058be88c5 Mon Sep 17 00:00:00 2001
From: Huon Wilson <dbau.pp+github@gmail.com>
Date: Mon, 17 Feb 2014 00:52:58 +1100
Subject: [PATCH 3/6] str: provide lossy UTF-16 support.

This replaces the iterator with one that handles lone surrogates
gracefully and uses that to implement `from_utf16_lossy` which replaces
invalid `u16`s with U+FFFD.
---
 src/libstd/str.rs | 156 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 133 insertions(+), 23 deletions(-)

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index 93abf8bc5428f..1ef622002c310 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -826,50 +826,142 @@ pub fn is_utf16(v: &[u16]) -> bool {
 
 /// An iterator that decodes UTF-16 encoded codepoints from a vector
 /// of `u16`s.
-///
-/// Fails when it encounters invalid UTF-16 data.
-pub struct UTF16Chars<'a> {
+#[deriving(Clone)]
+pub struct UTF16Items<'a> {
     priv iter: vec::Items<'a, u16>
 }
-impl<'a> Iterator<char> for UTF16Chars<'a> {
-    fn next(&mut self) -> Option<char> {
+/// The possibilities for values decoded from a `u16` stream.
+#[deriving(Eq, TotalEq, Clone)]
+pub enum UTF16Item {
+    /// A valid codepoint.
+    ScalarValue(char),
+    /// An invalid surrogate without its pair.
+    LoneSurrogate(u16)
+}
+
+impl UTF16Item {
+    /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
+    /// replacement character (U+FFFD).
+    #[inline]
+    pub fn to_char_lossy(&self) -> char {
+        match *self {
+            ScalarValue(c) => c,
+            LoneSurrogate(_) => '\uFFFD'
+        }
+    }
+}
+
+impl<'a> Iterator<UTF16Item> for UTF16Items<'a> {
+    fn next(&mut self) -> Option<UTF16Item> {
         let u = match self.iter.next() {
             Some(u) => *u,
             None => return None
         };
-        match char::from_u32(u as u32) {
-            Some(c) => Some(c),
-            None => {
-                let u2 = *self.iter.next().expect("UTF16Chars: unmatched lead surrogate");
-                if u < 0xD7FF || u > 0xDBFF ||
-                    u2 < 0xDC00 || u2 > 0xDFFF {
-                    fail!("UTF16Chars: invalid surrogate pair")
-                }
 
-                let mut c = (u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32 | 0x1_0000;
-                char::from_u32(c)
+        if u < 0xD800 || 0xDFFF < u {
+            // not a surrogate
+            Some(ScalarValue(unsafe {cast::transmute(u as u32)}))
+        } else if u >= 0xDC00 {
+            // a trailing surrogate
+            Some(LoneSurrogate(u))
+        } else {
+            // preserve state for rewinding.
+            let old = self.iter;
+
+            let u2 = match self.iter.next() {
+                Some(u2) => *u2,
+                // eof
+                None => return Some(LoneSurrogate(u))
+            };
+            if u2 < 0xDC00 || u2 > 0xDFFF {
+                // not a trailing surrogate so we're not a valid
+                // surrogate pair, so rewind to redecode u2 next time.
+                self.iter = old;
+                return Some(LoneSurrogate(u))
             }
+
+            // all ok, so lets decode it.
+            let c = (u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32 | 0x1_0000;
+            Some(ScalarValue(unsafe {cast::transmute(c)}))
         }
     }
 
+    #[inline]
     fn size_hint(&self) -> (uint, Option<uint>) {
         let (low, high) = self.iter.size_hint();
-        // we could be entirely surrogates (2 elements per char), or
-        // entirely non-surrogates (1 element per char)
+        // we could be entirely valid surrogates (2 elements per
+        // char), or entirely non-surrogates (1 element per char)
         (low / 2, high)
     }
 }
 
-/// Create an iterator over the UTF-16 encoded codepoints in `v`.
+/// Create an iterator over the UTF-16 encoded codepoints in `v`,
+/// returning invalid surrogates as `LoneSurrogate`s.
 ///
-/// The iterator fails if it attempts to decode invalid UTF-16 data.
-pub fn utf16_chars<'a>(v: &'a [u16]) -> UTF16Chars<'a> {
-    UTF16Chars { iter : v.iter() }
+/// # Example
+///
+/// ```rust
+/// use std::str;
+/// use std::str::{ScalarValue, LoneSurrogate};
+///
+/// // 𝄞mus<invalid>ic<invalid>
+/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
+///          0x0073, 0xDD1E, 0x0069, 0x0063,
+///          0xD834];
+///
+/// assert_eq!(str::utf16_items(v).to_owned_vec(),
+///            ~[ScalarValue('𝄞'),
+///              ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
+///              LoneSurrogate(0xDD1E),
+///              ScalarValue('i'), ScalarValue('c'),
+///              LoneSurrogate(0xD834)]);
+/// ```
+pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
+    UTF16Items { iter : v.iter() }
 }
 
-/// Allocates a new string from the utf-16 slice provided
+/// Decode a UTF-16 encoded vector `v` into a string.
+///
+/// # Failure
+///
+/// Fails on invalid UTF-16 data.
+///
+/// # Example
+///
+/// ```rust
+/// use std::str;
+///
+/// // 𝄞music
+/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
+///          0x0073, 0x0069, 0x0063];
+/// assert_eq!(str::from_utf16(v), ~"𝄞music");
+/// ```
 pub fn from_utf16(v: &[u16]) -> ~str {
-    utf16_chars(v).collect()
+    utf16_items(v).map(|c| {
+            match c {
+                ScalarValue(c) => c,
+                LoneSurrogate(u) => fail!("from_utf16: found lone surrogate {}", u)
+            }
+        }).collect()
+}
+
+/// Decode a UTF-16 encoded vector `v` into a string, replacing
+/// invalid data with the replacement character (U+FFFD).
+///
+/// # Example
+/// ```rust
+/// use std::str;
+///
+/// // 𝄞mus<invalid>ic<invalid>
+/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
+///          0x0073, 0xDD1E, 0x0069, 0x0063,
+///          0xD834];
+///
+/// assert_eq!(str::from_utf16_lossy(v),
+///            ~"𝄞mus\uFFFDic\uFFFD");
+/// ```
+pub fn from_utf16_lossy(v: &[u16]) -> ~str {
+    utf16_items(v).map(|c| c.to_char_lossy()).collect()
 }
 
 /// Allocates a new string with the specified capacity. The string returned is
@@ -3738,12 +3830,30 @@ mod tests {
             let (s, u) = (*p).clone();
             assert!(is_utf16(u));
             assert_eq!(s.to_utf16(), u);
+
             assert_eq!(from_utf16(u), s);
+            assert_eq!(from_utf16_lossy(u), s);
+
             assert_eq!(from_utf16(s.to_utf16()), s);
             assert_eq!(from_utf16(u).to_utf16(), u);
         }
     }
 
+    #[test]
+    fn test_utf16_lossy() {
+        // completely positive cases tested above.
+        // lead + eof
+        assert_eq!(from_utf16_lossy([0xD800]), ~"\uFFFD");
+        // lead + lead
+        assert_eq!(from_utf16_lossy([0xD800, 0xD800]), ~"\uFFFD\uFFFD");
+
+        // isolated trail
+        assert_eq!(from_utf16_lossy([0x0061, 0xDC00]), ~"a\uFFFD");
+
+        // general
+        assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), ~"\uFFFD𐒋\uFFFD");
+    }
+
     #[test]
     fn test_char_at() {
         let s = ~"ศไทย中华Việt Nam";

From 35b1b62ddfc31c2e52b65c2f908c0fcbc6465de5 Mon Sep 17 00:00:00 2001
From: Huon Wilson <dbau.pp+github@gmail.com>
Date: Mon, 17 Feb 2014 00:57:16 +1100
Subject: [PATCH 4/6] std: decode even numbered non-BMP planes in the UTF-16
 decoder.

Fixes #12318.
---
 src/libstd/str.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index 1ef622002c310..34bcb083134c0 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -881,7 +881,7 @@ impl<'a> Iterator<UTF16Item> for UTF16Items<'a> {
             }
 
             // all ok, so lets decode it.
-            let c = (u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32 | 0x1_0000;
+            let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
             Some(ScalarValue(unsafe {cast::transmute(c)}))
         }
     }
@@ -3824,7 +3824,10 @@ mod tests {
                 0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
                 0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
                 0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
-                0x000a_u16 ]) ];
+                0x000a_u16 ]),
+             // Issue #12318, even-numbered non-BMP planes
+             (~"\U00020000",
+              ~[0xD840, 0xDC00])];
 
         for p in pairs.iter() {
             let (s, u) = (*p).clone();

From 4f841ee1509fafdf688a3898e01560ae29ee7836 Mon Sep 17 00:00:00 2001
From: Huon Wilson <dbau.pp+github@gmail.com>
Date: Mon, 17 Feb 2014 09:57:56 +1100
Subject: [PATCH 5/6] std: make str::from_utf16 return an Option.

The rest of the codebase is moving toward avoiding `fail!` so we do it
here too!
---
 src/libnative/io/file.rs |  3 ++-
 src/libstd/os.rs         | 15 +++++++----
 src/libstd/str.rs        | 54 ++++++++++++++++++++++++++--------------
 3 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/src/libnative/io/file.rs b/src/libnative/io/file.rs
index e9c9f51966c60..80f5f74c53a52 100644
--- a/src/libnative/io/file.rs
+++ b/src/libnative/io/file.rs
@@ -571,7 +571,8 @@ pub fn readdir(p: &CString) -> IoResult<~[Path]> {
                         else {
                             let fp_vec = vec::from_buf(
                                 fp_buf, wcslen(fp_buf) as uint);
-                            let fp_str = str::from_utf16(fp_vec);
+                            let fp_str = str::from_utf16(fp_vec)
+                                    .expect("rust_list_dir_wfd_fp_buf returned invalid UTF-16");
                             paths.push(Path::new(fp_str));
                         }
                         more_files = FindNextFileW(find_handle, wfd_ptr as HANDLE);
diff --git a/src/libstd/os.rs b/src/libstd/os.rs
index 719ed62d03d0a..31e88905b30f9 100644
--- a/src/libstd/os.rs
+++ b/src/libstd/os.rs
@@ -88,7 +88,7 @@ pub fn getcwd() -> Path {
             fail!();
         }
     }
-    Path::new(str::from_utf16(buf))
+    Path::new(str::from_utf16(buf).expect("GetCurrentDirectoryW returned invalid UTF-16"))
 }
 
 #[cfg(windows)]
@@ -124,7 +124,12 @@ pub mod win32 {
                 }
                 if k != 0 && done {
                     let sub = buf.slice(0, k as uint);
-                    res = option::Some(str::from_utf16(sub));
+                    // We want to explicitly catch the case when the
+                    // closure returned invalid UTF-16, rather than
+                    // set `res` to None and continue.
+                    let s = str::from_utf16(sub)
+                        .expect("fill_utf16_buf_and_decode: closure created invalid UTF-16");
+                    res = option::Some(s)
                 }
             }
             return res;
@@ -739,7 +744,7 @@ pub fn last_os_error() -> ~str {
                 fail!("[{}] FormatMessage failure", errno());
             }
 
-            str::from_utf16(buf)
+            str::from_utf16(buf).expect("FormatMessageW returned invalid UTF-16")
         }
     }
 
@@ -828,8 +833,8 @@ fn real_args() -> ~[~str] {
             while *ptr.offset(len as int) != 0 { len += 1; }
 
             // Push it onto the list.
-            args.push(vec::raw::buf_as_slice(ptr, len,
-                                             str::from_utf16));
+            let opt_s = vec::raw::buf_as_slice(ptr, len, str::from_utf16);
+            args.push(opt_s.expect("CommandLineToArgvW returned invalid UTF-16"));
         }
     }
 
diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index 34bcb083134c0..20321dad60071 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -920,11 +920,8 @@ pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
     UTF16Items { iter : v.iter() }
 }
 
-/// Decode a UTF-16 encoded vector `v` into a string.
-///
-/// # Failure
-///
-/// Fails on invalid UTF-16 data.
+/// Decode a UTF-16 encoded vector `v` into a string, returning `None`
+/// if `v` contains any invalid data.
 ///
 /// # Example
 ///
@@ -932,17 +929,23 @@ pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
 /// use std::str;
 ///
 /// // 𝄞music
-/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
-///          0x0073, 0x0069, 0x0063];
-/// assert_eq!(str::from_utf16(v), ~"𝄞music");
+/// let mut v = [0xD834, 0xDD1E, 0x006d, 0x0075,
+///              0x0073, 0x0069, 0x0063];
+/// assert_eq!(str::from_utf16(v), Some(~"𝄞music"));
+///
+/// // 𝄞mu<invalid>ic
+/// v[4] = 0xD800;
+/// assert_eq!(str::from_utf16(v), None);
 /// ```
-pub fn from_utf16(v: &[u16]) -> ~str {
-    utf16_items(v).map(|c| {
-            match c {
-                ScalarValue(c) => c,
-                LoneSurrogate(u) => fail!("from_utf16: found lone surrogate {}", u)
-            }
-        }).collect()
+pub fn from_utf16(v: &[u16]) -> Option<~str> {
+    let mut s = with_capacity(v.len() / 2);
+    for c in utf16_items(v) {
+        match c {
+            ScalarValue(c) => s.push_char(c),
+            LoneSurrogate(_) => return None
+        }
+    }
+    Some(s)
 }
 
 /// Decode a UTF-16 encoded vector `v` into a string, replacing
@@ -3834,14 +3837,29 @@ mod tests {
             assert!(is_utf16(u));
             assert_eq!(s.to_utf16(), u);
 
-            assert_eq!(from_utf16(u), s);
+            assert_eq!(from_utf16(u).unwrap(), s);
             assert_eq!(from_utf16_lossy(u), s);
 
-            assert_eq!(from_utf16(s.to_utf16()), s);
-            assert_eq!(from_utf16(u).to_utf16(), u);
+            assert_eq!(from_utf16(s.to_utf16()).unwrap(), s);
+            assert_eq!(from_utf16(u).unwrap().to_utf16(), u);
         }
     }
 
+    #[test]
+    fn test_utf16_invalid() {
+        // completely positive cases tested above.
+        // lead + eof
+        assert_eq!(from_utf16([0xD800]), None);
+        // lead + lead
+        assert_eq!(from_utf16([0xD800, 0xD800]), None);
+
+        // isolated trail
+        assert_eq!(from_utf16([0x0061, 0xDC00]), None);
+
+        // general
+        assert_eq!(from_utf16([0xD800, 0xd801, 0xdc8b, 0xD800]), None);
+    }
+
     #[test]
     fn test_utf16_lossy() {
         // completely positive cases tested above.

From c9b4538babbc85b971b19bbeff16bd12a4f4db54 Mon Sep 17 00:00:00 2001
From: Huon Wilson <dbau.pp+github@gmail.com>
Date: Tue, 18 Feb 2014 22:25:32 +1100
Subject: [PATCH 6/6] str: add a function for truncating a vector of u16 at
 NUL.

Many of the functions interacting with Windows APIs allocate a vector of
0's and do not retrieve a length directly from the API call, and so need
to be sure to remove the unmodified junk at the end of the vector.
---
 src/libnative/io/file.rs |  3 ++-
 src/libstd/os.rs         | 10 ++++++---
 src/libstd/str.rs        | 44 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/src/libnative/io/file.rs b/src/libnative/io/file.rs
index 80f5f74c53a52..6d3a156a2b01a 100644
--- a/src/libnative/io/file.rs
+++ b/src/libnative/io/file.rs
@@ -571,7 +571,8 @@ pub fn readdir(p: &CString) -> IoResult<~[Path]> {
                         else {
                             let fp_vec = vec::from_buf(
                                 fp_buf, wcslen(fp_buf) as uint);
-                            let fp_str = str::from_utf16(fp_vec)
+                            let fp_trimmed = str::truncate_utf16_at_nul(fp_vec);
+                            let fp_str = str::from_utf16(fp_trimmed)
                                     .expect("rust_list_dir_wfd_fp_buf returned invalid UTF-16");
                             paths.push(Path::new(fp_str));
                         }
diff --git a/src/libstd/os.rs b/src/libstd/os.rs
index 31e88905b30f9..74e2fceb6cae6 100644
--- a/src/libstd/os.rs
+++ b/src/libstd/os.rs
@@ -88,7 +88,8 @@ pub fn getcwd() -> Path {
             fail!();
         }
     }
-    Path::new(str::from_utf16(buf).expect("GetCurrentDirectoryW returned invalid UTF-16"))
+    Path::new(str::from_utf16(str::truncate_utf16_at_nul(buf))
+              .expect("GetCurrentDirectoryW returned invalid UTF-16"))
 }
 
 #[cfg(windows)]
@@ -744,7 +745,8 @@ pub fn last_os_error() -> ~str {
                 fail!("[{}] FormatMessage failure", errno());
             }
 
-            str::from_utf16(buf).expect("FormatMessageW returned invalid UTF-16")
+            str::from_utf16(str::truncate_utf16_at_nul(buf))
+                .expect("FormatMessageW returned invalid UTF-16")
         }
     }
 
@@ -833,7 +835,9 @@ fn real_args() -> ~[~str] {
             while *ptr.offset(len as int) != 0 { len += 1; }
 
             // Push it onto the list.
-            let opt_s = vec::raw::buf_as_slice(ptr, len, str::from_utf16);
+            let opt_s = vec::raw::buf_as_slice(ptr, len, |buf| {
+                    str::from_utf16(str::truncate_utf16_at_nul(buf))
+                });
             args.push(opt_s.expect("CommandLineToArgvW returned invalid UTF-16"));
         }
     }
diff --git a/src/libstd/str.rs b/src/libstd/str.rs
index 20321dad60071..1f94aaaa7c413 100644
--- a/src/libstd/str.rs
+++ b/src/libstd/str.rs
@@ -920,6 +920,32 @@ pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
     UTF16Items { iter : v.iter() }
 }
 
+/// Return a slice of `v` ending at (and not including) the first NUL
+/// (0).
+///
+/// # Example
+///
+/// ```rust
+/// use std::str;
+///
+/// // "abcd"
+/// let mut v = ['a' as u16, 'b' as u16, 'c' as u16, 'd' as u16];
+/// // no NULs so no change
+/// assert_eq!(str::truncate_utf16_at_nul(v), v.as_slice());
+///
+/// // "ab\0d"
+/// v[2] = 0;
+/// assert_eq!(str::truncate_utf16_at_nul(v),
+///            &['a' as u16, 'b' as u16]);
+/// ```
+pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] {
+    match v.iter().position(|c| *c == 0) {
+        // don't include the 0
+        Some(i) => v.slice_to(i),
+        None => v
+    }
+}
+
 /// Decode a UTF-16 encoded vector `v` into a string, returning `None`
 /// if `v` contains any invalid data.
 ///
@@ -3875,6 +3901,24 @@ mod tests {
         assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), ~"\uFFFD𐒋\uFFFD");
     }
 
+    #[test]
+    fn test_truncate_utf16_at_nul() {
+        let v = [];
+        assert_eq!(truncate_utf16_at_nul(v), &[]);
+
+        let v = [0, 2, 3];
+        assert_eq!(truncate_utf16_at_nul(v), &[]);
+
+        let v = [1, 0, 3];
+        assert_eq!(truncate_utf16_at_nul(v), &[1]);
+
+        let v = [1, 2, 0];
+        assert_eq!(truncate_utf16_at_nul(v), &[1, 2]);
+
+        let v = [1, 2, 3];
+        assert_eq!(truncate_utf16_at_nul(v), &[1, 2, 3]);
+    }
+
     #[test]
     fn test_char_at() {
         let s = ~"ศไทย中华Việt Nam";