From 493a4b63c1d6791ae7d2001123d8953bd62aa443 Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Sun, 16 Feb 2014 23:52:14 +1100 Subject: [PATCH 1/6] std: iteratize str::is_utf16 & add tests. Most of the tests are randomly generated with Python 3 and rely on it's UTF-16be encoder/decoder being correct. --- src/libstd/str.rs | 96 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 78 insertions(+), 18 deletions(-) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 0a7f513581c0d..8214382fb0d0e 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -805,23 +805,23 @@ fn first_non_utf8_index(v: &[u8]) -> Option { /// Determines if a vector of `u16` contains valid UTF-16 pub fn is_utf16(v: &[u16]) -> bool { - let len = v.len(); - let mut i = 0u; - while i < len { - let u = v[i]; - - if u <= 0xD7FF_u16 || u >= 0xE000_u16 { - i += 1u; + let mut it = v.iter(); + macro_rules! next ( ($ret:expr) => { + match it.next() { Some(u) => *u, None => return $ret } + } + ) + loop { + let u = next!(true); - } else { - if i+1u < len { return false; } - let u2 = v[i+1u]; - if u < 0xD7FF_u16 || u > 0xDBFF_u16 { return false; } - if u2 < 0xDC00_u16 || u2 > 0xDFFF_u16 { return false; } - i += 2u; + match char::from_u32(u as u32) { + Some(_) => {} + None => { + let u2 = next!(false); + if u < 0xD7FF || u > 0xDBFF || + u2 < 0xDC00 || u2 > 0xDFFF { return false; } + } } } - return true; } /// Iterates over the utf-16 characters in the specified slice, yielding each @@ -3511,6 +3511,65 @@ mod tests { assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF])); } + #[test] + fn test_is_utf16() { + macro_rules! pos ( ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } }); + + // non-surrogates + pos!([0x0000], + [0x0001, 0x0002], + [0xD7FF], + [0xE000]); + + // surrogate pairs (randomly generated with Python 3's + // .encode('utf-16be')) + pos!([0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45], + [0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14], + [0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]); + + // mixtures (also random) + pos!([0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65], + [0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006], + [0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]); + + // negative tests + macro_rules! neg ( ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } }); + + neg!( + // surrogate + regular unit + [0xdb45, 0x0000], + // surrogate + lead surrogate + [0xd900, 0xd900], + // unterminated surrogate + [0xd8ff], + // trail surrogate without a lead + [0xddb7]); + + // random byte sequences that Python 3's .decode('utf-16be') + // failed on + neg!([0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7], + [0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3], + [0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca], + [0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278], + [0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e], + [0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5], + [0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee], + [0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7], + [0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a], + [0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a], + [0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe], + [0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf], + [0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e], + [0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5], + [0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f], + [0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b], + [0x934b, 0x8956, 0xc434, 0x1881, 0xddf7], + [0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9], + [0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8], + [0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282], + [0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]); + } + #[test] fn test_raw_from_c_str() { unsafe { @@ -3666,10 +3725,11 @@ mod tests { for p in pairs.iter() { let (s, u) = (*p).clone(); - assert!(s.to_utf16() == u); - assert!(from_utf16(u) == s); - assert!(from_utf16(s.to_utf16()) == s); - assert!(from_utf16(u).to_utf16() == u); + assert!(is_utf16(u)); + assert_eq!(s.to_utf16(), u); + assert_eq!(from_utf16(u), s); + assert_eq!(from_utf16(s.to_utf16()), s); + assert_eq!(from_utf16(u).to_utf16(), u); } } From b7656d048feb828af12278a6028c17b0c86241b3 Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Mon, 17 Feb 2014 00:09:45 +1100 Subject: [PATCH 2/6] std: convert str::from_utf16 to an external iterator. Fixes #12316. --- src/libstd/str.rs | 65 +++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 8214382fb0d0e..93abf8bc5428f 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -824,41 +824,52 @@ pub fn is_utf16(v: &[u16]) -> bool { } } -/// Iterates over the utf-16 characters in the specified slice, yielding each -/// decoded unicode character to the function provided. +/// An iterator that decodes UTF-16 encoded codepoints from a vector +/// of `u16`s. /// -/// # Failures -/// -/// * Fails on invalid utf-16 data -pub fn utf16_chars(v: &[u16], f: |char|) { - let len = v.len(); - let mut i = 0u; - while i < len && v[i] != 0u16 { - let u = v[i]; - - if u <= 0xD7FF_u16 || u >= 0xE000_u16 { - f(unsafe { cast::transmute(u as u32) }); - i += 1u; +/// Fails when it encounters invalid UTF-16 data. +pub struct UTF16Chars<'a> { + priv iter: vec::Items<'a, u16> +} +impl<'a> Iterator for UTF16Chars<'a> { + fn next(&mut self) -> Option { + let u = match self.iter.next() { + Some(u) => *u, + None => return None + }; + match char::from_u32(u as u32) { + Some(c) => Some(c), + None => { + let u2 = *self.iter.next().expect("UTF16Chars: unmatched lead surrogate"); + if u < 0xD7FF || u > 0xDBFF || + u2 < 0xDC00 || u2 > 0xDFFF { + fail!("UTF16Chars: invalid surrogate pair") + } - } else { - let u2 = v[i+1u]; - assert!(u >= 0xD800_u16 && u <= 0xDBFF_u16); - assert!(u2 >= 0xDC00_u16 && u2 <= 0xDFFF_u16); - let mut c: u32 = (u - 0xD800_u16) as u32; - c = c << 10; - c |= (u2 - 0xDC00_u16) as u32; - c |= 0x1_0000_u32; - f(unsafe { cast::transmute(c) }); - i += 2u; + let mut c = (u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32 | 0x1_0000; + char::from_u32(c) + } } } + + fn size_hint(&self) -> (uint, Option) { + let (low, high) = self.iter.size_hint(); + // we could be entirely surrogates (2 elements per char), or + // entirely non-surrogates (1 element per char) + (low / 2, high) + } +} + +/// Create an iterator over the UTF-16 encoded codepoints in `v`. +/// +/// The iterator fails if it attempts to decode invalid UTF-16 data. +pub fn utf16_chars<'a>(v: &'a [u16]) -> UTF16Chars<'a> { + UTF16Chars { iter : v.iter() } } /// Allocates a new string from the utf-16 slice provided pub fn from_utf16(v: &[u16]) -> ~str { - let mut buf = with_capacity(v.len()); - utf16_chars(v, |ch| buf.push_char(ch)); - buf + utf16_chars(v).collect() } /// Allocates a new string with the specified capacity. The string returned is From a96cea4f5a713eb357bde3d395c5453058be88c5 Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Mon, 17 Feb 2014 00:52:58 +1100 Subject: [PATCH 3/6] str: provide lossy UTF-16 support. This replaces the iterator with one that handles lone surrogates gracefully and uses that to implement `from_utf16_lossy` which replaces invalid `u16`s with U+FFFD. --- src/libstd/str.rs | 156 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 133 insertions(+), 23 deletions(-) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 93abf8bc5428f..1ef622002c310 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -826,50 +826,142 @@ pub fn is_utf16(v: &[u16]) -> bool { /// An iterator that decodes UTF-16 encoded codepoints from a vector /// of `u16`s. -/// -/// Fails when it encounters invalid UTF-16 data. -pub struct UTF16Chars<'a> { +#[deriving(Clone)] +pub struct UTF16Items<'a> { priv iter: vec::Items<'a, u16> } -impl<'a> Iterator for UTF16Chars<'a> { - fn next(&mut self) -> Option { +/// The possibilities for values decoded from a `u16` stream. +#[deriving(Eq, TotalEq, Clone)] +pub enum UTF16Item { + /// A valid codepoint. + ScalarValue(char), + /// An invalid surrogate without its pair. + LoneSurrogate(u16) +} + +impl UTF16Item { + /// Convert `self` to a `char`, taking `LoneSurrogate`s to the + /// replacement character (U+FFFD). + #[inline] + pub fn to_char_lossy(&self) -> char { + match *self { + ScalarValue(c) => c, + LoneSurrogate(_) => '\uFFFD' + } + } +} + +impl<'a> Iterator for UTF16Items<'a> { + fn next(&mut self) -> Option { let u = match self.iter.next() { Some(u) => *u, None => return None }; - match char::from_u32(u as u32) { - Some(c) => Some(c), - None => { - let u2 = *self.iter.next().expect("UTF16Chars: unmatched lead surrogate"); - if u < 0xD7FF || u > 0xDBFF || - u2 < 0xDC00 || u2 > 0xDFFF { - fail!("UTF16Chars: invalid surrogate pair") - } - let mut c = (u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32 | 0x1_0000; - char::from_u32(c) + if u < 0xD800 || 0xDFFF < u { + // not a surrogate + Some(ScalarValue(unsafe {cast::transmute(u as u32)})) + } else if u >= 0xDC00 { + // a trailing surrogate + Some(LoneSurrogate(u)) + } else { + // preserve state for rewinding. + let old = self.iter; + + let u2 = match self.iter.next() { + Some(u2) => *u2, + // eof + None => return Some(LoneSurrogate(u)) + }; + if u2 < 0xDC00 || u2 > 0xDFFF { + // not a trailing surrogate so we're not a valid + // surrogate pair, so rewind to redecode u2 next time. + self.iter = old; + return Some(LoneSurrogate(u)) } + + // all ok, so lets decode it. + let c = (u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32 | 0x1_0000; + Some(ScalarValue(unsafe {cast::transmute(c)})) } } + #[inline] fn size_hint(&self) -> (uint, Option) { let (low, high) = self.iter.size_hint(); - // we could be entirely surrogates (2 elements per char), or - // entirely non-surrogates (1 element per char) + // we could be entirely valid surrogates (2 elements per + // char), or entirely non-surrogates (1 element per char) (low / 2, high) } } -/// Create an iterator over the UTF-16 encoded codepoints in `v`. +/// Create an iterator over the UTF-16 encoded codepoints in `v`, +/// returning invalid surrogates as `LoneSurrogate`s. /// -/// The iterator fails if it attempts to decode invalid UTF-16 data. -pub fn utf16_chars<'a>(v: &'a [u16]) -> UTF16Chars<'a> { - UTF16Chars { iter : v.iter() } +/// # Example +/// +/// ```rust +/// use std::str; +/// use std::str::{ScalarValue, LoneSurrogate}; +/// +/// // 𝄞music +/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, +/// 0x0073, 0xDD1E, 0x0069, 0x0063, +/// 0xD834]; +/// +/// assert_eq!(str::utf16_items(v).to_owned_vec(), +/// ~[ScalarValue('𝄞'), +/// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'), +/// LoneSurrogate(0xDD1E), +/// ScalarValue('i'), ScalarValue('c'), +/// LoneSurrogate(0xD834)]); +/// ``` +pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> { + UTF16Items { iter : v.iter() } } -/// Allocates a new string from the utf-16 slice provided +/// Decode a UTF-16 encoded vector `v` into a string. +/// +/// # Failure +/// +/// Fails on invalid UTF-16 data. +/// +/// # Example +/// +/// ```rust +/// use std::str; +/// +/// // 𝄞music +/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, +/// 0x0073, 0x0069, 0x0063]; +/// assert_eq!(str::from_utf16(v), ~"𝄞music"); +/// ``` pub fn from_utf16(v: &[u16]) -> ~str { - utf16_chars(v).collect() + utf16_items(v).map(|c| { + match c { + ScalarValue(c) => c, + LoneSurrogate(u) => fail!("from_utf16: found lone surrogate {}", u) + } + }).collect() +} + +/// Decode a UTF-16 encoded vector `v` into a string, replacing +/// invalid data with the replacement character (U+FFFD). +/// +/// # Example +/// ```rust +/// use std::str; +/// +/// // 𝄞music +/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, +/// 0x0073, 0xDD1E, 0x0069, 0x0063, +/// 0xD834]; +/// +/// assert_eq!(str::from_utf16_lossy(v), +/// ~"𝄞mus\uFFFDic\uFFFD"); +/// ``` +pub fn from_utf16_lossy(v: &[u16]) -> ~str { + utf16_items(v).map(|c| c.to_char_lossy()).collect() } /// Allocates a new string with the specified capacity. The string returned is @@ -3738,12 +3830,30 @@ mod tests { let (s, u) = (*p).clone(); assert!(is_utf16(u)); assert_eq!(s.to_utf16(), u); + assert_eq!(from_utf16(u), s); + assert_eq!(from_utf16_lossy(u), s); + assert_eq!(from_utf16(s.to_utf16()), s); assert_eq!(from_utf16(u).to_utf16(), u); } } + #[test] + fn test_utf16_lossy() { + // completely positive cases tested above. + // lead + eof + assert_eq!(from_utf16_lossy([0xD800]), ~"\uFFFD"); + // lead + lead + assert_eq!(from_utf16_lossy([0xD800, 0xD800]), ~"\uFFFD\uFFFD"); + + // isolated trail + assert_eq!(from_utf16_lossy([0x0061, 0xDC00]), ~"a\uFFFD"); + + // general + assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), ~"\uFFFD𐒋\uFFFD"); + } + #[test] fn test_char_at() { let s = ~"ศไทย中华Việt Nam"; From 35b1b62ddfc31c2e52b65c2f908c0fcbc6465de5 Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Mon, 17 Feb 2014 00:57:16 +1100 Subject: [PATCH 4/6] std: decode even numbered non-BMP planes in the UTF-16 decoder. Fixes #12318. --- src/libstd/str.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 1ef622002c310..34bcb083134c0 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -881,7 +881,7 @@ impl<'a> Iterator for UTF16Items<'a> { } // all ok, so lets decode it. - let c = (u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32 | 0x1_0000; + let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000; Some(ScalarValue(unsafe {cast::transmute(c)})) } } @@ -3824,7 +3824,10 @@ mod tests { 0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16, 0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16, - 0x000a_u16 ]) ]; + 0x000a_u16 ]), + // Issue #12318, even-numbered non-BMP planes + (~"\U00020000", + ~[0xD840, 0xDC00])]; for p in pairs.iter() { let (s, u) = (*p).clone(); From 4f841ee1509fafdf688a3898e01560ae29ee7836 Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Mon, 17 Feb 2014 09:57:56 +1100 Subject: [PATCH 5/6] std: make str::from_utf16 return an Option. The rest of the codebase is moving toward avoiding `fail!` so we do it here too! --- src/libnative/io/file.rs | 3 ++- src/libstd/os.rs | 15 +++++++---- src/libstd/str.rs | 54 ++++++++++++++++++++++++++-------------- 3 files changed, 48 insertions(+), 24 deletions(-) diff --git a/src/libnative/io/file.rs b/src/libnative/io/file.rs index e9c9f51966c60..80f5f74c53a52 100644 --- a/src/libnative/io/file.rs +++ b/src/libnative/io/file.rs @@ -571,7 +571,8 @@ pub fn readdir(p: &CString) -> IoResult<~[Path]> { else { let fp_vec = vec::from_buf( fp_buf, wcslen(fp_buf) as uint); - let fp_str = str::from_utf16(fp_vec); + let fp_str = str::from_utf16(fp_vec) + .expect("rust_list_dir_wfd_fp_buf returned invalid UTF-16"); paths.push(Path::new(fp_str)); } more_files = FindNextFileW(find_handle, wfd_ptr as HANDLE); diff --git a/src/libstd/os.rs b/src/libstd/os.rs index 719ed62d03d0a..31e88905b30f9 100644 --- a/src/libstd/os.rs +++ b/src/libstd/os.rs @@ -88,7 +88,7 @@ pub fn getcwd() -> Path { fail!(); } } - Path::new(str::from_utf16(buf)) + Path::new(str::from_utf16(buf).expect("GetCurrentDirectoryW returned invalid UTF-16")) } #[cfg(windows)] @@ -124,7 +124,12 @@ pub mod win32 { } if k != 0 && done { let sub = buf.slice(0, k as uint); - res = option::Some(str::from_utf16(sub)); + // We want to explicitly catch the case when the + // closure returned invalid UTF-16, rather than + // set `res` to None and continue. + let s = str::from_utf16(sub) + .expect("fill_utf16_buf_and_decode: closure created invalid UTF-16"); + res = option::Some(s) } } return res; @@ -739,7 +744,7 @@ pub fn last_os_error() -> ~str { fail!("[{}] FormatMessage failure", errno()); } - str::from_utf16(buf) + str::from_utf16(buf).expect("FormatMessageW returned invalid UTF-16") } } @@ -828,8 +833,8 @@ fn real_args() -> ~[~str] { while *ptr.offset(len as int) != 0 { len += 1; } // Push it onto the list. - args.push(vec::raw::buf_as_slice(ptr, len, - str::from_utf16)); + let opt_s = vec::raw::buf_as_slice(ptr, len, str::from_utf16); + args.push(opt_s.expect("CommandLineToArgvW returned invalid UTF-16")); } } diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 34bcb083134c0..20321dad60071 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -920,11 +920,8 @@ pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> { UTF16Items { iter : v.iter() } } -/// Decode a UTF-16 encoded vector `v` into a string. -/// -/// # Failure -/// -/// Fails on invalid UTF-16 data. +/// Decode a UTF-16 encoded vector `v` into a string, returning `None` +/// if `v` contains any invalid data. /// /// # Example /// @@ -932,17 +929,23 @@ pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> { /// use std::str; /// /// // 𝄞music -/// let v = [0xD834, 0xDD1E, 0x006d, 0x0075, -/// 0x0073, 0x0069, 0x0063]; -/// assert_eq!(str::from_utf16(v), ~"𝄞music"); +/// let mut v = [0xD834, 0xDD1E, 0x006d, 0x0075, +/// 0x0073, 0x0069, 0x0063]; +/// assert_eq!(str::from_utf16(v), Some(~"𝄞music")); +/// +/// // 𝄞muic +/// v[4] = 0xD800; +/// assert_eq!(str::from_utf16(v), None); /// ``` -pub fn from_utf16(v: &[u16]) -> ~str { - utf16_items(v).map(|c| { - match c { - ScalarValue(c) => c, - LoneSurrogate(u) => fail!("from_utf16: found lone surrogate {}", u) - } - }).collect() +pub fn from_utf16(v: &[u16]) -> Option<~str> { + let mut s = with_capacity(v.len() / 2); + for c in utf16_items(v) { + match c { + ScalarValue(c) => s.push_char(c), + LoneSurrogate(_) => return None + } + } + Some(s) } /// Decode a UTF-16 encoded vector `v` into a string, replacing @@ -3834,14 +3837,29 @@ mod tests { assert!(is_utf16(u)); assert_eq!(s.to_utf16(), u); - assert_eq!(from_utf16(u), s); + assert_eq!(from_utf16(u).unwrap(), s); assert_eq!(from_utf16_lossy(u), s); - assert_eq!(from_utf16(s.to_utf16()), s); - assert_eq!(from_utf16(u).to_utf16(), u); + assert_eq!(from_utf16(s.to_utf16()).unwrap(), s); + assert_eq!(from_utf16(u).unwrap().to_utf16(), u); } } + #[test] + fn test_utf16_invalid() { + // completely positive cases tested above. + // lead + eof + assert_eq!(from_utf16([0xD800]), None); + // lead + lead + assert_eq!(from_utf16([0xD800, 0xD800]), None); + + // isolated trail + assert_eq!(from_utf16([0x0061, 0xDC00]), None); + + // general + assert_eq!(from_utf16([0xD800, 0xd801, 0xdc8b, 0xD800]), None); + } + #[test] fn test_utf16_lossy() { // completely positive cases tested above. From c9b4538babbc85b971b19bbeff16bd12a4f4db54 Mon Sep 17 00:00:00 2001 From: Huon Wilson Date: Tue, 18 Feb 2014 22:25:32 +1100 Subject: [PATCH 6/6] str: add a function for truncating a vector of u16 at NUL. Many of the functions interacting with Windows APIs allocate a vector of 0's and do not retrieve a length directly from the API call, and so need to be sure to remove the unmodified junk at the end of the vector. --- src/libnative/io/file.rs | 3 ++- src/libstd/os.rs | 10 ++++++--- src/libstd/str.rs | 44 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/src/libnative/io/file.rs b/src/libnative/io/file.rs index 80f5f74c53a52..6d3a156a2b01a 100644 --- a/src/libnative/io/file.rs +++ b/src/libnative/io/file.rs @@ -571,7 +571,8 @@ pub fn readdir(p: &CString) -> IoResult<~[Path]> { else { let fp_vec = vec::from_buf( fp_buf, wcslen(fp_buf) as uint); - let fp_str = str::from_utf16(fp_vec) + let fp_trimmed = str::truncate_utf16_at_nul(fp_vec); + let fp_str = str::from_utf16(fp_trimmed) .expect("rust_list_dir_wfd_fp_buf returned invalid UTF-16"); paths.push(Path::new(fp_str)); } diff --git a/src/libstd/os.rs b/src/libstd/os.rs index 31e88905b30f9..74e2fceb6cae6 100644 --- a/src/libstd/os.rs +++ b/src/libstd/os.rs @@ -88,7 +88,8 @@ pub fn getcwd() -> Path { fail!(); } } - Path::new(str::from_utf16(buf).expect("GetCurrentDirectoryW returned invalid UTF-16")) + Path::new(str::from_utf16(str::truncate_utf16_at_nul(buf)) + .expect("GetCurrentDirectoryW returned invalid UTF-16")) } #[cfg(windows)] @@ -744,7 +745,8 @@ pub fn last_os_error() -> ~str { fail!("[{}] FormatMessage failure", errno()); } - str::from_utf16(buf).expect("FormatMessageW returned invalid UTF-16") + str::from_utf16(str::truncate_utf16_at_nul(buf)) + .expect("FormatMessageW returned invalid UTF-16") } } @@ -833,7 +835,9 @@ fn real_args() -> ~[~str] { while *ptr.offset(len as int) != 0 { len += 1; } // Push it onto the list. - let opt_s = vec::raw::buf_as_slice(ptr, len, str::from_utf16); + let opt_s = vec::raw::buf_as_slice(ptr, len, |buf| { + str::from_utf16(str::truncate_utf16_at_nul(buf)) + }); args.push(opt_s.expect("CommandLineToArgvW returned invalid UTF-16")); } } diff --git a/src/libstd/str.rs b/src/libstd/str.rs index 20321dad60071..1f94aaaa7c413 100644 --- a/src/libstd/str.rs +++ b/src/libstd/str.rs @@ -920,6 +920,32 @@ pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> { UTF16Items { iter : v.iter() } } +/// Return a slice of `v` ending at (and not including) the first NUL +/// (0). +/// +/// # Example +/// +/// ```rust +/// use std::str; +/// +/// // "abcd" +/// let mut v = ['a' as u16, 'b' as u16, 'c' as u16, 'd' as u16]; +/// // no NULs so no change +/// assert_eq!(str::truncate_utf16_at_nul(v), v.as_slice()); +/// +/// // "ab\0d" +/// v[2] = 0; +/// assert_eq!(str::truncate_utf16_at_nul(v), +/// &['a' as u16, 'b' as u16]); +/// ``` +pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] { + match v.iter().position(|c| *c == 0) { + // don't include the 0 + Some(i) => v.slice_to(i), + None => v + } +} + /// Decode a UTF-16 encoded vector `v` into a string, returning `None` /// if `v` contains any invalid data. /// @@ -3875,6 +3901,24 @@ mod tests { assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), ~"\uFFFD𐒋\uFFFD"); } + #[test] + fn test_truncate_utf16_at_nul() { + let v = []; + assert_eq!(truncate_utf16_at_nul(v), &[]); + + let v = [0, 2, 3]; + assert_eq!(truncate_utf16_at_nul(v), &[]); + + let v = [1, 0, 3]; + assert_eq!(truncate_utf16_at_nul(v), &[1]); + + let v = [1, 2, 0]; + assert_eq!(truncate_utf16_at_nul(v), &[1, 2]); + + let v = [1, 2, 3]; + assert_eq!(truncate_utf16_at_nul(v), &[1, 2, 3]); + } + #[test] fn test_char_at() { let s = ~"ศไทย中华Việt Nam";