Skip to content

Commit d8f3dbd

Browse files
committed
Extract wtf8 argument parsing to sys_common
This commit extracts wtf8 argument parsing logic and from sys::windows::args to sys_common::args. This allows using the same logic for other targets which use wtf8 (example UEFI). This was originally a part of #100316 Signed-off-by: Ayush Singh <ayushsingh1325@gmail.com>
1 parent 57d3c58 commit d8f3dbd

File tree

3 files changed

+198
-187
lines changed

3 files changed

+198
-187
lines changed

library/std/src/sys/windows/args.rs

Lines changed: 1 addition & 187 deletions
Original file line numberDiff line numberDiff line change
@@ -9,29 +9,14 @@ mod tests;
99
use crate::ffi::OsString;
1010
use crate::fmt;
1111
use crate::io;
12-
use crate::marker::PhantomData;
13-
use crate::num::NonZeroU16;
1412
use crate::os::windows::prelude::*;
1513
use crate::path::PathBuf;
16-
use crate::ptr::NonNull;
1714
use crate::sys::c;
1815
use crate::sys::process::ensure_no_nuls;
1916
use crate::sys::windows::os::current_exe;
17+
use crate::sys_common::args::{parse_lp_cmd_line, WStrUnits};
2018
use crate::vec;
2119

22-
use core::iter;
23-
24-
/// This is the const equivalent to `NonZeroU16::new(n).unwrap()`
25-
///
26-
/// FIXME: This can be removed once `Option::unwrap` is stably const.
27-
/// See the `const_option` feature (#67441).
28-
const fn non_zero_u16(n: u16) -> NonZeroU16 {
29-
match NonZeroU16::new(n) {
30-
Some(n) => n,
31-
None => panic!("called `unwrap` on a `None` value"),
32-
}
33-
}
34-
3520
pub fn args() -> Args {
3621
// SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16
3722
// string so it's safe for `WStrUnits` to use.
@@ -45,128 +30,6 @@ pub fn args() -> Args {
4530
}
4631
}
4732

48-
/// Implements the Windows command-line argument parsing algorithm.
49-
///
50-
/// Microsoft's documentation for the Windows CLI argument format can be found at
51-
/// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments>
52-
///
53-
/// A more in-depth explanation is here:
54-
/// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN>
55-
///
56-
/// Windows includes a function to do command line parsing in shell32.dll.
57-
/// However, this is not used for two reasons:
58-
///
59-
/// 1. Linking with that DLL causes the process to be registered as a GUI application.
60-
/// GUI applications add a bunch of overhead, even if no windows are drawn. See
61-
/// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>.
62-
///
63-
/// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above.
64-
///
65-
/// This function was tested for equivalence to the C/C++ parsing rules using an
66-
/// extensive test suite available at
67-
/// <https://github.com/ChrisDenton/winarg/tree/std>.
68-
fn parse_lp_cmd_line<'a, F: Fn() -> OsString>(
69-
lp_cmd_line: Option<WStrUnits<'a>>,
70-
exe_name: F,
71-
) -> Vec<OsString> {
72-
const BACKSLASH: NonZeroU16 = non_zero_u16(b'\\' as u16);
73-
const QUOTE: NonZeroU16 = non_zero_u16(b'"' as u16);
74-
const TAB: NonZeroU16 = non_zero_u16(b'\t' as u16);
75-
const SPACE: NonZeroU16 = non_zero_u16(b' ' as u16);
76-
77-
let mut ret_val = Vec::new();
78-
// If the cmd line pointer is null or it points to an empty string then
79-
// return the name of the executable as argv[0].
80-
if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() {
81-
ret_val.push(exe_name());
82-
return ret_val;
83-
}
84-
let mut code_units = lp_cmd_line.unwrap();
85-
86-
// The executable name at the beginning is special.
87-
let mut in_quotes = false;
88-
let mut cur = Vec::new();
89-
for w in &mut code_units {
90-
match w {
91-
// A quote mark always toggles `in_quotes` no matter what because
92-
// there are no escape characters when parsing the executable name.
93-
QUOTE => in_quotes = !in_quotes,
94-
// If not `in_quotes` then whitespace ends argv[0].
95-
SPACE | TAB if !in_quotes => break,
96-
// In all other cases the code unit is taken literally.
97-
_ => cur.push(w.get()),
98-
}
99-
}
100-
// Skip whitespace.
101-
code_units.advance_while(|w| w == SPACE || w == TAB);
102-
ret_val.push(OsString::from_wide(&cur));
103-
104-
// Parse the arguments according to these rules:
105-
// * All code units are taken literally except space, tab, quote and backslash.
106-
// * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are
107-
// treated as a single separator.
108-
// * A space or tab `in_quotes` is taken literally.
109-
// * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally.
110-
// * A quote can be escaped if preceded by an odd number of backslashes.
111-
// * If any number of backslashes is immediately followed by a quote then the number of
112-
// backslashes is halved (rounding down).
113-
// * Backslashes not followed by a quote are all taken literally.
114-
// * If `in_quotes` then a quote can also be escaped using another quote
115-
// (i.e. two consecutive quotes become one literal quote).
116-
let mut cur = Vec::new();
117-
let mut in_quotes = false;
118-
while let Some(w) = code_units.next() {
119-
match w {
120-
// If not `in_quotes`, a space or tab ends the argument.
121-
SPACE | TAB if !in_quotes => {
122-
ret_val.push(OsString::from_wide(&cur[..]));
123-
cur.truncate(0);
124-
125-
// Skip whitespace.
126-
code_units.advance_while(|w| w == SPACE || w == TAB);
127-
}
128-
// Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote.
129-
BACKSLASH => {
130-
let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1;
131-
if code_units.peek() == Some(QUOTE) {
132-
cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2));
133-
// The quote is escaped if there are an odd number of backslashes.
134-
if backslash_count % 2 == 1 {
135-
code_units.next();
136-
cur.push(QUOTE.get());
137-
}
138-
} else {
139-
// If there is no quote on the end then there is no escaping.
140-
cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count));
141-
}
142-
}
143-
// If `in_quotes` and not backslash escaped (see above) then a quote either
144-
// unsets `in_quote` or is escaped by another quote.
145-
QUOTE if in_quotes => match code_units.peek() {
146-
// Two consecutive quotes when `in_quotes` produces one literal quote.
147-
Some(QUOTE) => {
148-
cur.push(QUOTE.get());
149-
code_units.next();
150-
}
151-
// Otherwise set `in_quotes`.
152-
Some(_) => in_quotes = false,
153-
// The end of the command line.
154-
// Push `cur` even if empty, which we do by breaking while `in_quotes` is still set.
155-
None => break,
156-
},
157-
// If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`.
158-
QUOTE => in_quotes = true,
159-
// Everything else is always taken literally.
160-
_ => cur.push(w.get()),
161-
}
162-
}
163-
// Push the final argument, if any.
164-
if !cur.is_empty() || in_quotes {
165-
ret_val.push(OsString::from_wide(&cur[..]));
166-
}
167-
ret_val
168-
}
169-
17033
pub struct Args {
17134
parsed_args_list: vec::IntoIter<OsString>,
17235
}
@@ -199,55 +62,6 @@ impl ExactSizeIterator for Args {
19962
}
20063
}
20164

202-
/// A safe iterator over a LPWSTR
203-
/// (aka a pointer to a series of UTF-16 code units terminated by a NULL).
204-
struct WStrUnits<'a> {
205-
// The pointer must never be null...
206-
lpwstr: NonNull<u16>,
207-
// ...and the memory it points to must be valid for this lifetime.
208-
lifetime: PhantomData<&'a [u16]>,
209-
}
210-
impl WStrUnits<'_> {
211-
/// Create the iterator. Returns `None` if `lpwstr` is null.
212-
///
213-
/// SAFETY: `lpwstr` must point to a null-terminated wide string that lives
214-
/// at least as long as the lifetime of this struct.
215-
unsafe fn new(lpwstr: *const u16) -> Option<Self> {
216-
Some(Self { lpwstr: NonNull::new(lpwstr as _)?, lifetime: PhantomData })
217-
}
218-
fn peek(&self) -> Option<NonZeroU16> {
219-
// SAFETY: It's always safe to read the current item because we don't
220-
// ever move out of the array's bounds.
221-
unsafe { NonZeroU16::new(*self.lpwstr.as_ptr()) }
222-
}
223-
/// Advance the iterator while `predicate` returns true.
224-
/// Returns the number of items it advanced by.
225-
fn advance_while<P: FnMut(NonZeroU16) -> bool>(&mut self, mut predicate: P) -> usize {
226-
let mut counter = 0;
227-
while let Some(w) = self.peek() {
228-
if !predicate(w) {
229-
break;
230-
}
231-
counter += 1;
232-
self.next();
233-
}
234-
counter
235-
}
236-
}
237-
impl Iterator for WStrUnits<'_> {
238-
// This can never return zero as that marks the end of the string.
239-
type Item = NonZeroU16;
240-
fn next(&mut self) -> Option<NonZeroU16> {
241-
// SAFETY: If NULL is reached we immediately return.
242-
// Therefore it's safe to advance the pointer after that.
243-
unsafe {
244-
let next = self.peek()?;
245-
self.lpwstr = NonNull::new_unchecked(self.lpwstr.as_ptr().add(1));
246-
Some(next)
247-
}
248-
}
249-
}
250-
25165
#[derive(Debug)]
25266
pub(crate) enum Arg {
25367
/// Add quotes (if needed)

0 commit comments

Comments
 (0)