Skip to content

Commit

Permalink
Auto merge of #87580 - ChrisDenton:win-arg-parse-2008, r=m-ou-se
Browse files Browse the repository at this point in the history
Update Windows Argument Parsing

Fixes #44650

The Windows command line is passed to applications [as a single string](https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string) which the application then parses to get a list of arguments. The standard rules (as used by C/C++) for parsing the command line have slightly changed over the years, most recently in 2008 which added new escaping rules.

This PR implements the new rules as [described on MSDN](https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments) and [further detailed here](https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN). It has been tested against the behaviour of C++ by calling a C++ program that outputs its raw command line and the contents of `argv`. See [my repo](https://github.com/ChrisDenton/winarg/tree/std) if anyone wants to reproduce my work.

For an overview of how this PR changes argument parsing behavior and why we feel it is warranted see #87580 (comment).

For some examples see: #87580 (comment)
  • Loading branch information
bors committed Sep 2, 2021
2 parents fcce644 + e26dda5 commit 1cf8fdd
Show file tree
Hide file tree
Showing 4 changed files with 201 additions and 123 deletions.
1 change: 1 addition & 0 deletions library/std/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@
#![feature(const_ip)]
#![feature(const_ipv4)]
#![feature(const_ipv6)]
#![feature(const_option)]
#![feature(const_raw_ptr_deref)]
#![feature(const_socketaddr)]
#![feature(const_trait_impl)]
Expand Down
257 changes: 152 additions & 105 deletions library/std/src/sys/windows/args.rs
Original file line number Diff line number Diff line change
@@ -1,23 +1,30 @@
#![allow(dead_code)] // runtime init functions not used during testing
//! The Windows command line is just a string
//! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string>
//!
//! This module implements the parsing necessary to turn that string into a list of arguments.

#[cfg(test)]
mod tests;

use crate::ffi::OsString;
use crate::fmt;
use crate::marker::PhantomData;
use crate::num::NonZeroU16;
use crate::os::windows::prelude::*;
use crate::path::PathBuf;
use crate::slice;
use crate::ptr::NonNull;
use crate::sys::c;
use crate::sys::windows::os::current_exe;
use crate::vec;

use core::iter;

pub fn args() -> Args {
// SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16
// string so it's safe for `WStrUnits` to use.
unsafe {
let lp_cmd_line = c::GetCommandLineW();
let parsed_args_list = parse_lp_cmd_line(lp_cmd_line as *const u16, || {
let parsed_args_list = parse_lp_cmd_line(WStrUnits::new(lp_cmd_line), || {
current_exe().map(PathBuf::into_os_string).unwrap_or_else(|_| OsString::new())
});

Expand All @@ -28,129 +35,120 @@ pub fn args() -> Args {
/// Implements the Windows command-line argument parsing algorithm.
///
/// Microsoft's documentation for the Windows CLI argument format can be found at
/// <https://docs.microsoft.com/en-us/previous-versions//17w5ykft(v=vs.85)>.
/// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments>
///
/// Windows includes a function to do this in shell32.dll,
/// but linking with that DLL causes the process to be registered as a GUI application.
/// A more in-depth explanation is here:
/// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN>
///
/// Windows includes a function to do command line parsing in shell32.dll.
/// However, this is not used for two reasons:
///
/// 1. Linking with that DLL causes the process to be registered as a GUI application.
/// GUI applications add a bunch of overhead, even if no windows are drawn. See
/// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>.
///
/// This function was tested for equivalence to the shell32.dll implementation in
/// Windows 10 Pro v1803, using an exhaustive test suite available at
/// <https://gist.github.com/notriddle/dde431930c392e428055b2dc22e638f5> or
/// <https://paste.gg/p/anonymous/47d6ed5f5bd549168b1c69c799825223>.
unsafe fn parse_lp_cmd_line<F: Fn() -> OsString>(
lp_cmd_line: *const u16,
/// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above.
///
/// This function was tested for equivalence to the C/C++ parsing rules using an
/// extensive test suite available at
/// <https://github.com/ChrisDenton/winarg/tree/std>.
fn parse_lp_cmd_line<'a, F: Fn() -> OsString>(
lp_cmd_line: Option<WStrUnits<'a>>,
exe_name: F,
) -> Vec<OsString> {
const BACKSLASH: u16 = '\\' as u16;
const QUOTE: u16 = '"' as u16;
const TAB: u16 = '\t' as u16;
const SPACE: u16 = ' ' as u16;
const BACKSLASH: NonZeroU16 = NonZeroU16::new(b'\\' as u16).unwrap();
const QUOTE: NonZeroU16 = NonZeroU16::new(b'"' as u16).unwrap();
const TAB: NonZeroU16 = NonZeroU16::new(b'\t' as u16).unwrap();
const SPACE: NonZeroU16 = NonZeroU16::new(b' ' as u16).unwrap();

let mut ret_val = Vec::new();
if lp_cmd_line.is_null() || *lp_cmd_line == 0 {
// If the cmd line pointer is null or it points to an empty string then
// return the name of the executable as argv[0].
if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() {
ret_val.push(exe_name());
return ret_val;
}
let mut cmd_line = {
let mut end = 0;
while *lp_cmd_line.offset(end) != 0 {
end += 1;
}
slice::from_raw_parts(lp_cmd_line, end as usize)
};
let mut code_units = lp_cmd_line.unwrap();

// The executable name at the beginning is special.
cmd_line = match cmd_line[0] {
// The executable name ends at the next quote mark,
// no matter what.
QUOTE => {
let args = {
let mut cut = cmd_line[1..].splitn(2, |&c| c == QUOTE);
if let Some(exe) = cut.next() {
ret_val.push(OsString::from_wide(exe));
}
cut.next()
};
if let Some(args) = args {
args
} else {
return ret_val;
}
}
// Implement quirk: when they say whitespace here,
// they include the entire ASCII control plane:
// "However, if lpCmdLine starts with any amount of whitespace, CommandLineToArgvW
// will consider the first argument to be an empty string. Excess whitespace at the
// end of lpCmdLine is ignored."
0..=SPACE => {
ret_val.push(OsString::new());
&cmd_line[1..]
}
// The executable name ends at the next whitespace,
// no matter what.
_ => {
let args = {
let mut cut = cmd_line.splitn(2, |&c| c > 0 && c <= SPACE);
if let Some(exe) = cut.next() {
ret_val.push(OsString::from_wide(exe));
}
cut.next()
};
if let Some(args) = args {
args
} else {
return ret_val;
}
let mut in_quotes = false;
let mut cur = Vec::new();
for w in &mut code_units {
match w {
// A quote mark always toggles `in_quotes` no matter what because
// there are no escape characters when parsing the executable name.
QUOTE => in_quotes = !in_quotes,
// If not `in_quotes` then whitespace ends argv[0].
SPACE | TAB if !in_quotes => break,
// In all other cases the code unit is taken literally.
_ => cur.push(w.get()),
}
};
}
// Skip whitespace.
code_units.advance_while(|w| w == SPACE || w == TAB);
ret_val.push(OsString::from_wide(&cur));

// Parse the arguments according to these rules:
// * All code units are taken literally except space, tab, quote and backslash.
// * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are
// treated as a single separator.
// * A space or tab `in_quotes` is taken literally.
// * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally.
// * A quote can be escaped if preceded by an odd number of backslashes.
// * If any number of backslashes is immediately followed by a quote then the number of
// backslashes is halved (rounding down).
// * Backslashes not followed by a quote are all taken literally.
// * If `in_quotes` then a quote can also be escaped using another quote
// (i.e. two consecutive quotes become one literal quote).
let mut cur = Vec::new();
let mut in_quotes = false;
let mut was_in_quotes = false;
let mut backslash_count: usize = 0;
for &c in cmd_line {
match c {
// backslash
BACKSLASH => {
backslash_count += 1;
was_in_quotes = false;
while let Some(w) = code_units.next() {
match w {
// If not `in_quotes`, a space or tab ends the argument.
SPACE | TAB if !in_quotes => {
ret_val.push(OsString::from_wide(&cur[..]));
cur.truncate(0);

// Skip whitespace.
code_units.advance_while(|w| w == SPACE || w == TAB);
}
QUOTE if backslash_count % 2 == 0 => {
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2));
backslash_count = 0;
if was_in_quotes {
cur.push('"' as u16);
was_in_quotes = false;
// Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote.
BACKSLASH => {
let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1;
if code_units.peek() == Some(QUOTE) {
cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2));
// The quote is escaped if there are an odd number of backslashes.
if backslash_count % 2 == 1 {
code_units.next();
cur.push(QUOTE.get());
}
} else {
was_in_quotes = in_quotes;
in_quotes = !in_quotes;
// If there is no quote on the end then there is no escaping.
cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count));
}
}
QUOTE if backslash_count % 2 != 0 => {
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count / 2));
backslash_count = 0;
was_in_quotes = false;
cur.push(b'"' as u16);
}
SPACE | TAB if !in_quotes => {
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
if !cur.is_empty() || was_in_quotes {
ret_val.push(OsString::from_wide(&cur[..]));
cur.truncate(0);
// If `in_quotes` and not backslash escaped (see above) then a quote either
// unsets `in_quote` or is escaped by another quote.
QUOTE if in_quotes => match code_units.peek() {
// Two consecutive quotes when `in_quotes` produces one literal quote.
Some(QUOTE) => {
cur.push(QUOTE.get());
code_units.next();
}
backslash_count = 0;
was_in_quotes = false;
}
_ => {
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
backslash_count = 0;
was_in_quotes = false;
cur.push(c);
}
// Otherwise set `in_quotes`.
Some(_) => in_quotes = false,
// The end of the command line.
// Push `cur` even if empty, which we do by breaking while `in_quotes` is still set.
None => break,
},
// If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`.
QUOTE => in_quotes = true,
// Everything else is always taken literally.
_ => cur.push(w.get()),
}
}
cur.extend(iter::repeat(b'\\' as u16).take(backslash_count));
// include empty quoted strings at the end of the arguments list
if !cur.is_empty() || was_in_quotes || in_quotes {
// Push the final argument, if any.
if !cur.is_empty() || in_quotes {
ret_val.push(OsString::from_wide(&cur[..]));
}
ret_val
Expand Down Expand Up @@ -187,3 +185,52 @@ impl ExactSizeIterator for Args {
self.parsed_args_list.len()
}
}

/// A safe iterator over a LPWSTR
/// (aka a pointer to a series of UTF-16 code units terminated by a NULL).
struct WStrUnits<'a> {
// The pointer must never be null...
lpwstr: NonNull<u16>,
// ...and the memory it points to must be valid for this lifetime.
lifetime: PhantomData<&'a [u16]>,
}
impl WStrUnits<'_> {
/// Create the iterator. Returns `None` if `lpwstr` is null.
///
/// SAFETY: `lpwstr` must point to a null-terminated wide string that lives
/// at least as long as the lifetime of this struct.
unsafe fn new(lpwstr: *const u16) -> Option<Self> {
Some(Self { lpwstr: NonNull::new(lpwstr as _)?, lifetime: PhantomData })
}
fn peek(&self) -> Option<NonZeroU16> {
// SAFETY: It's always safe to read the current item because we don't
// ever move out of the array's bounds.
unsafe { NonZeroU16::new(*self.lpwstr.as_ptr()) }
}
/// Advance the iterator while `predicate` returns true.
/// Returns the number of items it advanced by.
fn advance_while<P: FnMut(NonZeroU16) -> bool>(&mut self, mut predicate: P) -> usize {
let mut counter = 0;
while let Some(w) = self.peek() {
if !predicate(w) {
break;
}
counter += 1;
self.next();
}
counter
}
}
impl Iterator for WStrUnits<'_> {
// This can never return zero as that marks the end of the string.
type Item = NonZeroU16;
fn next(&mut self) -> Option<NonZeroU16> {
// SAFETY: If NULL is reached we immediately return.
// Therefore it's safe to advance the pointer after that.
unsafe {
let next = self.peek()?;
self.lpwstr = NonNull::new_unchecked(self.lpwstr.as_ptr().add(1));
Some(next)
}
}
}
Loading

0 comments on commit 1cf8fdd

Please sign in to comment.