-
-
Notifications
You must be signed in to change notification settings - Fork 2.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
More accurate argv parsing/serialization on Windows #18309
Conversation
This adds `ArgIteratorWindows`, which faithfully replicates the quoting and escaping behavior observed in `CommandLineToArgvW` and should make Zig applications play better with processes that abuse these quirks.
…processes on Windows The old implementation had a bug in it in that it didn't quote empty strings, but it also didn't properly follow the special quoting rules required for the first argument (the executable name). This new implementation serializes the argv correctly such that it can be parsed by the `CommandLineToArgvW` algorithm.
da0590f
to
4d9c4ab
Compare
Awesome work, love the methodology used! I modified your
Test codeconst std = @import("std");
const Allocator = std.mem.Allocator;
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer std.debug.assert(gpa.deinit() == .ok);
const allocator = gpa.allocator();
var random = std.rand.DefaultPrng.init(0);
const rand = random.random();
var i: usize = 0;
while (true) : (i += 1) {
const rand_cmd_line = try randomCommandLine(allocator, rand);
defer allocator.free(rand_cmd_line);
// TEMPORARY to avoid known difference for 0-length command lines
if (rand_cmd_line.len == 0 or rand_cmd_line[0] == '\x00') continue;
try testCommandLine(allocator, rand_cmd_line);
if (i % 1000 == 0) {
std.debug.print("{}\r", .{i});
}
}
}
fn testCommandLine(allocator: Allocator, cmd_line: []const u8) !void {
const cmd_line_w = try std.unicode.utf8ToUtf16LeWithNull(allocator, cmd_line);
defer allocator.free(cmd_line_w);
const win_args = try windowsParseCommandLine(allocator, cmd_line_w);
defer {
for (win_args) |win_arg| {
allocator.free(win_arg);
}
allocator.free(win_args);
}
var zig_args = try std.process.ArgIteratorWindows.init(allocator, cmd_line_w);
defer zig_args.deinit();
var zig_args_list = std.ArrayList([]const u8).init(allocator);
defer zig_args_list.deinit();
while (zig_args.next()) |zig_arg| {
try zig_args_list.append(zig_arg);
}
if (getDiffIndex(win_args, zig_args_list.items)) |diff_index| {
std.debug.print("cmd_line: \"{}\"\n", .{std.zig.fmtEscapes(cmd_line)});
std.debug.print("first difference at index {}\n", .{diff_index});
if (diff_index >= win_args.len)
std.debug.print("expected: null\n", .{})
else
std.debug.print("expected: \"{}\"\n", .{std.zig.fmtEscapes(win_args[diff_index])});
if (diff_index >= zig_args_list.items.len)
std.debug.print("actual: null\n", .{})
else
std.debug.print("actual: \"{}\"\n", .{std.zig.fmtEscapes(zig_args_list.items[diff_index])});
std.debug.print("win:", .{});
for (win_args) |arg| {
std.debug.print(" \"{}\"", .{std.zig.fmtEscapes(arg)});
}
std.debug.print("\n", .{});
std.debug.print("zig:", .{});
for (zig_args_list.items) |arg| {
std.debug.print(" \"{}\"", .{std.zig.fmtEscapes(arg)});
}
std.debug.print("\n", .{});
return error.ParsedArgsNotEqual;
}
}
fn getDiffIndex(a: []const []const u8, b: []const []const u8) ?usize {
if (a.len != b.len) return @min(a.len, b.len);
for (a, b, 0..) |a_str, b_str, i| {
if (!std.mem.eql(u8, a_str, b_str)) return i;
}
return null;
}
fn windowsParseCommandLine(allocator: Allocator, cmd_line_w: [:0]const u16) ![][:0]u8 {
var argc: c_int = undefined;
const argv_w = CommandLineToArgvW(cmd_line_w, &argc) orelse return error.Unexpected;
defer std.os.windows.LocalFree(@ptrCast(argv_w));
const args = try allocator.alloc([:0]u8, @intCast(argc));
for (argv_w, 0..@intCast(argc)) |arg_w, i| {
args[i] = try std.unicode.utf16leToUtf8AllocZ(allocator, std.mem.span(arg_w));
}
return args;
}
fn randomCommandLine(allocator: Allocator, rand: std.rand.Random) ![]const u8 {
const Choice = enum {
backslash,
quote,
space,
tab,
control,
printable,
};
const choices = rand.uintAtMostBiased(u16, 256);
var buf = try std.ArrayList(u8).initCapacity(allocator, choices);
errdefer buf.deinit();
for (0..choices) |_| {
const choice = rand.enumValue(Choice);
const byte = switch (choice) {
.backslash => '\\',
.quote => '"',
.space => ' ',
.tab => '\t',
.control => switch (rand.uintAtMostBiased(u8, 0x21)) {
0x21 => '\x7F',
else => |b| b,
},
.printable => '!' + rand.uintAtMostBiased(u8, '~' - '!'),
};
try buf.append(byte);
}
return buf.toOwnedSlice();
}
extern "shell32" fn CommandLineToArgvW(lpCmdLine: std.os.windows.LPCWSTR, pNumArgs: *c_int) ?[*]std.os.windows.LPCWSTR; The only difference found in 16,000,000+ iterations so far is around passing a 0-length string to
Not sure if/how that should be handled on the Zig side (I currently specifically skip it in the test code--comment that line out if you want to hit that case yourself). |
FWIW I made sure to document that the only difference in behavior between It's also worth taking into consideration how
This means that the child process' command-line string retrieved via You can verify the behavior of //! echo_cmd_line.zig
const std = @import("std");
pub fn main() !void {
const cmd_line_w = std.os.windows.kernel32.GetCommandLineW();
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}", .{std.unicode.fmtUtf16le(std.mem.span(cmd_line_w))});
} zig build-exe ./echo_cmd_line.zig //! test_cmd_line.zig
const std = @import("std");
const windows = std.os.windows;
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer std.debug.assert(gpa.deinit() == .ok);
const allocator = gpa.allocator();
const echo_app_name = get_echo_app_name: {
var argc: c_int = undefined;
const argv_w = CommandLineToArgvW(windows.kernel32.GetCommandLineW(), &argc) orelse return error.Unexpected;
defer windows.LocalFree(@ptrCast(argv_w));
if (argc < 2) return error.MissingEchoAppNameArg;
break :get_echo_app_name try std.unicode.utf16leToUtf8Alloc(allocator, std.mem.span(argv_w[1]));
};
defer allocator.free(echo_app_name);
const quoted_echo_app_name = try std.fmt.allocPrint(allocator, "\"{s}\"", .{echo_app_name});
defer allocator.free(quoted_echo_app_name);
inline for (.{
.{ echo_app_name, quoted_echo_app_name },
.{ echo_app_name, null },
.{ echo_app_name, "" },
.{ null, quoted_echo_app_name },
}) |case| {
const app_name, const cmd_line = case;
const echoed_cmd_line = try spawnChildReadStdout(allocator, app_name, cmd_line);
defer allocator.free(echoed_cmd_line);
std.debug.print("{s}\n", .{echoed_cmd_line});
}
}
extern "shell32" fn CommandLineToArgvW(lpCmdLine: windows.LPCWSTR, pNumArgs: *c_int) ?[*]windows.LPCWSTR;
fn spawnChildReadStdout(allocator: std.mem.Allocator, app_name: ?[]const u8, cmd_line: ?[]const u8) ![]u8 {
const child_stdout, const child_proc = spawn: {
const app_name_w = if (app_name != null)
try std.unicode.utf8ToUtf16LeWithNull(allocator, app_name.?)
else
null;
defer if (app_name_w != null) allocator.free(app_name_w.?);
const cmd_line_w = if (cmd_line != null)
try std.unicode.utf8ToUtf16LeWithNull(allocator, cmd_line.?)
else
null;
defer if (cmd_line_w != null) allocator.free(cmd_line_w.?);
var child_stdout_rd: windows.HANDLE = undefined;
var child_stdout_wr: windows.HANDLE = undefined;
try windows.CreatePipe(&child_stdout_rd, &child_stdout_wr, &.{
.nLength = @sizeOf(windows.SECURITY_ATTRIBUTES),
.lpSecurityDescriptor = null,
.bInheritHandle = windows.TRUE,
});
errdefer windows.CloseHandle(child_stdout_rd);
defer windows.CloseHandle(child_stdout_wr);
try windows.SetHandleInformation(child_stdout_rd, windows.HANDLE_FLAG_INHERIT, 0);
var startup_info: windows.STARTUPINFOW = .{
.cb = @sizeOf(windows.STARTUPINFOW),
.lpReserved = null,
.lpDesktop = null,
.lpTitle = null,
.dwX = 0,
.dwY = 0,
.dwXSize = 0,
.dwYSize = 0,
.dwXCountChars = 0,
.dwYCountChars = 0,
.dwFillAttribute = 0,
.dwFlags = windows.STARTF_USESTDHANDLES,
.wShowWindow = 0,
.cbReserved2 = 0,
.lpReserved2 = null,
.hStdInput = null,
.hStdOutput = child_stdout_wr,
.hStdError = null,
};
var proc_info: windows.PROCESS_INFORMATION = undefined;
try windows.CreateProcessW(
app_name_w orelse null,
cmd_line_w orelse null,
null,
null,
windows.TRUE,
0,
null,
null,
&startup_info,
&proc_info,
);
windows.CloseHandle(proc_info.hThread);
break :spawn .{ @as(std.fs.File, .{ .handle = child_stdout_rd }), proc_info.hProcess };
};
defer windows.CloseHandle(child_proc);
defer windows.WaitForSingleObjectEx(child_proc, windows.INFINITE, false) catch {};
return try child_stdout.reader().readAllAlloc(allocator, 4096);
} zig run ./test_cmd_line.zig -- ./echo_cmd_line.exe
(I think it's worth noting that the output still uses forward slashes, which suggests that Windows doesn't really mess with the provided paths.) Edit: I forgot to add, the |
…ToArgvW On Windows, the command line arguments of a program are a single WTF-16 encoded string and its up to the program to split it into an array of strings. In C/C++, the entry point of the C runtime takes care of splitting the command line and passing argc/argv to the main function. ziglang#18309 updated ArgIteratorWindows to match the behavior of CommandLineToArgvW, but it turns out that CommandLineToArgvW's behavior does not match the behavior of the C runtime post-2008. In 2008, the C runtime argv splitting changed how it handles consecutive double quotes within a quoted argument (it's now considered an escaped quote, e.g. `"foo""bar"` post-2008 would get parsed into `foo"bar`). This commit makes ArgIteratorWindows match the behavior of the post-2008 C runtime, and adds a standalone test that verifies the behavior matches both the MSVC and MinGW argv splitting exactly in all cases (it checks that randomly generated command line strings get split the same way). The motivation here is roughly the same as when the same change was made in Rust (rust-lang/rust#87580), that is (paraphrased): - Consistent behavior between Zig and modern C/C++ programs - Allows users to escape double quotes in a way that can be more straightforward Additionally, the suggested mitigation for BatBadBut (https://flatt.tech/research/posts/batbadbut-you-cant-securely-execute-commands-on-windows/) relies on the post-2008 argv splitting behavior for roundtripping of the arguments given to `cmd.exe`. Note: it's not necessary for the suggested mitigation to work, but it is necessary for the suggested escaping to be parsed back into the intended argv by ArgIteratorWindows.
…ToArgvW On Windows, the command line arguments of a program are a single WTF-16 encoded string and it's up to the program to split it into an array of strings. In C/C++, the entry point of the C runtime takes care of splitting the command line and passing argc/argv to the main function. ziglang#18309 updated ArgIteratorWindows to match the behavior of CommandLineToArgvW, but it turns out that CommandLineToArgvW's behavior does not match the behavior of the C runtime post-2008. In 2008, the C runtime argv splitting changed how it handles consecutive double quotes within a quoted argument (it's now considered an escaped quote, e.g. `"foo""bar"` post-2008 would get parsed into `foo"bar`), and the rules around argv[0] were also changed. This commit makes ArgIteratorWindows match the behavior of the post-2008 C runtime, and adds a standalone test that verifies the behavior matches both the MSVC and MinGW argv splitting exactly in all cases (it checks that randomly generated command line strings get split the same way). The motivation here is roughly the same as when the same change was made in Rust (rust-lang/rust#87580), that is (paraphrased): - Consistent behavior between Zig and modern C/C++ programs - Allows users to escape double quotes in a way that can be more straightforward Additionally, the suggested mitigation for BatBadBut (https://flatt.tech/research/posts/batbadbut-you-cant-securely-execute-commands-on-windows/) relies on the post-2008 argv splitting behavior for roundtripping of the arguments given to `cmd.exe`. Note: it's not necessary for the suggested mitigation to work, but it is necessary for the suggested escaping to be parsed back into the intended argv by ArgIteratorWindows after being run through a `.bat` file.
…ToArgvW On Windows, the command line arguments of a program are a single WTF-16 encoded string and it's up to the program to split it into an array of strings. In C/C++, the entry point of the C runtime takes care of splitting the command line and passing argc/argv to the main function. ziglang#18309 updated ArgIteratorWindows to match the behavior of CommandLineToArgvW, but it turns out that CommandLineToArgvW's behavior does not match the behavior of the C runtime post-2008. In 2008, the C runtime argv splitting changed how it handles consecutive double quotes within a quoted argument (it's now considered an escaped quote, e.g. `"foo""bar"` post-2008 would get parsed into `foo"bar`), and the rules around argv[0] were also changed. This commit makes ArgIteratorWindows match the behavior of the post-2008 C runtime, and adds a standalone test that verifies the behavior matches both the MSVC and MinGW argv splitting exactly in all cases (it checks that randomly generated command line strings get split the same way). The motivation here is roughly the same as when the same change was made in Rust (rust-lang/rust#87580), that is (paraphrased): - Consistent behavior between Zig and modern C/C++ programs - Allows users to escape double quotes in a way that can be more straightforward Additionally, the suggested mitigation for BatBadBut (https://flatt.tech/research/posts/batbadbut-you-cant-securely-execute-commands-on-windows/) relies on the post-2008 argv splitting behavior for roundtripping of the arguments given to `cmd.exe`. Note: it's not necessary for the suggested mitigation to work, but it is necessary for the suggested escaping to be parsed back into the intended argv by ArgIteratorWindows after being run through a `.bat` file.
…ToArgvW On Windows, the command line arguments of a program are a single WTF-16 encoded string and it's up to the program to split it into an array of strings. In C/C++, the entry point of the C runtime takes care of splitting the command line and passing argc/argv to the main function. ziglang#18309 updated ArgIteratorWindows to match the behavior of CommandLineToArgvW, but it turns out that CommandLineToArgvW's behavior does not match the behavior of the C runtime post-2008. In 2008, the C runtime argv splitting changed how it handles consecutive double quotes within a quoted argument (it's now considered an escaped quote, e.g. `"foo""bar"` post-2008 would get parsed into `foo"bar`), and the rules around argv[0] were also changed. This commit makes ArgIteratorWindows match the behavior of the post-2008 C runtime, and adds a standalone test that verifies the behavior matches both the MSVC and MinGW argv splitting exactly in all cases (it checks that randomly generated command line strings get split the same way). The motivation here is roughly the same as when the same change was made in Rust (rust-lang/rust#87580), that is (paraphrased): - Consistent behavior between Zig and modern C/C++ programs - Allows users to escape double quotes in a way that can be more straightforward Additionally, the suggested mitigation for BatBadBut (https://flatt.tech/research/posts/batbadbut-you-cant-securely-execute-commands-on-windows/) relies on the post-2008 argv splitting behavior for roundtripping of the arguments given to `cmd.exe`. Note: it's not necessary for the suggested mitigation to work, but it is necessary for the suggested escaping to be parsed back into the intended argv by ArgIteratorWindows after being run through a `.bat` file.
…ToArgvW On Windows, the command line arguments of a program are a single WTF-16 encoded string and it's up to the program to split it into an array of strings. In C/C++, the entry point of the C runtime takes care of splitting the command line and passing argc/argv to the main function. ziglang#18309 updated ArgIteratorWindows to match the behavior of CommandLineToArgvW, but it turns out that CommandLineToArgvW's behavior does not match the behavior of the C runtime post-2008. In 2008, the C runtime argv splitting changed how it handles consecutive double quotes within a quoted argument (it's now considered an escaped quote, e.g. `"foo""bar"` post-2008 would get parsed into `foo"bar`), and the rules around argv[0] were also changed. This commit makes ArgIteratorWindows match the behavior of the post-2008 C runtime, and adds a standalone test that verifies the behavior matches both the MSVC and MinGW argv splitting exactly in all cases (it checks that randomly generated command line strings get split the same way). The motivation here is roughly the same as when the same change was made in Rust (rust-lang/rust#87580), that is (paraphrased): - Consistent behavior between Zig and modern C/C++ programs - Allows users to escape double quotes in a way that can be more straightforward Additionally, the suggested mitigation for BatBadBut (https://flatt.tech/research/posts/batbadbut-you-cant-securely-execute-commands-on-windows/) relies on the post-2008 argv splitting behavior for roundtripping of the arguments given to `cmd.exe`. Note: it's not necessary for the suggested mitigation to work, but it is necessary for the suggested escaping to be parsed back into the intended argv by ArgIteratorWindows after being run through a `.bat` file.
…ToArgvW On Windows, the command line arguments of a program are a single WTF-16 encoded string and it's up to the program to split it into an array of strings. In C/C++, the entry point of the C runtime takes care of splitting the command line and passing argc/argv to the main function. ziglang#18309 updated ArgIteratorWindows to match the behavior of CommandLineToArgvW, but it turns out that CommandLineToArgvW's behavior does not match the behavior of the C runtime post-2008. In 2008, the C runtime argv splitting changed how it handles consecutive double quotes within a quoted argument (it's now considered an escaped quote, e.g. `"foo""bar"` post-2008 would get parsed into `foo"bar`), and the rules around argv[0] were also changed. This commit makes ArgIteratorWindows match the behavior of the post-2008 C runtime, and adds a standalone test that verifies the behavior matches both the MSVC and MinGW argv splitting exactly in all cases (it checks that randomly generated command line strings get split the same way). The motivation here is roughly the same as when the same change was made in Rust (rust-lang/rust#87580), that is (paraphrased): - Consistent behavior between Zig and modern C/C++ programs - Allows users to escape double quotes in a way that can be more straightforward Additionally, the suggested mitigation for BatBadBut (https://flatt.tech/research/posts/batbadbut-you-cant-securely-execute-commands-on-windows/) relies on the post-2008 argv splitting behavior for roundtripping of the arguments given to `cmd.exe`. Note: it's not necessary for the suggested mitigation to work, but it is necessary for the suggested escaping to be parsed back into the intended argv by ArgIteratorWindows after being run through a `.bat` file.
The first commit adds
ArgIteratorWindows
, which faithfully replicates the quoting and escaping behavior observed inCommandLineToArgvW
and should make Zig applications play better with processes that abuse these quirks.ArgIterator
is updated to use this implementation on Windows.The new implementation used
ArgIteratorGeneral
as a base but the new rules were written by me, from scratch, by following public documentation and manually observing the behavior ofCommandLineToArgvW
on Windows 10; they weren't ported from a different project or based on disassembly.I left
ArgIteratorGeneral
as is, but since it is now only used for response files it might make sense to rename itArgIteratorResponseFile
and remove the Windows-specific wide stringinit
overload.The second commit updates the argv-to-command-line serialization in
ChildProcess
on Windows to follow these rules properly (including the special quoting rules for the first argument), which fixes a bug where empty arguments weren't quoted and thus ignored by child processes.The PR includes test cases for both parsing and serialization. If you would like to independently verify that the argv parsing matches
CommandLineToArgvW
you can paste the test cases fromtest "ArgIteratorWindows"
into this program and run it (only works on Windows for obvious reasons):This program will print out URI-escaped lines that help you compare the output of the Windows and Zig implementations to the expected result.
Happy to answer any questions or concerns you may have.