xref: /drstd/src/std/sys/windows/args.rs (revision 9670759b785600bf6315e4173e46a602f16add7a)
1 //! The Windows command line is just a string
2 //! <https://docs.microsoft.com/en-us/archive/blogs/larryosterman/the-windows-command-line-is-just-a-string>
3 //!
4 //! This module implements the parsing necessary to turn that string into a list of arguments.
5 
6 #[cfg(test)]
7 mod tests;
8 
9 use crate::std::ffi::OsString;
10 use crate::std::fmt;
11 use crate::std::io;
12 use crate::std::num::NonZeroU16;
13 use crate::std::os::windows::prelude::*;
14 use crate::std::path::{Path, PathBuf};
15 use crate::std::sys::path::get_long_path;
16 use crate::std::sys::process::ensure_no_nuls;
17 use crate::std::sys::windows::os::current_exe;
18 use crate::std::sys::{c, to_u16s};
19 use crate::std::sys_common::wstr::WStrUnits;
20 use crate::std::vec;
21 
22 use crate::std::iter;
23 
24 /// This is the const equivalent to `NonZeroU16::new(n).unwrap()`
25 ///
26 /// FIXME: This can be removed once `Option::unwrap` is stably const.
27 /// See the `const_option` feature (#67441).
non_zero_u16(n: u16) -> NonZeroU1628 const fn non_zero_u16(n: u16) -> NonZeroU16 {
29     match NonZeroU16::new(n) {
30         Some(n) => n,
31         None => panic!("called `unwrap` on a `None` value"),
32     }
33 }
34 
args() -> Args35 pub fn args() -> Args {
36     // SAFETY: `GetCommandLineW` returns a pointer to a null terminated UTF-16
37     // string so it's safe for `WStrUnits` to use.
38     unsafe {
39         let lp_cmd_line = c::GetCommandLineW();
40         let parsed_args_list = parse_lp_cmd_line(WStrUnits::new(lp_cmd_line), || {
41             current_exe()
42                 .map(PathBuf::into_os_string)
43                 .unwrap_or_else(|_| OsString::new())
44         });
45 
46         Args {
47             parsed_args_list: parsed_args_list.into_iter(),
48         }
49     }
50 }
51 
52 /// Implements the Windows command-line argument parsing algorithm.
53 ///
54 /// Microsoft's documentation for the Windows CLI argument format can be found at
55 /// <https://docs.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-160#parsing-c-command-line-arguments>
56 ///
57 /// A more in-depth explanation is here:
58 /// <https://daviddeley.com/autohotkey/parameters/parameters.htm#WIN>
59 ///
60 /// Windows includes a function to do command line parsing in shell32.dll.
61 /// However, this is not used for two reasons:
62 ///
63 /// 1. Linking with that DLL causes the process to be registered as a GUI application.
64 /// GUI applications add a bunch of overhead, even if no windows are drawn. See
65 /// <https://randomascii.wordpress.com/2018/12/03/a-not-called-function-can-cause-a-5x-slowdown/>.
66 ///
67 /// 2. It does not follow the modern C/C++ argv rules outlined in the first two links above.
68 ///
69 /// This function was tested for equivalence to the C/C++ parsing rules using an
70 /// extensive test suite available at
71 /// <https://github.com/ChrisDenton/winarg/tree/std>.
parse_lp_cmd_line<'a, F: Fn() -> OsString>( lp_cmd_line: Option<WStrUnits<'a>>, exe_name: F, ) -> Vec<OsString>72 fn parse_lp_cmd_line<'a, F: Fn() -> OsString>(
73     lp_cmd_line: Option<WStrUnits<'a>>,
74     exe_name: F,
75 ) -> Vec<OsString> {
76     const BACKSLASH: NonZeroU16 = non_zero_u16(b'\\' as u16);
77     const QUOTE: NonZeroU16 = non_zero_u16(b'"' as u16);
78     const TAB: NonZeroU16 = non_zero_u16(b'\t' as u16);
79     const SPACE: NonZeroU16 = non_zero_u16(b' ' as u16);
80 
81     let mut ret_val = Vec::new();
82     // If the cmd line pointer is null or it points to an empty string then
83     // return the name of the executable as argv[0].
84     if lp_cmd_line.as_ref().and_then(|cmd| cmd.peek()).is_none() {
85         ret_val.push(exe_name());
86         return ret_val;
87     }
88     let mut code_units = lp_cmd_line.unwrap();
89 
90     // The executable name at the beginning is special.
91     let mut in_quotes = false;
92     let mut cur = Vec::new();
93     for w in &mut code_units {
94         match w {
95             // A quote mark always toggles `in_quotes` no matter what because
96             // there are no escape characters when parsing the executable name.
97             QUOTE => in_quotes = !in_quotes,
98             // If not `in_quotes` then whitespace ends argv[0].
99             SPACE | TAB if !in_quotes => break,
100             // In all other cases the code unit is taken literally.
101             _ => cur.push(w.get()),
102         }
103     }
104     // Skip whitespace.
105     code_units.advance_while(|w| w == SPACE || w == TAB);
106     ret_val.push(OsString::from_wide(&cur));
107 
108     // Parse the arguments according to these rules:
109     // * All code units are taken literally except space, tab, quote and backslash.
110     // * When not `in_quotes`, space and tab separate arguments. Consecutive spaces and tabs are
111     // treated as a single separator.
112     // * A space or tab `in_quotes` is taken literally.
113     // * A quote toggles `in_quotes` mode unless it's escaped. An escaped quote is taken literally.
114     // * A quote can be escaped if preceded by an odd number of backslashes.
115     // * If any number of backslashes is immediately followed by a quote then the number of
116     // backslashes is halved (rounding down).
117     // * Backslashes not followed by a quote are all taken literally.
118     // * If `in_quotes` then a quote can also be escaped using another quote
119     // (i.e. two consecutive quotes become one literal quote).
120     let mut cur = Vec::new();
121     let mut in_quotes = false;
122     while let Some(w) = code_units.next() {
123         match w {
124             // If not `in_quotes`, a space or tab ends the argument.
125             SPACE | TAB if !in_quotes => {
126                 ret_val.push(OsString::from_wide(&cur[..]));
127                 cur.truncate(0);
128 
129                 // Skip whitespace.
130                 code_units.advance_while(|w| w == SPACE || w == TAB);
131             }
132             // Backslashes can escape quotes or backslashes but only if consecutive backslashes are followed by a quote.
133             BACKSLASH => {
134                 let backslash_count = code_units.advance_while(|w| w == BACKSLASH) + 1;
135                 if code_units.peek() == Some(QUOTE) {
136                     cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count / 2));
137                     // The quote is escaped if there are an odd number of backslashes.
138                     if backslash_count % 2 == 1 {
139                         code_units.next();
140                         cur.push(QUOTE.get());
141                     }
142                 } else {
143                     // If there is no quote on the end then there is no escaping.
144                     cur.extend(iter::repeat(BACKSLASH.get()).take(backslash_count));
145                 }
146             }
147             // If `in_quotes` and not backslash escaped (see above) then a quote either
148             // unsets `in_quote` or is escaped by another quote.
149             QUOTE if in_quotes => match code_units.peek() {
150                 // Two consecutive quotes when `in_quotes` produces one literal quote.
151                 Some(QUOTE) => {
152                     cur.push(QUOTE.get());
153                     code_units.next();
154                 }
155                 // Otherwise set `in_quotes`.
156                 Some(_) => in_quotes = false,
157                 // The end of the command line.
158                 // Push `cur` even if empty, which we do by breaking while `in_quotes` is still set.
159                 None => break,
160             },
161             // If not `in_quotes` and not BACKSLASH escaped (see above) then a quote sets `in_quote`.
162             QUOTE => in_quotes = true,
163             // Everything else is always taken literally.
164             _ => cur.push(w.get()),
165         }
166     }
167     // Push the final argument, if any.
168     if !cur.is_empty() || in_quotes {
169         ret_val.push(OsString::from_wide(&cur[..]));
170     }
171     ret_val
172 }
173 
174 pub struct Args {
175     parsed_args_list: vec::IntoIter<OsString>,
176 }
177 
178 impl fmt::Debug for Args {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result179     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
180         self.parsed_args_list.as_slice().fmt(f)
181     }
182 }
183 
184 impl Iterator for Args {
185     type Item = OsString;
next(&mut self) -> Option<OsString>186     fn next(&mut self) -> Option<OsString> {
187         self.parsed_args_list.next()
188     }
size_hint(&self) -> (usize, Option<usize>)189     fn size_hint(&self) -> (usize, Option<usize>) {
190         self.parsed_args_list.size_hint()
191     }
192 }
193 
194 impl DoubleEndedIterator for Args {
next_back(&mut self) -> Option<OsString>195     fn next_back(&mut self) -> Option<OsString> {
196         self.parsed_args_list.next_back()
197     }
198 }
199 
200 impl ExactSizeIterator for Args {
len(&self) -> usize201     fn len(&self) -> usize {
202         self.parsed_args_list.len()
203     }
204 }
205 
206 #[derive(Debug)]
207 pub(crate) enum Arg {
208     /// Add quotes (if needed)
209     Regular(OsString),
210     /// Append raw string without quoting
211     Raw(OsString),
212 }
213 
214 enum Quote {
215     // Every arg is quoted
216     Always,
217     // Whitespace and empty args are quoted
218     Auto,
219     // Arg appended without any changes (#29494)
220     Never,
221 }
222 
append_arg(cmd: &mut Vec<u16>, arg: &Arg, force_quotes: bool) -> io::Result<()>223 pub(crate) fn append_arg(cmd: &mut Vec<u16>, arg: &Arg, force_quotes: bool) -> io::Result<()> {
224     let (arg, quote) = match arg {
225         Arg::Regular(arg) => (
226             arg,
227             if force_quotes {
228                 Quote::Always
229             } else {
230                 Quote::Auto
231             },
232         ),
233         Arg::Raw(arg) => (arg, Quote::Never),
234     };
235 
236     // If an argument has 0 characters then we need to quote it to ensure
237     // that it actually gets passed through on the command line or otherwise
238     // it will be dropped entirely when parsed on the other end.
239     ensure_no_nuls(arg)?;
240     let arg_bytes = arg.as_encoded_bytes();
241     let (quote, escape) = match quote {
242         Quote::Always => (true, true),
243         Quote::Auto => (
244             arg_bytes.iter().any(|c| *c == b' ' || *c == b'\t') || arg_bytes.is_empty(),
245             true,
246         ),
247         Quote::Never => (false, false),
248     };
249     if quote {
250         cmd.push('"' as u16);
251     }
252 
253     let mut backslashes: usize = 0;
254     for x in arg.encode_wide() {
255         if escape {
256             if x == '\\' as u16 {
257                 backslashes += 1;
258             } else {
259                 if x == '"' as u16 {
260                     // Add n+1 backslashes to total 2n+1 before internal '"'.
261                     cmd.extend((0..=backslashes).map(|_| '\\' as u16));
262                 }
263                 backslashes = 0;
264             }
265         }
266         cmd.push(x);
267     }
268 
269     if quote {
270         // Add n backslashes to total 2n before ending '"'.
271         cmd.extend((0..backslashes).map(|_| '\\' as u16));
272         cmd.push('"' as u16);
273     }
274     Ok(())
275 }
276 
make_bat_command_line( script: &[u16], args: &[Arg], force_quotes: bool, ) -> io::Result<Vec<u16>>277 pub(crate) fn make_bat_command_line(
278     script: &[u16],
279     args: &[Arg],
280     force_quotes: bool,
281 ) -> io::Result<Vec<u16>> {
282     // Set the start of the command line to `cmd.exe /c "`
283     // It is necessary to surround the command in an extra pair of quotes,
284     // hence the trailing quote here. It will be closed after all arguments
285     // have been added.
286     let mut cmd: Vec<u16> = "cmd.exe /d /c \"".encode_utf16().collect();
287 
288     // Push the script name surrounded by its quote pair.
289     cmd.push(b'"' as u16);
290     // Windows file names cannot contain a `"` character or end with `\\`.
291     // If the script name does then return an error.
292     if script.contains(&(b'"' as u16)) || script.last() == Some(&(b'\\' as u16)) {
293         return Err(io::const_io_error!(
294             io::ErrorKind::InvalidInput,
295             "Windows file names may not contain `\"` or end with `\\`"
296         ));
297     }
298     cmd.extend_from_slice(script.strip_suffix(&[0]).unwrap_or(script));
299     cmd.push(b'"' as u16);
300 
301     // Append the arguments.
302     // FIXME: This needs tests to ensure that the arguments are properly
303     // reconstructed by the batch script by default.
304     for arg in args {
305         cmd.push(' ' as u16);
306         // Make sure to always quote special command prompt characters, including:
307         // * Characters `cmd /?` says require quotes.
308         // * `%` for environment variables, as in `%TMP%`.
309         // * `|<>` pipe/redirect characters.
310         const SPECIAL: &[u8] = b"\t &()[]{}^=;!'+,`~%|<>";
311         let force_quotes = match arg {
312             Arg::Regular(arg) if !force_quotes => {
313                 arg.as_encoded_bytes().iter().any(|c| SPECIAL.contains(c))
314             }
315             _ => force_quotes,
316         };
317         append_arg(&mut cmd, arg, force_quotes)?;
318     }
319 
320     // Close the quote we left opened earlier.
321     cmd.push(b'"' as u16);
322 
323     Ok(cmd)
324 }
325 
326 /// Takes a path and tries to return a non-verbatim path.
327 ///
328 /// This is necessary because cmd.exe does not support verbatim paths.
to_user_path(path: &Path) -> io::Result<Vec<u16>>329 pub(crate) fn to_user_path(path: &Path) -> io::Result<Vec<u16>> {
330     from_wide_to_user_path(to_u16s(path)?)
331 }
from_wide_to_user_path(mut path: Vec<u16>) -> io::Result<Vec<u16>>332 pub(crate) fn from_wide_to_user_path(mut path: Vec<u16>) -> io::Result<Vec<u16>> {
333     use crate::std::ptr;
334     use crate::std::sys::windows::fill_utf16_buf;
335 
336     // UTF-16 encoded code points, used in parsing and building UTF-16 paths.
337     // All of these are in the ASCII range so they can be cast directly to `u16`.
338     const SEP: u16 = b'\\' as _;
339     const QUERY: u16 = b'?' as _;
340     const COLON: u16 = b':' as _;
341     const U: u16 = b'U' as _;
342     const N: u16 = b'N' as _;
343     const C: u16 = b'C' as _;
344 
345     // Early return if the path is too long to remove the verbatim prefix.
346     const LEGACY_MAX_PATH: usize = 260;
347     if path.len() > LEGACY_MAX_PATH {
348         return Ok(path);
349     }
350 
351     match &path[..] {
352         // `\\?\C:\...` => `C:\...`
353         [SEP, SEP, QUERY, SEP, _, COLON, SEP, ..] => unsafe {
354             let lpfilename = path[4..].as_ptr();
355             fill_utf16_buf(
356                 |buffer, size| c::GetFullPathNameW(lpfilename, size, buffer, ptr::null_mut()),
357                 |full_path: &[u16]| {
358                     if full_path == &path[4..path.len() - 1] {
359                         let mut path: Vec<u16> = full_path.into();
360                         path.push(0);
361                         path
362                     } else {
363                         path
364                     }
365                 },
366             )
367         },
368         // `\\?\UNC\...` => `\\...`
369         [SEP, SEP, QUERY, SEP, U, N, C, SEP, ..] => unsafe {
370             // Change the `C` in `UNC\` to `\` so we can get a slice that starts with `\\`.
371             path[6] = b'\\' as u16;
372             let lpfilename = path[6..].as_ptr();
373             fill_utf16_buf(
374                 |buffer, size| c::GetFullPathNameW(lpfilename, size, buffer, ptr::null_mut()),
375                 |full_path: &[u16]| {
376                     if full_path == &path[6..path.len() - 1] {
377                         let mut path: Vec<u16> = full_path.into();
378                         path.push(0);
379                         path
380                     } else {
381                         // Restore the 'C' in "UNC".
382                         path[6] = b'C' as u16;
383                         path
384                     }
385                 },
386             )
387         },
388         // For everything else, leave the path unchanged.
389         _ => get_long_path(path, false),
390     }
391 }
392