Move most of the peculiar argument handling for external calls into the parser (#13089)

# Description We've had a lot of different issues and PRs related to arg handling with externals since the rewrite of `run-external` in #12921: - #12950 - #12955 - #13000 - #13001 - #13021 - #13027 - #13028 - #13073 Many of these are caused by the argument handling of external calls and `run-external` being very special and involving the parser handing quoted strings over to `run-external` so that it knows whether to expand tildes and globs and so on. This is really unusual and also makes it harder to use `run-external`, and also harder to understand it (and probably is part of the reason why it was rewritten in the first place). This PR moves a lot more of that work over to the parser, so that by the time `run-external` gets it, it's dealing with much more normal Nushell values. In particular: - Unquoted strings are handled as globs with no expand - The unescaped-but-quoted handling of strings was removed, and the parser constructs normal looking strings instead, removing internal quotes so that `run-external` doesn't have to do it - Bare word interpolation is now supported and expansion is done in this case - Expressions typed as `Glob` containing `Expr::StringInterpolation` now produce `Value::Glob` instead, with the quoted status from the expr passed through so we know if it was a bare word - Bare word interpolation for values typed as `glob` now possible, but not implemented - Because expansion is now triggered by `Value::Glob(_, false)` instead of looking at the expr, externals now support glob types # User-Facing Changes - Bare word interpolation works for external command options, and otherwise embedded in other strings: ```nushell ^echo --foo=(2 + 2) # prints --foo=4 ^echo -foo=$"(2 + 2)" # prints -foo=4 ^echo foo="(2 + 2)" # prints (no interpolation!) foo=(2 + 2) ^echo foo,(2 + 2),bar # prints foo,4,bar ``` - Bare word interpolation expands for external command head/args: ```nushell let name = "exa" ~/.cargo/bin/($name) # this works, and expands the tilde ^$"~/.cargo/bin/($name)" # this doesn't expand the tilde ^echo ~/($name)/* # this glob is expanded ^echo $"~/($name)/*" # this isn't expanded ``` - Ndots are now supported for the head of an external command (`^.../foo` works) - Glob values are now supported for head/args of an external command, and expanded appropriately: ```nushell ^("~/.cargo/bin/exa" | into glob) # the tilde is expanded ^echo ("*.txt" | into glob) # this glob is expanded ``` - `run-external` now works more like any other command, without expecting a special call convention for its args: ```nushell run-external echo "'foo'" # before PR: 'foo' # after PR: foo run-external echo "*.txt" # before PR: (glob is expanded) # after PR: *.txt ``` # Tests + Formatting Lots of tests added and cleaned up. Some tests that weren't active on Windows changed to use `nu --testbin cococo` so that they can work. Added a test for Linux only to make sure tilde expansion of commands works, because changing `HOME` there causes `~` to reliably change. - 🟢 `toolkit fmt` - 🟢 `toolkit clippy` - 🟢 `toolkit test` - 🟢 `toolkit test stdlib` # After Submitting - [ ] release notes: make sure to mention the new syntaxes that are supported
2024-06-19 21:00:03 -07:00
parent 44aa0a2de4
commit bdc32345bd
13 changed files with 880 additions and 476 deletions
--- a/crates/nu-parser/src/flatten.rs
+++ b/crates/nu-parser/src/flatten.rs
@ -26,6 +26,7 @@ pub enum FlatShape {
    Flag,
    Float,
    Garbage,
+    GlobInterpolation,
    GlobPattern,
    Int,
    InternalCall(DeclId),
@ -67,6 +68,7 @@ impl FlatShape {
            FlatShape::Flag => "shape_flag",
            FlatShape::Float => "shape_float",
            FlatShape::Garbage => "shape_garbage",
+            FlatShape::GlobInterpolation => "shape_glob_interpolation",
            FlatShape::GlobPattern => "shape_globpattern",
            FlatShape::Int => "shape_int",
            FlatShape::InternalCall(_) => "shape_internalcall",
@ -277,7 +279,7 @@ fn flatten_expression_into(
            output[arg_start..].sort();
        }
        Expr::ExternalCall(head, args) => {
-            if let Expr::String(..) = &head.expr {
+            if let Expr::String(..) | Expr::GlobPattern(..) = &head.expr {
                output.push((head.span, FlatShape::External));
            } else {
                flatten_expression_into(working_set, head, output);
@ -286,7 +288,7 @@ fn flatten_expression_into(
            for arg in args.as_ref() {
                match arg {
                    ExternalArgument::Regular(expr) => {
-                        if let Expr::String(..) = &expr.expr {
+                        if let Expr::String(..) | Expr::GlobPattern(..) = &expr.expr {
                            output.push((expr.span, FlatShape::ExternalArg));
                        } else {
                            flatten_expression_into(working_set, expr, output);
@ -431,6 +433,25 @@ fn flatten_expression_into(
            }
            output.extend(flattened);
        }
+        Expr::GlobInterpolation(exprs, quoted) => {
+            let mut flattened = vec![];
+            for expr in exprs {
+                flatten_expression_into(working_set, expr, &mut flattened);
+            }
+
+            if *quoted {
+                // If we aren't a bare word interpolation, also highlight the outer quotes
+                output.push((
+                    Span::new(expr.span.start, expr.span.start + 2),
+                    FlatShape::GlobInterpolation,
+                ));
+                flattened.push((
+                    Span::new(expr.span.end - 1, expr.span.end),
+                    FlatShape::GlobInterpolation,
+                ));
+            }
+            output.extend(flattened);
+        }
        Expr::Record(list) => {
            let outer_span = expr.span;
            let mut last_end = outer_span.start;
--- a/crates/nu-parser/src/parser.rs
+++ b/crates/nu-parser/src/parser.rs
@ -16,7 +16,6 @@ use nu_protocol::{
    IN_VARIABLE_ID,
 };
 use std::{
-    borrow::Cow,
    collections::{HashMap, HashSet},
    num::ParseIntError,
    str,
@ -222,6 +221,209 @@ pub(crate) fn check_call(
    }
 }

+/// Parses a string in the arg or head position of an external call.
+///
+/// If the string begins with `r#`, it is parsed as a raw string. If it doesn't contain any quotes
+/// or parentheses, it is parsed as a glob pattern so that tilde and glob expansion can be handled
+/// by `run-external`. Otherwise, we use a custom state machine to put together an interpolated
+/// string, where each balanced pair of quotes is parsed as a separate part of the string, and then
+/// concatenated together.
+///
+/// For example, `-foo="bar\nbaz"` becomes `$"-foo=bar\nbaz"`
+fn parse_external_string(working_set: &mut StateWorkingSet, span: Span) -> Expression {
+    let contents = &working_set.get_span_contents(span);
+
+    if contents.starts_with(b"r#") {
+        parse_raw_string(working_set, span)
+    } else if contents
+        .iter()
+        .any(|b| matches!(b, b'"' | b'\'' | b'(' | b')'))
+    {
+        enum State {
+            Bare {
+                from: usize,
+            },
+            Quote {
+                from: usize,
+                quote_char: u8,
+                escaped: bool,
+                depth: i32,
+            },
+        }
+        // Find the spans of parts of the string that can be parsed as their own strings for
+        // concatenation.
+        //
+        // By passing each of these parts to `parse_string()`, we can eliminate the quotes and also
+        // handle string interpolation.
+        let make_span = |from: usize, index: usize| Span {
+            start: span.start + from,
+            end: span.start + index,
+        };
+        let mut spans = vec![];
+        let mut state = State::Bare { from: 0 };
+        let mut index = 0;
+        while index < contents.len() {
+            let ch = contents[index];
+            match &mut state {
+                State::Bare { from } => match ch {
+                    b'"' | b'\'' => {
+                        // Push bare string
+                        if index != *from {
+                            spans.push(make_span(*from, index));
+                        }
+                        // then transition to other state
+                        state = State::Quote {
+                            from: index,
+                            quote_char: ch,
+                            escaped: false,
+                            depth: 1,
+                        };
+                    }
+                    b'$' => {
+                        if let Some(&quote_char @ (b'"' | b'\'')) = contents.get(index + 1) {
+                            // Start a dollar quote (interpolated string)
+                            if index != *from {
+                                spans.push(make_span(*from, index));
+                            }
+                            state = State::Quote {
+                                from: index,
+                                quote_char,
+                                escaped: false,
+                                depth: 1,
+                            };
+                            // Skip over two chars (the dollar sign and the quote)
+                            index += 2;
+                            continue;
+                        }
+                    }
+                    // Continue to consume
+                    _ => (),
+                },
+                State::Quote {
+                    from,
+                    quote_char,
+                    escaped,
+                    depth,
+                } => match ch {
+                    ch if ch == *quote_char && !*escaped => {
+                        // Count if there are more than `depth` quotes remaining
+                        if contents[index..]
+                            .iter()
+                            .filter(|b| *b == quote_char)
+                            .count() as i32
+                            > *depth
+                        {
+                            // Increment depth to be greedy
+                            *depth += 1;
+                        } else {
+                            // Decrement depth
+                            *depth -= 1;
+                        }
+                        if *depth == 0 {
+                            // End of string
+                            spans.push(make_span(*from, index + 1));
+                            // go back to Bare state
+                            state = State::Bare { from: index + 1 };
+                        }
+                    }
+                    b'\\' if !*escaped && *quote_char == b'"' => {
+                        // The next token is escaped so it doesn't count (only for double quote)
+                        *escaped = true;
+                    }
+                    _ => {
+                        *escaped = false;
+                    }
+                },
+            }
+            index += 1;
+        }
+
+        // Add the final span
+        match state {
+            State::Bare { from } | State::Quote { from, .. } => {
+                if from < contents.len() {
+                    spans.push(make_span(from, contents.len()));
+                }
+            }
+        }
+
+        // Log the spans that will be parsed
+        if log::log_enabled!(log::Level::Trace) {
+            let contents = spans
+                .iter()
+                .map(|span| String::from_utf8_lossy(working_set.get_span_contents(*span)))
+                .collect::<Vec<_>>();
+
+            trace!("parsing: external string, parts: {contents:?}")
+        }
+
+        // Check if the whole thing is quoted. If not, it should be a glob
+        let quoted =
+            (contents.len() >= 3 && contents.starts_with(b"$\"") && contents.ends_with(b"\""))
+                || is_quoted(contents);
+
+        // Parse each as its own string
+        let exprs: Vec<Expression> = spans
+            .into_iter()
+            .map(|span| parse_string(working_set, span))
+            .collect();
+
+        if exprs
+            .iter()
+            .all(|expr| matches!(expr.expr, Expr::String(..)))
+        {
+            // If the exprs are all strings anyway, just collapse into a single string.
+            let string = exprs
+                .into_iter()
+                .map(|expr| {
+                    let Expr::String(contents) = expr.expr else {
+                        unreachable!("already checked that this was a String")
+                    };
+                    contents
+                })
+                .collect::<String>();
+            if quoted {
+                Expression::new(working_set, Expr::String(string), span, Type::String)
+            } else {
+                Expression::new(
+                    working_set,
+                    Expr::GlobPattern(string, false),
+                    span,
+                    Type::Glob,
+                )
+            }
+        } else {
+            // Flatten any string interpolations contained with the exprs.
+            let exprs = exprs
+                .into_iter()
+                .flat_map(|expr| match expr.expr {
+                    Expr::StringInterpolation(subexprs) => subexprs,
+                    _ => vec![expr],
+                })
+                .collect();
+            // Make an interpolation out of the expressions. Use `GlobInterpolation` if it's a bare
+            // word, so that the unquoted state can get passed through to `run-external`.
+            if quoted {
+                Expression::new(
+                    working_set,
+                    Expr::StringInterpolation(exprs),
+                    span,
+                    Type::String,
+                )
+            } else {
+                Expression::new(
+                    working_set,
+                    Expr::GlobInterpolation(exprs, false),
+                    span,
+                    Type::Glob,
+                )
+            }
+        }
+    } else {
+        parse_glob_pattern(working_set, span)
+    }
+}
+
 fn parse_external_arg(working_set: &mut StateWorkingSet, span: Span) -> ExternalArgument {
    let contents = working_set.get_span_contents(span);

@ -229,8 +431,6 @@ fn parse_external_arg(working_set: &mut StateWorkingSet, span: Span) -> External
        ExternalArgument::Regular(parse_dollar_expr(working_set, span))
    } else if contents.starts_with(b"[") {
        ExternalArgument::Regular(parse_list_expression(working_set, span, &SyntaxShape::Any))
-    } else if contents.starts_with(b"r#") {
-        ExternalArgument::Regular(parse_raw_string(working_set, span))
    } else if contents.len() > 3
        && contents.starts_with(b"...")
        && (contents[3] == b'$' || contents[3] == b'[' || contents[3] == b'(')
@ -241,18 +441,7 @@ fn parse_external_arg(working_set: &mut StateWorkingSet, span: Span) -> External
            &SyntaxShape::List(Box::new(SyntaxShape::Any)),
        ))
    } else {
-        // Eval stage trims the quotes, so we don't have to do the same thing when parsing.
-        let (contents, err) = unescape_string_preserving_quotes(contents, span);
-        if let Some(err) = err {
-            working_set.error(err);
-        }
-
-        ExternalArgument::Regular(Expression::new(
-            working_set,
-            Expr::String(contents),
-            span,
-            Type::String,
-        ))
+        ExternalArgument::Regular(parse_external_string(working_set, span))
    }
 }

@ -274,18 +463,7 @@ pub fn parse_external_call(working_set: &mut StateWorkingSet, spans: &[Span]) ->
        let arg = parse_expression(working_set, &[head_span]);
        Box::new(arg)
    } else {
-        // Eval stage will unquote the string, so we don't bother with that here
-        let (contents, err) = unescape_string_preserving_quotes(&head_contents, head_span);
-        if let Some(err) = err {
-            working_set.error(err)
-        }
-
-        Box::new(Expression::new(
-            working_set,
-            Expr::String(contents),
-            head_span,
-            Type::String,
-        ))
+        Box::new(parse_external_string(working_set, head_span))
    };

    let args = spans[1..]
@ -2639,23 +2817,6 @@ pub fn unescape_unquote_string(bytes: &[u8], span: Span) -> (String, Option<Pars
    }
 }

-/// XXX: This is here temporarily as a patch, but we should replace this with properly representing
-/// the quoted state of a string in the AST
-fn unescape_string_preserving_quotes(bytes: &[u8], span: Span) -> (String, Option<ParseError>) {
-    let (bytes, err) = if bytes.starts_with(b"\"") {
-        let (bytes, err) = unescape_string(bytes, span);
-        (Cow::Owned(bytes), err)
-    } else {
-        (Cow::Borrowed(bytes), None)
-    };
-
-    // The original code for args used lossy conversion here, even though that's not what we
-    // typically use for strings. Revisit whether that's actually desirable later, but don't
-    // want to introduce a breaking change for this patch.
-    let token = String::from_utf8_lossy(&bytes).into_owned();
-    (token, err)
-}
-
 pub fn parse_string(working_set: &mut StateWorkingSet, span: Span) -> Expression {
    trace!("parsing: string");

@ -6012,7 +6173,7 @@ pub fn discover_captures_in_expr(
        }
        Expr::String(_) => {}
        Expr::RawString(_) => {}
-        Expr::StringInterpolation(exprs) => {
+        Expr::StringInterpolation(exprs) | Expr::GlobInterpolation(exprs, _) => {
            for expr in exprs {
                discover_captures_in_expr(working_set, expr, seen, seen_blocks, output)?;
            }