Change group-by to accept cell paths (#9020)

Closes #9003. This PR changes `group-by` so that its optional argument is interpreted as a cell path. In turn, this lets users use `?` to ignore rows that are missing the column they wish to group on. For example: ``` > [{foo: 123}, {foo: 234}, {bar: 345}] | group-by foo Error: nu:🐚:column_not_found × Cannot find column ╭─[entry #3:1:1] 1 │ [{foo: 123}, {foo: 234}, {bar: 345}] | group-by foo · ─────┬──── ─┬─ · │ ╰── cannot find column 'foo' · ╰── value originates here ╰──── > [{foo: 123}, {foo: 234}, {bar: 345}] | group-by foo? ╭─────┬───────────────╮ │ 123 │ [table 1 row] │ │ 234 │ [table 1 row] │ ╰─────┴───────────────╯ ``` ~~This removes the ability to pass `group-by` a closure or block (I wasn't able to figure out how to make the 2 features coexist), and so it is a breaking change. I think this is OK; I didn't even know `group-by` could accept a closure or block because there was no example for that functionality.~~
2023-05-17 16:34:44 -07:00 · 2023-05-17 16:34:44 -07:00 · 9ce61dc677
commit 9ce61dc677
parent b912d4c1ea
4 changed files with 212 additions and 157 deletions
--- a/crates/nu-command/src/example_test.rs
+++ b/crates/nu-command/src/example_test.rs
@ -10,8 +10,8 @@ pub fn test_examples(cmd: impl Command + 'static) {
 mod test_examples {
    use super::super::{
        Ansi, Date, Enumerate, Flatten, From, Get, Into, IntoString, LetEnv, Math, MathEuler,
-        MathPi, MathRound, ParEach, Path, Random, Sort, SortBy, Split, SplitColumn, SplitRow, Str,
-        StrJoin, StrLength, StrReplace, Update, Url, Values, Wrap,
+        MathPi, MathRound, ParEach, Path, PathParse, Random, Sort, SortBy, Split, SplitColumn,
+        SplitRow, Str, StrJoin, StrLength, StrReplace, Update, Url, Values, Wrap,
    };
    use crate::{Each, To};
    use nu_cmd_lang::example_support::{
@ -85,6 +85,7 @@ mod test_examples {
            working_set.add_decl(Box::new(MathRound));
            working_set.add_decl(Box::new(Mut));
            working_set.add_decl(Box::new(Path));
+            working_set.add_decl(Box::new(PathParse));
            working_set.add_decl(Box::new(ParEach));
            working_set.add_decl(Box::new(Random));
            working_set.add_decl(Box::new(Sort));
--- a/crates/nu-command/src/filters/group_by.rs
+++ b/crates/nu-command/src/filters/group_by.rs
@ -1,9 +1,8 @@
 use nu_engine::{eval_block, CallExt};
-use nu_protocol::ast::Call;
+use nu_protocol::ast::{Call, CellPath};
 use nu_protocol::engine::{Closure, Command, EngineState, Stack};
 use nu_protocol::{
-    Example, IntoPipelineData, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape,
-    Type, Value,
+    Example, IntoPipelineData, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value,
 };

 use indexmap::IndexMap;
@ -26,7 +25,16 @@ impl Command for GroupBy {
                Type::List(Box::new(Type::Any)),
                Type::Record(vec![]),
            )])
-            .optional("grouper", SyntaxShape::Any, "the grouper value to use")
+            .optional(
+                "grouper",
+                SyntaxShape::OneOf(vec![
+                    SyntaxShape::CellPath,
+                    SyntaxShape::Block,
+                    SyntaxShape::Closure(None),
+                    SyntaxShape::Closure(Some(vec![SyntaxShape::Any])),
+                ]),
+                "the path to the column to group on",
+            )
    }

    fn usage(&self) -> &str {
@ -50,6 +58,33 @@ impl Command for GroupBy {
                example: r#"ls | group-by type"#,
                result: None,
            },
+            Example {
+                description: "Group items by the \"foo\" column's values, ignoring records without a \"foo\" column",
+                example: r#"open cool.json | group-by foo?"#,
+                result: None,
+            },
+            Example {
+                description: "Group using a block which is evaluated against each input value",
+                example: "[foo.txt bar.csv baz.txt] | group-by { path parse | get extension }",
+                result: Some(Value::Record {
+                    cols: vec!["txt".to_string(), "csv".to_string()],
+                    vals: vec![
+                        Value::List {
+                            vals: vec![
+                                Value::test_string("foo.txt"),
+                                Value::test_string("baz.txt"),
+                            ],
+                            span: Span::test_data(),
+                        },
+                        Value::List {
+                            vals: vec![Value::test_string("bar.csv")],
+                            span: Span::test_data(),
+                        },
+                    ],
+                    span: Span::test_data(),
+                }),
+            },
+
            Example {
                description: "You can also group by raw values by leaving out the argument",
                example: "['1' '3' '1' '3' '2' '1' '1'] | group-by",
@ -81,131 +116,163 @@ impl Command for GroupBy {
    }
 }

-enum Grouper {
-    ByColumn(Option<Spanned<String>>),
-    ByBlock,
-}
-
 pub fn group_by(
    engine_state: &EngineState,
    stack: &mut Stack,
    call: &Call,
    input: PipelineData,
 ) -> Result<PipelineData, ShellError> {
-    let name = call.head;
+    let span = call.head;

    let grouper: Option<Value> = call.opt(engine_state, stack, 0)?;
    let values: Vec<Value> = input.into_iter().collect();
-    let mut keys: Vec<Result<String, ShellError>> = vec![];
-    let mut group_strategy = Grouper::ByColumn(None);

    if values.is_empty() {
        return Err(ShellError::GenericError(
            "expected table from pipeline".into(),
            "requires a table input".into(),
-            Some(name),
+            Some(span),
            None,
            Vec::new(),
        ));
    }

-    let first = values[0].clone();
-
-    let value_list = Value::List {
-        vals: values.clone(),
-        span: name,
-    };
-
-    match grouper {
+    let group_value = match grouper {
+        Some(Value::CellPath { val, span }) => group_cell_path(val, values, span)?,
        Some(Value::Block { .. }) | Some(Value::Closure { .. }) => {
            let block: Option<Closure> = call.opt(engine_state, stack, 0)?;
-            let error_key = "error";
-
-            for value in values {
-                if let Some(capture_block) = &block {
-                    let mut stack = stack.captures_to_stack(&capture_block.captures);
-                    let block = engine_state.get_block(capture_block.block_id);
-                    let pipeline = eval_block(
-                        engine_state,
-                        &mut stack,
-                        block,
-                        value.into_pipeline_data(),
-                        call.redirect_stdout,
-                        call.redirect_stderr,
-                    );
-
-                    match pipeline {
-                        Ok(s) => {
-                            let collection: Vec<Value> = s.into_iter().collect();
-
-                            if collection.len() > 1 {
-                                return Err(ShellError::GenericError(
-                                    "expected one value from the block".into(),
-                                    "requires a table with one value for grouping".into(),
-                                    Some(name),
-                                    None,
-                                    Vec::new(),
-                                ));
-                            }
-
-                            let value = match collection.get(0) {
-                                Some(Value::Error { .. }) | None => Value::string(error_key, name),
-                                Some(return_value) => return_value.clone(),
-                            };
-
-                            keys.push(value.as_string());
-                        }
-                        Err(_) => {
-                            keys.push(Ok(error_key.into()));
-                        }
-                    }
-                }
-            }
-
-            group_strategy = Grouper::ByBlock;
+            group_closure(&values, span, block, stack, engine_state, call)?
        }
-        Some(other) => {
-            group_strategy = Grouper::ByColumn(Some(Spanned {
-                item: other.as_string()?,
-                span: name,
-            }));
+        None => group_no_grouper(values, span)?,
+        _ => {
+            return Err(ShellError::TypeMismatch {
+                err_message: "unsupported grouper type".to_string(),
+                span,
+            })
        }
-        _ => {}
-    }
-
-    let name = if let Ok(span) = first.span() {
-        span
-    } else {
-        name
    };

-    let group_value = match group_strategy {
-        Grouper::ByBlock => {
-            let map = keys;
-
-            let block = Box::new(move |idx: usize, row: &Value| match map.get(idx) {
-                Some(Ok(key)) => Ok(key.clone()),
-                Some(Err(reason)) => Err(reason.clone()),
-                None => row.as_string(),
-            });
-
-            data_group(&value_list, &Some(block), name)
-        }
-        Grouper::ByColumn(column_name) => group(&column_name, &value_list, name),
-    };
-
-    Ok(PipelineData::Value(group_value?, None))
+    Ok(PipelineData::Value(group_value, None))
 }

-#[allow(clippy::type_complexity)]
-pub fn data_group(
-    values: &Value,
-    grouper: &Option<Box<dyn Fn(usize, &Value) -> Result<String, ShellError> + Send>>,
+pub fn group_cell_path(
+    column_name: CellPath,
+    values: Vec<Value>,
    span: Span,
 ) -> Result<Value, ShellError> {
    let mut groups: IndexMap<String, Vec<Value>> = IndexMap::new();

-    for (idx, value) in values.clone().into_pipeline_data().into_iter().enumerate() {
+    for value in values.into_iter() {
+        let group_key = value
+            .clone()
+            .follow_cell_path(&column_name.members, false)?;
+        if matches!(group_key, Value::Nothing { .. }) {
+            continue; // likely the result of a failed optional access, ignore this value
+        }
+
+        let group_key = group_key.as_string()?;
+        let group = groups.entry(group_key).or_default();
+        group.push(value);
+    }
+
+    let mut cols = vec![];
+    let mut vals = vec![];
+
+    for (k, v) in groups {
+        cols.push(k.to_string());
+        vals.push(Value::List { vals: v, span });
+    }
+
+    Ok(Value::Record { cols, vals, span })
+}
+
+pub fn group_no_grouper(values: Vec<Value>, span: Span) -> Result<Value, ShellError> {
+    let mut groups: IndexMap<String, Vec<Value>> = IndexMap::new();
+
+    for value in values.into_iter() {
+        let group_key = value.as_string()?;
+        let group = groups.entry(group_key).or_default();
+        group.push(value);
+    }
+
+    let mut cols = vec![];
+    let mut vals = vec![];
+
+    for (k, v) in groups {
+        cols.push(k.to_string());
+        vals.push(Value::List { vals: v, span });
+    }
+
+    Ok(Value::Record { cols, vals, span })
+}
+
+// TODO: refactor this, it's a bit of a mess
+fn group_closure(
+    values: &Vec<Value>,
+    span: Span,
+    block: Option<Closure>,
+    stack: &mut Stack,
+    engine_state: &EngineState,
+    call: &Call,
+) -> Result<Value, ShellError> {
+    let error_key = "error";
+    let mut keys: Vec<Result<String, ShellError>> = vec![];
+    let value_list = Value::List {
+        vals: values.clone(),
+        span,
+    };
+
+    for value in values {
+        if let Some(capture_block) = &block {
+            let mut stack = stack.captures_to_stack(&capture_block.captures);
+            let block = engine_state.get_block(capture_block.block_id);
+            let pipeline = eval_block(
+                engine_state,
+                &mut stack,
+                block,
+                value.clone().into_pipeline_data(),
+                call.redirect_stdout,
+                call.redirect_stderr,
+            );
+
+            match pipeline {
+                Ok(s) => {
+                    let collection: Vec<Value> = s.into_iter().collect();
+
+                    if collection.len() > 1 {
+                        return Err(ShellError::GenericError(
+                            "expected one value from the block".into(),
+                            "requires a table with one value for grouping".into(),
+                            Some(span),
+                            None,
+                            Vec::new(),
+                        ));
+                    }
+
+                    let value = match collection.get(0) {
+                        Some(Value::Error { .. }) | None => Value::string(error_key, span),
+                        Some(return_value) => return_value.clone(),
+                    };
+
+                    keys.push(value.as_string());
+                }
+                Err(_) => {
+                    keys.push(Ok(error_key.into()));
+                }
+            }
+        }
+    }
+    let map = keys;
+    let block = Box::new(move |idx: usize, row: &Value| match map.get(idx) {
+        Some(Ok(key)) => Ok(key.clone()),
+        Some(Err(reason)) => Err(reason.clone()),
+        None => row.as_string(),
+    });
+
+    let grouper = &Some(block);
+    let mut groups: IndexMap<String, Vec<Value>> = IndexMap::new();
+
+    for (idx, value) in value_list.into_pipeline_data().into_iter().enumerate() {
        let group_key = if let Some(ref grouper) = grouper {
            grouper(idx, &value)
        } else {
@ -227,48 +294,6 @@ pub fn data_group(
    Ok(Value::Record { cols, vals, span })
 }

-pub fn group(
-    column_name: &Option<Spanned<String>>,
-    values: &Value,
-    span: Span,
-) -> Result<Value, ShellError> {
-    let name = span;
-
-    let grouper = if let Some(column_name) = column_name {
-        Grouper::ByColumn(Some(column_name.clone()))
-    } else {
-        Grouper::ByColumn(None)
-    };
-
-    match grouper {
-        Grouper::ByColumn(Some(column_name)) => {
-            let block = Box::new(move |_, row: &Value| {
-                if let Value::Error { error } = row {
-                    return Err(*error.clone());
-                };
-                match row.get_data_by_key(&column_name.item) {
-                    Some(group_key) => Ok(group_key.as_string()?),
-                    None => Err(ShellError::CantFindColumn {
-                        col_name: column_name.item.to_string(),
-                        span: column_name.span,
-                        src_span: row.expect_span(),
-                    }),
-                }
-            });
-
-            data_group(values, &Some(block), name)
-        }
-        Grouper::ByColumn(None) => {
-            let block = Box::new(move |_, row: &Value| row.as_string());
-
-            data_group(values, &Some(block), name)
-        }
-        Grouper::ByBlock => Err(ShellError::NushellFailed {
-            msg: "Block not implemented: This should never happen.".into(),
-        }),
-    }
-}
-
 #[cfg(test)]
 mod test {
    use super::*;
--- a/crates/nu-command/src/filters/split_by.rs
+++ b/crates/nu-command/src/filters/split_by.rs
@ -3,7 +3,8 @@ use nu_engine::CallExt;
 use nu_protocol::ast::Call;
 use nu_protocol::engine::{Command, EngineState, Stack};
 use nu_protocol::{
-    Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Type, Value,
+    Example, IntoPipelineData, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape,
+    Type, Value,
 };

 #[derive(Clone)]
@ -182,6 +183,36 @@ pub fn split(
    }
 }

+#[allow(clippy::type_complexity)]
+fn data_group(
+    values: &Value,
+    grouper: &Option<Box<dyn Fn(usize, &Value) -> Result<String, ShellError> + Send>>,
+    span: Span,
+) -> Result<Value, ShellError> {
+    let mut groups: IndexMap<String, Vec<Value>> = IndexMap::new();
+
+    for (idx, value) in values.clone().into_pipeline_data().into_iter().enumerate() {
+        let group_key = if let Some(ref grouper) = grouper {
+            grouper(idx, &value)
+        } else {
+            value.as_string()
+        };
+
+        let group = groups.entry(group_key?).or_default();
+        group.push(value);
+    }
+
+    let mut cols = vec![];
+    let mut vals = vec![];
+
+    for (k, v) in groups {
+        cols.push(k.to_string());
+        vals.push(Value::List { vals: v, span });
+    }
+
+    Ok(Value::Record { cols, vals, span })
+}
+
 #[allow(clippy::type_complexity)]
 pub fn data_split(
    value: PipelineData,
@ -203,7 +234,7 @@ pub fn data_split(
            _,
        ) => {
            for (idx, list) in grouped_rows.iter().enumerate() {
-                match super::group_by::data_group(list, splitter, span) {
+                match data_group(list, splitter, span) {
                    Ok(grouped) => {
                        if let Value::Record {
                            vals: li,
--- a/crates/nu-command/tests/commands/group_by.rs
+++ b/crates/nu-command/tests/commands/group_by.rs
@ -72,7 +72,7 @@ fn errors_if_given_unknown_column_name() {
 }

 #[test]
-fn errors_if_block_given_evaluates_more_than_one_row() {
+fn errors_if_column_not_found() {
    Playground::setup("group_by_test_3", |dirs, sandbox| {
        sandbox.with_files(vec![FileWithContentToBeTrimmed(
            "los_tres_caballeros.csv",
@ -92,21 +92,19 @@ fn errors_if_block_given_evaluates_more_than_one_row() {
            "#
        ));

-        assert!(actual.err.contains("value originates here"),);
-        assert!(actual.err.contains("cannot find column"),);
+        assert!(actual.err.contains("did you mean 'type'"),);
    })
 }

 #[test]
 fn errors_if_input_empty() {
-    Playground::setup("group_by_empty_test", |dirs, _sandbox| {
-        let actual = nu!(
-            cwd: dirs.test(), pipeline(
-            r#"
-            group-by date
-        "#
-        ));
-
-        assert!(actual.err.contains("expected table from pipeline"));
-    });
+    let actual = nu!("group-by date");
+    assert!(actual.err.contains("expected table from pipeline"));
+}
+
+#[test]
+fn optional_cell_path_works() {
+    let actual = nu!("[{foo: 123}, {foo: 234}, {bar: 345}] | group-by foo? | to nuon");
+    let expected = r#"{"123": [[foo]; [123]], "234": [[foo]; [234]]}"#;
+    assert_eq!(actual.out, expected)
 }