Change group-by to accept cell paths (#9020)

Closes #9003. This PR changes `group-by` so that its optional argument is interpreted as a cell path. In turn, this lets users use `?` to ignore rows that are missing the column they wish to group on. For example: ``` > [{foo: 123}, {foo: 234}, {bar: 345}] | group-by foo Error: nu:🐚:column_not_found × Cannot find column ╭─[entry #3:1:1] 1 │ [{foo: 123}, {foo: 234}, {bar: 345}] | group-by foo · ─────┬──── ─┬─ · │ ╰── cannot find column 'foo' · ╰── value originates here ╰──── > [{foo: 123}, {foo: 234}, {bar: 345}] | group-by foo? ╭─────┬───────────────╮ │ 123 │ [table 1 row] │ │ 234 │ [table 1 row] │ ╰─────┴───────────────╯ ``` ~~This removes the ability to pass `group-by` a closure or block (I wasn't able to figure out how to make the 2 features coexist), and so it is a breaking change. I think this is OK; I didn't even know `group-by` could accept a closure or block because there was no example for that functionality.~~
2025-08-09 10:15:41 +02:00 · 2023-05-17 16:34:44 -07:00
parent b912d4c1ea
commit 9ce61dc677
4 changed files with 212 additions and 157 deletions
--- a/crates/nu-command/src/example_test.rs
+++ b/crates/nu-command/src/example_test.rs
@ -10,8 +10,8 @@ pub fn test_examples(cmd: impl Command + 'static) {
 mod test_examples {
    use super::super::{
        Ansi, Date, Enumerate, Flatten, From, Get, Into, IntoString, LetEnv, Math, MathEuler,
-        MathPi, MathRound, ParEach, Path, Random, Sort, SortBy, Split, SplitColumn, SplitRow, Str,
+        MathPi, MathRound, ParEach, Path, PathParse, Random, Sort, SortBy, Split, SplitColumn,
-        StrJoin, StrLength, StrReplace, Update, Url, Values, Wrap,
+        SplitRow, Str, StrJoin, StrLength, StrReplace, Update, Url, Values, Wrap,
    };
    use crate::{Each, To};
    use nu_cmd_lang::example_support::{
@ -85,6 +85,7 @@ mod test_examples {
            working_set.add_decl(Box::new(MathRound));
            working_set.add_decl(Box::new(Mut));
            working_set.add_decl(Box::new(Path));
            working_set.add_decl(Box::new(PathParse));
            working_set.add_decl(Box::new(ParEach));
            working_set.add_decl(Box::new(Random));
            working_set.add_decl(Box::new(Sort));
--- a/crates/nu-command/src/filters/group_by.rs
+++ b/crates/nu-command/src/filters/group_by.rs
@ -1,9 +1,8 @@
 use nu_engine::{eval_block, CallExt};
-use nu_protocol::ast::Call;
+use nu_protocol::ast::{Call, CellPath};
 use nu_protocol::engine::{Closure, Command, EngineState, Stack};
 use nu_protocol::{
-    Example, IntoPipelineData, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape,
+    Example, IntoPipelineData, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value,
    Type, Value,
 };
 use indexmap::IndexMap;
@ -26,7 +25,16 @@ impl Command for GroupBy {
                Type::List(Box::new(Type::Any)),
                Type::Record(vec![]),
            )])
-            .optional("grouper", SyntaxShape::Any, "the grouper value to use")
+            .optional(
                "grouper",
                SyntaxShape::OneOf(vec![
                    SyntaxShape::CellPath,
                    SyntaxShape::Block,
                    SyntaxShape::Closure(None),
                    SyntaxShape::Closure(Some(vec![SyntaxShape::Any])),
                ]),
                "the path to the column to group on",
            )
    }
    fn usage(&self) -> &str {
@ -50,6 +58,33 @@ impl Command for GroupBy {
                example: r#"ls | group-by type"#,
                result: None,
            },
            Example {
                description: "Group items by the \"foo\" column's values, ignoring records without a \"foo\" column",
                example: r#"open cool.json | group-by foo?"#,
                result: None,
            },
            Example {
                description: "Group using a block which is evaluated against each input value",
                example: "[foo.txt bar.csv baz.txt] | group-by { path parse | get extension }",
                result: Some(Value::Record {
                    cols: vec!["txt".to_string(), "csv".to_string()],
                    vals: vec![
                        Value::List {
                            vals: vec![
                                Value::test_string("foo.txt"),
                                Value::test_string("baz.txt"),
                            ],
                            span: Span::test_data(),
                        },
                        Value::List {
                            vals: vec![Value::test_string("bar.csv")],
                            span: Span::test_data(),
                        },
                    ],
                    span: Span::test_data(),
                }),
            },
            Example {
                description: "You can also group by raw values by leaving out the argument",
                example: "['1' '3' '1' '3' '2' '1' '1'] | group-by",
@ -81,131 +116,163 @@ impl Command for GroupBy {
    }
 }
 enum Grouper {
    ByColumn(Option<Spanned<String>>),
    ByBlock,
 }
 pub fn group_by(
    engine_state: &EngineState,
    stack: &mut Stack,
    call: &Call,
    input: PipelineData,
 ) -> Result<PipelineData, ShellError> {
-    let name = call.head;
+    let span = call.head;
    let grouper: Option<Value> = call.opt(engine_state, stack, 0)?;
    let values: Vec<Value> = input.into_iter().collect();
    let mut keys: Vec<Result<String, ShellError>> = vec![];
    let mut group_strategy = Grouper::ByColumn(None);
    if values.is_empty() {
        return Err(ShellError::GenericError(
            "expected table from pipeline".into(),
            "requires a table input".into(),
-            Some(name),
+            Some(span),
            None,
            Vec::new(),
        ));
    }
-    let first = values[0].clone();
+    let group_value = match grouper {
-
+        Some(Value::CellPath { val, span }) => group_cell_path(val, values, span)?,
    let value_list = Value::List {
        vals: values.clone(),
        span: name,
    };
    match grouper {
        Some(Value::Block { .. }) | Some(Value::Closure { .. }) => {
            let block: Option<Closure> = call.opt(engine_state, stack, 0)?;
-            let error_key = "error";
+            group_closure(&values, span, block, stack, engine_state, call)?
            for value in values {
                if let Some(capture_block) = &block {
                    let mut stack = stack.captures_to_stack(&capture_block.captures);
                    let block = engine_state.get_block(capture_block.block_id);
                    let pipeline = eval_block(
                        engine_state,
                        &mut stack,
                        block,
                        value.into_pipeline_data(),
                        call.redirect_stdout,
                        call.redirect_stderr,
                    );
                    match pipeline {
                        Ok(s) => {
                            let collection: Vec<Value> = s.into_iter().collect();
                            if collection.len() > 1 {
                                return Err(ShellError::GenericError(
                                    "expected one value from the block".into(),
                                    "requires a table with one value for grouping".into(),
                                    Some(name),
                                    None,
                                    Vec::new(),
                                ));
                            }
                            let value = match collection.get(0) {
                                Some(Value::Error { .. }) | None => Value::string(error_key, name),
                                Some(return_value) => return_value.clone(),
                            };
                            keys.push(value.as_string());
                        }
                        Err(_) => {
                            keys.push(Ok(error_key.into()));
                        }
                    }
                }
            }
            group_strategy = Grouper::ByBlock;
        }
-        Some(other) => {
+        None => group_no_grouper(values, span)?,
-            group_strategy = Grouper::ByColumn(Some(Spanned {
+        _ => {
-                item: other.as_string()?,
+            return Err(ShellError::TypeMismatch {
-                span: name,
+                err_message: "unsupported grouper type".to_string(),
-            }));
+                span,
            })
        }
        _ => {}
    }
    let name = if let Ok(span) = first.span() {
        span
    } else {
        name
    };
-    let group_value = match group_strategy {
+    Ok(PipelineData::Value(group_value, None))
        Grouper::ByBlock => {
            let map = keys;
            let block = Box::new(move |idx: usize, row: &Value| match map.get(idx) {
                Some(Ok(key)) => Ok(key.clone()),
                Some(Err(reason)) => Err(reason.clone()),
                None => row.as_string(),
            });
            data_group(&value_list, &Some(block), name)
        }
        Grouper::ByColumn(column_name) => group(&column_name, &value_list, name),
    };
    Ok(PipelineData::Value(group_value?, None))
 }
-#[allow(clippy::type_complexity)]
+pub fn group_cell_path(
-pub fn data_group(
+    column_name: CellPath,
-    values: &Value,
+    values: Vec<Value>,
    grouper: &Option<Box<dyn Fn(usize, &Value) -> Result<String, ShellError> + Send>>,
    span: Span,
 ) -> Result<Value, ShellError> {
    let mut groups: IndexMap<String, Vec<Value>> = IndexMap::new();
-    for (idx, value) in values.clone().into_pipeline_data().into_iter().enumerate() {
+    for value in values.into_iter() {
        let group_key = value
            .clone()
            .follow_cell_path(&column_name.members, false)?;
        if matches!(group_key, Value::Nothing { .. }) {
            continue; // likely the result of a failed optional access, ignore this value
        }
        let group_key = group_key.as_string()?;
        let group = groups.entry(group_key).or_default();
        group.push(value);
    }
    let mut cols = vec![];
    let mut vals = vec![];
    for (k, v) in groups {
        cols.push(k.to_string());
        vals.push(Value::List { vals: v, span });
    }
    Ok(Value::Record { cols, vals, span })
 }
 pub fn group_no_grouper(values: Vec<Value>, span: Span) -> Result<Value, ShellError> {
    let mut groups: IndexMap<String, Vec<Value>> = IndexMap::new();
    for value in values.into_iter() {
        let group_key = value.as_string()?;
        let group = groups.entry(group_key).or_default();
        group.push(value);
    }
    let mut cols = vec![];
    let mut vals = vec![];
    for (k, v) in groups {
        cols.push(k.to_string());
        vals.push(Value::List { vals: v, span });
    }
    Ok(Value::Record { cols, vals, span })
 }
 // TODO: refactor this, it's a bit of a mess
 fn group_closure(
    values: &Vec<Value>,
    span: Span,
    block: Option<Closure>,
    stack: &mut Stack,
    engine_state: &EngineState,
    call: &Call,
 ) -> Result<Value, ShellError> {
    let error_key = "error";
    let mut keys: Vec<Result<String, ShellError>> = vec![];
    let value_list = Value::List {
        vals: values.clone(),
        span,
    };
    for value in values {
        if let Some(capture_block) = &block {
            let mut stack = stack.captures_to_stack(&capture_block.captures);
            let block = engine_state.get_block(capture_block.block_id);
            let pipeline = eval_block(
                engine_state,
                &mut stack,
                block,
                value.clone().into_pipeline_data(),
                call.redirect_stdout,
                call.redirect_stderr,
            );
            match pipeline {
                Ok(s) => {
                    let collection: Vec<Value> = s.into_iter().collect();
                    if collection.len() > 1 {
                        return Err(ShellError::GenericError(
                            "expected one value from the block".into(),
                            "requires a table with one value for grouping".into(),
                            Some(span),
                            None,
                            Vec::new(),
                        ));
                    }
                    let value = match collection.get(0) {
                        Some(Value::Error { .. }) | None => Value::string(error_key, span),
                        Some(return_value) => return_value.clone(),
                    };
                    keys.push(value.as_string());
                }
                Err(_) => {
                    keys.push(Ok(error_key.into()));
                }
            }
        }
    }
    let map = keys;
    let block = Box::new(move |idx: usize, row: &Value| match map.get(idx) {
        Some(Ok(key)) => Ok(key.clone()),
        Some(Err(reason)) => Err(reason.clone()),
        None => row.as_string(),
    });
    let grouper = &Some(block);
    let mut groups: IndexMap<String, Vec<Value>> = IndexMap::new();
    for (idx, value) in value_list.into_pipeline_data().into_iter().enumerate() {
        let group_key = if let Some(ref grouper) = grouper {
            grouper(idx, &value)
        } else {
@ -227,48 +294,6 @@ pub fn data_group(
    Ok(Value::Record { cols, vals, span })
 }
 pub fn group(
    column_name: &Option<Spanned<String>>,
    values: &Value,
    span: Span,
 ) -> Result<Value, ShellError> {
    let name = span;
    let grouper = if let Some(column_name) = column_name {
        Grouper::ByColumn(Some(column_name.clone()))
    } else {
        Grouper::ByColumn(None)
    };
    match grouper {
        Grouper::ByColumn(Some(column_name)) => {
            let block = Box::new(move |_, row: &Value| {
                if let Value::Error { error } = row {
                    return Err(*error.clone());
                };
                match row.get_data_by_key(&column_name.item) {
                    Some(group_key) => Ok(group_key.as_string()?),
                    None => Err(ShellError::CantFindColumn {
                        col_name: column_name.item.to_string(),
                        span: column_name.span,
                        src_span: row.expect_span(),
                    }),
                }
            });
            data_group(values, &Some(block), name)
        }
        Grouper::ByColumn(None) => {
            let block = Box::new(move |_, row: &Value| row.as_string());
            data_group(values, &Some(block), name)
        }
        Grouper::ByBlock => Err(ShellError::NushellFailed {
            msg: "Block not implemented: This should never happen.".into(),
        }),
    }
 }
 #[cfg(test)]
 mod test {
    use super::*;
--- a/crates/nu-command/src/filters/split_by.rs
+++ b/crates/nu-command/src/filters/split_by.rs
@ -3,7 +3,8 @@ use nu_engine::CallExt;
 use nu_protocol::ast::Call;
 use nu_protocol::engine::{Command, EngineState, Stack};
 use nu_protocol::{
-    Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Type, Value,
+    Example, IntoPipelineData, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape,
    Type, Value,
 };
 #[derive(Clone)]
@ -182,6 +183,36 @@ pub fn split(
    }
 }
 #[allow(clippy::type_complexity)]
 fn data_group(
    values: &Value,
    grouper: &Option<Box<dyn Fn(usize, &Value) -> Result<String, ShellError> + Send>>,
    span: Span,
 ) -> Result<Value, ShellError> {
    let mut groups: IndexMap<String, Vec<Value>> = IndexMap::new();
    for (idx, value) in values.clone().into_pipeline_data().into_iter().enumerate() {
        let group_key = if let Some(ref grouper) = grouper {
            grouper(idx, &value)
        } else {
            value.as_string()
        };
        let group = groups.entry(group_key?).or_default();
        group.push(value);
    }
    let mut cols = vec![];
    let mut vals = vec![];
    for (k, v) in groups {
        cols.push(k.to_string());
        vals.push(Value::List { vals: v, span });
    }
    Ok(Value::Record { cols, vals, span })
 }
 #[allow(clippy::type_complexity)]
 pub fn data_split(
    value: PipelineData,
@ -203,7 +234,7 @@ pub fn data_split(
            _,
        ) => {
            for (idx, list) in grouped_rows.iter().enumerate() {
-                match super::group_by::data_group(list, splitter, span) {
+                match data_group(list, splitter, span) {
                    Ok(grouped) => {
                        if let Value::Record {
                            vals: li,
--- a/crates/nu-command/tests/commands/group_by.rs
+++ b/crates/nu-command/tests/commands/group_by.rs
@ -72,7 +72,7 @@ fn errors_if_given_unknown_column_name() {
 }
 #[test]
-fn errors_if_block_given_evaluates_more_than_one_row() {
+fn errors_if_column_not_found() {
    Playground::setup("group_by_test_3", |dirs, sandbox| {
        sandbox.with_files(vec![FileWithContentToBeTrimmed(
            "los_tres_caballeros.csv",
@ -92,21 +92,19 @@ fn errors_if_block_given_evaluates_more_than_one_row() {
            "#
        ));
-        assert!(actual.err.contains("value originates here"),);
+        assert!(actual.err.contains("did you mean 'type'"),);
        assert!(actual.err.contains("cannot find column"),);
    })
 }
 #[test]
 fn errors_if_input_empty() {
-    Playground::setup("group_by_empty_test", |dirs, _sandbox| {
+    let actual = nu!("group-by date");
-        let actual = nu!(
+    assert!(actual.err.contains("expected table from pipeline"));
-            cwd: dirs.test(), pipeline(
+}
-            r#"
+
-            group-by date
+#[test]
-        "#
+fn optional_cell_path_works() {
-        ));
+    let actual = nu!("[{foo: 123}, {foo: 234}, {bar: 345}] | group-by foo? | to nuon");
-
+    let expected = r#"{"123": [[foo]; [123]], "234": [[foo]; [234]]}"#;
-        assert!(actual.err.contains("expected table from pipeline"));
+    assert_eq!(actual.out, expected)
    });
 }