add multiple grouper support to group-by (#14337)

- closes #14330 Related: - #2607 - #14019 - #14316 # Description This PR changes `group-by` to support grouping by multiple `grouper` arguments. # Changes - No grouper: no change in behavior - Single grouper - `--to-table=false`: no change in behavior - `--to-table=true`: - closure grouper: named group0 - cell-path grouper: named after the cell-path - Multiple groupers: - `--to-table=false`: nested groups - `--to-table=true`: one column for each grouper argument, followed by the `items` column - columns corresponding to cell-paths are named after them - columns corresponding to closure groupers are named `group{i}` where `i` is the index of the grouper argument # Examples ```nushell > [1 3 1 3 2 1 1] | group-by ╭───┬───────────╮ │ │ ╭───┬───╮ │ │ 1 │ │ 0 │ 1 │ │ │ │ │ 1 │ 1 │ │ │ │ │ 2 │ 1 │ │ │ │ │ 3 │ 1 │ │ │ │ ╰───┴───╯ │ │ │ ╭───┬───╮ │ │ 3 │ │ 0 │ 3 │ │ │ │ │ 1 │ 3 │ │ │ │ ╰───┴───╯ │ │ │ ╭───┬───╮ │ │ 2 │ │ 0 │ 2 │ │ │ │ ╰───┴───╯ │ ╰───┴───────────╯ > [1 3 1 3 2 1 1] | group-by --to-table ╭─#─┬─group─┬───items───╮ │ 0 │ 1 │ ╭───┬───╮ │ │ │ │ │ 0 │ 1 │ │ │ │ │ │ 1 │ 1 │ │ │ │ │ │ 2 │ 1 │ │ │ │ │ │ 3 │ 1 │ │ │ │ │ ╰───┴───╯ │ │ 1 │ 3 │ ╭───┬───╮ │ │ │ │ │ 0 │ 3 │ │ │ │ │ │ 1 │ 3 │ │ │ │ │ ╰───┴───╯ │ │ 2 │ 2 │ ╭───┬───╮ │ │ │ │ │ 0 │ 2 │ │ │ │ │ ╰───┴───╯ │ ╰─#─┴─group─┴───items───╯ > [1 3 1 3 2 1 1] | group-by { $in >= 2 } ╭───────┬───────────╮ │ │ ╭───┬───╮ │ │ false │ │ 0 │ 1 │ │ │ │ │ 1 │ 1 │ │ │ │ │ 2 │ 1 │ │ │ │ │ 3 │ 1 │ │ │ │ ╰───┴───╯ │ │ │ ╭───┬───╮ │ │ true │ │ 0 │ 3 │ │ │ │ │ 1 │ 3 │ │ │ │ │ 2 │ 2 │ │ │ │ ╰───┴───╯ │ ╰───────┴───────────╯ > [1 3 1 3 2 1 1] | group-by { $in >= 2 } --to-table ╭─#─┬─group0─┬───items───╮ │ 0 │ false │ ╭───┬───╮ │ │ │ │ │ 0 │ 1 │ │ │ │ │ │ 1 │ 1 │ │ │ │ │ │ 2 │ 1 │ │ │ │ │ │ 3 │ 1 │ │ │ │ │ ╰───┴───╯ │ │ 1 │ true │ ╭───┬───╮ │ │ │ │ │ 0 │ 3 │ │ │ │ │ │ 1 │ 3 │ │ │ │ │ │ 2 │ 2 │ │ │ │ │ ╰───┴───╯ │ ╰─#─┴─group0─┴───items───╯ ``` ```nushell let data = [ [name, lang, year]; [andres, rb, "2019"], [jt, rs, "2019"], [storm, rs, "2021"] ] > $data ╭─#─┬──name──┬─lang─┬─year─╮ │ 0 │ andres │ rb │ 2019 │ │ 1 │ jt │ rs │ 2019 │ │ 2 │ storm │ rs │ 2021 │ ╰─#─┴──name──┴─lang─┴─year─╯ ``` ```nushell > $data | group-by lang ╭────┬──────────────────────────────╮ │ │ ╭─#─┬──name──┬─lang─┬─year─╮ │ │ rb │ │ 0 │ andres │ rb │ 2019 │ │ │ │ ╰─#─┴──name──┴─lang─┴─year─╯ │ │ │ ╭─#─┬─name──┬─lang─┬─year─╮ │ │ rs │ │ 0 │ jt │ rs │ 2019 │ │ │ │ │ 1 │ storm │ rs │ 2021 │ │ │ │ ╰─#─┴─name──┴─lang─┴─year─╯ │ ╰────┴──────────────────────────────╯ ``` Group column is now named after the grouper, to allow multiple groupers. ```nushell > $data | group-by lang --to-table # column names changed! ╭─#─┬─lang─┬────────────items─────────────╮ │ 0 │ rb │ ╭─#─┬──name──┬─lang─┬─year─╮ │ │ │ │ │ 0 │ andres │ rb │ 2019 │ │ │ │ │ ╰─#─┴──name──┴─lang─┴─year─╯ │ │ 1 │ rs │ ╭─#─┬─name──┬─lang─┬─year─╮ │ │ │ │ │ 0 │ jt │ rs │ 2019 │ │ │ │ │ │ 1 │ storm │ rs │ 2021 │ │ │ │ │ ╰─#─┴─name──┴─lang─┴─year─╯ │ ╰─#─┴─lang─┴────────────items─────────────╯ ``` Grouping by multiple columns makes finer grained aggregations possible. ```nushell > $data | group-by lang year --to-table ╭─#─┬─lang─┬─year─┬────────────items─────────────╮ │ 0 │ rb │ 2019 │ ╭─#─┬──name──┬─lang─┬─year─╮ │ │ │ │ │ │ 0 │ andres │ rb │ 2019 │ │ │ │ │ │ ╰─#─┴──name──┴─lang─┴─year─╯ │ │ 1 │ rs │ 2019 │ ╭─#─┬─name─┬─lang─┬─year─╮ │ │ │ │ │ │ 0 │ jt │ rs │ 2019 │ │ │ │ │ │ ╰─#─┴─name─┴─lang─┴─year─╯ │ │ 2 │ rs │ 2021 │ ╭─#─┬─name──┬─lang─┬─year─╮ │ │ │ │ │ │ 0 │ storm │ rs │ 2021 │ │ │ │ │ │ ╰─#─┴─name──┴─lang─┴─year─╯ │ ╰─#─┴─lang─┴─year─┴────────────items─────────────╯ ``` Grouping by multiple columns, without `--to-table` returns a nested structure. This is equivalent to `$data | group-by year | split-by lang`, making `split-by` obsolete. ```nushell > $data | group-by lang year ╭────┬─────────────────────────────────────────╮ │ │ ╭──────┬──────────────────────────────╮ │ │ rb │ │ │ ╭─#─┬──name──┬─lang─┬─year─╮ │ │ │ │ │ 2019 │ │ 0 │ andres │ rb │ 2019 │ │ │ │ │ │ │ ╰─#─┴──name──┴─lang─┴─year─╯ │ │ │ │ ╰──────┴──────────────────────────────╯ │ │ │ ╭──────┬─────────────────────────────╮ │ │ rs │ │ │ ╭─#─┬─name─┬─lang─┬─year─╮ │ │ │ │ │ 2019 │ │ 0 │ jt │ rs │ 2019 │ │ │ │ │ │ │ ╰─#─┴─name─┴─lang─┴─year─╯ │ │ │ │ │ │ ╭─#─┬─name──┬─lang─┬─year─╮ │ │ │ │ │ 2021 │ │ 0 │ storm │ rs │ 2021 │ │ │ │ │ │ │ ╰─#─┴─name──┴─lang─┴─year─╯ │ │ │ │ ╰──────┴─────────────────────────────╯ │ ╰────┴─────────────────────────────────────────╯ ``` From #2607: > Here's a couple more examples without much explanation. This one shows adding two grouping keys. I'm always wanting to add more columns when using group-by and it just-work™️ `gb.exe -f movies-2.csv -k 3,2 -s 7 --skip_header` > > ``` > k:3 | k:2 | count | sum:7 > -----------------------+-----------+-------+-------------------- > 20th Century Fox | Drama | 1 | 117.09 > 20th Century Fox | Romance | 1 | 39.66 > CBS | Comedy | 1 | 77.09 > Disney | Animation | 4 | 1264.23 > Disney | Comedy | 4 | 950.27 > Fox | Comedy | 5 | 661.85 > Independent | Comedy | 7 | 399.07 > Independent | Drama | 4 | 69.75 > Independent | Romance | 7 | 1048.75 > Independent | romance | 1 | 29.37 > ... > ``` This example can be achieved like this: ```nushell > open movies-2.csv | group-by "Lead Studio" Genre --to-table | insert count {get items | length} | insert sum { get items."Worldwide Gross" | math sum} | reject items | sort-by "Lead Studio" Genre ╭─#──┬──────Lead Studio──────┬───Genre───┬─count─┬───sum───╮ │ 0 │ 20th Century Fox │ Drama │ 1 │ 117.09 │ │ 1 │ 20th Century Fox │ Romance │ 1 │ 39.66 │ │ 2 │ CBS │ Comedy │ 1 │ 77.09 │ │ 3 │ Disney │ Animation │ 4 │ 1264.23 │ │ 4 │ Disney │ Comedy │ 4 │ 950.27 │ │ 5 │ Fox │ Comedy │ 5 │ 661.85 │ │ 6 │ Fox │ comedy │ 1 │ 60.72 │ │ 7 │ Independent │ Comedy │ 7 │ 399.07 │ │ 8 │ Independent │ Drama │ 4 │ 69.75 │ │ 9 │ Independent │ Romance │ 7 │ 1048.75 │ │ 10 │ Independent │ romance │ 1 │ 29.37 │ ... ```
2025-08-21 01:03:49 +02:00 · 2024-11-15 15:40:49 +03:00
parent f7832c0e82
commit b6e84879b6
2 changed files with 226 additions and 60 deletions
--- a/crates/nu-command/src/filters/group_by.rs
+++ b/crates/nu-command/src/filters/group_by.rs
@@ -1,6 +1,6 @@
 use indexmap::IndexMap;
 use nu_engine::{command_prelude::*, ClosureEval};
-use nu_protocol::engine::Closure;
+use nu_protocol::{engine::Closure, IntoValue};
 #[derive(Clone)]
 pub struct GroupBy;
@@ -22,7 +22,7 @@ impl Command for GroupBy {
                "Return a table with \"groups\" and \"items\" columns",
                None,
            )
-            .optional(
+            .rest(
                "grouper",
                SyntaxShape::OneOf(vec![
                    SyntaxShape::CellPath,
@@ -135,7 +135,89 @@ impl Command for GroupBy {
                        Value::test_string("false"),
                    ]),
                })),
-            }
+            },
            Example {
                description: "Group items by multiple columns' values",
                example: r#"[
        [name, lang, year];
        [andres, rb, "2019"],
        [jt, rs, "2019"],
        [storm, rs, "2021"]
    ]
    | group-by lang year"#,
                result: Some(Value::test_record(record! {
                    "rb" => Value::test_record(record! {
                        "2019" => Value::test_list(
                            vec![Value::test_record(record! {
                                    "name" => Value::test_string("andres"),
                                    "lang" => Value::test_string("rb"),
                                    "year" => Value::test_string("2019"),
                            })],
                        ),
                    }),
                    "rs" => Value::test_record(record! {
                            "2019" => Value::test_list(
                                vec![Value::test_record(record! {
                                        "name" => Value::test_string("jt"),
                                        "lang" => Value::test_string("rs"),
                                        "year" => Value::test_string("2019"),
                                })],
                            ),
                            "2021" => Value::test_list(
                                vec![Value::test_record(record! {
                                        "name" => Value::test_string("storm"),
                                        "lang" => Value::test_string("rs"),
                                        "year" => Value::test_string("2021"),
                                })],
                            ),
                    }),
                }))
            },
            Example {
                description: "Group items by multiple columns' values",
                example: r#"[
        [name, lang, year];
        [andres, rb, "2019"],
        [jt, rs, "2019"],
        [storm, rs, "2021"]
    ]
    | group-by lang year --to-table"#,
                result: Some(Value::test_list(vec![
                    Value::test_record(record! {
                        "lang" => Value::test_string("rb"),
                        "year" => Value::test_string("2019"),
                        "items" => Value::test_list(vec![
                            Value::test_record(record! {
                                "name" => Value::test_string("andres"),
                                "lang" => Value::test_string("rb"),
                                "year" => Value::test_string("2019"),
                            })
                        ]),
                    }),
                    Value::test_record(record! {
                        "lang" => Value::test_string("rs"),
                        "year" => Value::test_string("2019"),
                        "items" => Value::test_list(vec![
                            Value::test_record(record! {
                                "name" => Value::test_string("jt"),
                                "lang" => Value::test_string("rs"),
                                "year" => Value::test_string("2019"),
                            })
                        ]),
                    }),
                    Value::test_record(record! {
                        "lang" => Value::test_string("rs"),
                        "year" => Value::test_string("2021"),
                        "items" => Value::test_list(vec![
                            Value::test_record(record! {
                                "name" => Value::test_string("storm"),
                                "lang" => Value::test_string("rs"),
                                "year" => Value::test_string("2021"),
                            })
                        ]),
                    }),
                ]))
            },
        ]
    }
 }
@@ -147,7 +229,7 @@ pub fn group_by(
    input: PipelineData,
 ) -> Result<PipelineData, ShellError> {
    let head = call.head;
-    let grouper: Option<Value> = call.opt(engine_state, stack, 0)?;
+    let groupers: Vec<Value> = call.rest(engine_state, stack, 0)?;
    let to_table = call.has_flag(engine_state, stack, "to-table")?;
    let config = engine_state.get_config();
@@ -156,29 +238,22 @@ pub fn group_by(
        return Ok(Value::record(Record::new(), head).into_pipeline_data());
    }
-    let groups = match grouper {
+    let mut groupers = groupers.into_iter();
-        Some(grouper) => {
+
-            let span = grouper.span();
+    let grouped = if let Some(grouper) = groupers.next() {
-            match grouper {
+        let mut groups = Grouped::new(&grouper, values, config, engine_state, stack)?;
-                Value::CellPath { val, .. } => group_cell_path(val, values, config)?,
+        for grouper in groupers {
-                Value::Closure { val, .. } => {
+            groups.subgroup(&grouper, config, engine_state, stack)?;
                    group_closure(values, span, *val, engine_state, stack)?
        }
-                _ => {
+        groups
-                    return Err(ShellError::TypeMismatch {
+    } else {
-                        err_message: "unsupported grouper type".to_string(),
+        Grouped::empty(values, config)
                        span,
                    })
                }
            }
        }
        None => group_no_grouper(values, config)?,
    };
    let value = if to_table {
-        groups_to_table(groups, head)
+        grouped.into_table(head)
    } else {
-        groups_to_record(groups, head)
+        grouped.into_record(head)
    };
    Ok(value.into_pipeline_data())
@@ -207,20 +282,6 @@ fn group_cell_path(
    Ok(groups)
 }
 fn group_no_grouper(
    values: Vec<Value>,
    config: &nu_protocol::Config,
 ) -> Result<IndexMap<String, Vec<Value>>, ShellError> {
    let mut groups = IndexMap::<_, Vec<_>>::new();
    for value in values.into_iter() {
        let key = value.to_abbreviated_string(config);
        groups.entry(key).or_default().push(value);
    }
    Ok(groups)
 }
 fn group_closure(
    values: Vec<Value>,
    span: Span,
@@ -244,32 +305,137 @@ fn group_closure(
    Ok(groups)
 }
-fn groups_to_record(groups: IndexMap<String, Vec<Value>>, span: Span) -> Value {
+struct Grouped {
-    Value::record(
+    grouper: Option<String>,
-        groups
+    groups: Tree,
            .into_iter()
            .map(|(k, v)| (k, Value::list(v, span)))
            .collect(),
        span,
    )
 }
-fn groups_to_table(groups: IndexMap<String, Vec<Value>>, span: Span) -> Value {
+enum Tree {
-    Value::list(
+    Leaf(IndexMap<String, Vec<Value>>),
-        groups
+    Branch(IndexMap<String, Grouped>),
-            .into_iter()
+}
-            .map(|(group, items)| {
+
-                Value::record(
+impl Grouped {
-                    record! {
+    fn empty(values: Vec<Value>, config: &nu_protocol::Config) -> Self {
-                        "group" => Value::string(group, span),
+        let mut groups = IndexMap::<_, Vec<_>>::new();
-                        "items" => Value::list(items, span),
+
-                    },
+        for value in values.into_iter() {
            let key = value.to_abbreviated_string(config);
            groups.entry(key).or_default().push(value);
        }
        Self {
            grouper: Some("group".into()),
            groups: Tree::Leaf(groups),
        }
    }
    fn new(
        grouper: &Value,
        values: Vec<Value>,
        config: &nu_protocol::Config,
        engine_state: &EngineState,
        stack: &mut Stack,
    ) -> Result<Self, ShellError> {
        let span = grouper.span();
        let groups = match grouper {
            Value::CellPath { val, .. } => group_cell_path(val.clone(), values, config)?,
            Value::Closure { val, .. } => {
                group_closure(values, span, Closure::clone(val), engine_state, stack)?
            }
            _ => {
                return Err(ShellError::TypeMismatch {
                    err_message: "unsupported grouper type".to_string(),
                    span,
-                )
+                })
            }
        };
        let grouper = grouper.as_cell_path().ok().map(CellPath::to_column_name);
        Ok(Self {
            grouper,
            groups: Tree::Leaf(groups),
        })
    }
    fn subgroup(
        &mut self,
        grouper: &Value,
        config: &nu_protocol::Config,
        engine_state: &EngineState,
        stack: &mut Stack,
    ) -> Result<(), ShellError> {
        let groups = match &mut self.groups {
            Tree::Leaf(groups) => std::mem::take(groups)
                .into_iter()
                .map(|(key, values)| -> Result<_, ShellError> {
                    let leaf = Self::new(grouper, values, config, engine_state, stack)?;
                    Ok((key, leaf))
                })
                .collect::<Result<IndexMap<_, _>, ShellError>>()?,
            Tree::Branch(nested_groups) => {
                let mut nested_groups = std::mem::take(nested_groups);
                for v in nested_groups.values_mut() {
                    v.subgroup(grouper, config, engine_state, stack)?;
                }
                nested_groups
            }
        };
        self.groups = Tree::Branch(groups);
        Ok(())
    }
    fn into_table(self, head: Span) -> Value {
        self._into_table(head, 0)
            .into_iter()
            .map(|row| row.into_iter().rev().collect::<Record>().into_value(head))
            .collect::<Vec<_>>()
            .into_value(head)
    }
    fn _into_table(self, head: Span, index: usize) -> Vec<Record> {
        let grouper = self.grouper.unwrap_or_else(|| format!("group{index}"));
        match self.groups {
            Tree::Leaf(leaf) => leaf
                .into_iter()
                .map(|(group, values)| {
                    [
                        ("items".to_string(), values.into_value(head)),
                        (grouper.clone(), group.into_value(head)),
                    ]
                    .into_iter()
                    .collect()
                })
                .collect::<Vec<Record>>(),
            Tree::Branch(branch) => branch
                .into_iter()
                .flat_map(|(group, items)| {
                    let mut inner = items._into_table(head, index + 1);
                    for row in &mut inner {
                        row.insert(grouper.clone(), group.clone().into_value(head));
                    }
                    inner
                })
                .collect(),
-        span,
+        }
-    )
+    }
    fn into_record(self, head: Span) -> Value {
        match self.groups {
            Tree::Leaf(leaf) => Value::record(
                leaf.into_iter()
                    .map(|(k, v)| (k, v.into_value(head)))
                    .collect(),
                head,
            ),
            Tree::Branch(branch) => {
                let values = branch
                    .into_iter()
                    .map(|(k, v)| (k, v.into_record(head)))
                    .collect();
                Value::record(values, head)
            }
        }
    }
 }
 #[cfg(test)]
--- a/crates/nu-std/testing.nu
+++ b/crates/nu-std/testing.nu
@@ -79,7 +79,7 @@ def create-test-record [] nothing -> record<before-each: string, after-each: str
        | group-by --to-table annotation
        | update items {|x|
            $x.items.function_name
-            | if $x.group in ["test", "test-skip"] {
+            | if $x.annotation in ["test", "test-skip"] {
                $in
            } else {
                get 0