This commit is ongoing work for making Nu working with data processing

a joy. Fundamentally we embrace functional programming principles for transforming the dataset from any format picked up by Nu. This table processing "primitive" commands will build up and make pipelines composable with data processing capabilities allowing us the valuate, reduce, and map, the tables as far as even composing this declartively. On this regard, `split-by` expects some table with grouped data and we can use it further in interesting ways (Eg. collecting labels for visualizing the data in charts and/or suit it for a particular chart of our interest).
2025-08-09 21:47:46 +02:00 · 2019-10-29 16:04:31 -05:00
parent b6824d8b88
commit 392ff286b2
6 changed files with 461 additions and 54 deletions
--- a/src/commands/group_by.rs
+++ b/src/commands/group_by.rs
@ -36,59 +36,154 @@ impl WholeStreamCommand for GroupBy {
    }
 }

-fn group_by(
+pub fn group_by(
    GroupByArgs { column_name }: GroupByArgs,
    RunnableContext { input, name, .. }: RunnableContext,
 ) -> Result<OutputStream, ShellError> {
    let stream = async_stream! {
        let values: Vec<Tagged<Value>> = input.values.collect().await;
-        let mut groups = indexmap::IndexMap::new();

-        for value in values {
-            let group_key = value.get_data_by_key(&column_name.item);
-
-            if group_key.is_none() {
-
-                let possibilities = value.data_descriptors();
-
-                let mut possible_matches: Vec<_> = possibilities
-                    .iter()
-                    .map(|x| (natural::distance::levenshtein_distance(x, &column_name.item), x))
-                    .collect();
-
-                possible_matches.sort();
-
-                let err = {
-                    if possible_matches.len() > 0 {
-                        ShellError::labeled_error(
-                            "Unknown column",
-                            format!("did you mean '{}'?", possible_matches[0].1),
-                            &column_name.tag,)
-                    } else {
-                        ShellError::labeled_error(
-                            "Unknown column",
-                            "row does not contain this column",
-                            &column_name.tag,
-                        )
-                    }
-                };
-
-                yield Err(err)
-            } else {
-                let group_key = group_key.unwrap().as_string()?;
-                let mut group = groups.entry(group_key).or_insert(vec![]);
-                group.push(value);
+        if values.is_empty() {
+            yield Err(ShellError::labeled_error(
+                    "Expected table from pipeline",
+                    "requires a table input",
+                    column_name.span()
+                ))
+        } else {
+            match group(&column_name, values, name) {
+                Ok(grouped) => yield ReturnSuccess::value(grouped),
+                Err(err) => yield Err(err)
            }
        }
-
-        let mut out = TaggedDictBuilder::new(name.clone());
-
-        for (k,v) in groups.iter() {
-            out.insert(k, Value::table(v));
-        }
-
-        yield ReturnSuccess::value(out)
    };

    Ok(stream.to_output_stream())
 }
+
+pub fn group(
+    column_name: &Tagged<String>,
+    values: Vec<Tagged<Value>>,
+    tag: impl Into<Tag>,
+) -> Result<Tagged<Value>, ShellError> {
+    let tag = tag.into();
+
+    let mut groups = indexmap::IndexMap::new();
+
+    for value in values {
+        let group_key = value.get_data_by_key(column_name);
+
+        if group_key.is_none() {
+            let possibilities = value.data_descriptors();
+
+            let mut possible_matches: Vec<_> = possibilities
+                .iter()
+                .map(|x| (natural::distance::levenshtein_distance(x, column_name), x))
+                .collect();
+
+            possible_matches.sort();
+
+            if possible_matches.len() > 0 {
+                return Err(ShellError::labeled_error(
+                    "Unknown column",
+                    format!("did you mean '{}'?", possible_matches[0].1),
+                    column_name.tag(),
+                ));
+            } else {
+                return Err(ShellError::labeled_error(
+                    "Unknown column",
+                    "row does not contain this column",
+                    column_name.tag(),
+                ));
+            }
+        }
+
+        let group_key = group_key.unwrap().as_string()?;
+        let group = groups.entry(group_key).or_insert(vec![]);
+        group.push(value);
+    }
+
+    let mut out = TaggedDictBuilder::new(&tag);
+
+    for (k, v) in groups.iter() {
+        out.insert(k, Value::table(v));
+    }
+
+    Ok(out.into_tagged_value())
+}
+
+#[cfg(test)]
+mod tests {
+
+    use crate::commands::group_by::group;
+    use crate::data::meta::*;
+    use crate::Value;
+    use indexmap::IndexMap;
+
+    fn string(input: impl Into<String>) -> Tagged<Value> {
+        Value::string(input.into()).tagged_unknown()
+    }
+
+    fn row(entries: IndexMap<String, Tagged<Value>>) -> Tagged<Value> {
+        Value::row(entries).tagged_unknown()
+    }
+
+    fn table(list: &Vec<Tagged<Value>>) -> Tagged<Value> {
+        Value::table(list).tagged_unknown()
+    }
+
+    #[test]
+    fn groups_table_by_key() {
+        let for_key = String::from("date").tagged_unknown();
+
+        let nu_releases = vec![
+            row(
+                indexmap! {"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")},
+            ),
+            row(
+                indexmap! {"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")},
+            ),
+            row(
+                indexmap! {"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")},
+            ),
+            row(
+                indexmap! {"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")},
+            ),
+            row(
+                indexmap! {"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")},
+            ),
+            row(
+                indexmap! {"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")},
+            ),
+            row(
+                indexmap! {"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")},
+            ),
+            row(
+                indexmap! {"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("Sept 24-2019")},
+            ),
+            row(
+                indexmap! {"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("August 23-2019")},
+            ),
+        ];
+
+        assert_eq!(
+            group(&for_key, nu_releases, Tag::unknown()).unwrap(),
+            row(indexmap! {
+                "August 23-2019".into() =>  table(&vec![
+                    row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}),
+                    row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")}),
+                    row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("August 23-2019")})
+                ]),
+                "October 10-2019".into() =>  table(&vec![
+                    row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")}),
+                    row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")}),
+                    row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")})
+                ]),
+                "Sept 24-2019".into() =>  table(&vec![
+                    row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")}),
+                    row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")}),
+                    row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("Sept 24-2019")})
+                ]),
+            })
+        );
+    }
+}
--- a/src/commands/split_by.rs
+++ b/src/commands/split_by.rs
@ -0,0 +1,256 @@
+use crate::commands::WholeStreamCommand;
+use crate::data::TaggedDictBuilder;
+use crate::errors::ShellError;
+use crate::prelude::*;
+
+pub struct SplitBy;
+
+#[derive(Deserialize)]
+pub struct SplitByArgs {
+    column_name: Tagged<String>,
+}
+
+impl WholeStreamCommand for SplitBy {
+    fn name(&self) -> &str {
+        "split-by"
+    }
+
+    fn signature(&self) -> Signature {
+        Signature::build("split-by").required(
+            "column_name",
+            SyntaxShape::String,
+            "the name of the column within the nested table to split by",
+        )
+    }
+
+    fn usage(&self) -> &str {
+        "Creates a new table with the data from the inner tables splitted by the column given."
+    }
+
+    fn run(
+        &self,
+        args: CommandArgs,
+        registry: &CommandRegistry,
+    ) -> Result<OutputStream, ShellError> {
+        args.process(registry, split_by)?.run()
+    }
+}
+
+pub fn split_by(
+    SplitByArgs { column_name }: SplitByArgs,
+    RunnableContext { input, name, .. }: RunnableContext,
+) -> Result<OutputStream, ShellError> {
+    let stream = async_stream! {
+        let values: Vec<Tagged<Value>> = input.values.collect().await;
+
+        if values.len() > 1 || values.is_empty() {
+            yield Err(ShellError::labeled_error(
+                    "Expected table from pipeline",
+                    "requires a table input",
+                    column_name.span()
+                ))
+        } else {
+            match split(&column_name, &values[0], name) {
+                Ok(split) => yield ReturnSuccess::value(split),
+                Err(err) => yield Err(err),
+            }
+        }
+    };
+
+    Ok(stream.to_output_stream())
+}
+
+pub fn split(
+    column_name: &Tagged<String>,
+    value: &Tagged<Value>,
+    tag: impl Into<Tag>,
+) -> Result<Tagged<Value>, ShellError> {
+    let origin_tag = tag.into();
+
+    let mut splits = indexmap::IndexMap::new();
+
+    match value {
+        Tagged {
+            item: Value::Row(group_sets),
+            ..
+        } => {
+            for (group_key, group_value) in group_sets.entries.iter() {
+                match *group_value {
+                    Tagged {
+                        item: Value::Table(ref dataset),
+                        ..
+                    } => {
+                        let group = crate::commands::group_by::group(
+                            &column_name,
+                            dataset.to_vec(),
+                            &origin_tag,
+                        )?;
+
+                        match group {
+                            Tagged {
+                                item: Value::Row(o),
+                                ..
+                            } => {
+                                for (split_label, subset) in o.entries.into_iter() {
+                                    match subset {
+                                        Tagged {
+                                            item: Value::Table(subset),
+                                            tag,
+                                        } => {
+                                            let s = splits
+                                                .entry(split_label.clone())
+                                                .or_insert(indexmap::IndexMap::new());
+                                            s.insert(
+                                                group_key.clone(),
+                                                Value::table(&subset).tagged(tag),
+                                            );
+                                        }
+                                        other => {
+                                            return Err(ShellError::type_error(
+                                                "a table value",
+                                                other.tagged_type_name(),
+                                            ))
+                                        }
+                                    }
+                                }
+                            }
+                            _ => {
+                                return Err(ShellError::type_error(
+                                    "a table value",
+                                    group.tagged_type_name(),
+                                ))
+                            }
+                        }
+                    }
+                    ref other => {
+                        return Err(ShellError::type_error(
+                            "a table value",
+                            other.tagged_type_name(),
+                        ))
+                    }
+                }
+            }
+        }
+        _ => {
+            return Err(ShellError::type_error(
+                "a table value",
+                value.tagged_type_name(),
+            ))
+        }
+    }
+
+    let mut out = TaggedDictBuilder::new(&origin_tag);
+
+    for (k, v) in splits.into_iter() {
+        out.insert(k, Value::row(v));
+    }
+
+    Ok(out.into_tagged_value())
+}
+#[cfg(test)]
+mod tests {
+
+    use crate::commands::split_by::split;
+    use crate::data::meta::*;
+    use crate::Value;
+    use indexmap::IndexMap;
+
+    fn string(input: impl Into<String>) -> Tagged<Value> {
+        Value::string(input.into()).tagged_unknown()
+    }
+
+    fn row(entries: IndexMap<String, Tagged<Value>>) -> Tagged<Value> {
+        Value::row(entries).tagged_unknown()
+    }
+
+    fn table(list: &Vec<Tagged<Value>>) -> Tagged<Value> {
+        Value::table(list).tagged_unknown()
+    }
+
+    #[test]
+    fn splits_inner_tables_by_key() {
+        let for_key = String::from("country").tagged_unknown();
+
+        let nu_releases = row(indexmap! {
+            "August 23-2019".into() =>  table(&vec![
+                    row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}),
+                    row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")}),
+                    row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("August 23-2019")})
+            ]),
+            "Sept 24-2019".into() =>  table(&vec![
+                    row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")}),
+                    row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("Sept 24-2019")}),
+                    row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")})
+            ]),
+            "October 10-2019".into() =>  table(&vec![
+                    row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")}),
+                    row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")}),
+                    row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")})
+            ])
+        });
+
+        assert_eq!(
+            split(&for_key, &nu_releases, Tag::unknown()).unwrap(),
+            Value::row(indexmap! {
+                "EC".into() => row(indexmap! {
+                    "August 23-2019".into() => table(&vec![
+                        row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")})
+                    ]),
+                    "Sept 24-2019".into() => table(&vec![
+                        row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")})
+                    ]),
+                    "October 10-2019".into() => table(&vec![
+                        row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")})
+                    ])
+                }),
+                "NZ".into() => row(indexmap! {
+                    "August 23-2019".into() => table(&vec![
+                        row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")})
+                    ]),
+                    "Sept 24-2019".into() => table(&vec![
+                        row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("Sept 24-2019")})
+                    ]),
+                    "October 10-2019".into() => table(&vec![
+                        row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")})
+                    ])
+                }),
+                "US".into() => row(indexmap! {
+                    "August 23-2019".into() => table(&vec![
+                        row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("August 23-2019")})
+                    ]),
+                    "Sept 24-2019".into() => table(&vec![
+                        row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")})
+                    ]),
+                    "October 10-2019".into() => table(&vec![
+                        row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")})
+                    ])
+                })
+            }).tagged_unknown()
+        );
+    }
+
+    #[test]
+    fn errors_if_key_within_some_inner_table_is_missing() {
+        let for_key = String::from("country").tagged_unknown();
+
+        let nu_releases = row(indexmap! {
+            "August 23-2019".into() =>  table(&vec![
+                    row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}),
+                    row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")}),
+                    row(indexmap!{"name".into() => string("YK"),  "country".into() => string("US"), "date".into() => string("August 23-2019")})
+            ]),
+            "Sept 24-2019".into() =>  table(&vec![
+                    row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")}),
+                    row(indexmap!{"name".into() => Value::string("JT").tagged(Tag::from(Span::new(5,10))), "date".into() => string("Sept 24-2019")}),
+                    row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")})
+            ]),
+            "October 10-2019".into() =>  table(&vec![
+                    row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")}),
+                    row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")}),
+                    row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")})
+            ])
+        });
+
+        assert!(split(&for_key, &nu_releases, Tag::from(Span::new(5, 10))).is_err());
+    }
+}