From 392ff286b2357790f66467f7683623bfb701f4a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20N=2E=20Robalino?= Date: Tue, 29 Oct 2019 16:04:31 -0500 Subject: [PATCH 1/2] This commit is ongoing work for making Nu working with data processing a joy. Fundamentally we embrace functional programming principles for transforming the dataset from any format picked up by Nu. This table processing "primitive" commands will build up and make pipelines composable with data processing capabilities allowing us the valuate, reduce, and map, the tables as far as even composing this declartively. On this regard, `split-by` expects some table with grouped data and we can use it further in interesting ways (Eg. collecting labels for visualizing the data in charts and/or suit it for a particular chart of our interest). --- README.md | 1 + src/cli.rs | 1 + src/commands.rs | 2 + src/commands/group_by.rs | 183 +++++++++++++++++++++------- src/commands/split_by.rs | 256 +++++++++++++++++++++++++++++++++++++++ tests/commands_test.rs | 72 +++++++++-- 6 files changed, 461 insertions(+), 54 deletions(-) create mode 100644 src/commands/split_by.rs diff --git a/README.md b/README.md index 64ff0e801..1deb11a20 100644 --- a/README.md +++ b/README.md @@ -262,6 +262,7 @@ Nu adheres closely to a set of goals that make up its design philosophy. As feat | pivot --header-row | Pivot the tables, making columns into rows and vice versa | | reject ...columns | Remove the given columns from the table | | reverse | Reverses the table. | +| split-by column | Creates a new table with the data from the inner tables splitted by the column given | | skip amount | Skip a number of rows | | skip-while condition | Skips rows while the condition matches. | | sort-by ...columns | Sort by the given columns | diff --git a/src/cli.rs b/src/cli.rs index 9661cb320..a66dd6cbb 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -259,6 +259,7 @@ pub async fn cli() -> Result<(), Box> { whole_stream_command(Previous), whole_stream_command(Debug), whole_stream_command(Shells), + whole_stream_command(SplitBy), whole_stream_command(SplitColumn), whole_stream_command(SplitRow), whole_stream_command(Lines), diff --git a/src/commands.rs b/src/commands.rs index c75ca8119..0a71a9363 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -55,6 +55,7 @@ pub(crate) mod shells; pub(crate) mod size; pub(crate) mod skip_while; pub(crate) mod sort_by; +pub(crate) mod split_by; pub(crate) mod split_column; pub(crate) mod split_row; pub(crate) mod table; @@ -129,6 +130,7 @@ pub(crate) use shells::Shells; pub(crate) use size::Size; pub(crate) use skip_while::SkipWhile; pub(crate) use sort_by::SortBy; +pub(crate) use split_by::SplitBy; pub(crate) use split_column::SplitColumn; pub(crate) use split_row::SplitRow; pub(crate) use table::Table; diff --git a/src/commands/group_by.rs b/src/commands/group_by.rs index f36d3f57d..66c1360f5 100644 --- a/src/commands/group_by.rs +++ b/src/commands/group_by.rs @@ -36,59 +36,154 @@ impl WholeStreamCommand for GroupBy { } } -fn group_by( +pub fn group_by( GroupByArgs { column_name }: GroupByArgs, RunnableContext { input, name, .. }: RunnableContext, ) -> Result { let stream = async_stream! { let values: Vec> = input.values.collect().await; - let mut groups = indexmap::IndexMap::new(); - for value in values { - let group_key = value.get_data_by_key(&column_name.item); - - if group_key.is_none() { - - let possibilities = value.data_descriptors(); - - let mut possible_matches: Vec<_> = possibilities - .iter() - .map(|x| (natural::distance::levenshtein_distance(x, &column_name.item), x)) - .collect(); - - possible_matches.sort(); - - let err = { - if possible_matches.len() > 0 { - ShellError::labeled_error( - "Unknown column", - format!("did you mean '{}'?", possible_matches[0].1), - &column_name.tag,) - } else { - ShellError::labeled_error( - "Unknown column", - "row does not contain this column", - &column_name.tag, - ) - } - }; - - yield Err(err) - } else { - let group_key = group_key.unwrap().as_string()?; - let mut group = groups.entry(group_key).or_insert(vec![]); - group.push(value); + if values.is_empty() { + yield Err(ShellError::labeled_error( + "Expected table from pipeline", + "requires a table input", + column_name.span() + )) + } else { + match group(&column_name, values, name) { + Ok(grouped) => yield ReturnSuccess::value(grouped), + Err(err) => yield Err(err) } } - - let mut out = TaggedDictBuilder::new(name.clone()); - - for (k,v) in groups.iter() { - out.insert(k, Value::table(v)); - } - - yield ReturnSuccess::value(out) }; Ok(stream.to_output_stream()) } + +pub fn group( + column_name: &Tagged, + values: Vec>, + tag: impl Into, +) -> Result, ShellError> { + let tag = tag.into(); + + let mut groups = indexmap::IndexMap::new(); + + for value in values { + let group_key = value.get_data_by_key(column_name); + + if group_key.is_none() { + let possibilities = value.data_descriptors(); + + let mut possible_matches: Vec<_> = possibilities + .iter() + .map(|x| (natural::distance::levenshtein_distance(x, column_name), x)) + .collect(); + + possible_matches.sort(); + + if possible_matches.len() > 0 { + return Err(ShellError::labeled_error( + "Unknown column", + format!("did you mean '{}'?", possible_matches[0].1), + column_name.tag(), + )); + } else { + return Err(ShellError::labeled_error( + "Unknown column", + "row does not contain this column", + column_name.tag(), + )); + } + } + + let group_key = group_key.unwrap().as_string()?; + let group = groups.entry(group_key).or_insert(vec![]); + group.push(value); + } + + let mut out = TaggedDictBuilder::new(&tag); + + for (k, v) in groups.iter() { + out.insert(k, Value::table(v)); + } + + Ok(out.into_tagged_value()) +} + +#[cfg(test)] +mod tests { + + use crate::commands::group_by::group; + use crate::data::meta::*; + use crate::Value; + use indexmap::IndexMap; + + fn string(input: impl Into) -> Tagged { + Value::string(input.into()).tagged_unknown() + } + + fn row(entries: IndexMap>) -> Tagged { + Value::row(entries).tagged_unknown() + } + + fn table(list: &Vec>) -> Tagged { + Value::table(list).tagged_unknown() + } + + #[test] + fn groups_table_by_key() { + let for_key = String::from("date").tagged_unknown(); + + let nu_releases = vec![ + row( + indexmap! {"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}, + ), + row( + indexmap! {"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")}, + ), + row( + indexmap! {"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")}, + ), + row( + indexmap! {"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")}, + ), + row( + indexmap! {"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")}, + ), + row( + indexmap! {"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")}, + ), + row( + indexmap! {"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")}, + ), + row( + indexmap! {"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("Sept 24-2019")}, + ), + row( + indexmap! {"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("August 23-2019")}, + ), + ]; + + assert_eq!( + group(&for_key, nu_releases, Tag::unknown()).unwrap(), + row(indexmap! { + "August 23-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("August 23-2019")}) + ]), + "October 10-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")}), + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")}) + ]), + "Sept 24-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("Sept 24-2019")}) + ]), + }) + ); + } +} diff --git a/src/commands/split_by.rs b/src/commands/split_by.rs new file mode 100644 index 000000000..b995b041d --- /dev/null +++ b/src/commands/split_by.rs @@ -0,0 +1,256 @@ +use crate::commands::WholeStreamCommand; +use crate::data::TaggedDictBuilder; +use crate::errors::ShellError; +use crate::prelude::*; + +pub struct SplitBy; + +#[derive(Deserialize)] +pub struct SplitByArgs { + column_name: Tagged, +} + +impl WholeStreamCommand for SplitBy { + fn name(&self) -> &str { + "split-by" + } + + fn signature(&self) -> Signature { + Signature::build("split-by").required( + "column_name", + SyntaxShape::String, + "the name of the column within the nested table to split by", + ) + } + + fn usage(&self) -> &str { + "Creates a new table with the data from the inner tables splitted by the column given." + } + + fn run( + &self, + args: CommandArgs, + registry: &CommandRegistry, + ) -> Result { + args.process(registry, split_by)?.run() + } +} + +pub fn split_by( + SplitByArgs { column_name }: SplitByArgs, + RunnableContext { input, name, .. }: RunnableContext, +) -> Result { + let stream = async_stream! { + let values: Vec> = input.values.collect().await; + + if values.len() > 1 || values.is_empty() { + yield Err(ShellError::labeled_error( + "Expected table from pipeline", + "requires a table input", + column_name.span() + )) + } else { + match split(&column_name, &values[0], name) { + Ok(split) => yield ReturnSuccess::value(split), + Err(err) => yield Err(err), + } + } + }; + + Ok(stream.to_output_stream()) +} + +pub fn split( + column_name: &Tagged, + value: &Tagged, + tag: impl Into, +) -> Result, ShellError> { + let origin_tag = tag.into(); + + let mut splits = indexmap::IndexMap::new(); + + match value { + Tagged { + item: Value::Row(group_sets), + .. + } => { + for (group_key, group_value) in group_sets.entries.iter() { + match *group_value { + Tagged { + item: Value::Table(ref dataset), + .. + } => { + let group = crate::commands::group_by::group( + &column_name, + dataset.to_vec(), + &origin_tag, + )?; + + match group { + Tagged { + item: Value::Row(o), + .. + } => { + for (split_label, subset) in o.entries.into_iter() { + match subset { + Tagged { + item: Value::Table(subset), + tag, + } => { + let s = splits + .entry(split_label.clone()) + .or_insert(indexmap::IndexMap::new()); + s.insert( + group_key.clone(), + Value::table(&subset).tagged(tag), + ); + } + other => { + return Err(ShellError::type_error( + "a table value", + other.tagged_type_name(), + )) + } + } + } + } + _ => { + return Err(ShellError::type_error( + "a table value", + group.tagged_type_name(), + )) + } + } + } + ref other => { + return Err(ShellError::type_error( + "a table value", + other.tagged_type_name(), + )) + } + } + } + } + _ => { + return Err(ShellError::type_error( + "a table value", + value.tagged_type_name(), + )) + } + } + + let mut out = TaggedDictBuilder::new(&origin_tag); + + for (k, v) in splits.into_iter() { + out.insert(k, Value::row(v)); + } + + Ok(out.into_tagged_value()) +} +#[cfg(test)] +mod tests { + + use crate::commands::split_by::split; + use crate::data::meta::*; + use crate::Value; + use indexmap::IndexMap; + + fn string(input: impl Into) -> Tagged { + Value::string(input.into()).tagged_unknown() + } + + fn row(entries: IndexMap>) -> Tagged { + Value::row(entries).tagged_unknown() + } + + fn table(list: &Vec>) -> Tagged { + Value::table(list).tagged_unknown() + } + + #[test] + fn splits_inner_tables_by_key() { + let for_key = String::from("country").tagged_unknown(); + + let nu_releases = row(indexmap! { + "August 23-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("August 23-2019")}) + ]), + "Sept 24-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("Sept 24-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")}) + ]), + "October 10-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")}) + ]) + }); + + assert_eq!( + split(&for_key, &nu_releases, Tag::unknown()).unwrap(), + Value::row(indexmap! { + "EC".into() => row(indexmap! { + "August 23-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}) + ]), + "Sept 24-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")}) + ]), + "October 10-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")}) + ]) + }), + "NZ".into() => row(indexmap! { + "August 23-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")}) + ]), + "Sept 24-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("Sept 24-2019")}) + ]), + "October 10-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")}) + ]) + }), + "US".into() => row(indexmap! { + "August 23-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("August 23-2019")}) + ]), + "Sept 24-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")}) + ]), + "October 10-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")}) + ]) + }) + }).tagged_unknown() + ); + } + + #[test] + fn errors_if_key_within_some_inner_table_is_missing() { + let for_key = String::from("country").tagged_unknown(); + + let nu_releases = row(indexmap! { + "August 23-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("August 23-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("August 23-2019")}) + ]), + "Sept 24-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("Sept 24-2019")}), + row(indexmap!{"name".into() => Value::string("JT").tagged(Tag::from(Span::new(5,10))), "date".into() => string("Sept 24-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("Sept 24-2019")}) + ]), + "October 10-2019".into() => table(&vec![ + row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("October 10-2019")}), + row(indexmap!{"name".into() => string("JT"), "country".into() => string("NZ"), "date".into() => string("October 10-2019")}), + row(indexmap!{"name".into() => string("YK"), "country".into() => string("US"), "date".into() => string("October 10-2019")}) + ]) + }); + + assert!(split(&for_key, &nu_releases, Tag::from(Span::new(5, 10))).is_err()); + } +} diff --git a/tests/commands_test.rs b/tests/commands_test.rs index 87e1182b1..7b31c6ae4 100644 --- a/tests/commands_test.rs +++ b/tests/commands_test.rs @@ -9,10 +9,10 @@ fn group_by() { sandbox.with_files(vec![FileWithContentToBeTrimmed( "los_tres_caballeros.csv", r#" - first_name,last_name,rusty_luck,type - Andrés,Robalino,1,A - Jonathan,Turner,1,B - Yehuda,Katz,1,A + first_name,last_name,rusty_at,type + Andrés,Robalino,10/11/2013,A + Jonathan,Turner,10/12/2013,B + Yehuda,Katz,10/11/2013,A "#, )]); @@ -20,8 +20,8 @@ fn group_by() { cwd: dirs.test(), h::pipeline( r#" open los_tres_caballeros.csv - | group-by type - | get A + | group-by rusty_at + | get "10/11/2013" | count | echo $it "# @@ -37,10 +37,10 @@ fn group_by_errors_if_unknown_column_name() { sandbox.with_files(vec![FileWithContentToBeTrimmed( "los_tres_caballeros.csv", r#" - first_name,last_name,rusty_luck,type - Andrés,Robalino,1,A - Jonathan,Turner,1,B - Yehuda,Katz,1,A + first_name,last_name,rusty_at,type + Andrés,Robalino,10/11/2013,A + Jonathan,Turner,10/12/2013,B + Yehuda,Katz,10/11/2013,A "#, )]); @@ -56,6 +56,58 @@ fn group_by_errors_if_unknown_column_name() { }) } +#[test] +fn split_by() { + Playground::setup("split_by_test_1", |dirs, sandbox| { + sandbox.with_files(vec![FileWithContentToBeTrimmed( + "los_tres_caballeros.csv", + r#" + first_name,last_name,rusty_at,type + Andrés,Robalino,10/11/2013,A + Jonathan,Turner,10/12/2013,B + Yehuda,Katz,10/11/2013,A + "#, + )]); + + let actual = nu!( + cwd: dirs.test(), h::pipeline( + r#" + open los_tres_caballeros.csv + | group-by rusty_at + | split-by type + | get A."10/11/2013" + | count + | echo $it + "# + )); + + assert_eq!(actual, "2"); + }) +} + +#[test] +fn split_by_errors_if_no_table_given_as_input() { + Playground::setup("split_by_test_2", |dirs, sandbox| { + sandbox.with_files(vec![ + EmptyFile("los.txt"), + EmptyFile("tres.txt"), + EmptyFile("amigos.txt"), + EmptyFile("arepas.clu"), + ]); + + let actual = nu_error!( + cwd: dirs.test(), h::pipeline( + r#" + ls + | get name + | split-by type + "# + )); + + assert!(actual.contains("Expected table from pipeline")); + }) +} + #[test] fn first_gets_first_rows_by_amount() { Playground::setup("first_test_1", |dirs, sandbox| { From 889d2bb378c1f5cbe552acdcf06be90bfb6bd478 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20N=2E=20Robalino?= Date: Sun, 3 Nov 2019 16:36:47 -0500 Subject: [PATCH 2/2] Isolate feature. --- README.md | 1 - features.toml | 8 ++++++++ src/cli.rs | 3 ++- src/commands.rs | 6 ++++++ tests/commands_test.rs | 2 ++ 5 files changed, 18 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1deb11a20..64ff0e801 100644 --- a/README.md +++ b/README.md @@ -262,7 +262,6 @@ Nu adheres closely to a set of goals that make up its design philosophy. As feat | pivot --header-row | Pivot the tables, making columns into rows and vice versa | | reject ...columns | Remove the given columns from the table | | reverse | Reverses the table. | -| split-by column | Creates a new table with the data from the inner tables splitted by the column given | | skip amount | Skip a number of rows | | skip-while condition | Skips rows while the condition matches. | | sort-by ...columns | Sort by the given columns | diff --git a/features.toml b/features.toml index f7cea6d9e..e1cf56e33 100644 --- a/features.toml +++ b/features.toml @@ -10,4 +10,12 @@ reason = """ This is laying the groundwork for merging coloring and parsing. It also makes token_nodes.atomic() naturally work with coloring, which is pretty useful on its own. """ +enabled = false + +[data_processing_primitives] + +description = "Groundwork so tables can be data processed" +reason = """ +These will allow take tables and be able to transform, process, and explore. +""" enabled = false \ No newline at end of file diff --git a/src/cli.rs b/src/cli.rs index a66dd6cbb..d53e587a3 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -259,7 +259,6 @@ pub async fn cli() -> Result<(), Box> { whole_stream_command(Previous), whole_stream_command(Debug), whole_stream_command(Shells), - whole_stream_command(SplitBy), whole_stream_command(SplitColumn), whole_stream_command(SplitRow), whole_stream_command(Lines), @@ -319,6 +318,8 @@ pub async fn cli() -> Result<(), Box> { whole_stream_command(Table), whole_stream_command(Version), whole_stream_command(Which), + #[cfg(data_processing_primitives)] + whole_stream_command(SplitBy), ]); #[cfg(feature = "clipboard")] diff --git a/src/commands.rs b/src/commands.rs index 0a71a9363..5300a39ec 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -55,7 +55,10 @@ pub(crate) mod shells; pub(crate) mod size; pub(crate) mod skip_while; pub(crate) mod sort_by; + +#[cfg(data_processing_primitives)] pub(crate) mod split_by; + pub(crate) mod split_column; pub(crate) mod split_row; pub(crate) mod table; @@ -130,7 +133,10 @@ pub(crate) use shells::Shells; pub(crate) use size::Size; pub(crate) use skip_while::SkipWhile; pub(crate) use sort_by::SortBy; + +#[cfg(data_processing_primitives)] pub(crate) use split_by::SplitBy; + pub(crate) use split_column::SplitColumn; pub(crate) use split_row::SplitRow; pub(crate) use table::Table; diff --git a/tests/commands_test.rs b/tests/commands_test.rs index 7b31c6ae4..547f1e008 100644 --- a/tests/commands_test.rs +++ b/tests/commands_test.rs @@ -56,6 +56,7 @@ fn group_by_errors_if_unknown_column_name() { }) } +#[cfg(data_processing_primitives)] #[test] fn split_by() { Playground::setup("split_by_test_1", |dirs, sandbox| { @@ -85,6 +86,7 @@ fn split_by() { }) } +#[cfg(data_processing_primitives)] #[test] fn split_by_errors_if_no_table_given_as_input() { Playground::setup("split_by_test_2", |dirs, sandbox| {