From f37f29b441cf98a33cb918724ae3c1c638245223 Mon Sep 17 00:00:00 2001 From: Ryan Blecher Date: Mon, 30 Dec 2019 23:05:02 -0500 Subject: [PATCH] Add uniq command (#1132) * start playing with ways to use the uniq command * WIP * Got uniq working, but still need to figure out args issue and add tests * Add some tests for uniq * fmt * remove commented out code * Add documentation and some additional tests showing uniq values and rows. Also removed args TODO * add changes that didn't get committed * whoops, I didn't save the docs correctly... * fmt * Add a test for uniq with nested json * Add another test * Fix unique-ness when json keys are out of order and make the test json more complicated --- crates/nu-protocol/src/value.rs | 4 +- crates/nu-protocol/src/value/dict.rs | 10 ++ crates/nu-protocol/src/value/primitive.rs | 2 +- crates/nu-protocol/src/value/range.rs | 4 +- docs/commands/uniq.rs | 36 +++++++ src/cli.rs | 1 + src/commands.rs | 2 + src/commands/uniq.rs | 48 +++++++++ src/utils.rs | 4 + tests/commands/mod.rs | 1 + tests/commands/uniq.rs | 118 ++++++++++++++++++++++ tests/commands/where_.rs | 2 +- tests/fixtures/formats/nested_uniq.json | 72 +++++++++++++ 13 files changed, 298 insertions(+), 6 deletions(-) create mode 100644 docs/commands/uniq.rs create mode 100644 src/commands/uniq.rs create mode 100644 tests/commands/uniq.rs create mode 100644 tests/fixtures/formats/nested_uniq.json diff --git a/crates/nu-protocol/src/value.rs b/crates/nu-protocol/src/value.rs index 5082799996..2569f41f33 100644 --- a/crates/nu-protocol/src/value.rs +++ b/crates/nu-protocol/src/value.rs @@ -23,7 +23,7 @@ use serde::{Deserialize, Serialize}; use std::path::PathBuf; use std::time::SystemTime; -#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)] pub enum UntaggedValue { Primitive(Primitive), Row(Dictionary), @@ -182,7 +182,7 @@ impl UntaggedValue { } } -#[derive(Debug, Clone, PartialOrd, PartialEq, Ord, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialOrd, PartialEq, Ord, Eq, Hash, Serialize, Deserialize)] pub struct Value { pub value: UntaggedValue, pub tag: Tag, diff --git a/crates/nu-protocol/src/value/dict.rs b/crates/nu-protocol/src/value/dict.rs index 0cb47e6232..2d02d8da62 100644 --- a/crates/nu-protocol/src/value/dict.rs +++ b/crates/nu-protocol/src/value/dict.rs @@ -7,6 +7,7 @@ use indexmap::IndexMap; use nu_source::{b, DebugDocBuilder, PrettyDebug, Spanned, Tag}; use serde::{Deserialize, Serialize}; use std::cmp::{Ord, Ordering, PartialOrd}; +use std::hash::{Hash, Hasher}; #[derive(Debug, Default, Serialize, Deserialize, PartialEq, Eq, Clone, Getters, new)] pub struct Dictionary { @@ -14,6 +15,15 @@ pub struct Dictionary { pub entries: IndexMap, } +impl Hash for Dictionary { + fn hash(&self, state: &mut H) { + let mut entries = self.entries.clone(); + entries.sort_keys(); + entries.keys().collect::>().hash(state); + entries.values().collect::>().hash(state); + } +} + impl PartialOrd for Dictionary { fn partial_cmp(&self, other: &Dictionary) -> Option { let this: Vec<&String> = self.entries.keys().collect(); diff --git a/crates/nu-protocol/src/value/primitive.rs b/crates/nu-protocol/src/value/primitive.rs index 6176be9974..47dae4c203 100644 --- a/crates/nu-protocol/src/value/primitive.rs +++ b/crates/nu-protocol/src/value/primitive.rs @@ -12,7 +12,7 @@ use num_traits::cast::{FromPrimitive, ToPrimitive}; use serde::{Deserialize, Serialize}; use std::path::PathBuf; -#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq, Deserialize, Serialize)] +#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq, Hash, Deserialize, Serialize)] pub enum Primitive { Nothing, #[serde(with = "serde_bigint")] diff --git a/crates/nu-protocol/src/value/range.rs b/crates/nu-protocol/src/value/range.rs index f1f9a90fa8..81876ff9f6 100644 --- a/crates/nu-protocol/src/value/range.rs +++ b/crates/nu-protocol/src/value/range.rs @@ -3,7 +3,7 @@ use derive_new::new; use nu_source::{b, DebugDocBuilder, Spanned}; use serde::{Deserialize, Serialize}; -#[derive(Debug, Copy, Clone, PartialEq, PartialOrd, Eq, Ord, Serialize, Deserialize, Hash)] +#[derive(Debug, Copy, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)] pub enum RangeInclusion { Inclusive, Exclusive, @@ -25,7 +25,7 @@ impl RangeInclusion { } } -#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Serialize, Deserialize, new)] +#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize, new)] pub struct Range { pub from: (Spanned, RangeInclusion), pub to: (Spanned, RangeInclusion), diff --git a/docs/commands/uniq.rs b/docs/commands/uniq.rs new file mode 100644 index 0000000000..d80f66d164 --- /dev/null +++ b/docs/commands/uniq.rs @@ -0,0 +1,36 @@ +# uniq + +Returns unique rows or values from a dataset. + +## Examples + +Given a file `test.csv` + +``` +first_name,last_name,rusty_at,type +Andrés,Robalino,10/11/2013,A +Andrés,Robalino,10/11/2013,A +Jonathan,Turner,10/12/2013,B +Yehuda,Katz,10/11/2013,A +``` + +``` +> `open test.csv | uniq` +━━━┯━━━━━━━━━━━━┯━━━━━━━━━━━┯━━━━━━━━━━━━┯━━━━━━ + # │ first_name │ last_name │ rusty_at │ type +───┼────────────┼───────────┼────────────┼────── +0 │ Andrés │ Robalino │ 10/11/2013 │ A +1 │ Jonathan │ Turner │ 10/12/2013 │ B +2 │ Yehuda │ Katz │ 10/11/2013 │ A +━━━┷━━━━━━━━━━━━┷━━━━━━━━━━━┷━━━━━━━━━━━━┷━━━━━━ +``` + +``` +> `open test.csv | get type | uniq` +━━━┯━━━━━━━━━ +# │ +───┼───────── +0 │ A +1 │ B +━━━┷━━━━━━━━━ +``` diff --git a/src/cli.rs b/src/cli.rs index 617ebffac9..0551e246c1 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -294,6 +294,7 @@ pub async fn cli() -> Result<(), Box> { whole_stream_command(Default), whole_stream_command(SkipWhile), whole_stream_command(Range), + whole_stream_command(Uniq), // Table manipulation whole_stream_command(Wrap), whole_stream_command(Pivot), diff --git a/src/commands.rs b/src/commands.rs index 818abef32a..200f6fd115 100644 --- a/src/commands.rs +++ b/src/commands.rs @@ -90,6 +90,7 @@ pub(crate) mod to_tsv; pub(crate) mod to_url; pub(crate) mod to_yaml; pub(crate) mod trim; +pub(crate) mod uniq; pub(crate) mod version; pub(crate) mod what; pub(crate) mod where_; @@ -185,6 +186,7 @@ pub(crate) use to_tsv::ToTSV; pub(crate) use to_url::ToURL; pub(crate) use to_yaml::ToYAML; pub(crate) use trim::Trim; +pub(crate) use uniq::Uniq; pub(crate) use version::Version; pub(crate) use what::What; pub(crate) use where_::Where; diff --git a/src/commands/uniq.rs b/src/commands/uniq.rs new file mode 100644 index 0000000000..878354078d --- /dev/null +++ b/src/commands/uniq.rs @@ -0,0 +1,48 @@ +use crate::commands::WholeStreamCommand; +use crate::context::CommandRegistry; +use crate::prelude::*; +use indexmap::set::IndexSet; +use nu_errors::ShellError; +use nu_protocol::{ReturnSuccess, Signature}; + +#[derive(Deserialize)] +struct UniqArgs {} + +pub struct Uniq; + +impl WholeStreamCommand for Uniq { + fn name(&self) -> &str { + "uniq" + } + + fn signature(&self) -> Signature { + Signature::build("uniq") + } + + fn usage(&self) -> &str { + "Return the unique rows" + } + + fn run( + &self, + args: CommandArgs, + registry: &CommandRegistry, + ) -> Result { + args.process(registry, uniq)?.run() + } +} + +fn uniq( + UniqArgs {}: UniqArgs, + RunnableContext { input, .. }: RunnableContext, +) -> Result { + let stream = async_stream! { + let uniq_values: IndexSet<_> = input.values.collect().await; + + for item in uniq_values.iter().map(|row| ReturnSuccess::value(row.clone())) { + yield item; + } + }; + + Ok(stream.to_output_stream()) +} diff --git a/src/utils.rs b/src/utils.rs index d25aaa0bfc..fa7e93c15a 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -325,6 +325,10 @@ mod tests { loc: fixtures().join("jonathan.xml"), at: 0 }, + Res { + loc: fixtures().join("nested_uniq.json"), + at: 0 + }, Res { loc: fixtures().join("sample.bson"), at: 0 diff --git a/tests/commands/mod.rs b/tests/commands/mod.rs index badd7628f8..80c2d48ca8 100644 --- a/tests/commands/mod.rs +++ b/tests/commands/mod.rs @@ -26,5 +26,6 @@ mod save; mod sort_by; mod split_by; mod split_column; +mod uniq; mod where_; mod wrap; diff --git a/tests/commands/uniq.rs b/tests/commands/uniq.rs new file mode 100644 index 0000000000..f908224004 --- /dev/null +++ b/tests/commands/uniq.rs @@ -0,0 +1,118 @@ +use nu_test_support::fs::Stub::FileWithContentToBeTrimmed; +use nu_test_support::playground::Playground; +use nu_test_support::{nu, pipeline}; + +#[test] +fn uniq_rows() { + Playground::setup("uniq_test_1", |dirs, sandbox| { + sandbox.with_files(vec![FileWithContentToBeTrimmed( + "los_tres_caballeros.csv", + r#" + first_name,last_name,rusty_at,type + Andrés,Robalino,10/11/2013,A + Jonathan,Turner,10/12/2013,B + Yehuda,Katz,10/11/2013,A + Jonathan,Turner,10/12/2013,B + Yehuda,Katz,10/11/2013,A + "#, + )]); + + let actual = nu!( + cwd: dirs.test(), pipeline( + r#" + open los_tres_caballeros.csv + | uniq + | count + | echo $it + "# + )); + + assert_eq!(actual, "3"); + }) +} + +#[test] +fn uniq_columns() { + Playground::setup("uniq_test_2", |dirs, sandbox| { + sandbox.with_files(vec![FileWithContentToBeTrimmed( + "los_tres_caballeros.csv", + r#" + first_name,last_name,rusty_at,type + Andrés,Robalino,10/11/2013,A + Jonathan,Turner,10/12/2013,B + Yehuda,Katz,10/11/2013,A + Jonathan,Turner,10/12/2013,B + Yehuda,Katz,10/11/2013,A + "#, + )]); + + let actual = nu!( + cwd: dirs.test(), pipeline( + r#" + open los_tres_caballeros.csv + | pick rusty_at type + | uniq + | count + | echo $it + "# + )); + + assert_eq!(actual, "2"); + }) +} + +#[test] +fn uniq_values() { + Playground::setup("uniq_test_3", |dirs, sandbox| { + sandbox.with_files(vec![FileWithContentToBeTrimmed( + "los_tres_caballeros.csv", + r#" + first_name,last_name,rusty_at,type + Andrés,Robalino,10/11/2013,A + Jonathan,Turner,10/12/2013,B + Yehuda,Katz,10/11/2013,A + Jonathan,Turner,10/12/2013,B + Yehuda,Katz,10/11/2013,A + "#, + )]); + + let actual = nu!( + cwd: dirs.test(), pipeline( + r#" + open los_tres_caballeros.csv + | pick get type + | uniq + | count + | echo $it + "# + )); + + assert_eq!(actual, "2"); + }) +} + +#[test] +fn uniq_when_keys_out_of_order() { + let actual = nu!( + cwd: "tests/fixtures/formats", pipeline( + r#" + echo '[{"a": "a", "b": [1,2,3]},{"b": [1,2,3], "a": "a"}]' + | from-json + | uniq + | count + | echo $it + "# + )); + + assert_eq!(actual, "1"); +} + +#[test] +fn uniq_nested_json_structures() { + let actual = nu!( + cwd: "tests/fixtures/formats", + "open nested_uniq.json | uniq | count | echo $it" + ); + + assert_eq!(actual, "3"); +} diff --git a/tests/commands/where_.rs b/tests/commands/where_.rs index ca84bd7f99..4b8f4af979 100644 --- a/tests/commands/where_.rs +++ b/tests/commands/where_.rs @@ -7,7 +7,7 @@ fn filters_by_unit_size_comparison() { "ls | where size > 1kb | sort-by size | get name | first 1 | trim | echo $it" ); - assert_eq!(actual, "cargo_sample.toml"); + assert_eq!(actual, "nested_uniq.json"); } #[test] diff --git a/tests/fixtures/formats/nested_uniq.json b/tests/fixtures/formats/nested_uniq.json new file mode 100644 index 0000000000..ac3f132d18 --- /dev/null +++ b/tests/fixtures/formats/nested_uniq.json @@ -0,0 +1,72 @@ +[ + { + "name": "this is duplicated", + "nesting": [ + { + "a": "a", + "b": "b" + }, + { + "c": "c", + "d": "d" + } + ], + "can_be_ordered_differently": { + "array": [1, 2, 3, 4, 5], + "something": { "else": "works" } + } + }, + { + "can_be_ordered_differently": { + "something": { "else": "works" }, + "array": [1, 2, 3, 4, 5] + }, + "nesting": [ + { + "b": "b", + "a": "a" + }, + { + "d": "d", + "c": "c" + } + ], + "name": "this is duplicated" + }, + { + "name": "this is unique", + "nesting": [ + { + "a": "b", + "b": "a" + }, + { + "c": "d", + "d": "c" + } + ], + "can_be_ordered_differently": { + "array": [], + "something": { "else": "does not work" } + } + }, + { + "name": "this is unique", + "nesting": [ + { + "a": "a", + "b": "b", + "c": "c" + }, + { + "d": "d", + "e": "e", + "f": "f" + } + ], + "can_be_ordered_differently": { + "array": [], + "something": { "else": "works" } + } + } +]