uniq-by command (#7295)

New command `uniq-by` to get uniq results by column.

Closes https://github.com/nushell/nushell/issues/7109
This commit is contained in:
raccmonteiro 2022-12-02 10:36:01 +00:00 committed by GitHub
parent f491d3e1e1
commit fcdc474731
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 442 additions and 10 deletions

View File

@ -132,6 +132,7 @@ pub fn create_default_context() -> EngineState {
SplitList,
Transpose,
Uniq,
UniqBy,
Upsert,
Update,
UpdateCells,

View File

@ -41,6 +41,7 @@ mod split_by;
mod take;
mod transpose;
mod uniq;
mod uniq_by;
mod update;
mod update_cells;
mod upsert;
@ -93,6 +94,7 @@ pub use split_by::SplitBy;
pub use take::*;
pub use transpose::Transpose;
pub use uniq::*;
pub use uniq_by::UniqBy;
pub use update::Update;
pub use update_cells::UpdateCells;
pub use upsert::Upsert;

View File

@ -1,7 +1,8 @@
use nu_protocol::ast::Call;
use nu_protocol::engine::{Command, EngineState, Stack};
use nu_protocol::{
Category, Example, IntoPipelineData, PipelineData, Signature, Span, Type, Value,
Category, Example, IntoPipelineData, PipelineData, PipelineMetadata, Signature, Span, Type,
Value,
};
#[derive(Clone)]
@ -63,7 +64,19 @@ impl Command for Uniq {
call: &Call,
input: PipelineData,
) -> Result<nu_protocol::PipelineData, nu_protocol::ShellError> {
uniq(engine_state, stack, call, input)
let mapper = Box::new(move |ms: ItemMapperState| -> ValueCounter {
item_mapper(ms.item, ms.flag_ignore_case)
});
let metadata = input.metadata();
uniq(
engine_state,
stack,
call,
input.into_iter().collect(),
mapper,
metadata,
)
}
fn examples(&self) -> Vec<Example> {
@ -123,7 +136,16 @@ impl Command for Uniq {
}
}
struct ValueCounter {
pub struct ItemMapperState {
pub item: Value,
pub flag_ignore_case: bool,
}
fn item_mapper(item: Value, flag_ignore_case: bool) -> ValueCounter {
ValueCounter::new(item, flag_ignore_case)
}
pub struct ValueCounter {
val: Value,
val_to_compare: Value,
count: i64,
@ -137,12 +159,15 @@ impl PartialEq<Self> for ValueCounter {
impl ValueCounter {
fn new(val: Value, flag_ignore_case: bool) -> Self {
Self::new_vals_to_compare(val.clone(), flag_ignore_case, val)
}
pub fn new_vals_to_compare(val: Value, flag_ignore_case: bool, vals_to_compare: Value) -> Self {
ValueCounter {
val: val.clone(),
val,
val_to_compare: if flag_ignore_case {
clone_to_lowercase(&val)
clone_to_lowercase(&vals_to_compare)
} else {
val
vals_to_compare
},
count: 1,
}
@ -193,22 +218,29 @@ fn generate_results_with_count(head: Span, uniq_values: Vec<ValueCounter>) -> Ve
.collect()
}
fn uniq(
pub fn uniq(
_engine_state: &EngineState,
_stack: &mut Stack,
call: &Call,
input: PipelineData,
input: Vec<Value>,
item_mapper: Box<dyn Fn(ItemMapperState) -> ValueCounter>,
metadata: Option<PipelineMetadata>,
) -> Result<nu_protocol::PipelineData, nu_protocol::ShellError> {
let head = call.head;
let flag_show_count = call.has_flag("count");
let flag_show_repeated = call.has_flag("repeated");
let flag_ignore_case = call.has_flag("ignore-case");
let flag_only_uniques = call.has_flag("unique");
let metadata = input.metadata();
// let metadata = input.metadata();
let mut uniq_values = input
.into_iter()
.map(|item| ValueCounter::new(item, flag_ignore_case))
.map(|item| {
item_mapper(ItemMapperState {
item,
flag_ignore_case,
})
})
.fold(Vec::<ValueCounter>::new(), |mut counter, item| {
match counter
.iter_mut()

View File

@ -0,0 +1,174 @@
pub use super::uniq;
use nu_engine::column::nonexistent_column;
use nu_engine::CallExt;
use nu_protocol::ast::Call;
use nu_protocol::engine::{Command, EngineState, Stack};
use nu_protocol::{
Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value,
};
#[derive(Clone)]
pub struct UniqBy;
impl Command for UniqBy {
fn name(&self) -> &str {
"uniq-by"
}
fn signature(&self) -> Signature {
Signature::build("uniq-by")
.input_output_types(vec![(Type::Table(vec![]), Type::Table(vec![]))])
.rest("columns", SyntaxShape::Any, "the column(s) to filter by")
.switch(
"count",
"Return a table containing the distinct input values together with their counts",
Some('c'),
)
.switch(
"repeated",
"Return the input values that occur more than once",
Some('d'),
)
.switch(
"ignore-case",
"Ignore differences in case when comparing input values",
Some('i'),
)
.switch(
"unique",
"Return the input values that occur once only",
Some('u'),
)
.category(Category::Filters)
}
fn usage(&self) -> &str {
"Return the distinct values in the input by the given column(s)."
}
fn search_terms(&self) -> Vec<&str> {
vec!["distinct", "deduplicate"]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<nu_protocol::PipelineData, nu_protocol::ShellError> {
let columns: Vec<String> = call.rest(engine_state, stack, 0)?;
if columns.is_empty() {
return Err(ShellError::MissingParameter("columns".into(), call.head));
}
let metadata = input.metadata();
let vec: Vec<_> = input.into_iter().collect();
match validate(vec.clone(), &columns, call.head) {
Ok(_) => {}
Err(err) => {
return Err(err);
}
}
let mapper = Box::new(item_mapper_by_col(columns));
uniq(engine_state, stack, call, vec, mapper, metadata)
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Get rows from table filtered by column uniqueness ",
example: "[[fruit count]; [apple 9] [apple 2] [pear 3] [orange 7]] | uniq-by fruit",
result: Some(Value::List {
vals: vec![
Value::test_record(
vec!["fruit", "count"],
vec![Value::test_string("apple"), Value::test_int(9)],
),
Value::test_record(
vec!["fruit", "count"],
vec![Value::test_string("pear"), Value::test_int(3)],
),
Value::test_record(
vec!["fruit", "count"],
vec![Value::test_string("orange"), Value::test_int(7)],
),
],
span: Span::test_data(),
}),
}]
}
}
fn validate(vec: Vec<Value>, columns: &Vec<String>, span: Span) -> Result<(), ShellError> {
if vec.is_empty() {
return Err(ShellError::GenericError(
"no values to work with".to_string(),
"".to_string(),
None,
Some("no values to work with".to_string()),
Vec::new(),
));
}
if let Value::Record {
cols,
vals: _input_vals,
span: val_span,
} = &vec[0]
{
if columns.is_empty() {
// This uses the same format as the 'requires a column name' error in split_by.rs
return Err(ShellError::GenericError(
"expected name".into(),
"requires a column name to filter table data".into(),
Some(span),
None,
Vec::new(),
));
}
if let Some(nonexistent) = nonexistent_column(columns.clone(), cols.to_vec()) {
return Err(ShellError::CantFindColumn(nonexistent, span, *val_span));
}
}
Ok(())
}
fn get_data_by_columns(columns: &[String], item: &Value) -> Vec<Value> {
columns
.iter()
.filter_map(|col| item.get_data_by_key(col))
.collect::<Vec<_>>()
}
fn item_mapper_by_col(cols: Vec<String>) -> impl Fn(crate::ItemMapperState) -> crate::ValueCounter {
let columns = cols;
Box::new(move |ms: crate::ItemMapperState| -> crate::ValueCounter {
let item_column_values = get_data_by_columns(&columns, &ms.item);
let col_vals = Value::List {
vals: item_column_values,
span: Span { start: 0, end: 0 },
};
crate::ValueCounter::new_vals_to_compare(ms.item, ms.flag_ignore_case, col_vals)
})
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_examples() {
use crate::test_examples;
test_examples(UniqBy {})
}
}

View File

@ -88,6 +88,7 @@ mod touch;
mod transpose;
mod try_;
mod uniq;
mod uniq_by;
mod update;
mod upsert;
mod url;

View File

@ -0,0 +1,222 @@
use nu_test_support::fs::Stub::FileWithContentToBeTrimmed;
use nu_test_support::playground::Playground;
use nu_test_support::{nu, pipeline};
#[test]
fn removes_duplicate_rows() {
Playground::setup("uniq_test_1", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_caballeros.csv",
r#"
first_name,last_name,rusty_at,type
Andrés,Robalino,10/11/2013,A
Afonso,Turner,10/12/2013,B
Yehuda,Katz,10/11/2013,A
Jonathan,Turner,11/12/2011,O
"#,
)]);
let actual = nu!(
cwd: dirs.test(), pipeline(
r#"
open los_tres_caballeros.csv
| uniq-by last_name
| length
"#
));
assert_eq!(actual.out, "3");
})
}
#[test]
fn uniq_when_keys_out_of_order() {
let actual = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
[{"a": "a", "b": [1,2,3]}, {"b": [1,2,3,4], "a": "a"}]
| uniq-by a
"#
));
let expected = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
echo [{"a": "a", "b": [1,2,3]}]
"#
));
print!("{}", actual.out);
print!("{}", expected.out);
assert_eq!(actual.out, expected.out);
assert_eq!(actual.out, expected.out);
}
#[test]
fn uniq_counting() {
let actual = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
["A", "B", "A"]
| wrap item
| uniq-by item --count
| flatten
| where item == A
| get count
| get 0
"#
));
assert_eq!(actual.out, "2");
let actual = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
echo ["A", "B", "A"]
| wrap item
| uniq-by item --count
| flatten
| where item == B
| get count
| get 0
"#
));
assert_eq!(actual.out, "1");
}
#[test]
fn uniq_unique() {
let actual = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
echo [1 2 3 4 1 5]
| wrap item
| uniq-by item --unique
| get item
"#
));
let expected = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
echo [2 3 4 5]
"#
));
print!("{}", actual.out);
print!("{}", expected.out);
assert_eq!(actual.out, expected.out);
}
#[test]
fn table() {
let actual = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
[[fruit day]; [apple monday] [apple friday] [Apple friday] [apple monday] [pear monday] [orange tuesday]]
| uniq-by fruit
"#
));
let expected = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
echo [[fruit day]; [apple monday] [Apple friday] [pear monday] [orange tuesday]]
"#
));
print!("{}", actual.out);
print!("{}", expected.out);
assert_eq!(actual.out, expected.out);
}
#[test]
fn uniq_by_multiple_columns() {
let actual = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
[[fruit day]; [apple monday] [apple friday] [Apple friday] [apple monday] [pear monday] [orange tuesday]]
| uniq-by fruit day
"#
));
let expected = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
echo [[fruit day]; [apple monday] [apple friday] [Apple friday] [pear monday] [orange tuesday]]
"#
));
print!("{}", actual.out);
print!("{}", expected.out);
assert_eq!(actual.out, expected.out);
}
#[test]
fn table_with_ignore_case() {
let actual = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
[[origin, people];
[World, (
[[name, meal];
['Geremias', {plate: 'bitoque', carbs: 100}]
]
)],
[World, (
[[name, meal];
['Martin', {plate: 'bitoque', carbs: 100}]
]
)],
[World, (
[[name, meal];
['Geremias', {plate: 'Bitoque', carbs: 100}]
]
)],
] | uniq-by people -i
"#
));
let expected = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
echo [[origin, people];
[World, (
[[name, meal];
['Geremias', {plate: 'bitoque', carbs: 100}]
]
)],
[World, (
[[name, meal];
['Martin', {plate: 'bitoque', carbs: 100}]
]
)],
]
"#
));
print!("{}", actual.out);
print!("{}", expected.out);
assert_eq!(actual.out, expected.out);
assert_eq!(actual.out, expected.out);
}
#[test]
fn missing_parameter() {
let actual = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
[11 22 33] | uniq-by
"#
));
assert!(actual.err.contains("missing parameter"));
}
#[test]
fn wrong_column() {
let actual = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
[[fruit day]; [apple monday] [apple friday]]
| uniq-by column1
"#
));
assert!(actual.err.contains("cannot find column 'column1'"));
}