uniq and uniq-by optimization (#7477) (#7534)

# Description

Refactored the quadratic complexity on `uniq` to use a HashMap, as key I
converted the Value to string.
I tried to use the HashableValue, but it looks it is not very developed
yet and it was getting more complex and difficult.

This improves performance on large data sets.

Fixes https://github.com/nushell/nushell/issues/7477


# Tests + Formatting
```
> let data = fetch "https://home.treasury.gov/system/files/276/yield-curve-rates-1990-2021.csv"
> $data | uniq
```

it keeps original attribute order in Records:
```
> [ {b:2, a:1} {a:1, b:2} ] | uniq 
╭───┬───┬───╮
│ # │ b │ a │
├───┼───┼───┤
│ 0 │ 2 │ 1 │
╰───┴───┴───╯
```
This commit is contained in:
raccmonteiro 2023-01-04 19:35:49 +00:00 committed by GitHub
parent f0e87da830
commit 75cb3fcc5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 95 additions and 26 deletions

View File

@ -1,9 +1,13 @@
use crate::formats::value_to_string;
use itertools::Itertools;
use nu_protocol::ast::Call; use nu_protocol::ast::Call;
use nu_protocol::engine::{Command, EngineState, Stack}; use nu_protocol::engine::{Command, EngineState, Stack};
use nu_protocol::{ use nu_protocol::{
Category, Example, IntoPipelineData, PipelineData, PipelineMetadata, Signature, Span, Type, Category, Example, IntoPipelineData, PipelineData, PipelineMetadata, ShellError, Signature,
Value, Span, Type, Value,
}; };
use std::collections::hash_map::IntoIter;
use std::collections::HashMap;
#[derive(Clone)] #[derive(Clone)]
pub struct Uniq; pub struct Uniq;
@ -65,7 +69,7 @@ impl Command for Uniq {
input: PipelineData, input: PipelineData,
) -> Result<nu_protocol::PipelineData, nu_protocol::ShellError> { ) -> Result<nu_protocol::PipelineData, nu_protocol::ShellError> {
let mapper = Box::new(move |ms: ItemMapperState| -> ValueCounter { let mapper = Box::new(move |ms: ItemMapperState| -> ValueCounter {
item_mapper(ms.item, ms.flag_ignore_case) item_mapper(ms.item, ms.flag_ignore_case, ms.index)
}); });
let metadata = input.metadata(); let metadata = input.metadata();
@ -139,16 +143,18 @@ impl Command for Uniq {
pub struct ItemMapperState { pub struct ItemMapperState {
pub item: Value, pub item: Value,
pub flag_ignore_case: bool, pub flag_ignore_case: bool,
pub index: usize,
} }
fn item_mapper(item: Value, flag_ignore_case: bool) -> ValueCounter { fn item_mapper(item: Value, flag_ignore_case: bool, index: usize) -> ValueCounter {
ValueCounter::new(item, flag_ignore_case) ValueCounter::new(item, flag_ignore_case, index)
} }
pub struct ValueCounter { pub struct ValueCounter {
val: Value, val: Value,
val_to_compare: Value, val_to_compare: Value,
count: i64, count: i64,
index: usize,
} }
impl PartialEq<Self> for ValueCounter { impl PartialEq<Self> for ValueCounter {
@ -158,18 +164,24 @@ impl PartialEq<Self> for ValueCounter {
} }
impl ValueCounter { impl ValueCounter {
fn new(val: Value, flag_ignore_case: bool) -> Self { fn new(val: Value, flag_ignore_case: bool, index: usize) -> Self {
Self::new_vals_to_compare(val.clone(), flag_ignore_case, val) Self::new_vals_to_compare(val.clone(), flag_ignore_case, val, index)
} }
pub fn new_vals_to_compare(val: Value, flag_ignore_case: bool, vals_to_compare: Value) -> Self { pub fn new_vals_to_compare(
val: Value,
flag_ignore_case: bool,
vals_to_compare: Value,
index: usize,
) -> Self {
ValueCounter { ValueCounter {
val, val,
val_to_compare: if flag_ignore_case { val_to_compare: if flag_ignore_case {
clone_to_lowercase(&vals_to_compare) clone_to_lowercase(&vals_to_compare.with_span(Span::unknown()))
} else { } else {
vals_to_compare vals_to_compare.with_span(Span::unknown())
}, },
count: 1, count: 1,
index,
} }
} }
} }
@ -201,6 +213,40 @@ fn clone_to_lowercase(value: &Value) -> Value {
} }
} }
fn sort_attributes(val: Value) -> Value {
match val {
Value::Record { cols, vals, span } => {
let sorted = cols
.into_iter()
.zip(vals)
.sorted_by(|a, b| a.0.cmp(&b.0))
.collect_vec();
let sorted_cols = sorted.clone().into_iter().map(|a| a.0).collect_vec();
let sorted_vals = sorted
.into_iter()
.map(|a| sort_attributes(a.1))
.collect_vec();
Value::Record {
cols: sorted_cols,
vals: sorted_vals,
span,
}
}
Value::List { vals, span } => Value::List {
vals: vals.into_iter().map(sort_attributes).collect_vec(),
span,
},
other => other,
}
}
fn generate_key(item: &ValueCounter) -> Result<String, ShellError> {
let value = sort_attributes(item.val_to_compare.clone()); //otherwise, keys could be different for Records
value_to_string(&value, Span::unknown())
}
fn generate_results_with_count(head: Span, uniq_values: Vec<ValueCounter>) -> Vec<Value> { fn generate_results_with_count(head: Span, uniq_values: Vec<ValueCounter>) -> Vec<Value> {
uniq_values uniq_values
.into_iter() .into_iter()
@ -227,36 +273,52 @@ pub fn uniq(
let flag_ignore_case = call.has_flag("ignore-case"); let flag_ignore_case = call.has_flag("ignore-case");
let flag_only_uniques = call.has_flag("unique"); let flag_only_uniques = call.has_flag("unique");
let mut uniq_values = input let uniq_values = input
.into_iter() .into_iter()
.map_while(|item| { .enumerate()
.map_while(|(index, item)| {
if nu_utils::ctrl_c::was_pressed(&ctrlc) { if nu_utils::ctrl_c::was_pressed(&ctrlc) {
return None; return None;
} }
Some(item_mapper(ItemMapperState { Some(item_mapper(ItemMapperState {
item, item,
flag_ignore_case, flag_ignore_case,
index,
})) }))
}) })
.fold(Vec::<ValueCounter>::new(), |mut counter, item| { .into_iter()
match counter .try_fold(
.iter_mut() HashMap::<String, ValueCounter>::new(),
.find(|x| x.val_to_compare == item.val_to_compare) |mut counter, item| {
{ let key = generate_key(&item);
Some(x) => x.count += 1,
None => counter.push(item), match key {
}; Ok(key) => {
counter match counter.get_mut(&key) {
}); Some(x) => x.count += 1,
None => {
counter.insert(key, item);
}
};
Ok(counter)
}
Err(err) => Err(err),
}
},
);
let mut uniq_values: HashMap<String, ValueCounter> = uniq_values?;
if flag_show_repeated { if flag_show_repeated {
uniq_values.retain(|value_count_pair| value_count_pair.count > 1); uniq_values.retain(|_v, value_count_pair| value_count_pair.count > 1);
} }
if flag_only_uniques { if flag_only_uniques {
uniq_values.retain(|value_count_pair| value_count_pair.count == 1); uniq_values.retain(|_v, value_count_pair| value_count_pair.count == 1);
} }
let uniq_values = sort(uniq_values.into_iter());
let result = if flag_show_count { let result = if flag_show_count {
generate_results_with_count(head, uniq_values) generate_results_with_count(head, uniq_values)
} else { } else {
@ -271,6 +333,12 @@ pub fn uniq(
.set_metadata(metadata)) .set_metadata(metadata))
} }
fn sort(iter: IntoIter<String, ValueCounter>) -> Vec<ValueCounter> {
iter.map(|item| item.1)
.sorted_by(|a, b| a.index.cmp(&b.index))
.collect()
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;

View File

@ -157,7 +157,7 @@ fn item_mapper_by_col(cols: Vec<String>) -> impl Fn(crate::ItemMapperState) -> c
span: Span::unknown(), span: Span::unknown(),
}; };
crate::ValueCounter::new_vals_to_compare(ms.item, ms.flag_ignore_case, col_vals) crate::ValueCounter::new_vals_to_compare(ms.item, ms.flag_ignore_case, col_vals, ms.index)
}) })
} }

View File

@ -19,6 +19,7 @@ pub use command::To;
pub use html::ToHtml; pub use html::ToHtml;
pub use json::ToJson; pub use json::ToJson;
pub use md::ToMd; pub use md::ToMd;
pub use nuon::value_to_string;
pub use nuon::ToNuon; pub use nuon::ToNuon;
pub use text::ToText; pub use text::ToText;
pub use tsv::ToTsv; pub use tsv::ToTsv;

View File

@ -50,7 +50,7 @@ impl Command for ToNuon {
} }
} }
fn value_to_string(v: &Value, span: Span) -> Result<String, ShellError> { pub fn value_to_string(v: &Value, span: Span) -> Result<String, ShellError> {
match v { match v {
Value::Binary { val, .. } => { Value::Binary { val, .. } => {
let mut s = String::with_capacity(2 * val.len()); let mut s = String::with_capacity(2 * val.len());