uniq and uniq-by optimization (#7477) (#7534)

# Description

Refactored the quadratic complexity on `uniq` to use a HashMap, as key I
converted the Value to string.
I tried to use the HashableValue, but it looks it is not very developed
yet and it was getting more complex and difficult.

This improves performance on large data sets.

Fixes https://github.com/nushell/nushell/issues/7477


# Tests + Formatting
```
> let data = fetch "https://home.treasury.gov/system/files/276/yield-curve-rates-1990-2021.csv"
> $data | uniq
```

it keeps original attribute order in Records:
```
> [ {b:2, a:1} {a:1, b:2} ] | uniq 
╭───┬───┬───╮
│ # │ b │ a │
├───┼───┼───┤
│ 0 │ 2 │ 1 │
╰───┴───┴───╯
```
This commit is contained in:
raccmonteiro 2023-01-04 19:35:49 +00:00 committed by GitHub
parent f0e87da830
commit 75cb3fcc5f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 95 additions and 26 deletions

View File

@ -1,9 +1,13 @@
use crate::formats::value_to_string;
use itertools::Itertools;
use nu_protocol::ast::Call;
use nu_protocol::engine::{Command, EngineState, Stack};
use nu_protocol::{
Category, Example, IntoPipelineData, PipelineData, PipelineMetadata, Signature, Span, Type,
Value,
Category, Example, IntoPipelineData, PipelineData, PipelineMetadata, ShellError, Signature,
Span, Type, Value,
};
use std::collections::hash_map::IntoIter;
use std::collections::HashMap;
#[derive(Clone)]
pub struct Uniq;
@ -65,7 +69,7 @@ impl Command for Uniq {
input: PipelineData,
) -> Result<nu_protocol::PipelineData, nu_protocol::ShellError> {
let mapper = Box::new(move |ms: ItemMapperState| -> ValueCounter {
item_mapper(ms.item, ms.flag_ignore_case)
item_mapper(ms.item, ms.flag_ignore_case, ms.index)
});
let metadata = input.metadata();
@ -139,16 +143,18 @@ impl Command for Uniq {
pub struct ItemMapperState {
pub item: Value,
pub flag_ignore_case: bool,
pub index: usize,
}
fn item_mapper(item: Value, flag_ignore_case: bool) -> ValueCounter {
ValueCounter::new(item, flag_ignore_case)
fn item_mapper(item: Value, flag_ignore_case: bool, index: usize) -> ValueCounter {
ValueCounter::new(item, flag_ignore_case, index)
}
pub struct ValueCounter {
val: Value,
val_to_compare: Value,
count: i64,
index: usize,
}
impl PartialEq<Self> for ValueCounter {
@ -158,18 +164,24 @@ impl PartialEq<Self> for ValueCounter {
}
impl ValueCounter {
fn new(val: Value, flag_ignore_case: bool) -> Self {
Self::new_vals_to_compare(val.clone(), flag_ignore_case, val)
fn new(val: Value, flag_ignore_case: bool, index: usize) -> Self {
Self::new_vals_to_compare(val.clone(), flag_ignore_case, val, index)
}
pub fn new_vals_to_compare(val: Value, flag_ignore_case: bool, vals_to_compare: Value) -> Self {
pub fn new_vals_to_compare(
val: Value,
flag_ignore_case: bool,
vals_to_compare: Value,
index: usize,
) -> Self {
ValueCounter {
val,
val_to_compare: if flag_ignore_case {
clone_to_lowercase(&vals_to_compare)
clone_to_lowercase(&vals_to_compare.with_span(Span::unknown()))
} else {
vals_to_compare
vals_to_compare.with_span(Span::unknown())
},
count: 1,
index,
}
}
}
@ -201,6 +213,40 @@ fn clone_to_lowercase(value: &Value) -> Value {
}
}
fn sort_attributes(val: Value) -> Value {
match val {
Value::Record { cols, vals, span } => {
let sorted = cols
.into_iter()
.zip(vals)
.sorted_by(|a, b| a.0.cmp(&b.0))
.collect_vec();
let sorted_cols = sorted.clone().into_iter().map(|a| a.0).collect_vec();
let sorted_vals = sorted
.into_iter()
.map(|a| sort_attributes(a.1))
.collect_vec();
Value::Record {
cols: sorted_cols,
vals: sorted_vals,
span,
}
}
Value::List { vals, span } => Value::List {
vals: vals.into_iter().map(sort_attributes).collect_vec(),
span,
},
other => other,
}
}
fn generate_key(item: &ValueCounter) -> Result<String, ShellError> {
let value = sort_attributes(item.val_to_compare.clone()); //otherwise, keys could be different for Records
value_to_string(&value, Span::unknown())
}
fn generate_results_with_count(head: Span, uniq_values: Vec<ValueCounter>) -> Vec<Value> {
uniq_values
.into_iter()
@ -227,36 +273,52 @@ pub fn uniq(
let flag_ignore_case = call.has_flag("ignore-case");
let flag_only_uniques = call.has_flag("unique");
let mut uniq_values = input
let uniq_values = input
.into_iter()
.map_while(|item| {
.enumerate()
.map_while(|(index, item)| {
if nu_utils::ctrl_c::was_pressed(&ctrlc) {
return None;
}
Some(item_mapper(ItemMapperState {
item,
flag_ignore_case,
index,
}))
})
.fold(Vec::<ValueCounter>::new(), |mut counter, item| {
match counter
.iter_mut()
.find(|x| x.val_to_compare == item.val_to_compare)
{
Some(x) => x.count += 1,
None => counter.push(item),
};
counter
});
.into_iter()
.try_fold(
HashMap::<String, ValueCounter>::new(),
|mut counter, item| {
let key = generate_key(&item);
match key {
Ok(key) => {
match counter.get_mut(&key) {
Some(x) => x.count += 1,
None => {
counter.insert(key, item);
}
};
Ok(counter)
}
Err(err) => Err(err),
}
},
);
let mut uniq_values: HashMap<String, ValueCounter> = uniq_values?;
if flag_show_repeated {
uniq_values.retain(|value_count_pair| value_count_pair.count > 1);
uniq_values.retain(|_v, value_count_pair| value_count_pair.count > 1);
}
if flag_only_uniques {
uniq_values.retain(|value_count_pair| value_count_pair.count == 1);
uniq_values.retain(|_v, value_count_pair| value_count_pair.count == 1);
}
let uniq_values = sort(uniq_values.into_iter());
let result = if flag_show_count {
generate_results_with_count(head, uniq_values)
} else {
@ -271,6 +333,12 @@ pub fn uniq(
.set_metadata(metadata))
}
fn sort(iter: IntoIter<String, ValueCounter>) -> Vec<ValueCounter> {
iter.map(|item| item.1)
.sorted_by(|a, b| a.index.cmp(&b.index))
.collect()
}
#[cfg(test)]
mod test {
use super::*;

View File

@ -157,7 +157,7 @@ fn item_mapper_by_col(cols: Vec<String>) -> impl Fn(crate::ItemMapperState) -> c
span: Span::unknown(),
};
crate::ValueCounter::new_vals_to_compare(ms.item, ms.flag_ignore_case, col_vals)
crate::ValueCounter::new_vals_to_compare(ms.item, ms.flag_ignore_case, col_vals, ms.index)
})
}

View File

@ -19,6 +19,7 @@ pub use command::To;
pub use html::ToHtml;
pub use json::ToJson;
pub use md::ToMd;
pub use nuon::value_to_string;
pub use nuon::ToNuon;
pub use text::ToText;
pub use tsv::ToTsv;

View File

@ -50,7 +50,7 @@ impl Command for ToNuon {
}
}
fn value_to_string(v: &Value, span: Span) -> Result<String, ShellError> {
pub fn value_to_string(v: &Value, span: Span) -> Result<String, ShellError> {
match v {
Value::Binary { val, .. } => {
let mut s = String::with_capacity(2 * val.len());