From 75cb3fcc5f5d293266f1782a59b371e693fea3e0 Mon Sep 17 00:00:00 2001 From: raccmonteiro <3062698+raccmonteiro@users.noreply.github.com> Date: Wed, 4 Jan 2023 19:35:49 +0000 Subject: [PATCH] `uniq` and `uniq-by` optimization (#7477) (#7534) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description Refactored the quadratic complexity on `uniq` to use a HashMap, as key I converted the Value to string. I tried to use the HashableValue, but it looks it is not very developed yet and it was getting more complex and difficult. This improves performance on large data sets. Fixes https://github.com/nushell/nushell/issues/7477 # Tests + Formatting ``` > let data = fetch "https://home.treasury.gov/system/files/276/yield-curve-rates-1990-2021.csv" > $data | uniq ``` it keeps original attribute order in Records: ``` > [ {b:2, a:1} {a:1, b:2} ] | uniq ╭───┬───┬───╮ │ # │ b │ a │ ├───┼───┼───┤ │ 0 │ 2 │ 1 │ ╰───┴───┴───╯ ``` --- crates/nu-command/src/filters/uniq.rs | 116 ++++++++++++++++++----- crates/nu-command/src/filters/uniq_by.rs | 2 +- crates/nu-command/src/formats/to/mod.rs | 1 + crates/nu-command/src/formats/to/nuon.rs | 2 +- 4 files changed, 95 insertions(+), 26 deletions(-) diff --git a/crates/nu-command/src/filters/uniq.rs b/crates/nu-command/src/filters/uniq.rs index 2ffd399ab..9af7d481e 100644 --- a/crates/nu-command/src/filters/uniq.rs +++ b/crates/nu-command/src/filters/uniq.rs @@ -1,9 +1,13 @@ +use crate::formats::value_to_string; +use itertools::Itertools; use nu_protocol::ast::Call; use nu_protocol::engine::{Command, EngineState, Stack}; use nu_protocol::{ - Category, Example, IntoPipelineData, PipelineData, PipelineMetadata, Signature, Span, Type, - Value, + Category, Example, IntoPipelineData, PipelineData, PipelineMetadata, ShellError, Signature, + Span, Type, Value, }; +use std::collections::hash_map::IntoIter; +use std::collections::HashMap; #[derive(Clone)] pub struct Uniq; @@ -65,7 +69,7 @@ impl Command for Uniq { input: PipelineData, ) -> Result { let mapper = Box::new(move |ms: ItemMapperState| -> ValueCounter { - item_mapper(ms.item, ms.flag_ignore_case) + item_mapper(ms.item, ms.flag_ignore_case, ms.index) }); let metadata = input.metadata(); @@ -139,16 +143,18 @@ impl Command for Uniq { pub struct ItemMapperState { pub item: Value, pub flag_ignore_case: bool, + pub index: usize, } -fn item_mapper(item: Value, flag_ignore_case: bool) -> ValueCounter { - ValueCounter::new(item, flag_ignore_case) +fn item_mapper(item: Value, flag_ignore_case: bool, index: usize) -> ValueCounter { + ValueCounter::new(item, flag_ignore_case, index) } pub struct ValueCounter { val: Value, val_to_compare: Value, count: i64, + index: usize, } impl PartialEq for ValueCounter { @@ -158,18 +164,24 @@ impl PartialEq for ValueCounter { } impl ValueCounter { - fn new(val: Value, flag_ignore_case: bool) -> Self { - Self::new_vals_to_compare(val.clone(), flag_ignore_case, val) + fn new(val: Value, flag_ignore_case: bool, index: usize) -> Self { + Self::new_vals_to_compare(val.clone(), flag_ignore_case, val, index) } - pub fn new_vals_to_compare(val: Value, flag_ignore_case: bool, vals_to_compare: Value) -> Self { + pub fn new_vals_to_compare( + val: Value, + flag_ignore_case: bool, + vals_to_compare: Value, + index: usize, + ) -> Self { ValueCounter { val, val_to_compare: if flag_ignore_case { - clone_to_lowercase(&vals_to_compare) + clone_to_lowercase(&vals_to_compare.with_span(Span::unknown())) } else { - vals_to_compare + vals_to_compare.with_span(Span::unknown()) }, count: 1, + index, } } } @@ -201,6 +213,40 @@ fn clone_to_lowercase(value: &Value) -> Value { } } +fn sort_attributes(val: Value) -> Value { + match val { + Value::Record { cols, vals, span } => { + let sorted = cols + .into_iter() + .zip(vals) + .sorted_by(|a, b| a.0.cmp(&b.0)) + .collect_vec(); + + let sorted_cols = sorted.clone().into_iter().map(|a| a.0).collect_vec(); + let sorted_vals = sorted + .into_iter() + .map(|a| sort_attributes(a.1)) + .collect_vec(); + + Value::Record { + cols: sorted_cols, + vals: sorted_vals, + span, + } + } + Value::List { vals, span } => Value::List { + vals: vals.into_iter().map(sort_attributes).collect_vec(), + span, + }, + other => other, + } +} + +fn generate_key(item: &ValueCounter) -> Result { + let value = sort_attributes(item.val_to_compare.clone()); //otherwise, keys could be different for Records + value_to_string(&value, Span::unknown()) +} + fn generate_results_with_count(head: Span, uniq_values: Vec) -> Vec { uniq_values .into_iter() @@ -227,36 +273,52 @@ pub fn uniq( let flag_ignore_case = call.has_flag("ignore-case"); let flag_only_uniques = call.has_flag("unique"); - let mut uniq_values = input + let uniq_values = input .into_iter() - .map_while(|item| { + .enumerate() + .map_while(|(index, item)| { if nu_utils::ctrl_c::was_pressed(&ctrlc) { return None; } Some(item_mapper(ItemMapperState { item, flag_ignore_case, + index, })) }) - .fold(Vec::::new(), |mut counter, item| { - match counter - .iter_mut() - .find(|x| x.val_to_compare == item.val_to_compare) - { - Some(x) => x.count += 1, - None => counter.push(item), - }; - counter - }); + .into_iter() + .try_fold( + HashMap::::new(), + |mut counter, item| { + let key = generate_key(&item); + + match key { + Ok(key) => { + match counter.get_mut(&key) { + Some(x) => x.count += 1, + None => { + counter.insert(key, item); + } + }; + Ok(counter) + } + Err(err) => Err(err), + } + }, + ); + + let mut uniq_values: HashMap = uniq_values?; if flag_show_repeated { - uniq_values.retain(|value_count_pair| value_count_pair.count > 1); + uniq_values.retain(|_v, value_count_pair| value_count_pair.count > 1); } if flag_only_uniques { - uniq_values.retain(|value_count_pair| value_count_pair.count == 1); + uniq_values.retain(|_v, value_count_pair| value_count_pair.count == 1); } + let uniq_values = sort(uniq_values.into_iter()); + let result = if flag_show_count { generate_results_with_count(head, uniq_values) } else { @@ -271,6 +333,12 @@ pub fn uniq( .set_metadata(metadata)) } +fn sort(iter: IntoIter) -> Vec { + iter.map(|item| item.1) + .sorted_by(|a, b| a.index.cmp(&b.index)) + .collect() +} + #[cfg(test)] mod test { use super::*; diff --git a/crates/nu-command/src/filters/uniq_by.rs b/crates/nu-command/src/filters/uniq_by.rs index 90c3f74ca..0be5de0db 100644 --- a/crates/nu-command/src/filters/uniq_by.rs +++ b/crates/nu-command/src/filters/uniq_by.rs @@ -157,7 +157,7 @@ fn item_mapper_by_col(cols: Vec) -> impl Fn(crate::ItemMapperState) -> c span: Span::unknown(), }; - crate::ValueCounter::new_vals_to_compare(ms.item, ms.flag_ignore_case, col_vals) + crate::ValueCounter::new_vals_to_compare(ms.item, ms.flag_ignore_case, col_vals, ms.index) }) } diff --git a/crates/nu-command/src/formats/to/mod.rs b/crates/nu-command/src/formats/to/mod.rs index 5397970ed..14bbf91df 100644 --- a/crates/nu-command/src/formats/to/mod.rs +++ b/crates/nu-command/src/formats/to/mod.rs @@ -19,6 +19,7 @@ pub use command::To; pub use html::ToHtml; pub use json::ToJson; pub use md::ToMd; +pub use nuon::value_to_string; pub use nuon::ToNuon; pub use text::ToText; pub use tsv::ToTsv; diff --git a/crates/nu-command/src/formats/to/nuon.rs b/crates/nu-command/src/formats/to/nuon.rs index d2adb4344..fc20356bc 100644 --- a/crates/nu-command/src/formats/to/nuon.rs +++ b/crates/nu-command/src/formats/to/nuon.rs @@ -50,7 +50,7 @@ impl Command for ToNuon { } } -fn value_to_string(v: &Value, span: Span) -> Result { +pub fn value_to_string(v: &Value, span: Span) -> Result { match v { Value::Binary { val, .. } => { let mut s = String::with_capacity(2 * val.len());