From 852de7921232c0c01fa6a410567126c1a9c8a5e7 Mon Sep 17 00:00:00 2001 From: WindSoilder Date: Fri, 13 May 2022 19:48:47 +0800 Subject: [PATCH] Implement histogram command (#5518) * finish histogram * adjust comment * add test for histogram * add Date to test * move hashable value back inside chart package --- .../nu-command/src/charting/hashable_value.rs | 317 ++++++++++++++++++ crates/nu-command/src/charting/histogram.rs | 254 ++++++++++++++ crates/nu-command/src/charting/mod.rs | 4 + crates/nu-command/src/default_context.rs | 5 + crates/nu-command/src/lib.rs | 2 + crates/nu-command/tests/commands/histogram.rs | 71 +++- crates/nu-protocol/src/signature.rs | 2 + 7 files changed, 640 insertions(+), 15 deletions(-) create mode 100644 crates/nu-command/src/charting/hashable_value.rs create mode 100644 crates/nu-command/src/charting/histogram.rs create mode 100644 crates/nu-command/src/charting/mod.rs diff --git a/crates/nu-command/src/charting/hashable_value.rs b/crates/nu-command/src/charting/hashable_value.rs new file mode 100644 index 000000000..2a99b2f69 --- /dev/null +++ b/crates/nu-command/src/charting/hashable_value.rs @@ -0,0 +1,317 @@ +use chrono::{DateTime, FixedOffset}; +use nu_protocol::{ShellError, Span, Value}; +use std::hash::{Hash, Hasher}; + +/// A subset of [Value](crate::Value), which is hashable. +/// And it means that we can put the value into something like [HashMap](std::collections::HashMap) or [HashSet](std::collections::HashSet) +/// for further usage like value statistics. +/// +/// For now the main way to crate a [HashableValue] is using [from_value](HashableValue::from_value) +/// +/// Please note that although each variant contains `span` field, but during hashing, this field will not be concerned. +/// Which means that the following will be true: +/// ```text +/// assert_eq!(HashableValue::Bool {val: true, span: Span{start: 0, end: 1}}, HashableValue::Bool {val: true, span: Span{start: 90, end: 1000}}) +/// ``` +#[derive(Eq, Debug)] +pub enum HashableValue { + Bool { + val: bool, + span: Span, + }, + Int { + val: i64, + span: Span, + }, + Float { + val: [u8; 8], // because f64 is not hashable, we save it as [u8;8] array to make it hashable. + span: Span, + }, + Filesize { + val: i64, + span: Span, + }, + Duration { + val: i64, + span: Span, + }, + Date { + val: DateTime, + span: Span, + }, + String { + val: String, + span: Span, + }, + Binary { + val: Vec, + span: Span, + }, +} + +impl Default for HashableValue { + fn default() -> Self { + HashableValue::Bool { + val: false, + span: Span { start: 0, end: 0 }, + } + } +} + +impl HashableValue { + /// Try to convert from `value` to self + /// + /// A `span` is required because when there is an error in value, it may not contain `span` field. + /// + /// If the given value is not hashable(mainly because of it is structured data), an error will returned. + pub fn from_value(value: Value, span: Span) -> Result { + match value { + Value::Bool { val, span } => Ok(HashableValue::Bool { val, span }), + Value::Int { val, span } => Ok(HashableValue::Int { val, span }), + Value::Filesize { val, span } => Ok(HashableValue::Filesize { val, span }), + Value::Duration { val, span } => Ok(HashableValue::Duration { val, span }), + Value::Date { val, span } => Ok(HashableValue::Date { val, span }), + Value::Float { val, span } => Ok(HashableValue::Float { + val: val.to_ne_bytes(), + span, + }), + Value::String { val, span } => Ok(HashableValue::String { val, span }), + Value::Binary { val, span } => Ok(HashableValue::Binary { val, span }), + + _ => { + let input_span = value.span().unwrap_or(span); + Err(ShellError::UnsupportedInput( + format!("input value {value:?} is not hashable"), + input_span, + )) + } + } + } + + /// Convert from self to nu's core data type `Value`. + pub fn into_value(self) -> Value { + match self { + HashableValue::Bool { val, span } => Value::Bool { val, span }, + HashableValue::Int { val, span } => Value::Int { val, span }, + HashableValue::Filesize { val, span } => Value::Filesize { val, span }, + HashableValue::Duration { val, span } => Value::Duration { val, span }, + HashableValue::Date { val, span } => Value::Date { val, span }, + HashableValue::Float { val, span } => Value::Float { + val: f64::from_ne_bytes(val), + span, + }, + HashableValue::String { val, span } => Value::String { val, span }, + HashableValue::Binary { val, span } => Value::Binary { val, span }, + } + } +} + +impl Hash for HashableValue { + fn hash(&self, state: &mut H) { + match self { + HashableValue::Bool { val, .. } => val.hash(state), + HashableValue::Int { val, .. } => val.hash(state), + HashableValue::Filesize { val, .. } => val.hash(state), + HashableValue::Duration { val, .. } => val.hash(state), + HashableValue::Date { val, .. } => val.hash(state), + HashableValue::Float { val, .. } => val.hash(state), + HashableValue::String { val, .. } => val.hash(state), + HashableValue::Binary { val, .. } => val.hash(state), + } + } +} + +impl PartialEq for HashableValue { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (HashableValue::Bool { val: lhs, .. }, HashableValue::Bool { val: rhs, .. }) => { + lhs == rhs + } + (HashableValue::Int { val: lhs, .. }, HashableValue::Int { val: rhs, .. }) => { + lhs == rhs + } + ( + HashableValue::Filesize { val: lhs, .. }, + HashableValue::Filesize { val: rhs, .. }, + ) => lhs == rhs, + ( + HashableValue::Duration { val: lhs, .. }, + HashableValue::Duration { val: rhs, .. }, + ) => lhs == rhs, + (HashableValue::Date { val: lhs, .. }, HashableValue::Date { val: rhs, .. }) => { + lhs == rhs + } + (HashableValue::Float { val: lhs, .. }, HashableValue::Float { val: rhs, .. }) => { + lhs == rhs + } + (HashableValue::String { val: lhs, .. }, HashableValue::String { val: rhs, .. }) => { + lhs == rhs + } + (HashableValue::Binary { val: lhs, .. }, HashableValue::Binary { val: rhs, .. }) => { + lhs == rhs + } + _ => false, + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use nu_protocol::ast::{CellPath, PathMember}; + use std::collections::{HashMap, HashSet}; + + #[test] + fn from_value() { + let span = Span::test_data(); + let values = vec![ + ( + Value::Bool { val: true, span }, + HashableValue::Bool { val: true, span }, + ), + ( + Value::Int { val: 1, span }, + HashableValue::Int { val: 1, span }, + ), + ( + Value::Filesize { val: 1, span }, + HashableValue::Filesize { val: 1, span }, + ), + ( + Value::Duration { val: 1, span }, + HashableValue::Duration { val: 1, span }, + ), + ( + Value::Date { + val: DateTime::::parse_from_rfc2822( + "Wed, 18 Feb 2015 23:16:09 GMT", + ) + .unwrap(), + span, + }, + HashableValue::Date { + val: DateTime::::parse_from_rfc2822( + "Wed, 18 Feb 2015 23:16:09 GMT", + ) + .unwrap(), + span, + }, + ), + ( + Value::String { + val: "1".to_string(), + span, + }, + HashableValue::String { + val: "1".to_string(), + span, + }, + ), + ( + Value::Binary { val: vec![1], span }, + HashableValue::Binary { val: vec![1], span }, + ), + ]; + for (val, expect_hashable_val) in values.into_iter() { + assert_eq!( + HashableValue::from_value(val, Span { start: 0, end: 0 }).unwrap(), + expect_hashable_val + ); + } + } + + #[test] + fn from_unhashable_value() { + let span = Span::test_data(); + let values = [ + Value::List { + vals: vec![Value::Bool { val: true, span }], + span, + }, + Value::Block { + val: 0, + captures: HashMap::new(), + span, + }, + Value::Nothing { span }, + Value::Error { + error: ShellError::DidYouMean("what?".to_string(), span), + }, + Value::CellPath { + val: CellPath { + members: vec![PathMember::Int { val: 0, span }], + }, + span, + }, + ]; + for v in values { + assert!(HashableValue::from_value(v, Span { start: 0, end: 0 }).is_err()) + } + } + + #[test] + fn from_to_tobe_same() { + let span = Span::test_data(); + let values = vec![ + Value::Bool { val: true, span }, + Value::Int { val: 1, span }, + Value::Filesize { val: 1, span }, + Value::Duration { val: 1, span }, + Value::String { + val: "1".to_string(), + span, + }, + Value::Binary { val: vec![1], span }, + ]; + for val in values.into_iter() { + let expected_val = val.clone(); + assert_eq!( + HashableValue::from_value(val, Span { start: 0, end: 0 }) + .unwrap() + .into_value(), + expected_val + ); + } + } + + #[test] + fn hashable_value_eq_without_concern_span() { + assert_eq!( + HashableValue::Bool { + val: true, + span: Span { start: 0, end: 1 } + }, + HashableValue::Bool { + val: true, + span: Span { + start: 90, + end: 1000 + } + } + ) + } + + #[test] + fn put_to_hashset() { + let span = Span::test_data(); + let mut set = HashSet::new(); + set.insert(HashableValue::Bool { val: true, span }); + assert!(set.contains(&HashableValue::Bool { val: true, span })); + + // hashable value doesn't care about span. + let diff_span = Span { start: 1, end: 2 }; + set.insert(HashableValue::Bool { + val: true, + span: diff_span, + }); + assert!(set.contains(&HashableValue::Bool { val: true, span })); + assert!(set.contains(&HashableValue::Bool { + val: true, + span: diff_span + })); + assert_eq!(set.len(), 1); + + set.insert(HashableValue::Int { val: 2, span }); + assert_eq!(set.len(), 2); + } +} diff --git a/crates/nu-command/src/charting/histogram.rs b/crates/nu-command/src/charting/histogram.rs new file mode 100644 index 000000000..38bb734be --- /dev/null +++ b/crates/nu-command/src/charting/histogram.rs @@ -0,0 +1,254 @@ +use super::hashable_value::HashableValue; +use nu_engine::CallExt; +use nu_protocol::ast::Call; +use nu_protocol::engine::{Command, EngineState, Stack}; +use nu_protocol::{ + Example, IntoPipelineData, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, + Value, +}; +use std::collections::HashMap; +use std::iter; + +#[derive(Clone)] +pub struct Histogram; + +enum PercentageCalcMethod { + Normalize, + Relative, +} + +impl Command for Histogram { + fn name(&self) -> &str { + "histogram" + } + + fn signature(&self) -> Signature { + Signature::build("histogram") + .optional("column-name", SyntaxShape::String, "column name to calc frequency, no need to provide if input is just a list") + .optional("frequency-column-name", SyntaxShape::String, "histogram's frequency column, default to be frequency column output") + .named("percentage-type", SyntaxShape::String, "percentage calculate method, can be 'normalize' or 'relative', in 'normalize', defaults to be 'normalize'", Some('t')) + } + + fn usage(&self) -> &str { + "Creates a new table with a histogram based on the column name passed in." + } + + fn examples(&self) -> Vec { + vec![ + Example { + description: "Get a histogram for the types of files", + example: "ls | histogram type", + result: None, + }, + Example { + description: + "Get a histogram for the types of files, with frequency column named freq", + example: "ls | histogram type freq", + result: None, + }, + Example { + description: "Get a histogram for a list of numbers", + example: "echo [1 2 3 1 1 1 2 2 1 1] | histogram", + result: None, + }, + Example { + description: "Get a histogram for a list of numbers, and percentage is based on the maximum value", + example: "echo [1 2 3 1 1 1 2 2 1 1] | histogram --percentage-type relative", + result: None, + } + ] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + // input check. + let column_name: Option> = call.opt(engine_state, stack, 0)?; + let frequency_name_arg = call.opt::>(engine_state, stack, 1)?; + let frequency_column_name = match frequency_name_arg { + Some(inner) => { + let span = inner.span; + if ["value", "count", "percentage"].contains(&inner.item.as_str()) { + return Err(ShellError::UnsupportedInput( + "frequency-column-name can't be 'value', 'count' or 'percentage'" + .to_string(), + span, + )); + } + inner.item + } + None => "frequency".to_string(), + }; + + let calc_method: Option> = + call.get_flag(engine_state, stack, "percentage-type")?; + let calc_method = match calc_method { + None => PercentageCalcMethod::Normalize, + Some(inner) => match inner.item.as_str() { + "normalize" => PercentageCalcMethod::Normalize, + "relative" => PercentageCalcMethod::Relative, + _ => { + return Err(ShellError::UnsupportedInput( + "calc method can only be 'normalize' or 'relative'".to_string(), + inner.span, + )) + } + }, + }; + + let span = call.head; + let data_as_value = input.into_value(span); + // `input` is not a list, here we can return an error. + match data_as_value.as_list() { + Ok(list_value) => run_histogram( + list_value.to_vec(), + column_name, + frequency_column_name, + calc_method, + span, + ), + Err(e) => Err(e), + } + } +} + +fn run_histogram( + values: Vec, + column_name: Option>, + freq_column: String, + calc_method: PercentageCalcMethod, + head_span: Span, +) -> Result { + let mut inputs = vec![]; + // convert from inputs to hashable values. + match column_name { + None => { + // some invalid input scenario needs to handle: + // Expect input is a list of hashable value, if one value is not hashable, throw out error. + for v in values { + let current_span = v.span().unwrap_or(head_span); + inputs.push(HashableValue::from_value(v, head_span).map_err(|_| { + ShellError::UnsupportedInput( + "--column-name is not provided, can only support a list of simple value." + .to_string(), + current_span, + ) + })?); + } + } + Some(ref col) => { + // some invalid input scenario needs to handle: + // * item in `input` is not a record, just skip it. + // * a record doesn't contain specific column, just skip it. + // * all records don't contain specific column, throw out error, indicate at least one row should contains specific column. + // * a record contain a value which can't be hashed, skip it. + let col_name = &col.item; + for v in values { + match v { + // parse record, and fill valid value to actual input. + Value::Record { cols, vals, .. } => { + for (c, v) in iter::zip(cols, vals) { + if &c == col_name { + if let Ok(v) = HashableValue::from_value(v, head_span) { + inputs.push(v); + } + } + } + } + _ => continue, + } + } + + if inputs.is_empty() { + return Err(ShellError::UnsupportedInput( + format!("expect input is table, and inputs doesn't contain any value which has {col_name} column"), + head_span, + )); + } + } + } + + let value_column_name = column_name + .map(|x| x.item) + .unwrap_or_else(|| "value".to_string()); + Ok(histogram_impl( + inputs, + &value_column_name, + calc_method, + &freq_column, + head_span, + )) +} + +fn histogram_impl( + inputs: Vec, + value_column_name: &str, + calc_method: PercentageCalcMethod, + freq_column: &str, + span: Span, +) -> PipelineData { + // here we can make sure that inputs is not empty, and every elements + // is a simple val and ok to make count. + let mut counter = HashMap::new(); + let mut max_cnt = 0; + let total_cnt = inputs.len(); + for i in inputs { + let new_cnt = *counter.get(&i).unwrap_or(&0) + 1; + counter.insert(i, new_cnt); + if new_cnt > max_cnt { + max_cnt = new_cnt; + } + } + + let mut result = vec![]; + let result_cols = vec![ + value_column_name.to_string(), + "count".to_string(), + "percentage".to_string(), + freq_column.to_string(), + ]; + const MAX_FREQ_COUNT: f64 = 100.0; + for (val, count) in counter.into_iter() { + let (percentage, freq) = { + let percentage = match calc_method { + PercentageCalcMethod::Normalize => (count as f64 / total_cnt as f64), + PercentageCalcMethod::Relative => (count as f64 / max_cnt as f64), + }; + ( + format!("{:.2}%", percentage * 100_f64), + "*".repeat((MAX_FREQ_COUNT * percentage).floor() as usize), + ) + }; + + result.push(Value::Record { + cols: result_cols.clone(), + vals: vec![ + val.into_value(), + Value::Int { val: count, span }, + Value::String { + val: percentage, + span, + }, + Value::String { val: freq, span }, + ], + span, + }); + } + Value::List { vals: result, span }.into_pipeline_data() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_examples() { + use crate::test_examples; + + test_examples(Histogram) + } +} diff --git a/crates/nu-command/src/charting/mod.rs b/crates/nu-command/src/charting/mod.rs new file mode 100644 index 000000000..eed4cc652 --- /dev/null +++ b/crates/nu-command/src/charting/mod.rs @@ -0,0 +1,4 @@ +mod hashable_value; +mod histogram; + +pub use histogram::Histogram; diff --git a/crates/nu-command/src/default_context.rs b/crates/nu-command/src/default_context.rs index a193c67cc..bbf1fb086 100644 --- a/crates/nu-command/src/default_context.rs +++ b/crates/nu-command/src/default_context.rs @@ -65,6 +65,11 @@ pub fn create_default_context(cwd: impl AsRef) -> EngineState { Version, }; + // Charts + bind_command! { + Histogram + } + // Filters bind_command! { All, diff --git a/crates/nu-command/src/lib.rs b/crates/nu-command/src/lib.rs index 74954c446..f16778a71 100644 --- a/crates/nu-command/src/lib.rs +++ b/crates/nu-command/src/lib.rs @@ -1,3 +1,4 @@ +mod charting; mod conversions; mod core_commands; mod date; @@ -21,6 +22,7 @@ mod strings; mod system; mod viewers; +pub use charting::*; pub use conversions::*; pub use core_commands::*; pub use date::*; diff --git a/crates/nu-command/tests/commands/histogram.rs b/crates/nu-command/tests/commands/histogram.rs index 1b799f0f3..fc3722790 100644 --- a/crates/nu-command/tests/commands/histogram.rs +++ b/crates/nu-command/tests/commands/histogram.rs @@ -2,10 +2,40 @@ use nu_test_support::fs::Stub::FileWithContentToBeTrimmed; use nu_test_support::playground::Playground; use nu_test_support::{nu, pipeline}; -// FIXME: jt: needs more work -#[ignore] #[test] fn summarizes_by_column_given() { + Playground::setup("histogram_test_1", |dirs, sandbox| { + sandbox.with_files(vec![FileWithContentToBeTrimmed( + "los_tres_caballeros.csv", + r#" + first_name,last_name,rusty_at + Andrés,Robalino,Ecuador + Jonathan,Turner,Estados Unidos + Yehuda,Katz,Estados Unidos + "#, + )]); + + let actual = nu!( + cwd: dirs.test(), pipeline( + r#" + open los_tres_caballeros.csv + | histogram rusty_at countries --percentage-type relative + | where rusty_at == "Ecuador" + | get countries + | get 0 + "# + )); + + assert_eq!( + actual.out, + "**************************************************" + ); + // 50% + }) +} + +#[test] +fn summarizes_by_column_given_with_normalize_percentage() { Playground::setup("histogram_test_1", |dirs, sandbox| { sandbox.with_files(vec![FileWithContentToBeTrimmed( "los_tres_caballeros.csv", @@ -24,19 +54,15 @@ fn summarizes_by_column_given() { | histogram rusty_at countries | where rusty_at == "Ecuador" | get countries + | get 0 "# )); - assert_eq!( - actual.out, - "**************************************************" - ); - // 50% + assert_eq!(actual.out, "*********************************"); + // 33% }) } -// FIXME: jt: needs more work -#[ignore] #[test] fn summarizes_by_values() { Playground::setup("histogram_test_2", |dirs, sandbox| { @@ -58,6 +84,7 @@ fn summarizes_by_values() { | histogram | where value == "Estados Unidos" | get count + | get 0 "# )); @@ -65,8 +92,6 @@ fn summarizes_by_values() { }) } -// FIXME: jt: needs more work -#[ignore] #[test] fn help() { Playground::setup("histogram_test_3", |dirs, _sandbox| { @@ -96,22 +121,38 @@ fn help() { }) } -// FIXME: jt: needs more work -#[ignore] #[test] fn count() { let actual = nu!( cwd: ".", pipeline( r#" echo [[bit]; [1] [0] [0] [0] [0] [0] [0] [1]] - | histogram bit + | histogram bit --percentage-type relative | sort-by count | reject frequency | to json "# )); - let bit_json = r#"[{"bit":"1","count":2,"percentage":"33.33%"},{"bit":"0","count":6,"percentage":"100.00%"}]"#; + let bit_json = r#"[ { "bit": 1, "count": 2, "percentage": "33.33%" }, { "bit": 0, "count": 6, "percentage": "100.00%" }]"#; + + assert_eq!(actual.out, bit_json); +} + +#[test] +fn count_with_normalize_percentage() { + let actual = nu!( + cwd: ".", pipeline( + r#" + echo [[bit]; [1] [0] [0] [0] [0] [0] [0] [1]] + | histogram bit --percentage-type normalize + | sort-by count + | reject frequency + | to json + "# + )); + + let bit_json = r#"[ { "bit": 1, "count": 2, "percentage": "25.00%" }, { "bit": 0, "count": 6, "percentage": "75.00%" }]"#; assert_eq!(actual.out, bit_json); } diff --git a/crates/nu-protocol/src/signature.rs b/crates/nu-protocol/src/signature.rs index 63649b732..49e14b035 100644 --- a/crates/nu-protocol/src/signature.rs +++ b/crates/nu-protocol/src/signature.rs @@ -57,6 +57,7 @@ pub enum Category { Viewers, Hash, Generators, + Chart, Custom(String), Deprecated, } @@ -83,6 +84,7 @@ impl std::fmt::Display for Category { Category::Viewers => "viewers", Category::Hash => "hash", Category::Generators => "generators", + Category::Chart => "chart", Category::Custom(name) => name, Category::Deprecated => "deprecated", };