diff --git a/crates/nu-command/src/commands/dataframe/describe.rs b/crates/nu-command/src/commands/dataframe/describe.rs new file mode 100644 index 0000000000..ff452d728d --- /dev/null +++ b/crates/nu-command/src/commands/dataframe/describe.rs @@ -0,0 +1,232 @@ +use crate::prelude::*; +use nu_engine::WholeStreamCommand; +use nu_errors::ShellError; +use nu_protocol::{ + dataframe::{Column, NuDataFrame}, + Signature, UntaggedValue, +}; +use polars::{ + chunked_array::ChunkedArray, + prelude::{ + AnyValue, DataFrame as PolarsDF, DataType, Float64Type, IntoSeries, NewChunkedArray, + Series, Utf8Type, + }, +}; + +use super::utils::parse_polars_error; + +pub struct DataFrame; + +impl WholeStreamCommand for DataFrame { + fn name(&self) -> &str { + "dataframe describe" + } + + fn usage(&self) -> &str { + "[DataFrame] Describes dataframes numeric columns" + } + + fn signature(&self) -> Signature { + Signature::build("dataframe describe") + } + + fn run(&self, args: CommandArgs) -> Result { + command(args) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Describes dataframe", + example: "[[a b]; [1 1] [1 1]] | dataframe to-df | dataframe describe", + result: Some(vec![NuDataFrame::try_from_columns( + vec![ + Column::new( + "descriptor".to_string(), + vec![ + UntaggedValue::string("count").into(), + UntaggedValue::string("sum").into(), + UntaggedValue::string("mean").into(), + UntaggedValue::string("median").into(), + UntaggedValue::string("std").into(), + UntaggedValue::string("min").into(), + UntaggedValue::string("25%").into(), + UntaggedValue::string("50%").into(), + UntaggedValue::string("75%").into(), + UntaggedValue::string("max").into(), + ], + ), + Column::new( + "a (i64)".to_string(), + vec![ + UntaggedValue::decimal_from_float(2.0, Span::default()).into(), + UntaggedValue::decimal_from_float(2.0, Span::default()).into(), + UntaggedValue::decimal_from_float(1.0, Span::default()).into(), + UntaggedValue::decimal_from_float(1.0, Span::default()).into(), + UntaggedValue::decimal_from_float(0.0, Span::default()).into(), + UntaggedValue::decimal_from_float(1.0, Span::default()).into(), + UntaggedValue::decimal_from_float(1.0, Span::default()).into(), + UntaggedValue::decimal_from_float(1.0, Span::default()).into(), + UntaggedValue::decimal_from_float(1.0, Span::default()).into(), + UntaggedValue::decimal_from_float(1.0, Span::default()).into(), + ], + ), + Column::new( + "b (i64)".to_string(), + vec![ + UntaggedValue::decimal_from_float(2.0, Span::default()).into(), + UntaggedValue::decimal_from_float(2.0, Span::default()).into(), + UntaggedValue::decimal_from_float(1.0, Span::default()).into(), + UntaggedValue::decimal_from_float(1.0, Span::default()).into(), + UntaggedValue::decimal_from_float(0.0, Span::default()).into(), + UntaggedValue::decimal_from_float(1.0, Span::default()).into(), + UntaggedValue::decimal_from_float(1.0, Span::default()).into(), + UntaggedValue::decimal_from_float(1.0, Span::default()).into(), + UntaggedValue::decimal_from_float(1.0, Span::default()).into(), + UntaggedValue::decimal_from_float(1.0, Span::default()).into(), + ], + ), + ], + &Span::default(), + ) + .expect("simple df for test should not fail") + .into_value(Tag::default())]), + }] + } +} + +fn command(mut args: CommandArgs) -> Result { + let tag = args.call_info.name_tag.clone(); + + let (df, _) = NuDataFrame::try_from_stream(&mut args.input, &tag.span)?; + + let names = ChunkedArray::::new_from_opt_slice( + "descriptor", + &[ + Some("count"), + Some("sum"), + Some("mean"), + Some("median"), + Some("std"), + Some("min"), + Some("25%"), + Some("50%"), + Some("75%"), + Some("max"), + ], + ) + .into_series(); + + let head = std::iter::once(names); + + let tail = df.as_ref().get_columns().iter().map(|col| { + let count = col.len() as f64; + + let sum = match col.sum_as_series().cast_with_dtype(&DataType::Float64) { + Ok(ca) => match ca.get(0) { + AnyValue::Float64(v) => Some(v), + _ => None, + }, + Err(_) => None, + }; + + let mean = match col.mean_as_series().get(0) { + AnyValue::Float64(v) => Some(v), + _ => None, + }; + + let median = match col.median_as_series().get(0) { + AnyValue::Float64(v) => Some(v), + _ => None, + }; + + let std = match col.std_as_series().get(0) { + AnyValue::Float64(v) => Some(v), + _ => None, + }; + + let min = match col.min_as_series().cast_with_dtype(&DataType::Float64) { + Ok(ca) => match ca.get(0) { + AnyValue::Float64(v) => Some(v), + _ => None, + }, + Err(_) => None, + }; + + let q_25 = match col.quantile_as_series(0.25) { + Ok(ca) => match ca.cast_with_dtype(&DataType::Float64) { + Ok(ca) => match ca.get(0) { + AnyValue::Float64(v) => Some(v), + _ => None, + }, + Err(_) => None, + }, + Err(_) => None, + }; + + let q_50 = match col.quantile_as_series(0.50) { + Ok(ca) => match ca.cast_with_dtype(&DataType::Float64) { + Ok(ca) => match ca.get(0) { + AnyValue::Float64(v) => Some(v), + _ => None, + }, + Err(_) => None, + }, + Err(_) => None, + }; + + let q_75 = match col.quantile_as_series(0.75) { + Ok(ca) => match ca.cast_with_dtype(&DataType::Float64) { + Ok(ca) => match ca.get(0) { + AnyValue::Float64(v) => Some(v), + _ => None, + }, + Err(_) => None, + }, + Err(_) => None, + }; + + let max = match col.max_as_series().cast_with_dtype(&DataType::Float64) { + Ok(ca) => match ca.get(0) { + AnyValue::Float64(v) => Some(v), + _ => None, + }, + Err(_) => None, + }; + + let name = format!("{} ({})", col.name(), col.dtype()); + ChunkedArray::::new_from_opt_slice( + name.as_str(), + &[ + Some(count), + sum, + mean, + median, + std, + min, + q_25, + q_50, + q_75, + max, + ], + ) + .into_series() + }); + + let res = head.chain(tail).collect::>(); + let df = PolarsDF::new(res).map_err(|e| parse_polars_error::<&str>(&e, &tag.span, None))?; + let df = NuDataFrame::dataframe_to_value(df, tag); + Ok(OutputStream::one(df)) +} + +#[cfg(test)] +mod tests { + use super::DataFrame; + use super::ShellError; + + #[test] + fn examples_work_as_expected() -> Result<(), ShellError> { + use crate::examples::test_dataframe as test_examples; + + test_examples(DataFrame {}) + } +} diff --git a/crates/nu-command/src/commands/dataframe/mod.rs b/crates/nu-command/src/commands/dataframe/mod.rs index 43c34b05d5..07d53f271a 100644 --- a/crates/nu-command/src/commands/dataframe/mod.rs +++ b/crates/nu-command/src/commands/dataframe/mod.rs @@ -2,6 +2,7 @@ pub mod aggregate; pub mod append; pub mod column; pub mod command; +pub mod describe; pub mod drop; pub mod drop_duplicates; pub mod drop_nulls; @@ -35,6 +36,7 @@ pub use aggregate::DataFrame as DataFrameAggregate; pub use append::DataFrame as DataFrameAppend; pub use column::DataFrame as DataFrameColumn; pub use command::Command as DataFrame; +pub use describe::DataFrame as DataFrameDescribe; pub use drop::DataFrame as DataFrameDrop; pub use drop_duplicates::DataFrame as DataFrameDropDuplicates; pub use drop_nulls::DataFrame as DataFrameDropNulls; diff --git a/crates/nu-command/src/commands/mod.rs b/crates/nu-command/src/commands/mod.rs index b1a5244e88..c2a8ac4aa5 100644 --- a/crates/nu-command/src/commands/mod.rs +++ b/crates/nu-command/src/commands/mod.rs @@ -27,19 +27,19 @@ pub use core_commands::*; pub use dataframe::{ DataFrame, DataFrameAggregate, DataFrameAllFalse, DataFrameAllTrue, DataFrameAppend, DataFrameArgMax, DataFrameArgMin, DataFrameArgSort, DataFrameArgTrue, DataFrameArgUnique, - DataFrameColumn, DataFrameConcatenate, DataFrameContains, DataFrameDTypes, DataFrameDrop, - DataFrameDropDuplicates, DataFrameDropNulls, DataFrameDummies, DataFrameFilter, DataFrameFirst, - DataFrameGet, DataFrameGetDay, DataFrameGetHour, DataFrameGetMinute, DataFrameGetMonth, - DataFrameGetNanoSecond, DataFrameGetOrdinal, DataFrameGetSecond, DataFrameGetWeek, - DataFrameGetWeekDay, DataFrameGetYear, DataFrameGroupBy, DataFrameIsDuplicated, DataFrameIsIn, - DataFrameIsNotNull, DataFrameIsNull, DataFrameIsUnique, DataFrameJoin, DataFrameLast, - DataFrameList, DataFrameMelt, DataFrameNNull, DataFrameNUnique, DataFrameNot, DataFrameOpen, - DataFramePivot, DataFrameReplace, DataFrameReplaceAll, DataFrameSample, DataFrameSelect, - DataFrameSeriesRename, DataFrameSet, DataFrameSetWithIdx, DataFrameShape, DataFrameShift, - DataFrameShow, DataFrameSlice, DataFrameSort, DataFrameStrFTime, DataFrameStringLengths, - DataFrameStringSlice, DataFrameTake, DataFrameToCsv, DataFrameToDF, DataFrameToLowercase, - DataFrameToParquet, DataFrameToUppercase, DataFrameUnique, DataFrameValueCounts, - DataFrameWhere, DataFrameWithColumn, + DataFrameColumn, DataFrameConcatenate, DataFrameContains, DataFrameDTypes, DataFrameDescribe, + DataFrameDrop, DataFrameDropDuplicates, DataFrameDropNulls, DataFrameDummies, DataFrameFilter, + DataFrameFirst, DataFrameGet, DataFrameGetDay, DataFrameGetHour, DataFrameGetMinute, + DataFrameGetMonth, DataFrameGetNanoSecond, DataFrameGetOrdinal, DataFrameGetSecond, + DataFrameGetWeek, DataFrameGetWeekDay, DataFrameGetYear, DataFrameGroupBy, + DataFrameIsDuplicated, DataFrameIsIn, DataFrameIsNotNull, DataFrameIsNull, DataFrameIsUnique, + DataFrameJoin, DataFrameLast, DataFrameList, DataFrameMelt, DataFrameNNull, DataFrameNUnique, + DataFrameNot, DataFrameOpen, DataFramePivot, DataFrameReplace, DataFrameReplaceAll, + DataFrameSample, DataFrameSelect, DataFrameSeriesRename, DataFrameSet, DataFrameSetWithIdx, + DataFrameShape, DataFrameShift, DataFrameShow, DataFrameSlice, DataFrameSort, + DataFrameStrFTime, DataFrameStringLengths, DataFrameStringSlice, DataFrameTake, DataFrameToCsv, + DataFrameToDF, DataFrameToLowercase, DataFrameToParquet, DataFrameToUppercase, DataFrameUnique, + DataFrameValueCounts, DataFrameWhere, DataFrameWithColumn, }; pub use env::*; pub use filesystem::*; diff --git a/crates/nu-command/src/default_context.rs b/crates/nu-command/src/default_context.rs index 7026da4386..795feb6d43 100644 --- a/crates/nu-command/src/default_context.rs +++ b/crates/nu-command/src/default_context.rs @@ -340,6 +340,7 @@ pub fn create_default_context(interactive: bool) -> Result