describe command (#3907)

This commit is contained in:
Fernando Herrera 2021-08-07 18:48:54 +01:00 committed by GitHub
parent b6728efcd4
commit 38848082ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 248 additions and 13 deletions

View File

@ -0,0 +1,232 @@
use crate::prelude::*;
use nu_engine::WholeStreamCommand;
use nu_errors::ShellError;
use nu_protocol::{
dataframe::{Column, NuDataFrame},
Signature, UntaggedValue,
};
use polars::{
chunked_array::ChunkedArray,
prelude::{
AnyValue, DataFrame as PolarsDF, DataType, Float64Type, IntoSeries, NewChunkedArray,
Series, Utf8Type,
},
};
use super::utils::parse_polars_error;
pub struct DataFrame;
impl WholeStreamCommand for DataFrame {
fn name(&self) -> &str {
"dataframe describe"
}
fn usage(&self) -> &str {
"[DataFrame] Describes dataframes numeric columns"
}
fn signature(&self) -> Signature {
Signature::build("dataframe describe")
}
fn run(&self, args: CommandArgs) -> Result<OutputStream, ShellError> {
command(args)
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Describes dataframe",
example: "[[a b]; [1 1] [1 1]] | dataframe to-df | dataframe describe",
result: Some(vec![NuDataFrame::try_from_columns(
vec![
Column::new(
"descriptor".to_string(),
vec![
UntaggedValue::string("count").into(),
UntaggedValue::string("sum").into(),
UntaggedValue::string("mean").into(),
UntaggedValue::string("median").into(),
UntaggedValue::string("std").into(),
UntaggedValue::string("min").into(),
UntaggedValue::string("25%").into(),
UntaggedValue::string("50%").into(),
UntaggedValue::string("75%").into(),
UntaggedValue::string("max").into(),
],
),
Column::new(
"a (i64)".to_string(),
vec![
UntaggedValue::decimal_from_float(2.0, Span::default()).into(),
UntaggedValue::decimal_from_float(2.0, Span::default()).into(),
UntaggedValue::decimal_from_float(1.0, Span::default()).into(),
UntaggedValue::decimal_from_float(1.0, Span::default()).into(),
UntaggedValue::decimal_from_float(0.0, Span::default()).into(),
UntaggedValue::decimal_from_float(1.0, Span::default()).into(),
UntaggedValue::decimal_from_float(1.0, Span::default()).into(),
UntaggedValue::decimal_from_float(1.0, Span::default()).into(),
UntaggedValue::decimal_from_float(1.0, Span::default()).into(),
UntaggedValue::decimal_from_float(1.0, Span::default()).into(),
],
),
Column::new(
"b (i64)".to_string(),
vec![
UntaggedValue::decimal_from_float(2.0, Span::default()).into(),
UntaggedValue::decimal_from_float(2.0, Span::default()).into(),
UntaggedValue::decimal_from_float(1.0, Span::default()).into(),
UntaggedValue::decimal_from_float(1.0, Span::default()).into(),
UntaggedValue::decimal_from_float(0.0, Span::default()).into(),
UntaggedValue::decimal_from_float(1.0, Span::default()).into(),
UntaggedValue::decimal_from_float(1.0, Span::default()).into(),
UntaggedValue::decimal_from_float(1.0, Span::default()).into(),
UntaggedValue::decimal_from_float(1.0, Span::default()).into(),
UntaggedValue::decimal_from_float(1.0, Span::default()).into(),
],
),
],
&Span::default(),
)
.expect("simple df for test should not fail")
.into_value(Tag::default())]),
}]
}
}
fn command(mut args: CommandArgs) -> Result<OutputStream, ShellError> {
let tag = args.call_info.name_tag.clone();
let (df, _) = NuDataFrame::try_from_stream(&mut args.input, &tag.span)?;
let names = ChunkedArray::<Utf8Type>::new_from_opt_slice(
"descriptor",
&[
Some("count"),
Some("sum"),
Some("mean"),
Some("median"),
Some("std"),
Some("min"),
Some("25%"),
Some("50%"),
Some("75%"),
Some("max"),
],
)
.into_series();
let head = std::iter::once(names);
let tail = df.as_ref().get_columns().iter().map(|col| {
let count = col.len() as f64;
let sum = match col.sum_as_series().cast_with_dtype(&DataType::Float64) {
Ok(ca) => match ca.get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
},
Err(_) => None,
};
let mean = match col.mean_as_series().get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
};
let median = match col.median_as_series().get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
};
let std = match col.std_as_series().get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
};
let min = match col.min_as_series().cast_with_dtype(&DataType::Float64) {
Ok(ca) => match ca.get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
},
Err(_) => None,
};
let q_25 = match col.quantile_as_series(0.25) {
Ok(ca) => match ca.cast_with_dtype(&DataType::Float64) {
Ok(ca) => match ca.get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
},
Err(_) => None,
},
Err(_) => None,
};
let q_50 = match col.quantile_as_series(0.50) {
Ok(ca) => match ca.cast_with_dtype(&DataType::Float64) {
Ok(ca) => match ca.get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
},
Err(_) => None,
},
Err(_) => None,
};
let q_75 = match col.quantile_as_series(0.75) {
Ok(ca) => match ca.cast_with_dtype(&DataType::Float64) {
Ok(ca) => match ca.get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
},
Err(_) => None,
},
Err(_) => None,
};
let max = match col.max_as_series().cast_with_dtype(&DataType::Float64) {
Ok(ca) => match ca.get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
},
Err(_) => None,
};
let name = format!("{} ({})", col.name(), col.dtype());
ChunkedArray::<Float64Type>::new_from_opt_slice(
name.as_str(),
&[
Some(count),
sum,
mean,
median,
std,
min,
q_25,
q_50,
q_75,
max,
],
)
.into_series()
});
let res = head.chain(tail).collect::<Vec<Series>>();
let df = PolarsDF::new(res).map_err(|e| parse_polars_error::<&str>(&e, &tag.span, None))?;
let df = NuDataFrame::dataframe_to_value(df, tag);
Ok(OutputStream::one(df))
}
#[cfg(test)]
mod tests {
use super::DataFrame;
use super::ShellError;
#[test]
fn examples_work_as_expected() -> Result<(), ShellError> {
use crate::examples::test_dataframe as test_examples;
test_examples(DataFrame {})
}
}

View File

@ -2,6 +2,7 @@ pub mod aggregate;
pub mod append; pub mod append;
pub mod column; pub mod column;
pub mod command; pub mod command;
pub mod describe;
pub mod drop; pub mod drop;
pub mod drop_duplicates; pub mod drop_duplicates;
pub mod drop_nulls; pub mod drop_nulls;
@ -35,6 +36,7 @@ pub use aggregate::DataFrame as DataFrameAggregate;
pub use append::DataFrame as DataFrameAppend; pub use append::DataFrame as DataFrameAppend;
pub use column::DataFrame as DataFrameColumn; pub use column::DataFrame as DataFrameColumn;
pub use command::Command as DataFrame; pub use command::Command as DataFrame;
pub use describe::DataFrame as DataFrameDescribe;
pub use drop::DataFrame as DataFrameDrop; pub use drop::DataFrame as DataFrameDrop;
pub use drop_duplicates::DataFrame as DataFrameDropDuplicates; pub use drop_duplicates::DataFrame as DataFrameDropDuplicates;
pub use drop_nulls::DataFrame as DataFrameDropNulls; pub use drop_nulls::DataFrame as DataFrameDropNulls;

View File

@ -27,19 +27,19 @@ pub use core_commands::*;
pub use dataframe::{ pub use dataframe::{
DataFrame, DataFrameAggregate, DataFrameAllFalse, DataFrameAllTrue, DataFrameAppend, DataFrame, DataFrameAggregate, DataFrameAllFalse, DataFrameAllTrue, DataFrameAppend,
DataFrameArgMax, DataFrameArgMin, DataFrameArgSort, DataFrameArgTrue, DataFrameArgUnique, DataFrameArgMax, DataFrameArgMin, DataFrameArgSort, DataFrameArgTrue, DataFrameArgUnique,
DataFrameColumn, DataFrameConcatenate, DataFrameContains, DataFrameDTypes, DataFrameDrop, DataFrameColumn, DataFrameConcatenate, DataFrameContains, DataFrameDTypes, DataFrameDescribe,
DataFrameDropDuplicates, DataFrameDropNulls, DataFrameDummies, DataFrameFilter, DataFrameFirst, DataFrameDrop, DataFrameDropDuplicates, DataFrameDropNulls, DataFrameDummies, DataFrameFilter,
DataFrameGet, DataFrameGetDay, DataFrameGetHour, DataFrameGetMinute, DataFrameGetMonth, DataFrameFirst, DataFrameGet, DataFrameGetDay, DataFrameGetHour, DataFrameGetMinute,
DataFrameGetNanoSecond, DataFrameGetOrdinal, DataFrameGetSecond, DataFrameGetWeek, DataFrameGetMonth, DataFrameGetNanoSecond, DataFrameGetOrdinal, DataFrameGetSecond,
DataFrameGetWeekDay, DataFrameGetYear, DataFrameGroupBy, DataFrameIsDuplicated, DataFrameIsIn, DataFrameGetWeek, DataFrameGetWeekDay, DataFrameGetYear, DataFrameGroupBy,
DataFrameIsNotNull, DataFrameIsNull, DataFrameIsUnique, DataFrameJoin, DataFrameLast, DataFrameIsDuplicated, DataFrameIsIn, DataFrameIsNotNull, DataFrameIsNull, DataFrameIsUnique,
DataFrameList, DataFrameMelt, DataFrameNNull, DataFrameNUnique, DataFrameNot, DataFrameOpen, DataFrameJoin, DataFrameLast, DataFrameList, DataFrameMelt, DataFrameNNull, DataFrameNUnique,
DataFramePivot, DataFrameReplace, DataFrameReplaceAll, DataFrameSample, DataFrameSelect, DataFrameNot, DataFrameOpen, DataFramePivot, DataFrameReplace, DataFrameReplaceAll,
DataFrameSeriesRename, DataFrameSet, DataFrameSetWithIdx, DataFrameShape, DataFrameShift, DataFrameSample, DataFrameSelect, DataFrameSeriesRename, DataFrameSet, DataFrameSetWithIdx,
DataFrameShow, DataFrameSlice, DataFrameSort, DataFrameStrFTime, DataFrameStringLengths, DataFrameShape, DataFrameShift, DataFrameShow, DataFrameSlice, DataFrameSort,
DataFrameStringSlice, DataFrameTake, DataFrameToCsv, DataFrameToDF, DataFrameToLowercase, DataFrameStrFTime, DataFrameStringLengths, DataFrameStringSlice, DataFrameTake, DataFrameToCsv,
DataFrameToParquet, DataFrameToUppercase, DataFrameUnique, DataFrameValueCounts, DataFrameToDF, DataFrameToLowercase, DataFrameToParquet, DataFrameToUppercase, DataFrameUnique,
DataFrameWhere, DataFrameWithColumn, DataFrameValueCounts, DataFrameWhere, DataFrameWithColumn,
}; };
pub use env::*; pub use env::*;
pub use filesystem::*; pub use filesystem::*;

View File

@ -340,6 +340,7 @@ pub fn create_default_context(interactive: bool) -> Result<EvaluationContext, Bo
whole_stream_command(DataFrameGetOrdinal), whole_stream_command(DataFrameGetOrdinal),
whole_stream_command(DataFrameGetNanoSecond), whole_stream_command(DataFrameGetNanoSecond),
whole_stream_command(DataFrameStrFTime), whole_stream_command(DataFrameStrFTime),
whole_stream_command(DataFrameDescribe),
]); ]);
#[cfg(feature = "clipboard-cli")] #[cfg(feature = "clipboard-cli")]