Dataframe commands (#542)

* groupby object

* aggregate command

* eager commands

* rest of dataframe commands
This commit is contained in:
Fernando Herrera
2021-12-21 18:32:09 +00:00
committed by GitHub
parent c3a16902fe
commit 6a35e6b7b6
44 changed files with 2936 additions and 130 deletions

View File

@ -0,0 +1,375 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
did_you_mean,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Value,
};
use polars::{frame::groupby::GroupBy, prelude::PolarsError};
use crate::dataframe::values::NuGroupBy;
use super::super::values::{Column, NuDataFrame};
enum Operation {
Mean,
Sum,
Min,
Max,
First,
Last,
Nunique,
Quantile(f64),
Median,
Var,
Std,
Count,
}
impl Operation {
fn from_tagged(
name: &Spanned<String>,
quantile: Option<Spanned<f64>>,
) -> Result<Operation, ShellError> {
match name.item.as_ref() {
"mean" => Ok(Operation::Mean),
"sum" => Ok(Operation::Sum),
"min" => Ok(Operation::Min),
"max" => Ok(Operation::Max),
"first" => Ok(Operation::First),
"last" => Ok(Operation::Last),
"nunique" => Ok(Operation::Nunique),
"quantile" => match quantile {
None => Err(ShellError::SpannedLabeledError(
"Quantile value not fount".into(),
"Quantile operation requires quantile value".into(),
name.span,
)),
Some(value) => {
if (value.item < 0.0) | (value.item > 1.0) {
Err(ShellError::SpannedLabeledError(
"Inappropriate quantile".into(),
"Quantile value should be between 0.0 and 1.0".into(),
value.span,
))
} else {
Ok(Operation::Quantile(value.item))
}
}
},
"median" => Ok(Operation::Median),
"var" => Ok(Operation::Var),
"std" => Ok(Operation::Std),
"count" => Ok(Operation::Count),
selection => {
let possibilities = [
"mean".to_string(),
"sum".to_string(),
"min".to_string(),
"max".to_string(),
"first".to_string(),
"last".to_string(),
"nunique".to_string(),
"quantile".to_string(),
"median".to_string(),
"var".to_string(),
"std".to_string(),
"count".to_string(),
];
match did_you_mean(&possibilities, selection) {
Some(suggestion) => Err(ShellError::DidYouMean(suggestion, name.span)),
None => Err(ShellError::SpannedLabeledErrorHelp(
"Operation not fount".into(),
"Operation does not exist".into(),
name.span,
"Perhaps you want: mean, sum, min, max, first, last, nunique, quantile, median, var, std, or count".into(),
))
}
}
}
}
fn to_str(&self) -> &'static str {
match self {
Self::Mean => "mean",
Self::Sum => "sum",
Self::Min => "min",
Self::Max => "max",
Self::First => "first",
Self::Last => "last",
Self::Nunique => "nunique",
Self::Quantile(_) => "quantile",
Self::Median => "median",
Self::Var => "var",
Self::Std => "std",
Self::Count => "count",
}
}
}
#[derive(Clone)]
pub struct Aggregate;
impl Command for Aggregate {
fn name(&self) -> &str {
"dfr aggregate"
}
fn usage(&self) -> &str {
"Performs an aggregation operation on a dataframe and groupby object"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required(
"operation-name",
SyntaxShape::String,
"\n\tDataframes: mean, sum, min, max, quantile, median, var, std
\tGroupBy: mean, sum, min, max, first, last, nunique, quantile, median, var, std, count",
)
.named(
"quantile",
SyntaxShape::Number,
"quantile value for quantile operation",
Some('q'),
)
.switch(
"explicit",
"returns explicit names for groupby aggregations",
Some('e'),
)
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Aggregate sum by grouping by column a and summing on col b",
example:
"[[a b]; [one 1] [one 2]] | dfr to-df | dfr group-by a | dfr aggregate sum",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new("a".to_string(), vec![Value::test_string("one")]),
Column::new("b".to_string(), vec![Value::test_int(3)]),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
Example {
description: "Aggregate sum in dataframe columns",
example: "[[a b]; [4 1] [5 2]] | dfr to-df | dfr aggregate sum",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new("a".to_string(), vec![Value::test_int(9)]),
Column::new("b".to_string(), vec![Value::test_int(3)]),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
Example {
description: "Aggregate sum in series",
example: "[4 1 5 6] | dfr to-df | dfr aggregate sum",
result: Some(
NuDataFrame::try_from_columns(vec![Column::new(
"0".to_string(),
vec![Value::test_int(16)],
)])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let operation: Spanned<String> = call.req(engine_state, stack, 0)?;
let quantile: Option<Spanned<f64>> = call.get_flag(engine_state, stack, "quantile")?;
let op = Operation::from_tagged(&operation, quantile)?;
match input {
PipelineData::Value(Value::CustomValue { val, span }, _) => {
let df = val.as_any().downcast_ref::<NuDataFrame>();
let groupby = val.as_any().downcast_ref::<NuGroupBy>();
match (df, groupby) {
(Some(df), None) => {
let df = df.as_ref();
let res = perform_dataframe_aggregation(df, op, operation.span)?;
Ok(PipelineData::Value(
NuDataFrame::dataframe_into_value(res, span),
None,
))
}
(None, Some(nu_groupby)) => {
let groupby = nu_groupby.to_groupby()?;
let res = perform_groupby_aggregation(
groupby,
op,
operation.span,
call.head,
call.has_flag("explicit"),
)?;
Ok(PipelineData::Value(
NuDataFrame::dataframe_into_value(res, span),
None,
))
}
_ => Err(ShellError::SpannedLabeledError(
"Incorrect datatype".into(),
"no groupby or dataframe found in input stream".into(),
call.head,
)),
}
}
_ => Err(ShellError::SpannedLabeledError(
"Incorrect datatype".into(),
"no groupby or dataframe found in input stream".into(),
call.head,
)),
}
}
fn perform_groupby_aggregation(
groupby: GroupBy,
operation: Operation,
operation_span: Span,
agg_span: Span,
explicit: bool,
) -> Result<polars::prelude::DataFrame, ShellError> {
let mut res = match operation {
Operation::Mean => groupby.mean(),
Operation::Sum => groupby.sum(),
Operation::Min => groupby.min(),
Operation::Max => groupby.max(),
Operation::First => groupby.first(),
Operation::Last => groupby.last(),
Operation::Nunique => groupby.n_unique(),
Operation::Quantile(quantile) => groupby.quantile(quantile),
Operation::Median => groupby.median(),
Operation::Var => groupby.var(),
Operation::Std => groupby.std(),
Operation::Count => groupby.count(),
}
.map_err(|e| {
let span = match &e {
PolarsError::NotFound(_) => agg_span,
_ => operation_span,
};
ShellError::SpannedLabeledError("Error calculating aggregation".into(), e.to_string(), span)
})?;
if !explicit {
let col_names = res
.get_column_names()
.iter()
.map(|name| name.to_string())
.collect::<Vec<String>>();
for col in col_names {
let from = match operation {
Operation::Mean => "_mean",
Operation::Sum => "_sum",
Operation::Min => "_min",
Operation::Max => "_max",
Operation::First => "_first",
Operation::Last => "_last",
Operation::Nunique => "_n_unique",
Operation::Quantile(_) => "_quantile",
Operation::Median => "_median",
Operation::Var => "_agg_var",
Operation::Std => "_agg_std",
Operation::Count => "_count",
};
let new_col = match col.find(from) {
Some(index) => &col[..index],
None => &col[..],
};
res.rename(&col, new_col)
.expect("Column is always there. Looping with known names");
}
}
Ok(res)
}
fn perform_dataframe_aggregation(
dataframe: &polars::prelude::DataFrame,
operation: Operation,
operation_span: Span,
) -> Result<polars::prelude::DataFrame, ShellError> {
match operation {
Operation::Mean => Ok(dataframe.mean()),
Operation::Sum => Ok(dataframe.sum()),
Operation::Min => Ok(dataframe.min()),
Operation::Max => Ok(dataframe.max()),
Operation::Quantile(quantile) => dataframe.quantile(quantile).map_err(|e| {
ShellError::SpannedLabeledError(
"Error calculating quantile".into(),
e.to_string(),
operation_span,
)
}),
Operation::Median => Ok(dataframe.median()),
Operation::Var => Ok(dataframe.var()),
Operation::Std => Ok(dataframe.std()),
operation => {
let possibilities = [
"mean".to_string(),
"sum".to_string(),
"min".to_string(),
"max".to_string(),
"quantile".to_string(),
"median".to_string(),
"var".to_string(),
"std".to_string(),
];
match did_you_mean(&possibilities, operation.to_str()) {
Some(suggestion) => Err(ShellError::DidYouMean(suggestion, operation_span)),
None => Err(ShellError::SpannedLabeledErrorHelp(
"Operation not fount".into(),
"Operation does not exist".into(),
operation_span,
"Perhaps you want: mean, sum, min, max, quantile, median, var, or std".into(),
)),
}
}
}
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::super::CreateGroupBy;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(Aggregate {}), Box::new(CreateGroupBy {})])
}
}

View File

@ -0,0 +1,130 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value,
};
use super::super::values::{Axis, Column, NuDataFrame};
#[derive(Clone)]
pub struct AppendDF;
impl Command for AppendDF {
fn name(&self) -> &str {
"dfr append"
}
fn usage(&self) -> &str {
"Appends a new dataframe"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required("other", SyntaxShape::Any, "dataframe to be appended")
.switch("col", "appends in col orientation", Some('c'))
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Appends a dataframe as new columns",
example: r#"let a = ([[a b]; [1 2] [3 4]] | dfr to-df);
$a | dfr append $a"#,
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a".to_string(),
vec![Value::test_int(1), Value::test_int(3)],
),
Column::new(
"b".to_string(),
vec![Value::test_int(2), Value::test_int(4)],
),
Column::new(
"a_x".to_string(),
vec![Value::test_int(1), Value::test_int(3)],
),
Column::new(
"b_x".to_string(),
vec![Value::test_int(2), Value::test_int(4)],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
Example {
description: "Appends a dataframe merging at the end of columns",
example: r#"let a = ([[a b]; [1 2] [3 4]] | dfr to-df);
$a | dfr append $a --col"#,
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a".to_string(),
vec![
Value::test_int(1),
Value::test_int(3),
Value::test_int(1),
Value::test_int(3),
],
),
Column::new(
"b".to_string(),
vec![
Value::test_int(2),
Value::test_int(4),
Value::test_int(2),
Value::test_int(4),
],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let other: Value = call.req(engine_state, stack, 0)?;
let axis = if call.has_flag("col") {
Axis::Column
} else {
Axis::Row
};
let df_other = NuDataFrame::try_from_value(other)?;
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
df.append_df(&df_other, axis, call.head)
.map(|df| PipelineData::Value(NuDataFrame::into_value(df, call.head), None))
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(AppendDF {})])
}
}

View File

@ -0,0 +1,81 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Value,
};
use super::super::values::{Column, NuDataFrame};
#[derive(Clone)]
pub struct ColumnDF;
impl Command for ColumnDF {
fn name(&self) -> &str {
"dfr column"
}
fn usage(&self) -> &str {
"Returns the selected column"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required("column", SyntaxShape::String, "column name")
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Returns the selected column as series",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr column a",
result: Some(
NuDataFrame::try_from_columns(vec![Column::new(
"a".to_string(),
vec![Value::test_int(1), Value::test_int(3)],
)])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let column: Spanned<String> = call.req(engine_state, stack, 0)?;
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
let res = df.as_ref().column(&column.item).map_err(|e| {
ShellError::SpannedLabeledError("Error selecting column".into(), e.to_string(), column.span)
})?;
NuDataFrame::try_from_series(vec![res.clone()], call.head)
.map(|df| PipelineData::Value(NuDataFrame::into_value(df, call.head), None))
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(ColumnDF {})])
}
}

View File

@ -0,0 +1,37 @@
use nu_engine::get_full_help;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, IntoPipelineData, PipelineData, ShellError, Signature, Value,
};
#[derive(Clone)]
pub struct Dataframe;
impl Command for Dataframe {
fn name(&self) -> &str {
"dfr"
}
fn usage(&self) -> &str {
"Dataframe commands"
}
fn signature(&self) -> Signature {
Signature::build(self.name()).category(Category::Custom("dataframe".into()))
}
fn run(
&self,
engine_state: &EngineState,
_stack: &mut Stack,
call: &Call,
_input: PipelineData,
) -> Result<PipelineData, ShellError> {
Ok(Value::String {
val: get_full_help(&Dataframe.signature(), &Dataframe.examples(), engine_state),
span: call.head,
}
.into_pipeline_data())
}
}

View File

@ -0,0 +1,241 @@
use super::super::values::{Column, NuDataFrame};
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, Value,
};
use polars::{
chunked_array::ChunkedArray,
prelude::{
AnyValue, DataFrame, DataType, Float64Type, IntoSeries, NewChunkedArray, Series, Utf8Type,
},
};
#[derive(Clone)]
pub struct DescribeDF;
impl Command for DescribeDF {
fn name(&self) -> &str {
"dfr describe"
}
fn usage(&self) -> &str {
"Describes dataframes numeric columns"
}
fn signature(&self) -> Signature {
Signature::build(self.name()).category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "dataframe description",
example: "[[a b]; [1 1] [1 1]] | dfr to-df | dfr describe",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"descriptor".to_string(),
vec![
Value::test_string("count"),
Value::test_string("sum"),
Value::test_string("mean"),
Value::test_string("median"),
Value::test_string("std"),
Value::test_string("min"),
Value::test_string("25%"),
Value::test_string("50%"),
Value::test_string("75%"),
Value::test_string("max"),
],
),
Column::new(
"a (i64)".to_string(),
vec![
Value::test_float(2.0),
Value::test_float(2.0),
Value::test_float(1.0),
Value::test_float(1.0),
Value::test_float(0.0),
Value::test_float(1.0),
Value::test_float(1.0),
Value::test_float(1.0),
Value::test_float(1.0),
Value::test_float(1.0),
],
),
Column::new(
"b (i64)".to_string(),
vec![
Value::test_float(2.0),
Value::test_float(2.0),
Value::test_float(1.0),
Value::test_float(1.0),
Value::test_float(0.0),
Value::test_float(1.0),
Value::test_float(1.0),
Value::test_float(1.0),
Value::test_float(1.0),
Value::test_float(1.0),
],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
_engine_state: &EngineState,
_stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
let names = ChunkedArray::<Utf8Type>::new_from_opt_slice(
"descriptor",
&[
Some("count"),
Some("sum"),
Some("mean"),
Some("median"),
Some("std"),
Some("min"),
Some("25%"),
Some("50%"),
Some("75%"),
Some("max"),
],
)
.into_series();
let head = std::iter::once(names);
let tail = df
.as_ref()
.get_columns()
.iter()
.filter(|col| col.dtype() != &DataType::Object("object"))
.map(|col| {
let count = col.len() as f64;
let sum = col
.sum_as_series()
.cast(&DataType::Float64)
.ok()
.and_then(|ca| match ca.get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
});
let mean = match col.mean_as_series().get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
};
let median = match col.median_as_series().get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
};
let std = match col.std_as_series().get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
};
let min = col
.min_as_series()
.cast(&DataType::Float64)
.ok()
.and_then(|ca| match ca.get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
});
let q_25 = col
.quantile_as_series(0.25)
.ok()
.and_then(|ca| ca.cast(&DataType::Float64).ok())
.and_then(|ca| match ca.get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
});
let q_50 = col
.quantile_as_series(0.50)
.ok()
.and_then(|ca| ca.cast(&DataType::Float64).ok())
.and_then(|ca| match ca.get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
});
let q_75 = col
.quantile_as_series(0.75)
.ok()
.and_then(|ca| ca.cast(&DataType::Float64).ok())
.and_then(|ca| match ca.get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
});
let max = col
.max_as_series()
.cast(&DataType::Float64)
.ok()
.and_then(|ca| match ca.get(0) {
AnyValue::Float64(v) => Some(v),
_ => None,
});
let name = format!("{} ({})", col.name(), col.dtype());
ChunkedArray::<Float64Type>::new_from_opt_slice(
&name,
&[
Some(count),
sum,
mean,
median,
std,
min,
q_25,
q_50,
q_75,
max,
],
)
.into_series()
});
let res = head.chain(tail).collect::<Vec<Series>>();
DataFrame::new(res)
.map_err(|e| {
ShellError::SpannedLabeledError("Dataframe Error".into(), e.to_string(), call.head)
})
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None))
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(DescribeDF {})])
}
}

View File

@ -0,0 +1,111 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value,
};
use super::super::values::utils::convert_columns;
use super::super::values::{Column, NuDataFrame};
#[derive(Clone)]
pub struct DropDF;
impl Command for DropDF {
fn name(&self) -> &str {
"dfr drop"
}
fn usage(&self) -> &str {
"Creates a new dataframe by dropping the selected columns"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.rest("rest", SyntaxShape::Any, "column names to be dropped")
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "drop column a",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr drop a",
result: Some(
NuDataFrame::try_from_columns(vec![Column::new(
"b".to_string(),
vec![Value::test_int(2), Value::test_int(4)],
)])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let columns: Vec<Value> = call.rest(engine_state, stack, 0)?;
let (col_string, col_span) = convert_columns(columns, call.head)?;
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
let new_df = col_string
.get(0)
.ok_or_else(|| {
ShellError::SpannedLabeledError(
"Empty names list".into(),
"No column names where found".into(),
col_span,
)
})
.and_then(|col| {
df.as_ref().drop(&col.item).map_err(|e| {
ShellError::SpannedLabeledError(
"Error dropping column".into(),
e.to_string(),
col.span,
)
})
})?;
// If there are more columns in the drop selection list, these
// are added from the resulting dataframe
col_string
.iter()
.skip(1)
.try_fold(new_df, |new_df, col| {
new_df.drop(&col.item).map_err(|e| {
ShellError::SpannedLabeledError(
"Error dropping column".into(),
e.to_string(),
col.span,
)
})
})
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None))
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(DropDF {})])
}
}

View File

@ -0,0 +1,130 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value,
};
use super::super::values::utils::convert_columns_string;
use super::super::values::{Column, NuDataFrame};
#[derive(Clone)]
pub struct DropNulls;
impl Command for DropNulls {
fn name(&self) -> &str {
"dfr drop-nulls"
}
fn usage(&self) -> &str {
"Drops null values in dataframe"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.optional(
"subset",
SyntaxShape::Table,
"subset of columns to drop nulls",
)
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "drop null values in dataframe",
example: r#"let df = ([[a b]; [1 2] [3 0] [1 2]] | dfr to-df);
let res = ($df.b / $df.b);
let a = ($df | dfr with-column $res --name res);
$a | dfr drop-nulls"#,
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a".to_string(),
vec![Value::test_int(1), Value::test_int(1)],
),
Column::new(
"b".to_string(),
vec![Value::test_int(2), Value::test_int(2)],
),
Column::new(
"res".to_string(),
vec![Value::test_int(1), Value::test_int(1)],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
Example {
description: "drop null values in dataframe",
example: r#"let s = ([1 2 0 0 3 4] | dfr to-df);
($s / $s) | dfr drop-nulls"#,
result: Some(
NuDataFrame::try_from_columns(vec![Column::new(
"div_0_0".to_string(),
vec![
Value::test_int(1),
Value::test_int(1),
Value::test_int(1),
Value::test_int(1),
],
)])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
let columns: Option<Vec<Value>> = call.opt(engine_state, stack, 0)?;
let (subset, col_span) = match columns {
Some(cols) => {
let (agg_string, col_span) = convert_columns_string(cols, call.head)?;
(Some(agg_string), col_span)
}
None => (None, call.head),
};
let subset_slice = subset.as_ref().map(|cols| &cols[..]);
df.as_ref()
.drop_nulls(subset_slice)
.map_err(|e| {
ShellError::SpannedLabeledError("Error dropping nulls".into(), e.to_string(), col_span)
})
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None))
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::super::WithColumn;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(DropNulls {}), Box::new(WithColumn {})])
}
}

View File

@ -0,0 +1,106 @@
use super::super::values::{Column, NuDataFrame};
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, Value,
};
#[derive(Clone)]
pub struct DataTypes;
impl Command for DataTypes {
fn name(&self) -> &str {
"dfr dtypes"
}
fn usage(&self) -> &str {
"Show dataframe data types"
}
fn signature(&self) -> Signature {
Signature::build(self.name()).category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Dataframe dtypes",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr dtypes",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"column".to_string(),
vec![Value::test_string("a"), Value::test_string("b")],
),
Column::new(
"dtype".to_string(),
vec![Value::test_string("i64"), Value::test_string("i64")],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
#[allow(clippy::needless_collect)]
fn command(
_engine_state: &EngineState,
_stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
let mut dtypes: Vec<Value> = Vec::new();
let names: Vec<Value> = df
.as_ref()
.get_column_names()
.iter()
.map(|v| {
let dtype = df
.as_ref()
.column(v)
.expect("using name from list of names from dataframe")
.dtype();
let dtype_str = dtype.to_string();
dtypes.push(Value::String {
val: dtype_str,
span: call.head,
});
Value::String {
val: v.to_string(),
span: call.head,
}
})
.collect();
let names_col = Column::new("column".to_string(), names);
let dtypes_col = Column::new("dtype".to_string(), dtypes);
NuDataFrame::try_from_columns(vec![names_col, dtypes_col])
.map(|df| PipelineData::Value(df.into_value(call.head), None))
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(DataTypes {})])
}
}

View File

@ -0,0 +1,137 @@
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, Value,
};
use super::super::values::{Column, NuDataFrame};
#[derive(Clone)]
pub struct Dummies;
impl Command for Dummies {
fn name(&self) -> &str {
"dfr to-dummies"
}
fn usage(&self) -> &str {
"Creates a new dataframe with dummy variables"
}
fn signature(&self) -> Signature {
Signature::build(self.name()).category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Create new dataframe with dummy variables from a dataframe",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr to-dummies",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a_1".to_string(),
vec![Value::test_int(1), Value::test_int(0)],
),
Column::new(
"a_3".to_string(),
vec![Value::test_int(0), Value::test_int(1)],
),
Column::new(
"b_2".to_string(),
vec![Value::test_int(1), Value::test_int(0)],
),
Column::new(
"b_4".to_string(),
vec![Value::test_int(0), Value::test_int(1)],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
Example {
description: "Create new dataframe with dummy variables from a series",
example: "[1 2 2 3 3] | dfr to-df | dfr to-dummies",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"0_1".to_string(),
vec![
Value::test_int(1),
Value::test_int(0),
Value::test_int(0),
Value::test_int(0),
Value::test_int(0),
],
),
Column::new(
"0_2".to_string(),
vec![
Value::test_int(0),
Value::test_int(1),
Value::test_int(1),
Value::test_int(0),
Value::test_int(0),
],
),
Column::new(
"0_3".to_string(),
vec![
Value::test_int(0),
Value::test_int(0),
Value::test_int(0),
Value::test_int(1),
Value::test_int(1),
],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
_engine_state: &EngineState,
_stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
df.as_ref()
.to_dummies()
.map_err(|e| {
ShellError::SpannedLabeledErrorHelp(
"Error calculating dummies".into(),
e.to_string(),
call.head,
"The only allowed column types for dummies are String or Int".into(),
)
})
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None))
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(Dummies {})])
}
}

View File

@ -0,0 +1,98 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value,
};
use super::super::values::{Column, NuDataFrame};
#[derive(Clone)]
pub struct FilterWith;
impl Command for FilterWith {
fn name(&self) -> &str {
"dfr filter-with"
}
fn usage(&self) -> &str {
"Filters dataframe using a mask as reference"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required("mask", SyntaxShape::Any, "boolean mask used to filter data")
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Filter dataframe using a bool mask",
example: r#"let mask = ([$true $false] | dfr to-df);
[[a b]; [1 2] [3 4]] | dfr to-df | dfr filter-with $mask"#,
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new("a".to_string(), vec![Value::test_int(1)]),
Column::new("b".to_string(), vec![Value::test_int(2)]),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let mask_value: Value = call.req(engine_state, stack, 0)?;
let mask_span = mask_value.span()?;
let mask = NuDataFrame::try_from_value(mask_value)?.as_series(mask_span)?;
let mask = mask.bool().map_err(|e| {
ShellError::SpannedLabeledErrorHelp(
"Error casting to bool".into(),
e.to_string(),
mask_span,
"Perhaps you want to use a series with booleans as mask".into(),
)
})?;
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
df.as_ref()
.filter(mask)
.map_err(|e| {
ShellError::SpannedLabeledErrorHelp(
"Error calculating dummies".into(),
e.to_string(),
call.head,
"The only allowed column types for dummies are String or Int".into(),
)
})
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None))
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(FilterWith {})])
}
}

View File

@ -0,0 +1,80 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value,
};
use super::super::values::{utils::DEFAULT_ROWS, Column, NuDataFrame};
#[derive(Clone)]
pub struct FirstDF;
impl Command for FirstDF {
fn name(&self) -> &str {
"dfr first"
}
fn usage(&self) -> &str {
"Creates new dataframe with first rows"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.optional("rows", SyntaxShape::Int, "Number of rows for head")
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Create new dataframe with head rows",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr first 1",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new("a".to_string(), vec![Value::test_int(1)]),
Column::new("b".to_string(), vec![Value::test_int(2)]),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let rows: Option<usize> = call.opt(engine_state, stack, 0)?;
let rows = rows.unwrap_or(DEFAULT_ROWS);
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
let res = df.as_ref().head(Some(rows));
Ok(PipelineData::Value(
NuDataFrame::dataframe_into_value(res, call.head),
None,
))
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(FirstDF {})])
}
}

View File

@ -0,0 +1,88 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value,
};
use crate::dataframe::values::utils::convert_columns_string;
use super::super::values::{Column, NuDataFrame};
#[derive(Clone)]
pub struct GetDF;
impl Command for GetDF {
fn name(&self) -> &str {
"dfr get"
}
fn usage(&self) -> &str {
"Creates dataframe with the selected columns"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.rest("rest", SyntaxShape::Any, "column names to sort dataframe")
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Creates dataframe with selected columns",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr get a",
result: Some(
NuDataFrame::try_from_columns(vec![Column::new(
"a".to_string(),
vec![Value::test_int(1), Value::test_int(3)],
)])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let columns: Vec<Value> = call.rest(engine_state, stack, 0)?;
let (col_string, col_span) = convert_columns_string(columns, call.head)?;
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
df.as_ref()
.select(&col_string)
.map_err(|e| {
ShellError::SpannedLabeledError(
"Error selecting columns".into(),
e.to_string(),
col_span,
)
})
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None))
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(GetDF {})])
}
}

View File

@ -0,0 +1,71 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, SyntaxShape, Value,
};
use super::super::values::{utils::convert_columns_string, NuDataFrame, NuGroupBy};
#[derive(Clone)]
pub struct CreateGroupBy;
impl Command for CreateGroupBy {
fn name(&self) -> &str {
"dfr group-by"
}
fn usage(&self) -> &str {
"Creates a groupby object that can be used for other aggregations"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.rest("rest", SyntaxShape::Any, "groupby columns")
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Grouping by column a",
example: "[[a b]; [one 1] [one 2]] | dfr to-df | dfr group-by a",
result: None,
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
// Extracting the names of the columns to perform the groupby
let columns: Vec<Value> = call.rest(engine_state, stack, 0)?;
let (col_string, col_span) = convert_columns_string(columns, call.head)?;
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
// This is the expensive part of the groupby; to create the
// groups that will be used for grouping the data in the
// dataframe. Once it has been done these values can be stored
// in a NuGroupBy
let groupby = df.as_ref().groupby(&col_string).map_err(|e| {
ShellError::SpannedLabeledError("Error creating groupby".into(), e.to_string(), col_span)
})?;
let groups = groupby.get_groups().to_vec();
let groupby = NuGroupBy::new(df.as_ref().clone(), col_string, groups);
Ok(PipelineData::Value(groupby.into_value(call.head), None))
}

View File

@ -0,0 +1,226 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Value,
};
use polars::prelude::JoinType;
use crate::dataframe::values::utils::convert_columns_string;
use super::super::values::{Column, NuDataFrame};
#[derive(Clone)]
pub struct JoinDF;
impl Command for JoinDF {
fn name(&self) -> &str {
"dfr join"
}
fn usage(&self) -> &str {
"Joins a dataframe using columns as reference"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required("dataframe", SyntaxShape::Any, "right dataframe to join")
.required_named(
"left",
SyntaxShape::Table,
"left column names to perform join",
Some('l'),
)
.required_named(
"right",
SyntaxShape::Table,
"right column names to perform join",
Some('r'),
)
.named(
"type",
SyntaxShape::String,
"type of join. Inner by default",
Some('t'),
)
.named(
"suffix",
SyntaxShape::String,
"suffix for the columns of the right dataframe",
Some('s'),
)
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "inner join dataframe",
example: r#"let right = ([[a b c]; [1 2 5] [3 4 5] [5 6 6]] | dfr to-df);
$right | dfr join $right -l [a b] -r [a b]"#,
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a".to_string(),
vec![Value::test_int(1), Value::test_int(3), Value::test_int(5)],
),
Column::new(
"b".to_string(),
vec![Value::test_int(2), Value::test_int(4), Value::test_int(6)],
),
Column::new(
"c".to_string(),
vec![Value::test_int(5), Value::test_int(5), Value::test_int(6)],
),
Column::new(
"c_right".to_string(),
vec![Value::test_int(5), Value::test_int(5), Value::test_int(6)],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let r_df: Value = call.req(engine_state, stack, 0)?;
let l_col: Vec<Value> = call
.get_flag(engine_state, stack, "left")?
.expect("required value in syntax");
let r_col: Vec<Value> = call
.get_flag(engine_state, stack, "right")?
.expect("required value in syntax");
let suffix: Option<String> = call.get_flag(engine_state, stack, "suffix")?;
let join_type_op: Option<Spanned<String>> = call.get_flag(engine_state, stack, "type")?;
let join_type = match join_type_op {
None => JoinType::Inner,
Some(val) => match val.item.as_ref() {
"inner" => JoinType::Inner,
"outer" => JoinType::Outer,
"left" => JoinType::Left,
_ => {
return Err(ShellError::SpannedLabeledErrorHelp(
"Incorrect join type".into(),
"Invalid join type".into(),
val.span,
"Options: inner, outer or left".into(),
))
}
},
};
let (l_col_string, l_col_span) = convert_columns_string(l_col, call.head)?;
let (r_col_string, r_col_span) = convert_columns_string(r_col, call.head)?;
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
let r_df = NuDataFrame::try_from_value(r_df)?;
check_column_datatypes(
df.as_ref(),
r_df.as_ref(),
&l_col_string,
l_col_span,
&r_col_string,
r_col_span,
)?;
df.as_ref()
.join(
r_df.as_ref(),
&l_col_string,
&r_col_string,
join_type,
suffix,
)
.map_err(|e| {
ShellError::SpannedLabeledError(
"Error joining dataframes".into(),
e.to_string(),
l_col_span,
)
})
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None))
}
fn check_column_datatypes<T: AsRef<str>>(
df_l: &polars::prelude::DataFrame,
df_r: &polars::prelude::DataFrame,
l_cols: &[T],
l_col_span: Span,
r_cols: &[T],
r_col_span: Span,
) -> Result<(), ShellError> {
if l_cols.len() != r_cols.len() {
return Err(ShellError::SpannedLabeledErrorHelp(
"Mismatched number of column names".into(),
format!(
"found {} left names vs {} right names",
l_cols.len(),
r_cols.len()
),
l_col_span,
"perhaps you need to change the number of columns to join".into(),
));
}
for (l, r) in l_cols.iter().zip(r_cols) {
let l_series = df_l.column(l.as_ref()).map_err(|e| {
ShellError::SpannedLabeledError(
"Error selecting the columns".into(),
e.to_string(),
l_col_span,
)
})?;
let r_series = df_r.column(r.as_ref()).map_err(|e| {
ShellError::SpannedLabeledError(
"Error selecting the columns".into(),
e.to_string(),
r_col_span,
)
})?;
if l_series.dtype() != r_series.dtype() {
return Err(ShellError::SpannedLabeledErrorHelp(
"Mismatched datatypes".into(),
format!(
"left column type '{}' doesn't match '{}' right column match",
l_series.dtype(),
r_series.dtype()
),
l_col_span,
"perhaps you need to select other column to match".into(),
));
}
}
Ok(())
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(JoinDF {})])
}
}

View File

@ -0,0 +1,80 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value,
};
use super::super::values::{utils::DEFAULT_ROWS, Column, NuDataFrame};
#[derive(Clone)]
pub struct LastDF;
impl Command for LastDF {
fn name(&self) -> &str {
"dfr last"
}
fn usage(&self) -> &str {
"Creates new dataframe with tail rows"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.optional("rows", SyntaxShape::Int, "Number of rows for tail")
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Create new dataframe with last rows",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr last 1",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new("a".to_string(), vec![Value::test_int(3)]),
Column::new("b".to_string(), vec![Value::test_int(4)]),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let rows: Option<usize> = call.opt(engine_state, stack, 0)?;
let rows = rows.unwrap_or(DEFAULT_ROWS);
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
let res = df.as_ref().tail(Some(rows));
Ok(PipelineData::Value(
NuDataFrame::dataframe_into_value(res, call.head),
None,
))
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(LastDF {})])
}
}

View File

@ -0,0 +1,243 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Value,
};
use crate::dataframe::values::utils::convert_columns_string;
use super::super::values::{Column, NuDataFrame};
#[derive(Clone)]
pub struct MeltDF;
impl Command for MeltDF {
fn name(&self) -> &str {
"dfr melt"
}
fn usage(&self) -> &str {
"Unpivot a DataFrame from wide to long format"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required_named(
"columns",
SyntaxShape::Table,
"column names for melting",
Some('c'),
)
.required_named(
"values",
SyntaxShape::Table,
"column names used as value columns",
Some('v'),
)
.named(
"variable-name",
SyntaxShape::String,
"optional name for variable column",
Some('r'),
)
.named(
"value-name",
SyntaxShape::String,
"optional name for value column",
Some('l'),
)
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "melt dataframe",
example:
"[[a b c d]; [x 1 4 a] [y 2 5 b] [z 3 6 c]] | dfr to-df | dfr melt -c [b c] -v [a d]",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"b".to_string(),
vec![
Value::test_int(1),
Value::test_int(2),
Value::test_int(3),
Value::test_int(1),
Value::test_int(2),
Value::test_int(3),
],
),
Column::new(
"c".to_string(),
vec![
Value::test_int(4),
Value::test_int(5),
Value::test_int(6),
Value::test_int(4),
Value::test_int(5),
Value::test_int(6),
],
),
Column::new(
"variable".to_string(),
vec![
Value::test_string("a"),
Value::test_string("a"),
Value::test_string("a"),
Value::test_string("d"),
Value::test_string("d"),
Value::test_string("d"),
],
),
Column::new(
"value".to_string(),
vec![
Value::test_string("x"),
Value::test_string("y"),
Value::test_string("z"),
Value::test_string("a"),
Value::test_string("b"),
Value::test_string("c"),
],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let id_col: Vec<Value> = call
.get_flag(engine_state, stack, "columns")?
.expect("required value");
let val_col: Vec<Value> = call
.get_flag(engine_state, stack, "values")?
.expect("required value");
let value_name: Option<Spanned<String>> = call.get_flag(engine_state, stack, "value-name")?;
let variable_name: Option<Spanned<String>> =
call.get_flag(engine_state, stack, "variable-name")?;
let (id_col_string, id_col_span) = convert_columns_string(id_col, call.head)?;
let (val_col_string, val_col_span) = convert_columns_string(val_col, call.head)?;
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
check_column_datatypes(df.as_ref(), &id_col_string, id_col_span)?;
check_column_datatypes(df.as_ref(), &val_col_string, val_col_span)?;
let mut res = df
.as_ref()
.melt(&id_col_string, &val_col_string)
.map_err(|e| {
ShellError::SpannedLabeledError(
"Error calculating melt".into(),
e.to_string(),
call.head,
)
})?;
if let Some(name) = &variable_name {
res.rename("variable", &name.item).map_err(|e| {
ShellError::SpannedLabeledError(
"Error renaming column".into(),
e.to_string(),
name.span,
)
})?;
}
if let Some(name) = &value_name {
res.rename("value", &name.item).map_err(|e| {
ShellError::SpannedLabeledError(
"Error renaming column".into(),
e.to_string(),
name.span,
)
})?;
}
Ok(PipelineData::Value(
NuDataFrame::dataframe_into_value(res, call.head),
None,
))
}
fn check_column_datatypes<T: AsRef<str>>(
df: &polars::prelude::DataFrame,
cols: &[T],
col_span: Span,
) -> Result<(), ShellError> {
if cols.is_empty() {
return Err(ShellError::SpannedLabeledError(
"Merge error".into(),
"empty column list".into(),
col_span,
));
}
// Checking if they are same type
if cols.len() > 1 {
for w in cols.windows(2) {
let l_series = df.column(w[0].as_ref()).map_err(|e| {
ShellError::SpannedLabeledError(
"Error selecting columns".into(),
e.to_string(),
col_span,
)
})?;
let r_series = df.column(w[1].as_ref()).map_err(|e| {
ShellError::SpannedLabeledError(
"Error selecting columns".into(),
e.to_string(),
col_span,
)
})?;
if l_series.dtype() != r_series.dtype() {
return Err(ShellError::SpannedLabeledErrorHelp(
"Merge error".into(),
"found different column types in list".into(),
col_span,
format!(
"datatypes {} and {} are incompatible",
l_series.dtype(),
r_series.dtype()
),
));
}
}
}
Ok(())
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(MeltDF {})])
}
}

View File

@ -0,0 +1,105 @@
mod aggregate;
mod append;
mod column;
mod command;
mod describe;
mod drop;
mod drop_nulls;
mod dtypes;
mod dummies;
mod filter_with;
mod first;
mod get;
mod groupby;
mod join;
mod last;
mod melt;
mod open;
mod pivot;
mod rename;
mod sample;
mod shape;
mod slice;
mod sort;
mod take;
mod to_csv;
mod to_df;
mod to_nu;
mod to_parquet;
mod with_column;
use nu_protocol::engine::StateWorkingSet;
pub use aggregate::Aggregate;
pub use append::AppendDF;
pub use column::ColumnDF;
pub use command::Dataframe;
pub use describe::DescribeDF;
pub use drop::DropDF;
pub use drop_nulls::DropNulls;
pub use dtypes::DataTypes;
pub use dummies::Dummies;
pub use filter_with::FilterWith;
pub use first::FirstDF;
pub use get::GetDF;
pub use groupby::CreateGroupBy;
pub use join::JoinDF;
pub use last::LastDF;
pub use melt::MeltDF;
pub use open::OpenDataFrame;
pub use pivot::PivotDF;
pub use rename::RenameDF;
pub use sample::SampleDF;
pub use shape::ShapeDF;
pub use slice::SliceDF;
pub use sort::SortDF;
pub use take::TakeDF;
pub use to_csv::ToCSV;
pub use to_df::ToDataFrame;
pub use to_nu::ToNu;
pub use to_parquet::ToParquet;
pub use with_column::WithColumn;
pub fn add_eager_decls(working_set: &mut StateWorkingSet) {
macro_rules! bind_command {
( $command:expr ) => {
working_set.add_decl(Box::new($command));
};
( $( $command:expr ),* ) => {
$( working_set.add_decl(Box::new($command)); )*
};
}
// Dataframe commands
bind_command!(
Aggregate,
AppendDF,
ColumnDF,
CreateGroupBy,
Dataframe,
DataTypes,
DescribeDF,
DropDF,
DropNulls,
Dummies,
FilterWith,
FirstDF,
GetDF,
JoinDF,
LastDF,
MeltDF,
OpenDataFrame,
PivotDF,
RenameDF,
SampleDF,
ShapeDF,
SliceDF,
SortDF,
TakeDF,
ToCSV,
ToDataFrame,
ToNu,
ToParquet,
WithColumn
);
}

View File

@ -0,0 +1,218 @@
use super::super::values::NuDataFrame;
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Spanned, SyntaxShape,
};
use std::{fs::File, path::PathBuf};
use polars::prelude::{CsvEncoding, CsvReader, JsonReader, ParquetReader, SerReader};
#[derive(Clone)]
pub struct OpenDataFrame;
impl Command for OpenDataFrame {
fn name(&self) -> &str {
"dfr open"
}
fn usage(&self) -> &str {
"Opens csv, json or parquet file to create dataframe"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required(
"file",
SyntaxShape::Filepath,
"file path to load values from",
)
.named(
"delimiter",
SyntaxShape::String,
"file delimiter character. CSV file",
Some('d'),
)
.switch(
"no-header",
"Indicates if file doesn't have header. CSV file",
None,
)
.named(
"infer-schema",
SyntaxShape::Number,
"Number of rows to infer the schema of the file. CSV file",
None,
)
.named(
"skip-rows",
SyntaxShape::Number,
"Number of rows to skip from file. CSV file",
None,
)
.named(
"columns",
SyntaxShape::List(Box::new(SyntaxShape::String)),
"Columns to be selected from csv file. CSV and Parquet file",
None,
)
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Takes a file name and creates a dataframe",
example: "dfr open test.csv",
result: None,
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
_input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
) -> Result<PipelineData, ShellError> {
let span = call.head;
let file: Spanned<PathBuf> = call.req(engine_state, stack, 0)?;
match file.item.extension() {
Some(e) => match e.to_str() {
Some("csv") => from_csv(engine_state, stack, call),
Some("parquet") => from_parquet(engine_state, stack, call),
Some("json") => from_json(engine_state, stack, call),
_ => Err(ShellError::FileNotFoundCustom(
"Not a csv, parquet or json file".into(),
file.span,
)),
},
None => Err(ShellError::FileNotFoundCustom(
"File without extension".into(),
file.span,
)),
}
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, span), None))
}
fn from_parquet(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
) -> Result<polars::prelude::DataFrame, ShellError> {
let file: Spanned<PathBuf> = call.req(engine_state, stack, 0)?;
let columns: Option<Vec<String>> = call.get_flag(engine_state, stack, "columns")?;
let r = File::open(&file.item).map_err(|e| {
ShellError::SpannedLabeledError("Error opening file".into(), e.to_string(), file.span)
})?;
let reader = ParquetReader::new(r);
let reader = match columns {
None => reader,
Some(columns) => reader.with_columns(Some(columns)),
};
reader.finish().map_err(|e| {
ShellError::SpannedLabeledError(
"Parquet reader error".into(),
format!("{:?}", e),
call.head,
)
})
}
fn from_json(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
) -> Result<polars::prelude::DataFrame, ShellError> {
let file: Spanned<PathBuf> = call.req(engine_state, stack, 0)?;
let r = File::open(&file.item).map_err(|e| {
ShellError::SpannedLabeledError("Error opening file".into(), e.to_string(), file.span)
})?;
let reader = JsonReader::new(r);
reader.finish().map_err(|e| {
ShellError::SpannedLabeledError("Json reader error".into(), format!("{:?}", e), call.head)
})
}
fn from_csv(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
) -> Result<polars::prelude::DataFrame, ShellError> {
let file: Spanned<PathBuf> = call.req(engine_state, stack, 0)?;
let delimiter: Option<Spanned<String>> = call.get_flag(engine_state, stack, "delimiter")?;
let no_header: bool = call.has_flag("no_header");
let infer_schema: Option<usize> = call.get_flag(engine_state, stack, "infer_schema")?;
let skip_rows: Option<usize> = call.get_flag(engine_state, stack, "skip_rows")?;
let columns: Option<Vec<String>> = call.get_flag(engine_state, stack, "columns")?;
let csv_reader = CsvReader::from_path(&file.item)
.map_err(|e| {
ShellError::SpannedLabeledError(
"Error creating CSV reader".into(),
e.to_string(),
file.span,
)
})?
.with_encoding(CsvEncoding::LossyUtf8);
let csv_reader = match delimiter {
None => csv_reader,
Some(d) => {
if d.item.len() != 1 {
return Err(ShellError::SpannedLabeledError(
"Incorrect delimiter".into(),
"Delimiter has to be one character".into(),
d.span,
));
} else {
let delimiter = match d.item.chars().next() {
Some(d) => d as u8,
None => unreachable!(),
};
csv_reader.with_delimiter(delimiter)
}
}
};
let csv_reader = csv_reader.has_header(!no_header);
let csv_reader = match infer_schema {
None => csv_reader,
Some(r) => csv_reader.infer_schema(Some(r)),
};
let csv_reader = match skip_rows {
None => csv_reader,
Some(r) => csv_reader.with_skip_rows(r),
};
let csv_reader = match columns {
None => csv_reader,
Some(columns) => csv_reader.with_columns(Some(columns)),
};
csv_reader.finish().map_err(|e| {
ShellError::SpannedLabeledError(
"Parquet reader error".into(),
format!("{:?}", e),
call.head,
)
})
}

View File

@ -0,0 +1,175 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Spanned, SyntaxShape,
};
use polars::prelude::DataType;
use crate::dataframe::values::NuGroupBy;
use super::super::values::NuDataFrame;
enum Operation {
First,
Sum,
Min,
Max,
Mean,
Median,
}
impl Operation {
fn from_tagged(name: Spanned<String>) -> Result<Operation, ShellError> {
match name.item.as_ref() {
"first" => Ok(Operation::First),
"sum" => Ok(Operation::Sum),
"min" => Ok(Operation::Min),
"max" => Ok(Operation::Max),
"mean" => Ok(Operation::Mean),
"median" => Ok(Operation::Median),
_ => Err(ShellError::SpannedLabeledErrorHelp(
"Operation not fount".into(),
"Operation does not exist for pivot".into(),
name.span,
"Options: first, sum, min, max, mean, median".into(),
)),
}
}
}
#[derive(Clone)]
pub struct PivotDF;
impl Command for PivotDF {
fn name(&self) -> &str {
"dfr pivot"
}
fn usage(&self) -> &str {
"Performs a pivot operation on a groupby object"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required(
"pivot-column",
SyntaxShape::String,
"pivot column to perform pivot",
)
.required(
"value-column",
SyntaxShape::String,
"value column to perform pivot",
)
.required("operation", SyntaxShape::String, "aggregate operation")
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Pivot a dataframe on b and aggregation on col c",
example:
"[[a b c]; [one x 1] [two y 2]] | dfr to-df | dfr group-by a | dfr pivot b c sum",
result: None,
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let pivot_col: Spanned<String> = call.req(engine_state, stack, 0)?;
let value_col: Spanned<String> = call.req(engine_state, stack, 1)?;
let operation: Spanned<String> = call.req(engine_state, stack, 2)?;
let op = Operation::from_tagged(operation)?;
let nu_groupby = NuGroupBy::try_from_pipeline(input, call.head)?;
let df_ref = nu_groupby.as_ref();
check_pivot_column(df_ref, &pivot_col)?;
check_value_column(df_ref, &value_col)?;
let mut groupby = nu_groupby.to_groupby()?;
let pivot = groupby.pivot(&pivot_col.item, &value_col.item);
match op {
Operation::Mean => pivot.mean(),
Operation::Sum => pivot.sum(),
Operation::Min => pivot.min(),
Operation::Max => pivot.max(),
Operation::First => pivot.first(),
Operation::Median => pivot.median(),
}
.map_err(|e| {
ShellError::SpannedLabeledError("Error creating pivot".into(), e.to_string(), call.head)
})
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None))
}
fn check_pivot_column(
df: &polars::prelude::DataFrame,
col: &Spanned<String>,
) -> Result<(), ShellError> {
let series = df.column(&col.item).map_err(|e| {
ShellError::SpannedLabeledError("Column not found".into(), e.to_string(), col.span)
})?;
match series.dtype() {
DataType::UInt8
| DataType::UInt16
| DataType::UInt32
| DataType::UInt64
| DataType::Int8
| DataType::Int16
| DataType::Int32
| DataType::Int64
| DataType::Utf8 => Ok(()),
_ => Err(ShellError::SpannedLabeledError(
"Pivot error".into(),
format!("Unsupported datatype {}", series.dtype()),
col.span,
)),
}
}
fn check_value_column(
df: &polars::prelude::DataFrame,
col: &Spanned<String>,
) -> Result<(), ShellError> {
let series = df.column(&col.item).map_err(|e| {
ShellError::SpannedLabeledError("Column not found".into(), e.to_string(), col.span)
})?;
match series.dtype() {
DataType::UInt8
| DataType::UInt16
| DataType::UInt32
| DataType::UInt64
| DataType::Int8
| DataType::Int16
| DataType::Int32
| DataType::Int64
| DataType::Float32
| DataType::Float64 => Ok(()),
_ => Err(ShellError::SpannedLabeledError(
"Pivot error".into(),
format!("Unsupported datatype {}", series.dtype()),
col.span,
)),
}
}

View File

@ -0,0 +1,94 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value,
};
use super::super::values::{Column, NuDataFrame};
#[derive(Clone)]
pub struct RenameDF;
impl Command for RenameDF {
fn name(&self) -> &str {
"dfr rename-col"
}
fn usage(&self) -> &str {
"rename a dataframe column"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required("from", SyntaxShape::String, "column name to be renamed")
.required("to", SyntaxShape::String, "new column name")
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Renames a dataframe column",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr rename-col a a_new",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a_new".to_string(),
vec![Value::test_int(1), Value::test_int(3)],
),
Column::new(
"b".to_string(),
vec![Value::test_int(2), Value::test_int(4)],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let from: String = call.req(engine_state, stack, 0)?;
let to: String = call.req(engine_state, stack, 1)?;
let mut df = NuDataFrame::try_from_pipeline(input, call.head)?;
df.as_mut()
.rename(&from, &to)
.map_err(|e| {
ShellError::SpannedLabeledError("Error renaming".into(), e.to_string(), call.head)
})
.map(|df| {
PipelineData::Value(
NuDataFrame::dataframe_into_value(df.clone(), call.head),
None,
)
})
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(RenameDF {})])
}
}

View File

@ -0,0 +1,106 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Spanned, SyntaxShape,
};
use super::super::values::NuDataFrame;
#[derive(Clone)]
pub struct SampleDF;
impl Command for SampleDF {
fn name(&self) -> &str {
"dfr sample"
}
fn usage(&self) -> &str {
"Create sample dataframe"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.named(
"n-rows",
SyntaxShape::Int,
"number of rows to be taken from dataframe",
Some('n'),
)
.named(
"fraction",
SyntaxShape::Number,
"fraction of dataframe to be taken",
Some('f'),
)
.switch("replace", "sample with replace", Some('e'))
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Sample rows from dataframe",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr sample -n 1",
result: None, // No expected value because sampling is random
},
Example {
description: "Shows sample row using fraction and replace",
example: "[[a b]; [1 2] [3 4] [5 6]] | dfr to-df | dfr sample -f 0.5 -e",
result: None, // No expected value because sampling is random
},
]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let rows: Option<Spanned<usize>> = call.get_flag(engine_state, stack, "n-rows")?;
let fraction: Option<Spanned<f64>> = call.get_flag(engine_state, stack, "fraction")?;
let replace: bool = call.has_flag("replace");
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
match (rows, fraction) {
(Some(rows), None) => df.as_ref().sample_n(rows.item, replace).map_err(|e| {
ShellError::SpannedLabeledError(
"Error creating sample".into(),
e.to_string(),
rows.span,
)
}),
(None, Some(frac)) => df.as_ref().sample_frac(frac.item, replace).map_err(|e| {
ShellError::SpannedLabeledError(
"Error creating sample".into(),
e.to_string(),
frac.span,
)
}),
(Some(_), Some(_)) => Err(ShellError::SpannedLabeledError(
"Incompatible flags".into(),
"Only one selection criterion allowed".into(),
call.head,
)),
(None, None) => Err(ShellError::SpannedLabeledErrorHelp(
"No selection".into(),
"No selection criterion was found".into(),
call.head,
"Perhaps you want to use the flag -n or -f".into(),
)),
}
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None))
}

View File

@ -0,0 +1,87 @@
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, Value,
};
use crate::dataframe::values::Column;
use super::super::values::NuDataFrame;
#[derive(Clone)]
pub struct ShapeDF;
impl Command for ShapeDF {
fn name(&self) -> &str {
"dfr shape"
}
fn usage(&self) -> &str {
"Shows column and row size for a dataframe"
}
fn signature(&self) -> Signature {
Signature::build(self.name()).category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Shows row and column shape",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr shape",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new("rows".to_string(), vec![Value::test_int(2)]),
Column::new("columns".to_string(), vec![Value::test_int(2)]),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
_engine_state: &EngineState,
_stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
let rows = Value::Int {
val: df.as_ref().height() as i64,
span: call.head,
};
let cols = Value::Int {
val: df.as_ref().width() as i64,
span: call.head,
};
let rows_col = Column::new("rows".to_string(), vec![rows]);
let cols_col = Column::new("columns".to_string(), vec![cols]);
NuDataFrame::try_from_columns(vec![rows_col, cols_col])
.map(|df| PipelineData::Value(df.into_value(call.head), None))
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(ShapeDF {})])
}
}

View File

@ -0,0 +1,85 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value,
};
use crate::dataframe::values::Column;
use super::super::values::NuDataFrame;
#[derive(Clone)]
pub struct SliceDF;
impl Command for SliceDF {
fn name(&self) -> &str {
"dfr slice"
}
fn usage(&self) -> &str {
"Creates new dataframe from a slice of rows"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required("offset", SyntaxShape::Int, "start of slice")
.required("size", SyntaxShape::Int, "size of slice")
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Create new dataframe from a slice of the rows",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr slice 0 1",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new("a".to_string(), vec![Value::test_int(1)]),
Column::new("b".to_string(), vec![Value::test_int(2)]),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let offset: i64 = call.req(engine_state, stack, 0)?;
let size: usize = call.req(engine_state, stack, 1)?;
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
let res = df.as_ref().slice(offset, size);
Ok(PipelineData::Value(
NuDataFrame::dataframe_into_value(res, call.head),
None,
))
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(SliceDF {})])
}
}

View File

@ -0,0 +1,142 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value,
};
use crate::dataframe::values::{utils::convert_columns_string, Column};
use super::super::values::NuDataFrame;
#[derive(Clone)]
pub struct SortDF;
impl Command for SortDF {
fn name(&self) -> &str {
"dfr sort"
}
fn usage(&self) -> &str {
"Creates new sorted dataframe or series"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.switch("reverse", "invert sort", Some('r'))
.rest("rest", SyntaxShape::Any, "column names to sort dataframe")
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Create new sorted dataframe",
example: "[[a b]; [3 4] [1 2]] | dfr to-df | dfr sort a",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a".to_string(),
vec![Value::test_int(1), Value::test_int(3)],
),
Column::new(
"b".to_string(),
vec![Value::test_int(2), Value::test_int(4)],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
Example {
description: "Create new sorted series",
example: "[3 4 1 2] | dfr to-df | dfr sort",
result: Some(
NuDataFrame::try_from_columns(vec![Column::new(
"0".to_string(),
vec![
Value::test_int(1),
Value::test_int(2),
Value::test_int(3),
Value::test_int(4),
],
)])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let reverse = call.has_flag("reverse");
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
if df.is_series() {
let columns = df.as_ref().get_column_names();
df.as_ref()
.sort(columns, reverse)
.map_err(|e| {
ShellError::SpannedLabeledError(
"Error sorting dataframe".into(),
e.to_string(),
call.head,
)
})
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None))
} else {
let columns: Vec<Value> = call.rest(engine_state, stack, 0)?;
if !columns.is_empty() {
let (col_string, col_span) = convert_columns_string(columns, call.head)?;
df.as_ref()
.sort(&col_string, reverse)
.map_err(|e| {
ShellError::SpannedLabeledError(
"Error sorting dataframe".into(),
e.to_string(),
col_span,
)
})
.map(|df| {
PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None)
})
} else {
Err(ShellError::SpannedLabeledError(
"Missing columns".into(),
"missing column name to perform sort".into(),
call.head,
))
}
}
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(SortDF {})])
}
}

View File

@ -0,0 +1,144 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value,
};
use polars::prelude::DataType;
use crate::dataframe::values::Column;
use super::super::values::NuDataFrame;
#[derive(Clone)]
pub struct TakeDF;
impl Command for TakeDF {
fn name(&self) -> &str {
"dfr take"
}
fn usage(&self) -> &str {
"Creates new dataframe using the given indices"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required(
"indices",
SyntaxShape::Any,
"list of indices used to take data",
)
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Takes selected rows from dataframe",
example: r#"let df = ([[a b]; [4 1] [5 2] [4 3]] | dfr to-df);
let indices = ([0 2] | dfr to-df);
$df | dfr take $indices"#,
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a".to_string(),
vec![Value::test_int(4), Value::test_int(4)],
),
Column::new(
"b".to_string(),
vec![Value::test_int(1), Value::test_int(3)],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
Example {
description: "Takes selected rows from series",
example: r#"let series = ([4 1 5 2 4 3] | dfr to-df);
let indices = ([0 2] | dfr to-df);
$series | dfr take $indices"#,
result: Some(
NuDataFrame::try_from_columns(vec![Column::new(
"0".to_string(),
vec![Value::test_int(4), Value::test_int(5)],
)])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let index_value: Value = call.req(engine_state, stack, 0)?;
let index_span = index_value.span()?;
let index = NuDataFrame::try_from_value(index_value)?.as_series(index_span)?;
let casted = match index.dtype() {
DataType::UInt32 | DataType::UInt64 | DataType::Int32 | DataType::Int64 => {
index.cast(&DataType::UInt32).map_err(|e| {
ShellError::SpannedLabeledError(
"Error casting index list".into(),
e.to_string(),
index_span,
)
})
}
_ => Err(ShellError::SpannedLabeledErrorHelp(
"Incorrect type".into(),
"Series with incorrect type".into(),
call.head,
"Consider using a Series with type int type".into(),
)),
}?;
let indices = casted.u32().map_err(|e| {
ShellError::SpannedLabeledError(
"Error casting index list".into(),
e.to_string(),
index_span,
)
})?;
NuDataFrame::try_from_pipeline(input, call.head).and_then(|df| {
df.as_ref()
.take(indices)
.map_err(|e| {
ShellError::SpannedLabeledError(
"Error taking values".into(),
e.to_string(),
call.head,
)
})
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None))
})
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(TakeDF {})])
}
}

View File

@ -0,0 +1,132 @@
use std::{fs::File, path::PathBuf};
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Spanned, SyntaxShape, Value,
};
use polars::prelude::{CsvWriter, SerWriter};
use super::super::values::NuDataFrame;
#[derive(Clone)]
pub struct ToCSV;
impl Command for ToCSV {
fn name(&self) -> &str {
"dfr to-csv"
}
fn usage(&self) -> &str {
"Saves dataframe to csv file"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required("file", SyntaxShape::Filepath, "file path to save dataframe")
.named(
"delimiter",
SyntaxShape::String,
"file delimiter character",
Some('d'),
)
.switch("no-header", "Indicates if file doesn't have header", None)
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Saves dataframe to csv file",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr to-csv test.csv",
result: None,
},
Example {
description: "Saves dataframe to csv file using other delimiter",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr to-csv test.csv -d '|'",
result: None,
},
]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let file_name: Spanned<PathBuf> = call.req(engine_state, stack, 0)?;
let delimiter: Option<Spanned<String>> = call.get_flag(engine_state, stack, "delimiter")?;
let no_header: bool = call.has_flag("no_header");
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
let mut file = File::create(&file_name.item).map_err(|e| {
ShellError::SpannedLabeledError(
"Error with file name".into(),
e.to_string(),
file_name.span,
)
})?;
let writer = CsvWriter::new(&mut file);
let writer = if no_header {
writer.has_header(false)
} else {
writer.has_header(true)
};
let writer = match delimiter {
None => writer,
Some(d) => {
if d.item.len() != 1 {
return Err(ShellError::SpannedLabeledError(
"Incorrect delimiter".into(),
"Delimiter has to be one char".into(),
d.span,
));
} else {
let delimiter = match d.item.chars().next() {
Some(d) => d as u8,
None => unreachable!(),
};
writer.with_delimiter(delimiter)
}
}
};
writer.finish(df.as_ref()).map_err(|e| {
ShellError::SpannedLabeledError(
"Error writing to file".into(),
e.to_string(),
file_name.span,
)
})?;
let file_value = Value::String {
val: format!("saved {:?}", &file_name.item),
span: file_name.span,
};
Ok(PipelineData::Value(
Value::List {
vals: vec![file_value],
span: call.head,
},
None,
))
}

View File

@ -0,0 +1,127 @@
use super::super::values::{Column, NuDataFrame};
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, Value,
};
#[derive(Clone)]
pub struct ToDataFrame;
impl Command for ToDataFrame {
fn name(&self) -> &str {
"dfr to-df"
}
fn usage(&self) -> &str {
"Converts a List, Table or Dictionary into a dataframe"
}
fn signature(&self) -> Signature {
Signature::build(self.name()).category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Takes a dictionary and creates a dataframe",
example: "[[a b];[1 2] [3 4]] | dfr to-df",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a".to_string(),
vec![Value::test_int(1), Value::test_int(3)],
),
Column::new(
"b".to_string(),
vec![Value::test_int(2), Value::test_int(4)],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
Example {
description: "Takes a list of tables and creates a dataframe",
example: "[[1 2 a] [3 4 b] [5 6 c]] | dfr to-df",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"0".to_string(),
vec![Value::test_int(1), Value::test_int(3), Value::test_int(5)],
),
Column::new(
"1".to_string(),
vec![Value::test_int(2), Value::test_int(4), Value::test_int(6)],
),
Column::new(
"2".to_string(),
vec![
Value::test_string("a"),
Value::test_string("b"),
Value::test_string("c"),
],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
Example {
description: "Takes a list and creates a dataframe",
example: "[a b c] | dfr to-df",
result: Some(
NuDataFrame::try_from_columns(vec![Column::new(
"0".to_string(),
vec![
Value::test_string("a"),
Value::test_string("b"),
Value::test_string("c"),
],
)])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
Example {
description: "Takes a list of booleans and creates a dataframe",
example: "[$true $true $false] | dfr to-df",
result: Some(
NuDataFrame::try_from_columns(vec![Column::new(
"0".to_string(),
vec![
Value::test_bool(true),
Value::test_bool(true),
Value::test_bool(false),
],
)])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
]
}
fn run(
&self,
_engine_state: &EngineState,
_stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
NuDataFrame::try_from_iter(input.into_iter())
.map(|df| PipelineData::Value(NuDataFrame::into_value(df, call.head), None))
}
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(ToDataFrame {})])
}
}

View File

@ -0,0 +1,83 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, SyntaxShape, Value,
};
use super::super::values::NuDataFrame;
#[derive(Clone)]
pub struct ToNu;
impl Command for ToNu {
fn name(&self) -> &str {
"dfr to-nu"
}
fn usage(&self) -> &str {
"Converts a section of the dataframe to Nushell Table"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.named(
"n-rows",
SyntaxShape::Number,
"number of rows to be shown",
Some('n'),
)
.switch("tail", "shows tail rows", Some('t'))
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Shows head rows from dataframe",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr to-nu",
result: None,
},
Example {
description: "Shows tail rows from dataframe",
example: "[[a b]; [1 2] [3 4] [5 6]] | dfr to-df | dfr to-nu -t -n 1",
result: None,
},
]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let rows: Option<usize> = call.get_flag(engine_state, stack, "n-rows")?;
let tail: bool = call.has_flag("tail");
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
let values = if tail {
df.tail(rows, call.head)?
} else {
df.head(rows, call.head)?
};
let value = Value::List {
vals: values,
span: call.head,
};
Ok(PipelineData::Value(value, None))
}

View File

@ -0,0 +1,84 @@
use std::{fs::File, path::PathBuf};
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Spanned, SyntaxShape, Value,
};
use polars::prelude::ParquetWriter;
use super::super::values::NuDataFrame;
#[derive(Clone)]
pub struct ToParquet;
impl Command for ToParquet {
fn name(&self) -> &str {
"dfr to-parquet"
}
fn usage(&self) -> &str {
"Saves dataframe to parquet file"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required("file", SyntaxShape::Filepath, "file path to save dataframe")
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Saves dataframe to csv file",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr to-parquet test.parquet",
result: None,
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let file_name: Spanned<PathBuf> = call.req(engine_state, stack, 0)?;
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
let file = File::create(&file_name.item).map_err(|e| {
ShellError::SpannedLabeledError(
"Error with file name".into(),
e.to_string(),
file_name.span,
)
})?;
ParquetWriter::new(file).finish(df.as_ref()).map_err(|e| {
ShellError::SpannedLabeledError("Error saving file".into(), e.to_string(), file_name.span)
})?;
let file_value = Value::String {
val: format!("saved {:?}", &file_name.item),
span: file_name.span,
};
Ok(PipelineData::Value(
Value::List {
vals: vec![file_value],
span: call.head,
},
None,
))
}

View File

@ -0,0 +1,109 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Value,
};
use super::super::values::{Column, NuDataFrame};
#[derive(Clone)]
pub struct WithColumn;
impl Command for WithColumn {
fn name(&self) -> &str {
"dfr with-column"
}
fn usage(&self) -> &str {
"Adds a series to the dataframe"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.required("series", SyntaxShape::Any, "series to be added")
.required_named("name", SyntaxShape::String, "column name", Some('n'))
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Adds a series to the dataframe",
example:
"[[a b]; [1 2] [3 4]] | dfr to-df | dfr with-column ([5 6] | dfr to-df) --name c",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a".to_string(),
vec![Value::test_int(1), Value::test_int(3)],
),
Column::new(
"b".to_string(),
vec![Value::test_int(2), Value::test_int(4)],
),
Column::new(
"c".to_string(),
vec![Value::test_int(5), Value::test_int(6)],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let name: Spanned<String> = call
.get_flag(engine_state, stack, "name")?
.expect("required named value");
let other_value: Value = call.req(engine_state, stack, 0)?;
let other_span = other_value.span()?;
let mut other = NuDataFrame::try_from_value(other_value)?.as_series(other_span)?;
let series = other.rename(&name.item).clone();
let mut df = NuDataFrame::try_from_pipeline(input, call.head)?;
df.as_mut()
.with_column(series)
.map_err(|e| {
ShellError::SpannedLabeledError(
"Error adding column to dataframe".into(),
e.to_string(),
other_span,
)
})
.map(|df| {
PipelineData::Value(
NuDataFrame::dataframe_into_value(df.clone(), call.head),
None,
)
})
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(WithColumn {})])
}
}