Lazy dataframes (#5687)

* change between lazy and eager

* when expressions

* examples for aggregations

* more examples for agg

* examples for dataframes

* checked examples

* cargo fmt
This commit is contained in:
Fernando Herrera
2022-05-31 07:29:55 +01:00
committed by GitHub
parent 0769e9b750
commit 997d56a288
50 changed files with 1781 additions and 810 deletions

View File

@@ -29,23 +29,37 @@ impl Command for FilterWith {
SyntaxShape::Any,
"boolean mask used to filter data",
)
.category(Category::Custom("dataframe".into()))
.category(Category::Custom("dataframe or lazyframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Filter dataframe using a bool mask",
example: r#"let mask = ([true false] | dfr to-df);
vec![
Example {
description: "Filter dataframe using a bool mask",
example: r#"let mask = ([true false] | dfr to-df);
[[a b]; [1 2] [3 4]] | dfr to-df | dfr filter-with $mask"#,
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new("a".to_string(), vec![Value::test_int(1)]),
Column::new("b".to_string(), vec![Value::test_int(2)]),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new("a".to_string(), vec![Value::test_int(1)]),
Column::new("b".to_string(), vec![Value::test_int(2)]),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
Example {
description: "Filter dataframe using an expression",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr filter-with ((dfr col a) > 1)",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new("a".to_string(), vec![Value::test_int(3)]),
Column::new("b".to_string(), vec![Value::test_int(4)]),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
]
}
fn run(
@@ -81,31 +95,42 @@ fn command_eager(
df: NuDataFrame,
) -> Result<PipelineData, ShellError> {
let mask_value: Value = call.req(engine_state, stack, 0)?;
let mask_span = mask_value.span()?;
let mask = NuDataFrame::try_from_value(mask_value)?.as_series(mask_span)?;
let mask = mask.bool().map_err(|e| {
ShellError::GenericError(
"Error casting to bool".into(),
e.to_string(),
Some(mask_span),
Some("Perhaps you want to use a series with booleans as mask".into()),
Vec::new(),
)
})?;
df.as_ref()
.filter(mask)
.map_err(|e| {
if NuExpression::can_downcast(&mask_value) {
let expression = NuExpression::try_from_value(mask_value)?;
let lazy = NuLazyFrame::new(true, df.lazy());
let lazy = lazy.apply_with_expr(expression, LazyFrame::filter);
Ok(PipelineData::Value(
NuLazyFrame::into_value(lazy, call.head)?,
None,
))
} else {
let mask = NuDataFrame::try_from_value(mask_value)?.as_series(mask_span)?;
let mask = mask.bool().map_err(|e| {
ShellError::GenericError(
"Error calculating dummies".into(),
"Error casting to bool".into(),
e.to_string(),
Some(call.head),
Some("The only allowed column types for dummies are String or Int".into()),
Some(mask_span),
Some("Perhaps you want to use a series with booleans as mask".into()),
Vec::new(),
)
})
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None))
})?;
df.as_ref()
.filter(mask)
.map_err(|e| {
ShellError::GenericError(
"Error filtering dataframe".into(),
e.to_string(),
Some(call.head),
Some("The only allowed column types for dummies are String or Int".into()),
Vec::new(),
)
})
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None))
}
}
fn command_lazy(
@@ -120,7 +145,7 @@ fn command_lazy(
let lazy = lazy.apply_with_expr(expr, LazyFrame::filter);
Ok(PipelineData::Value(
NuLazyFrame::into_value(lazy, call.head),
NuLazyFrame::into_value(lazy, call.head)?,
None,
))
}
@@ -129,9 +154,10 @@ fn command_lazy(
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
use crate::dataframe::expressions::ExprCol;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(FilterWith {})])
test_dataframe(vec![Box::new(FilterWith {}), Box::new(ExprCol {})])
}
}

View File

@@ -22,7 +22,7 @@ impl Command for FirstDF {
fn signature(&self) -> Signature {
Signature::build(self.name())
.optional("rows", SyntaxShape::Int, "Number of rows for head")
.category(Category::Custom("dataframe".into()))
.category(Category::Custom("dataframe or expression".into()))
}
fn examples(&self) -> Vec<Example> {

View File

@@ -22,7 +22,7 @@ impl Command for LastDF {
fn signature(&self) -> Signature {
Signature::build(self.name())
.optional("rows", SyntaxShape::Int, "Number of rows for tail")
.category(Category::Custom("dataframe".into()))
.category(Category::Custom("dataframe or lazyframe".into()))
}
fn examples(&self) -> Vec<Example> {

View File

@@ -18,7 +18,6 @@ mod rename;
mod sample;
mod shape;
mod slice;
mod sort;
mod take;
mod to_csv;
mod to_df;
@@ -48,7 +47,6 @@ pub use rename::RenameDF;
pub use sample::SampleDF;
pub use shape::ShapeDF;
pub use slice::SliceDF;
pub use sort::SortDF;
pub use take::TakeDF;
pub use to_csv::ToCSV;
pub use to_df::ToDataFrame;
@@ -88,7 +86,6 @@ pub fn add_eager_decls(working_set: &mut StateWorkingSet) {
SampleDF,
ShapeDF,
SliceDF,
SortDF,
TakeDF,
ToCSV,
ToDataFrame,

View File

@@ -14,7 +14,7 @@ pub struct RenameDF;
impl Command for RenameDF {
fn name(&self) -> &str {
"dfr rename-col"
"dfr rename"
}
fn usage(&self) -> &str {
@@ -33,28 +33,65 @@ impl Command for RenameDF {
SyntaxShape::Any,
"New names for the selected column(s). A string or list of strings",
)
.category(Category::Custom("dataframe".into()))
.category(Category::Custom("dataframe or lazyframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Renames a dataframe column",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr rename-col a a_new",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a_new".to_string(),
vec![Value::test_int(1), Value::test_int(3)],
),
Column::new(
"b".to_string(),
vec![Value::test_int(2), Value::test_int(4)],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
}]
vec![
Example {
description: "Renames a series",
example: "[5 6 7 8] | dfr to-df | dfr rename '0' new_name",
result: Some(
NuDataFrame::try_from_columns(vec![Column::new(
"new_name".to_string(),
vec![
Value::test_int(5),
Value::test_int(6),
Value::test_int(7),
Value::test_int(8),
],
)])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
Example {
description: "Renames a dataframe column",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr rename a a_new",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a_new".to_string(),
vec![Value::test_int(1), Value::test_int(3)],
),
Column::new(
"b".to_string(),
vec![Value::test_int(2), Value::test_int(4)],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
Example {
description: "Renames two dataframe columns",
example: "[[a b]; [1 2] [3 4]] | dfr to-df | dfr rename [a b] [a_new b_new]",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a_new".to_string(),
vec![Value::test_int(1), Value::test_int(3)],
),
Column::new(
"b_new".to_string(),
vec![Value::test_int(2), Value::test_int(4)],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
]
}
fn run(
@@ -133,7 +170,7 @@ fn command_lazy(
let lazy = lazy.into_polars();
let lazy: NuLazyFrame = lazy.rename(&columns, &new_names).into();
Ok(PipelineData::Value(lazy.into_value(call.head), None))
Ok(PipelineData::Value(lazy.into_value(call.head)?, None))
}
#[cfg(test)]

View File

@@ -1,148 +0,0 @@
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Value,
};
use crate::dataframe::values::{utils::convert_columns_string, Column};
use super::super::values::NuDataFrame;
#[derive(Clone)]
pub struct SortDF;
impl Command for SortDF {
fn name(&self) -> &str {
"dfr sort"
}
fn usage(&self) -> &str {
"Creates new sorted dataframe or series"
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.switch("reverse", "invert sort", Some('r'))
.rest("rest", SyntaxShape::Any, "column names to sort dataframe")
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Create new sorted dataframe",
example: "[[a b]; [3 4] [1 2]] | dfr to-df | dfr sort a",
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a".to_string(),
vec![Value::test_int(1), Value::test_int(3)],
),
Column::new(
"b".to_string(),
vec![Value::test_int(2), Value::test_int(4)],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
Example {
description: "Create new sorted series",
example: "[3 4 1 2] | dfr to-df | dfr sort",
result: Some(
NuDataFrame::try_from_columns(vec![Column::new(
"0".to_string(),
vec![
Value::test_int(1),
Value::test_int(2),
Value::test_int(3),
Value::test_int(4),
],
)])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let reverse = call.has_flag("reverse");
let df = NuDataFrame::try_from_pipeline(input, call.head)?;
if df.is_series() {
let columns = df.as_ref().get_column_names();
df.as_ref()
.sort(columns, reverse)
.map_err(|e| {
ShellError::GenericError(
"Error sorting dataframe".into(),
e.to_string(),
Some(call.head),
None,
Vec::new(),
)
})
.map(|df| PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None))
} else {
let columns: Vec<Value> = call.rest(engine_state, stack, 0)?;
if !columns.is_empty() {
let (col_string, col_span) = convert_columns_string(columns, call.head)?;
df.as_ref()
.sort(&col_string, reverse)
.map_err(|e| {
ShellError::GenericError(
"Error sorting dataframe".into(),
e.to_string(),
Some(col_span),
None,
Vec::new(),
)
})
.map(|df| {
PipelineData::Value(NuDataFrame::dataframe_into_value(df, call.head), None)
})
} else {
Err(ShellError::GenericError(
"Missing columns".into(),
"missing column name to perform sort".into(),
Some(call.head),
None,
Vec::new(),
))
}
}
}
#[cfg(test)]
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(SortDF {})])
}
}

View File

@@ -27,7 +27,7 @@ impl Command for WithColumn {
SyntaxShape::Any,
"series to be added or expressions used to define the new columns",
)
.category(Category::Custom("dataframe".into()))
.category(Category::Custom("dataframe or lazyframe".into()))
}
fn examples(&self) -> Vec<Example> {
@@ -59,11 +59,34 @@ impl Command for WithColumn {
Example {
description: "Adds a series to the dataframe",
example: r#"[[a b]; [1 2] [3 4]]
| dfr to-df
| dfr to-lazy
| dfr with-column ((dfr col a) * 2 | dfr as "c")
| dfr with-column [
((dfr col a) * 2 | dfr as "c")
((dfr col a) * 3 | dfr as "d")
]
| dfr collect"#,
result: None,
result: Some(
NuDataFrame::try_from_columns(vec![
Column::new(
"a".to_string(),
vec![Value::test_int(1), Value::test_int(3)],
),
Column::new(
"b".to_string(),
vec![Value::test_int(2), Value::test_int(4)],
),
Column::new(
"c".to_string(),
vec![Value::test_int(2), Value::test_int(6)],
),
Column::new(
"d".to_string(),
vec![Value::test_int(3), Value::test_int(9)],
),
])
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
},
]
}
@@ -85,7 +108,7 @@ impl Command for WithColumn {
command_eager(engine_state, stack, call, df)
} else {
Err(ShellError::CantConvert(
"expression or query".into(),
"lazy or eager dataframe".into(),
value.get_type().to_string(),
value.span()?,
None,
@@ -100,34 +123,49 @@ fn command_eager(
call: &Call,
mut df: NuDataFrame,
) -> Result<PipelineData, ShellError> {
let other_value: Value = call.req(engine_state, stack, 0)?;
let other_span = other_value.span()?;
let mut other = NuDataFrame::try_from_value(other_value)?.as_series(other_span)?;
let new_column: Value = call.req(engine_state, stack, 0)?;
let column_span = new_column.span()?;
let name = match call.get_flag::<String>(engine_state, stack, "name")? {
Some(name) => name,
None => other.name().to_string(),
};
if NuExpression::can_downcast(&new_column) {
let vals: Vec<Value> = call.rest(engine_state, stack, 0)?;
let value = Value::List {
vals,
span: call.head,
};
let expressions = NuExpression::extract_exprs(value)?;
let lazy = NuLazyFrame::new(true, df.lazy().with_columns(&expressions));
let series = other.rename(&name).clone();
let df = lazy.collect(call.head)?;
df.as_mut()
.with_column(series)
.map_err(|e| {
ShellError::GenericError(
"Error adding column to dataframe".into(),
e.to_string(),
Some(other_span),
None,
Vec::new(),
)
})
.map(|df| {
PipelineData::Value(
NuDataFrame::dataframe_into_value(df.clone(), call.head),
None,
)
})
Ok(PipelineData::Value(df.into_value(call.head), None))
} else {
let mut other = NuDataFrame::try_from_value(new_column)?.as_series(column_span)?;
let name = match call.get_flag::<String>(engine_state, stack, "name")? {
Some(name) => name,
None => other.name().to_string(),
};
let series = other.rename(&name).clone();
df.as_mut()
.with_column(series)
.map_err(|e| {
ShellError::GenericError(
"Error adding column to dataframe".into(),
e.to_string(),
Some(column_span),
None,
Vec::new(),
)
})
.map(|df| {
PipelineData::Value(
NuDataFrame::dataframe_into_value(df.clone(), call.head),
None,
)
})
}
}
fn command_lazy(
@@ -146,7 +184,7 @@ fn command_lazy(
let lazy: NuLazyFrame = lazy.into_polars().with_columns(&expressions).into();
Ok(PipelineData::Value(
NuLazyFrame::into_value(lazy, call.head),
NuLazyFrame::into_value(lazy, call.head)?,
None,
))
}
@@ -155,9 +193,15 @@ fn command_lazy(
mod test {
use super::super::super::test_dataframe::test_dataframe;
use super::*;
use crate::dataframe::expressions::ExprAlias;
use crate::dataframe::expressions::ExprCol;
#[test]
fn test_examples() {
test_dataframe(vec![Box::new(WithColumn {})])
test_dataframe(vec![
Box::new(WithColumn {}),
Box::new(ExprAlias {}),
Box::new(ExprCol {}),
])
}
}