From ae64c58f597817581c36b1963ae406fed12cb13e Mon Sep 17 00:00:00 2001 From: Fernando Herrera Date: Fri, 12 Aug 2022 14:10:36 +0200 Subject: [PATCH] Polars upgrade 0.23 (#6303) * more lazy expressions * upgrade polars and correct functions * arg-where example * cargo clippy * restore modified filter files * correct string addition with str * correct string addition with str * correct message in test --- Cargo.lock | 77 ++++++++---- crates/nu-command/Cargo.toml | 5 +- .../nu-command/src/dataframe/eager/columns.rs | 94 +++++++++++++++ crates/nu-command/src/dataframe/eager/mod.rs | 3 + .../src/dataframe/expressions/arg_where.rs | 76 ++++++++++++ .../src/dataframe/expressions/is_in.rs | 110 ++++++++++++++++++ .../src/dataframe/expressions/mod.rs | 6 + .../nu-command/src/dataframe/lazy/filter.rs | 84 +++++++++++++ crates/nu-command/src/dataframe/lazy/mod.rs | 8 +- .../nu-command/src/dataframe/lazy/select.rs | 13 --- .../src/dataframe/lazy/sort_by_expr.rs | 9 +- .../src/dataframe/series/date/get_weekday.rs | 2 +- .../src/dataframe/series/indexes/arg_true.rs | 44 ++++--- .../dataframe/series/indexes/set_with_idx.rs | 4 +- .../dataframe/series/masks/is_duplicated.rs | 64 ++++++---- .../src/dataframe/series/masks/is_unique.rs | 85 +++++++++----- .../src/dataframe/series/masks/set.rs | 4 +- .../src/dataframe/series/value_counts.rs | 4 +- crates/nu-command/src/filters/find.rs | 2 +- crates/nu-command/src/filters/lines.rs | 2 +- crates/nu-command/src/formats/from/yaml.rs | 2 +- crates/nu-command/tests/commands/source.rs | 6 +- 22 files changed, 583 insertions(+), 121 deletions(-) create mode 100644 crates/nu-command/src/dataframe/eager/columns.rs create mode 100644 crates/nu-command/src/dataframe/expressions/arg_where.rs create mode 100644 crates/nu-command/src/dataframe/expressions/is_in.rs create mode 100644 crates/nu-command/src/dataframe/lazy/filter.rs diff --git a/Cargo.lock b/Cargo.lock index eafc147ed2..bf0208c9f4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -130,9 +130,9 @@ checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" [[package]] name = "arrow-format" -version = "0.6.0" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "216249afef413d7e9e9b4b543e73b3e371ace3a812380af98f1c871521572cdd" +checksum = "8df5d25bc6d676271277120c41ef28760fe0a9f070677a58db621c0f983f9c20" dependencies = [ "planus", "serde", @@ -140,16 +140,19 @@ dependencies = [ [[package]] name = "arrow2" -version = "0.12.0" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5feafd6df4e3f577529e6aa2b9b7cdb3c9fe8e8f66ebc8dc29abbe71a7e968f0" +checksum = "afc54f0b14083abaf6bc71cf1aeccd7831a24b1e29d07683ba9a4a0f6c5d9326" dependencies = [ + "ahash", "arrow-format", "base64", "bytemuck", "chrono", + "dyn-clone", "either", "fallible-streaming-iterator", + "foreign_vec", "futures", "hash_hasher", "indexmap", @@ -1024,6 +1027,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "453440c271cf5577fd2a40e4942540cb7d0d2f85e27c8d07dd0023c925a67541" +[[package]] +name = "dyn-clone" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d07a982d1fb29db01e5a59b1918e03da4df7297eaeee7686ac45542fd4e59c8" + [[package]] name = "ego-tree" version = "0.6.2" @@ -1204,6 +1213,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" +[[package]] +name = "foreign_vec" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee1b05cbd864bcaecbd3455d6d967862d446e4ebfc3c2e5e5b9841e53cba6673" + [[package]] name = "form_urlencoded" version = "1.0.1" @@ -3172,9 +3187,9 @@ dependencies = [ [[package]] name = "parquet2" -version = "0.13.2" +version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73fd2690ad041f9296876daef1f2706f6347073bdbcc719090887f1691e4a09d" +checksum = "33e434af3293ba384075a56d4b400ce659868ca7823142194ef204f01ab35e50" dependencies = [ "async-stream", "bitpacking", @@ -3381,18 +3396,18 @@ checksum = "1df8c4ec4b0627e53bdf214615ad287367e482558cf84b109250b37464dc03ae" [[package]] name = "planus" -version = "0.2.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bffebaf174d6cad46a5f0f1bb1c45c6eb509571688bcb18dfab217f3c9f9b151" +checksum = "fc1691dd09e82f428ce8d6310bd6d5da2557c82ff17694d2a32cad7242aea89f" dependencies = [ "array-init-cursor", ] [[package]] name = "polars" -version = "0.22.8" +version = "0.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d175c67e80ceaef7219258cfc3a8686531d9510875b0cefa25404e5b80a7933" +checksum = "a75b1077fda63c0f67acc1cdc8586e7afce419be1e85bf7dfa8935e0e266d6b3" dependencies = [ "polars-core", "polars-io", @@ -3403,9 +3418,9 @@ dependencies = [ [[package]] name = "polars-arrow" -version = "0.22.7" +version = "0.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f66c7d3da2c10a09131294dbe7802fac792f570be639dc6ebf207bfc3e144287" +checksum = "f7b28f858b252436550679609a23be34d62705faf783887f172f845eb58bcb8b" dependencies = [ "arrow2", "hashbrown", @@ -3416,13 +3431,14 @@ dependencies = [ [[package]] name = "polars-core" -version = "0.22.7" +version = "0.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7f15f443a90d5367c4fbbb151e203f03b5b96055c8b928c6bc30655a3644f13" +checksum = "eeaec1ca3ac4829ca24b33743adeeb323a43b5a85515bfce20c2c81799c82790" dependencies = [ "ahash", "anyhow", "arrow2", + "bitflags", "chrono", "comfy-table", "hashbrown", @@ -3437,14 +3453,15 @@ dependencies = [ "regex", "serde", "serde_json", + "smartstring", "thiserror", ] [[package]] name = "polars-io" -version = "0.22.7" +version = "0.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "058d0a847ce5009b974c69ec878ed416e306436f21b626543019f738cee12315" +checksum = "51405e46f93e306a3c9280c60ba1101c662e8a6dab33344680d31c3161045f1c" dependencies = [ "ahash", "anyhow", @@ -3470,11 +3487,12 @@ dependencies = [ [[package]] name = "polars-lazy" -version = "0.22.7" +version = "0.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dad86a4ce7e32540ff12089bce6f77270fd133a5b263328a92be61defdd6b151" +checksum = "1340af778bc8124180d8ca1a566f076a5339566a207a42130796048b087fe977" dependencies = [ "ahash", + "bitflags", "glob", "parking_lot", "polars-arrow", @@ -3489,9 +3507,9 @@ dependencies = [ [[package]] name = "polars-ops" -version = "0.22.7" +version = "0.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "030ecd473be113cd0264f1bc19de39a844fa12fa565db9dc52c859cbc292cf04" +checksum = "4a1812e5d5e589d5bd23f8d89dcd8bd4508082c50d055b8ff5fafb6f2a519c9a" dependencies = [ "polars-arrow", "polars-core", @@ -3499,9 +3517,9 @@ dependencies = [ [[package]] name = "polars-time" -version = "0.22.7" +version = "0.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94047b20d2da3bcc55c421be187a0c6f316cf1eea7fe7ed7347c1160a32d017c" +checksum = "fc4ebe97d601a4b443337df71d0b7e673fce953654871c3311850ea394d48297" dependencies = [ "chrono", "lexical", @@ -3513,9 +3531,9 @@ dependencies = [ [[package]] name = "polars-utils" -version = "0.22.7" +version = "0.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcd3d0238462d5d9f7fbeaaea46e73ed4d58f6fae8b70d53cbe51d7538cc43f5" +checksum = "4ea836afadcddee3f1a513dae7624f6d7d0d64abb129063ec7476b8347c8725b" dependencies = [ "parking_lot", "rayon", @@ -4479,6 +4497,17 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2fd0db749597d91ff862fd1d55ea87f7855a744a8425a64695b6fca237d1dad1" +[[package]] +name = "smartstring" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" +dependencies = [ + "autocfg", + "static_assertions", + "version_check", +] + [[package]] name = "smawk" version = "0.3.1" diff --git a/crates/nu-command/Cargo.toml b/crates/nu-command/Cargo.toml index a10c80fe82..ec3890edb8 100644 --- a/crates/nu-command/Cargo.toml +++ b/crates/nu-command/Cargo.toml @@ -99,15 +99,14 @@ version = "2.1.3" optional = true [dependencies.polars] -version = "0.22.8" -# path = "../../../../polars/polars" +version = "0.23.2" optional = true features = [ "default", "to_dummies", "parquet", "json", "serde", "serde-lazy", "object", "checked_arithmetic", "strings", "cum_agg", "is_in", "rolling_window", "strings", "rows", "random", "dtype-datetime", "dtype-struct", "lazy", "cross_join", - "dynamic_groupby", "dtype-categorical", "concat_str" + "dynamic_groupby", "dtype-categorical", "concat_str", "arg_where" ] [target.'cfg(windows)'.dependencies.windows] diff --git a/crates/nu-command/src/dataframe/eager/columns.rs b/crates/nu-command/src/dataframe/eager/columns.rs new file mode 100644 index 0000000000..0005d9311e --- /dev/null +++ b/crates/nu-command/src/dataframe/eager/columns.rs @@ -0,0 +1,94 @@ +use super::super::values::NuDataFrame; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, Type, Value, +}; + +#[derive(Clone)] +pub struct ColumnsDF; + +impl Command for ColumnsDF { + fn name(&self) -> &str { + "columns" + } + + fn usage(&self) -> &str { + "Show dataframe columns" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .input_type(Type::Custom("dataframe".into())) + .output_type(Type::Any) + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Dataframe columns", + example: "[[a b]; [1 2] [3 4]] | into df | columns", + result: Some(Value::List { + vals: vec![ + Value::String { + val: "a".into(), + span: Span::test_data(), + }, + Value::String { + val: "b".into(), + span: Span::test_data(), + }, + ], + span: Span::test_data(), + }), + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + command(engine_state, stack, call, input) + } +} + +#[allow(clippy::needless_collect)] +fn command( + _engine_state: &EngineState, + _stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let df = NuDataFrame::try_from_pipeline(input, call.head)?; + + let names: Vec = df + .as_ref() + .get_column_names() + .iter() + .map(|v| Value::String { + val: v.to_string(), + span: call.head, + }) + .collect(); + + let names = Value::List { + vals: names, + span: call.head, + }; + + Ok(PipelineData::Value(names, None)) +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(ColumnsDF {})]) + } +} diff --git a/crates/nu-command/src/dataframe/eager/mod.rs b/crates/nu-command/src/dataframe/eager/mod.rs index da3c077f12..1b7442ac3b 100644 --- a/crates/nu-command/src/dataframe/eager/mod.rs +++ b/crates/nu-command/src/dataframe/eager/mod.rs @@ -1,4 +1,5 @@ mod append; +mod columns; mod describe; mod drop; mod drop_duplicates; @@ -26,6 +27,7 @@ mod with_column; use nu_protocol::engine::StateWorkingSet; pub use append::AppendDF; +pub use columns::ColumnsDF; pub use describe::DescribeDF; pub use drop::DropDF; pub use drop_duplicates::DropDuplicates; @@ -63,6 +65,7 @@ pub fn add_eager_decls(working_set: &mut StateWorkingSet) { // Dataframe commands bind_command!( AppendDF, + ColumnsDF, DataTypes, DescribeDF, DropDF, diff --git a/crates/nu-command/src/dataframe/expressions/arg_where.rs b/crates/nu-command/src/dataframe/expressions/arg_where.rs new file mode 100644 index 0000000000..90f4d2b953 --- /dev/null +++ b/crates/nu-command/src/dataframe/expressions/arg_where.rs @@ -0,0 +1,76 @@ +use crate::dataframe::values::{Column, NuDataFrame, NuExpression}; +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value, +}; +use polars::prelude::arg_where; + +#[derive(Clone)] +pub struct ExprArgWhere; + +impl Command for ExprArgWhere { + fn name(&self) -> &str { + "arg-where" + } + + fn usage(&self) -> &str { + "Creates an expression that returns the arguments where expression is true" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required("column name", SyntaxShape::Any, "Expression to evaluate") + .input_type(Type::Any) + .output_type(Type::Custom("expression".into())) + .category(Category::Custom("expression".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Return a dataframe where the value match the expression", + example: "let df = ([[a b]; [one 1] [two 2] [three 3]] | into df); + $df | select (arg-where ((col b) >= 2) | as b_arg)", + result: Some( + NuDataFrame::try_from_columns(vec![Column::new( + "b_arg".to_string(), + vec![Value::test_int(1), Value::test_int(2)], + )]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + _input: PipelineData, + ) -> Result { + let value: Value = call.req(engine_state, stack, 0)?; + let expr = NuExpression::try_from_value(value)?; + let expr: NuExpression = arg_where(expr.into_polars()).into(); + + Ok(PipelineData::Value(expr.into_value(call.head), None)) + } +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + use crate::dataframe::expressions::ExprAlias; + use crate::dataframe::lazy::LazySelect; + + #[test] + fn test_examples() { + test_dataframe(vec![ + Box::new(ExprArgWhere {}), + Box::new(ExprAlias {}), + Box::new(LazySelect {}), + ]) + } +} diff --git a/crates/nu-command/src/dataframe/expressions/is_in.rs b/crates/nu-command/src/dataframe/expressions/is_in.rs new file mode 100644 index 0000000000..9051b4aa23 --- /dev/null +++ b/crates/nu-command/src/dataframe/expressions/is_in.rs @@ -0,0 +1,110 @@ +use crate::dataframe::values::{Column, NuDataFrame, NuExpression}; +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value, +}; +use polars::prelude::{lit, DataType}; + +#[derive(Clone)] +pub struct ExprIsIn; + +impl Command for ExprIsIn { + fn name(&self) -> &str { + "is-in" + } + + fn usage(&self) -> &str { + "Creates an is-in expression" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required( + "list", + SyntaxShape::List(Box::new(SyntaxShape::Any)), + "List to check if values are in", + ) + .input_type(Type::Custom("expression".into())) + .output_type(Type::Custom("expression".into())) + .category(Category::Custom("expression".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Creates a is-in expression", + example: r#"let df = ([[a b]; [one 1] [two 2] [three 3]] | into df); + $df | with-column (col a | is-in [one two] | as a_in)"#, + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new( + "a".to_string(), + vec![ + Value::test_string("one"), + Value::test_string("two"), + Value::test_string("three"), + ], + ), + Column::new( + "b".to_string(), + vec![Value::test_int(1), Value::test_int(2), Value::test_int(3)], + ), + Column::new( + "a_in".to_string(), + vec![ + Value::test_bool(true), + Value::test_bool(true), + Value::test_bool(false), + ], + ), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + let list: Vec = call.req(engine_state, stack, 0)?; + let expr = NuExpression::try_from_pipeline(input, call.head)?; + + let values = NuDataFrame::try_from_columns(vec![Column::new("list".to_string(), list)])?; + let list = values.as_series(call.head)?; + + if matches!(list.dtype(), DataType::Object(..)) { + return Err(ShellError::IncompatibleParametersSingle( + "Cannot use a mixed list as argument".into(), + call.head, + )); + } + + let expr: NuExpression = expr.into_polars().is_in(lit(list)).into(); + Ok(PipelineData::Value(expr.into_value(call.head), None)) + } +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + use crate::dataframe::eager::WithColumn; + use crate::dataframe::expressions::alias::ExprAlias; + use crate::dataframe::expressions::col::ExprCol; + + #[test] + fn test_examples() { + test_dataframe(vec![ + Box::new(ExprIsIn {}), + Box::new(ExprAlias {}), + Box::new(ExprCol {}), + Box::new(WithColumn {}), + ]) + } +} diff --git a/crates/nu-command/src/dataframe/expressions/mod.rs b/crates/nu-command/src/dataframe/expressions/mod.rs index 2133dcf0e9..25bfc4d419 100644 --- a/crates/nu-command/src/dataframe/expressions/mod.rs +++ b/crates/nu-command/src/dataframe/expressions/mod.rs @@ -1,8 +1,10 @@ mod alias; +mod arg_where; mod as_nu; mod col; mod concat_str; mod expressions_macro; +mod is_in; mod lit; mod otherwise; mod quantile; @@ -11,10 +13,12 @@ mod when; use nu_protocol::engine::StateWorkingSet; pub(crate) use crate::dataframe::expressions::alias::ExprAlias; +use crate::dataframe::expressions::arg_where::ExprArgWhere; use crate::dataframe::expressions::as_nu::ExprAsNu; pub(super) use crate::dataframe::expressions::col::ExprCol; pub(super) use crate::dataframe::expressions::concat_str::ExprConcatStr; pub(crate) use crate::dataframe::expressions::expressions_macro::*; +pub(super) use crate::dataframe::expressions::is_in::ExprIsIn; pub(super) use crate::dataframe::expressions::lit::ExprLit; pub(super) use crate::dataframe::expressions::otherwise::ExprOtherwise; pub(super) use crate::dataframe::expressions::quantile::ExprQuantile; @@ -33,6 +37,7 @@ pub fn add_expressions(working_set: &mut StateWorkingSet) { // Dataframe commands bind_command!( ExprAlias, + ExprArgWhere, ExprCol, ExprConcatStr, ExprCount, @@ -49,6 +54,7 @@ pub fn add_expressions(working_set: &mut StateWorkingSet) { ExprFirst, ExprLast, ExprNUnique, + ExprIsIn, ExprIsNotNull, ExprIsNull, ExprNot, diff --git a/crates/nu-command/src/dataframe/lazy/filter.rs b/crates/nu-command/src/dataframe/lazy/filter.rs new file mode 100644 index 0000000000..a952ed6350 --- /dev/null +++ b/crates/nu-command/src/dataframe/lazy/filter.rs @@ -0,0 +1,84 @@ +use crate::dataframe::values::{Column, NuDataFrame, NuExpression, NuLazyFrame}; + +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value, +}; + +#[derive(Clone)] +pub struct LazyFilter; + +impl Command for LazyFilter { + fn name(&self) -> &str { + "filter" + } + + fn usage(&self) -> &str { + "Filter dataframe based in expression" + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required( + "filter expression", + SyntaxShape::Any, + "Expression that define the column selection", + ) + .input_type(Type::Custom("dataframe".into())) + .output_type(Type::Custom("dataframe".into())) + .category(Category::Custom("lazyframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Filter dataframe using an expression", + example: "[[a b]; [6 2] [4 2] [2 2]] | into df | filter ((col a) >= 4)", + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new( + "a".to_string(), + vec![Value::test_int(6), Value::test_int(4)], + ), + Column::new( + "b".to_string(), + vec![Value::test_int(2), Value::test_int(2)], + ), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + let value: Value = call.req(engine_state, stack, 0)?; + let expression = NuExpression::try_from_value(value)?; + + let lazy = NuLazyFrame::try_from_pipeline(input, call.head)?; + let lazy = NuLazyFrame::new( + lazy.from_eager, + lazy.into_polars().filter(expression.into_polars()), + ); + + Ok(PipelineData::Value(lazy.into_value(call.head)?, None)) + } +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + + #[test] + fn test_examples() { + test_dataframe(vec![Box::new(LazyFilter {})]) + } +} diff --git a/crates/nu-command/src/dataframe/lazy/mod.rs b/crates/nu-command/src/dataframe/lazy/mod.rs index 50c77f279f..2c45861d09 100644 --- a/crates/nu-command/src/dataframe/lazy/mod.rs +++ b/crates/nu-command/src/dataframe/lazy/mod.rs @@ -3,6 +3,7 @@ mod collect; mod fetch; mod fill_na; mod fill_null; +mod filter; pub mod groupby; mod join; mod macro_commands; @@ -13,17 +14,17 @@ mod to_lazy; use nu_protocol::engine::StateWorkingSet; -pub(crate) use crate::dataframe::lazy::macro_commands::*; - use crate::dataframe::lazy::aggregate::LazyAggregate; pub use crate::dataframe::lazy::collect::LazyCollect; use crate::dataframe::lazy::fetch::LazyFetch; use crate::dataframe::lazy::fill_na::LazyFillNA; use crate::dataframe::lazy::fill_null::LazyFillNull; +use crate::dataframe::lazy::filter::LazyFilter; use crate::dataframe::lazy::groupby::ToLazyGroupBy; use crate::dataframe::lazy::join::LazyJoin; +pub(crate) use crate::dataframe::lazy::macro_commands::*; use crate::dataframe::lazy::quantile::LazyQuantile; -use crate::dataframe::lazy::select::LazySelect; +pub(crate) use crate::dataframe::lazy::select::LazySelect; use crate::dataframe::lazy::sort_by_expr::LazySortBy; pub use crate::dataframe::lazy::to_lazy::ToLazyFrame; @@ -45,6 +46,7 @@ pub fn add_lazy_decls(working_set: &mut StateWorkingSet) { LazyFetch, LazyFillNA, LazyFillNull, + LazyFilter, LazyJoin, LazyQuantile, LazyMax, diff --git a/crates/nu-command/src/dataframe/lazy/select.rs b/crates/nu-command/src/dataframe/lazy/select.rs index 9e9c26d6eb..2981c7d7b8 100644 --- a/crates/nu-command/src/dataframe/lazy/select.rs +++ b/crates/nu-command/src/dataframe/lazy/select.rs @@ -6,8 +6,6 @@ use nu_protocol::{ engine::{Command, EngineState, Stack}, Category, Example, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value, }; -use polars::prelude::Expr; - #[derive(Clone)] pub struct LazySelect; @@ -61,17 +59,6 @@ impl Command for LazySelect { }; let expressions = NuExpression::extract_exprs(value)?; - if expressions - .iter() - .any(|expr| !matches!(expr, Expr::Column(..))) - { - let value: Value = call.req(engine_state, stack, 0)?; - return Err(ShellError::IncompatibleParametersSingle( - "Expected only Col expressions".into(), - value.span()?, - )); - } - let lazy = NuLazyFrame::try_from_pipeline(input, call.head)?; let lazy = NuLazyFrame::new(lazy.from_eager, lazy.into_polars().select(&expressions)); diff --git a/crates/nu-command/src/dataframe/lazy/sort_by_expr.rs b/crates/nu-command/src/dataframe/lazy/sort_by_expr.rs index 3d0c0c6cb4..2d8928f78f 100644 --- a/crates/nu-command/src/dataframe/lazy/sort_by_expr.rs +++ b/crates/nu-command/src/dataframe/lazy/sort_by_expr.rs @@ -32,6 +32,11 @@ impl Command for LazySortBy { "Reverse sorting. Default is false", Some('r'), ) + .switch( + "nulls-last", + "nulls are shown last in the dataframe", + Some('n'), + ) .input_type(Type::Custom("dataframe".into())) .output_type(Type::Custom("dataframe".into())) .category(Category::Custom("lazyframe".into())) @@ -102,6 +107,7 @@ impl Command for LazySortBy { span: call.head, }; let expressions = NuExpression::extract_exprs(value)?; + let nulls_last = call.has_flag("nulls-last"); let reverse: Option> = call.get_flag(engine_state, stack, "reverse")?; let reverse = match reverse { @@ -128,7 +134,8 @@ impl Command for LazySortBy { let lazy = NuLazyFrame::try_from_pipeline(input, call.head)?; let lazy = NuLazyFrame::new( lazy.from_eager, - lazy.into_polars().sort_by_exprs(&expressions, reverse), + lazy.into_polars() + .sort_by_exprs(&expressions, reverse, nulls_last), ); Ok(PipelineData::Value( diff --git a/crates/nu-command/src/dataframe/series/date/get_weekday.rs b/crates/nu-command/src/dataframe/series/date/get_weekday.rs index 6d4c076d2d..49ead1bd94 100644 --- a/crates/nu-command/src/dataframe/series/date/get_weekday.rs +++ b/crates/nu-command/src/dataframe/series/date/get_weekday.rs @@ -35,7 +35,7 @@ impl Command for GetWeekDay { result: Some( NuDataFrame::try_from_columns(vec![Column::new( "0".to_string(), - vec![Value::test_int(2), Value::test_int(2)], + vec![Value::test_int(1), Value::test_int(1)], )]) .expect("simple df for test should not fail") .into_value(Span::test_data()), diff --git a/crates/nu-command/src/dataframe/series/indexes/arg_true.rs b/crates/nu-command/src/dataframe/series/indexes/arg_true.rs index 890413ceda..44a3737dd7 100644 --- a/crates/nu-command/src/dataframe/series/indexes/arg_true.rs +++ b/crates/nu-command/src/dataframe/series/indexes/arg_true.rs @@ -5,7 +5,7 @@ use nu_protocol::{ engine::{Command, EngineState, Stack}, Category, Example, PipelineData, ShellError, Signature, Span, Type, Value, }; -use polars::prelude::IntoSeries; +use polars::prelude::{arg_where, col, IntoLazy}; #[derive(Clone)] pub struct ArgTrue; @@ -59,23 +59,41 @@ fn command( input: PipelineData, ) -> Result { let df = NuDataFrame::try_from_pipeline(input, call.head)?; - - let series = df.as_series(call.head)?; - let bool = series.bool().map_err(|_| { - ShellError::GenericError( - "Error converting to bool".into(), - "all-false only works with series of type bool".into(), + let columns = df.as_ref().get_column_names(); + if columns.len() > 1 { + return Err(ShellError::GenericError( + "Error using as series".into(), + "dataframe has more than one column".into(), Some(call.head), None, Vec::new(), - ) - })?; + )); + } - let mut res = bool.arg_true().into_series(); - res.rename("arg_true"); + match columns.first() { + Some(column) => { + let expression = arg_where(col(column).eq(true)).alias("arg_true"); + let res = df + .as_ref() + .clone() + .lazy() + .select(&[expression]) + .collect() + .map_err(|err| { + ShellError::GenericError( + "Error creating index column".into(), + err.to_string(), + Some(call.head), + None, + Vec::new(), + ) + })?; - NuDataFrame::try_from_series(vec![res], call.head) - .map(|df| PipelineData::Value(NuDataFrame::into_value(df, call.head), None)) + let value = NuDataFrame::dataframe_into_value(res, call.head); + Ok(PipelineData::Value(value, None)) + } + _ => todo!(), + } } #[cfg(test)] diff --git a/crates/nu-command/src/dataframe/series/indexes/set_with_idx.rs b/crates/nu-command/src/dataframe/series/indexes/set_with_idx.rs index 76b6106538..18ba0ce411 100644 --- a/crates/nu-command/src/dataframe/series/indexes/set_with_idx.rs +++ b/crates/nu-command/src/dataframe/series/indexes/set_with_idx.rs @@ -146,7 +146,7 @@ fn command( NuDataFrame::try_from_series(vec![res.into_series()], call.head) } Value::Float { val, span } => { - let chunked = series.as_ref().f64().map_err(|e| { + let chunked = series.f64().map_err(|e| { ShellError::GenericError( "Error casting to f64".into(), e.to_string(), @@ -169,7 +169,7 @@ fn command( NuDataFrame::try_from_series(vec![res.into_series()], call.head) } Value::String { val, span } => { - let chunked = series.as_ref().utf8().map_err(|e| { + let chunked = series.utf8().map_err(|e| { ShellError::GenericError( "Error casting to string".into(), e.to_string(), diff --git a/crates/nu-command/src/dataframe/series/masks/is_duplicated.rs b/crates/nu-command/src/dataframe/series/masks/is_duplicated.rs index 533556a579..90299bb4d0 100644 --- a/crates/nu-command/src/dataframe/series/masks/is_duplicated.rs +++ b/crates/nu-command/src/dataframe/series/masks/is_duplicated.rs @@ -27,26 +27,46 @@ impl Command for IsDuplicated { } fn examples(&self) -> Vec { - vec![Example { - description: "Create mask indicating duplicated values", - example: "[5 6 6 6 8 8 8] | into df | is-duplicated", - result: Some( - NuDataFrame::try_from_columns(vec![Column::new( - "is_duplicated".to_string(), - vec![ - Value::test_bool(false), - Value::test_bool(true), - Value::test_bool(true), - Value::test_bool(true), - Value::test_bool(true), - Value::test_bool(true), - Value::test_bool(true), - ], - )]) - .expect("simple df for test should not fail") - .into_value(Span::test_data()), - ), - }] + vec![ + Example { + description: "Create mask indicating duplicated values", + example: "[5 6 6 6 8 8 8] | into df | is-duplicated", + result: Some( + NuDataFrame::try_from_columns(vec![Column::new( + "is_duplicated".to_string(), + vec![ + Value::test_bool(false), + Value::test_bool(true), + Value::test_bool(true), + Value::test_bool(true), + Value::test_bool(true), + Value::test_bool(true), + Value::test_bool(true), + ], + )]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + Example { + description: "Create mask indicating duplicated rows in a dataframe", + example: "[[a, b]; [1 2] [1 2] [3 3] [3 3] [1 1]] | into df | is-duplicated", + result: Some( + NuDataFrame::try_from_columns(vec![Column::new( + "is_duplicated".to_string(), + vec![ + Value::test_bool(true), + Value::test_bool(true), + Value::test_bool(true), + Value::test_bool(true), + Value::test_bool(false), + ], + )]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + ] } fn run( @@ -69,7 +89,7 @@ fn command( let df = NuDataFrame::try_from_pipeline(input, call.head)?; let mut res = df - .as_series(call.head)? + .as_ref() .is_duplicated() .map_err(|e| { ShellError::GenericError( @@ -84,7 +104,7 @@ fn command( res.rename("is_duplicated"); - NuDataFrame::try_from_series(vec![res.into_series()], call.head) + NuDataFrame::try_from_series(vec![res], call.head) .map(|df| PipelineData::Value(NuDataFrame::into_value(df, call.head), None)) } diff --git a/crates/nu-command/src/dataframe/series/masks/is_unique.rs b/crates/nu-command/src/dataframe/series/masks/is_unique.rs index 279f58769d..4fab38e697 100644 --- a/crates/nu-command/src/dataframe/series/masks/is_unique.rs +++ b/crates/nu-command/src/dataframe/series/masks/is_unique.rs @@ -27,26 +27,46 @@ impl Command for IsUnique { } fn examples(&self) -> Vec { - vec![Example { - description: "Create mask indicating unique values", - example: "[5 6 6 6 8 8 8] | into df | is-unique", - result: Some( - NuDataFrame::try_from_columns(vec![Column::new( - "is_unique".to_string(), - vec![ - Value::test_bool(true), - Value::test_bool(false), - Value::test_bool(false), - Value::test_bool(false), - Value::test_bool(false), - Value::test_bool(false), - Value::test_bool(false), - ], - )]) - .expect("simple df for test should not fail") - .into_value(Span::test_data()), - ), - }] + vec![ + Example { + description: "Create mask indicating unique values", + example: "[5 6 6 6 8 8 8] | into df | is-unique", + result: Some( + NuDataFrame::try_from_columns(vec![Column::new( + "is_unique".to_string(), + vec![ + Value::test_bool(true), + Value::test_bool(false), + Value::test_bool(false), + Value::test_bool(false), + Value::test_bool(false), + Value::test_bool(false), + Value::test_bool(false), + ], + )]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + Example { + description: "Create mask indicating duplicated rows in a dataframe", + example: "[[a, b]; [1 2] [1 2] [3 3] [3 3] [1 1]] | into df | is-unique", + result: Some( + NuDataFrame::try_from_columns(vec![Column::new( + "is_unique".to_string(), + vec![ + Value::test_bool(false), + Value::test_bool(false), + Value::test_bool(false), + Value::test_bool(false), + Value::test_bool(true), + ], + )]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + ] } fn run( @@ -68,18 +88,23 @@ fn command( ) -> Result { let df = NuDataFrame::try_from_pipeline(input, call.head)?; - let mut res = df.as_series(call.head)?.is_unique().map_err(|e| { - ShellError::GenericError( - "Error finding unique values".into(), - e.to_string(), - Some(call.head), - None, - Vec::new(), - ) - })?; + let mut res = df + .as_ref() + .is_unique() + .map_err(|e| { + ShellError::GenericError( + "Error finding unique values".into(), + e.to_string(), + Some(call.head), + None, + Vec::new(), + ) + })? + .into_series(); + res.rename("is_unique"); - NuDataFrame::try_from_series(vec![res.into_series()], call.head) + NuDataFrame::try_from_series(vec![res], call.head) .map(|df| PipelineData::Value(NuDataFrame::into_value(df, call.head), None)) } diff --git a/crates/nu-command/src/dataframe/series/masks/set.rs b/crates/nu-command/src/dataframe/series/masks/set.rs index 0edd389f01..27c44ae77d 100644 --- a/crates/nu-command/src/dataframe/series/masks/set.rs +++ b/crates/nu-command/src/dataframe/series/masks/set.rs @@ -129,7 +129,7 @@ fn command( NuDataFrame::try_from_series(vec![res.into_series()], call.head) } Value::Float { val, span } => { - let chunked = series.as_ref().f64().map_err(|e| { + let chunked = series.f64().map_err(|e| { ShellError::GenericError( "Error casting to f64".into(), e.to_string(), @@ -152,7 +152,7 @@ fn command( NuDataFrame::try_from_series(vec![res.into_series()], call.head) } Value::String { val, span } => { - let chunked = series.as_ref().utf8().map_err(|e| { + let chunked = series.utf8().map_err(|e| { ShellError::GenericError( "Error casting to string".into(), e.to_string(), diff --git a/crates/nu-command/src/dataframe/series/value_counts.rs b/crates/nu-command/src/dataframe/series/value_counts.rs index 3242b2530a..4f19a495c8 100644 --- a/crates/nu-command/src/dataframe/series/value_counts.rs +++ b/crates/nu-command/src/dataframe/series/value_counts.rs @@ -6,6 +6,8 @@ use nu_protocol::{ Category, Example, PipelineData, ShellError, Signature, Span, Type, Value, }; +use polars::prelude::SeriesMethods; + #[derive(Clone)] pub struct ValueCount; @@ -66,7 +68,7 @@ fn command( let df = NuDataFrame::try_from_pipeline(input, call.head)?; let series = df.as_series(call.head)?; - let res = series.value_counts(false).map_err(|e| { + let res = series.value_counts(false, false).map_err(|e| { ShellError::GenericError( "Error calculating value counts values".into(), e.to_string(), diff --git a/crates/nu-command/src/filters/find.rs b/crates/nu-command/src/filters/find.rs index ae8a502187..3fe81b121c 100644 --- a/crates/nu-command/src/filters/find.rs +++ b/crates/nu-command/src/filters/find.rs @@ -190,7 +190,7 @@ fn find_with_regex( (true, true, true) => "(?ims)", }; - let regex = flags.to_string() + ®ex; + let regex = flags.to_string() + regex.as_str(); let re = Regex::new(regex.as_str()) .map_err(|e| ShellError::UnsupportedInput(format!("incorrect regex: {}", e), span))?; diff --git a/crates/nu-command/src/filters/lines.rs b/crates/nu-command/src/filters/lines.rs index 1ec062d182..b4a00f3fbf 100644 --- a/crates/nu-command/src/filters/lines.rs +++ b/crates/nu-command/src/filters/lines.rs @@ -199,7 +199,7 @@ impl Iterator for RawStreamLinesAdapter { if !self.incomplete_line.is_empty() { if let Some(first) = lines.first() { let new_incomplete_line = - self.incomplete_line.to_string() + first; + self.incomplete_line.to_string() + first.as_str(); lines.splice(0..1, vec![new_incomplete_line]); self.incomplete_line = String::new(); } diff --git a/crates/nu-command/src/formats/from/yaml.rs b/crates/nu-command/src/formats/from/yaml.rs index abab7cdaa2..d7242f3ef8 100644 --- a/crates/nu-command/src/formats/from/yaml.rs +++ b/crates/nu-command/src/formats/from/yaml.rs @@ -133,7 +133,7 @@ fn convert_yaml_value_to_nu_value(v: &serde_yaml::Value, span: Span) -> Result { Some(Value::String { - val: "{{ ".to_owned() + s + " }}", + val: "{{ ".to_owned() + s.as_str() + " }}", span, }) } diff --git a/crates/nu-command/tests/commands/source.rs b/crates/nu-command/tests/commands/source.rs index b663f1cfba..baf659e1d6 100644 --- a/crates/nu-command/tests/commands/source.rs +++ b/crates/nu-command/tests/commands/source.rs @@ -59,7 +59,7 @@ fn try_source_foo_with_double_quotes_in(testdir: &str, playdir: &str) { sandbox.mkdir(&testdir); sandbox.with_files(vec![FileWithContent(&foo_file, "echo foo")]); - let cmd = String::from("source ") + r#"""# + &foo_file + r#"""#; + let cmd = String::from("source ") + r#"""# + foo_file.as_str() + r#"""#; let actual = nu!(cwd: dirs.test(), &cmd); @@ -76,7 +76,7 @@ fn try_source_foo_with_single_quotes_in(testdir: &str, playdir: &str) { sandbox.mkdir(&testdir); sandbox.with_files(vec![FileWithContent(&foo_file, "echo foo")]); - let cmd = String::from("source ") + r#"'"# + &foo_file + r#"'"#; + let cmd = String::from("source ") + r#"'"# + foo_file.as_str() + r#"'"#; let actual = nu!(cwd: dirs.test(), &cmd); @@ -93,7 +93,7 @@ fn try_source_foo_without_quotes_in(testdir: &str, playdir: &str) { sandbox.mkdir(&testdir); sandbox.with_files(vec![FileWithContent(&foo_file, "echo foo")]); - let cmd = String::from("source ") + &foo_file; + let cmd = String::from("source ") + foo_file.as_str(); let actual = nu!(cwd: dirs.test(), &cmd);