From a2dc3e3b33e49a63d208fd387430d2c278debebb Mon Sep 17 00:00:00 2001 From: pyz4 <42039243+pyz4@users.noreply.github.com> Date: Fri, 18 Apr 2025 16:50:36 -0400 Subject: [PATCH] feat(polars): enable `as_date` and `as_datetime` to handle expressions as inputs (#15590) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description This PR is a follow-up to the previous PR #15557 and part of a wider campaign to enable certain polars commands that only operated on the entire dataframe to also operate on expressions. Here, we enable two commands `polars as-date` and `polars as-datetime` to receive expressions as inputs so that they may be used on specific columns in a dataframe with multiple columns of different types. See examples below. ```nushell > [[a b]; ["2025-04-01" 1] ["2025-04-02" 2] ["2025-04-03" 3]] | polars into-df | polars select (polars col a | polars as-date %Y-%m-%d) b | polars collect ╭───┬───────────────────────┬───╮ │ # │ a │ b │ ├───┼───────────────────────┼───┤ │ 0 │ 04/01/2025 12:00:00AM │ 1 │ │ 1 │ 04/02/2025 12:00:00AM │ 2 │ │ 2 │ 04/03/2025 12:00:00AM │ 3 │ ╰───┴───────────────────────┴───╯ > seq date -b 2025-04-01 --periods 4 --increment 25min -o "%Y-%m-%d %H:%M:%S" | polars into-df | polars select (polars col 0 | polars as-datetime "%Y-%m-%d %H:%M:%S") | polars collect ╭───┬───────────────────────╮ │ # │ 0 │ ├───┼───────────────────────┤ │ 0 │ 04/01/2025 12:00:00AM │ │ 1 │ 04/01/2025 12:25:00AM │ │ 2 │ 04/01/2025 12:50:00AM │ │ 3 │ 04/01/2025 01:15:00AM │ ╰───┴───────────────────────╯ ``` # User-Facing Changes No breaking changes. Users have the additional option to use `polars as-date` and `polars as-datetime` in expressions that operate on specific columns. # Tests + Formatting Examples have been added to `polars as-date` and `polars as-datetime`. # After Submitting --- .../src/dataframe/command/datetime/as_date.rs | 185 ++++++++++++++++- .../dataframe/command/datetime/as_datetime.rs | 195 ++++++++++++++++-- 2 files changed, 357 insertions(+), 23 deletions(-) diff --git a/crates/nu_plugin_polars/src/dataframe/command/datetime/as_date.rs b/crates/nu_plugin_polars/src/dataframe/command/datetime/as_date.rs index aa36e44ece..eb9c056309 100644 --- a/crates/nu_plugin_polars/src/dataframe/command/datetime/as_date.rs +++ b/crates/nu_plugin_polars/src/dataframe/command/datetime/as_date.rs @@ -1,13 +1,19 @@ -use crate::{values::CustomValueSupport, PolarsPlugin}; - -use super::super::super::values::NuDataFrame; +use crate::{ + values::{ + cant_convert_err, Column, CustomValueSupport, NuDataFrame, NuExpression, NuLazyFrame, + NuSchema, PolarsPluginObject, PolarsPluginType, + }, + PolarsPlugin, +}; +use chrono::DateTime; +use std::sync::Arc; use nu_plugin::{EngineInterface, EvaluatedCall, PluginCommand}; use nu_protocol::{ record, Category, Example, LabeledError, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value, }; -use polars::prelude::{IntoSeries, StringMethods}; +use polars::prelude::{col, DataType, Field, IntoSeries, Schema, StringMethods, StrptimeOptions}; #[derive(Clone)] pub struct AsDate; @@ -34,10 +40,16 @@ impl PluginCommand for AsDate { Signature::build(self.name()) .required("format", SyntaxShape::String, "formatting date string") .switch("not-exact", "the format string may be contained in the date (e.g. foo-2021-01-01-bar could match 2021-01-01)", Some('n')) - .input_output_type( - Type::Custom("dataframe".into()), - Type::Custom("dataframe".into()), - ) + .input_output_types(vec![ + ( + Type::Custom("dataframe".into()), + Type::Custom("dataframe".into()), + ), + ( + Type::Custom("expression".into()), + Type::Custom("expression".into()), + ), + ]) .category(Category::Custom("dataframe".into())) } @@ -46,12 +58,110 @@ impl PluginCommand for AsDate { Example { description: "Converts string to date", example: r#"["2021-12-30" "2021-12-31"] | polars into-df | polars as-date "%Y-%m-%d""#, - result: None, // help is needed on how to provide results + result: Some( + NuDataFrame::try_from_columns( + vec![Column::new( + "date".to_string(), + vec![ + // Nushell's Value::date only maps to DataType::Datetime and not DataType::Date + // We therefore force the type to be DataType::Date in the schema + Value::date( + DateTime::parse_from_str( + "2021-12-30 00:00:00 +0000", + "%Y-%m-%d %H:%M:%S %z", + ) + .expect("date calculation should not fail in test"), + Span::test_data(), + ), + Value::date( + DateTime::parse_from_str( + "2021-12-31 00:00:00 +0000", + "%Y-%m-%d %H:%M:%S %z", + ) + .expect("date calculation should not fail in test"), + Span::test_data(), + ), + ], + )], + Some(NuSchema::new(Arc::new(Schema::from_iter(vec![ + Field::new("date".into(), DataType::Date), + ])))), + ) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), }, Example { description: "Converts string to date", example: r#"["2021-12-30" "2021-12-31 21:00:00"] | polars into-df | polars as-date "%Y-%m-%d" --not-exact"#, - result: None, + result: Some( + NuDataFrame::try_from_columns( + vec![Column::new( + "date".to_string(), + vec![ + // Nushell's Value::date only maps to DataType::Datetime and not DataType::Date + // We therefore force the type to be DataType::Date in the schema + Value::date( + DateTime::parse_from_str( + "2021-12-30 00:00:00 +0000", + "%Y-%m-%d %H:%M:%S %z", + ) + .expect("date calculation should not fail in test"), + Span::test_data(), + ), + Value::date( + DateTime::parse_from_str( + "2021-12-31 00:00:00 +0000", + "%Y-%m-%d %H:%M:%S %z", + ) + .expect("date calculation should not fail in test"), + Span::test_data(), + ), + ], + )], + Some(NuSchema::new(Arc::new(Schema::from_iter(vec![ + Field::new("date".into(), DataType::Date), + ])))), + ) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + Example { + description: "Converts string to date in an expression", + example: r#"["2021-12-30" "2021-12-31 21:00:00"] | polars into-lazy | polars select (polars col 0 | polars as-date "%Y-%m-%d" --not-exact)"#, + result: Some( + NuDataFrame::try_from_columns( + vec![Column::new( + "date".to_string(), + vec![ + // Nushell's Value::date only maps to DataType::Datetime and not DataType::Date + // We therefore force the type to be DataType::Date in the schema + Value::date( + DateTime::parse_from_str( + "2021-12-30 00:00:00 +0000", + "%Y-%m-%d %H:%M:%S %z", + ) + .expect("date calculation should not fail in test"), + Span::test_data(), + ), + Value::date( + DateTime::parse_from_str( + "2021-12-31 00:00:00 +0000", + "%Y-%m-%d %H:%M:%S %z", + ) + .expect("date calculation should not fail in test"), + Span::test_data(), + ), + ], + )], + Some(NuSchema::new(Arc::new(Schema::from_iter(vec![ + Field::new("date".into(), DataType::Date), + ])))), + ) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), }, Example { description: "Output is of date type", @@ -85,8 +195,61 @@ fn command( ) -> Result { let format: String = call.req(0)?; let not_exact = call.has_flag("not-exact")?; + let value = input.into_value(call.head)?; + + let options = StrptimeOptions { + format: Some(format.into()), + strict: true, + exact: !not_exact, + cache: Default::default(), + }; + + match PolarsPluginObject::try_from_value(plugin, &value)? { + PolarsPluginObject::NuLazyFrame(lazy) => command_lazy(plugin, engine, call, lazy, options), + PolarsPluginObject::NuDataFrame(df) => command_eager(plugin, engine, call, df, options), + PolarsPluginObject::NuExpression(expr) => { + let res: NuExpression = expr.into_polars().str().to_date(options).into(); + res.to_pipeline_data(plugin, engine, call.head) + } + _ => Err(cant_convert_err( + &value, + &[ + PolarsPluginType::NuDataFrame, + PolarsPluginType::NuLazyFrame, + PolarsPluginType::NuExpression, + ], + )), + } +} + +fn command_lazy( + plugin: &PolarsPlugin, + engine: &EngineInterface, + call: &EvaluatedCall, + lazy: NuLazyFrame, + options: StrptimeOptions, +) -> Result { + NuLazyFrame::new( + false, + lazy.to_polars().select([col("*").str().to_date(options)]), + ) + .to_pipeline_data(plugin, engine, call.head) +} + +fn command_eager( + plugin: &PolarsPlugin, + engine: &EngineInterface, + call: &EvaluatedCall, + df: NuDataFrame, + options: StrptimeOptions, +) -> Result { + let format = if let Some(format) = options.format { + format.to_string() + } else { + unreachable!("`format` will never be None") + }; + let not_exact = !options.exact; - let df = NuDataFrame::try_from_pipeline_coerce(plugin, input, call.head)?; let series = df.as_series(call.head)?; let casted = series.str().map_err(|e| ShellError::GenericError { error: "Error casting to string".into(), diff --git a/crates/nu_plugin_polars/src/dataframe/command/datetime/as_datetime.rs b/crates/nu_plugin_polars/src/dataframe/command/datetime/as_datetime.rs index 3a246cc58c..c06a349496 100644 --- a/crates/nu_plugin_polars/src/dataframe/command/datetime/as_datetime.rs +++ b/crates/nu_plugin_polars/src/dataframe/command/datetime/as_datetime.rs @@ -1,15 +1,22 @@ -use crate::{values::CustomValueSupport, PolarsPlugin}; +use crate::{ + values::{ + cant_convert_err, Column, CustomValueSupport, NuDataFrame, NuExpression, NuLazyFrame, + NuSchema, PolarsPluginObject, PolarsPluginType, + }, + PolarsPlugin, +}; +use chrono::DateTime; use std::sync::Arc; -use super::super::super::values::{Column, NuDataFrame, NuSchema}; - -use chrono::DateTime; use nu_plugin::{EngineInterface, EvaluatedCall, PluginCommand}; use nu_protocol::{ Category, Example, LabeledError, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value, }; -use polars::prelude::{DataType, Field, IntoSeries, Schema, StringMethods, TimeUnit}; +use polars::prelude::{ + col, DataType, Expr, Field, IntoSeries, LiteralValue, PlSmallStr, Schema, StringMethods, + StrptimeOptions, TimeUnit, +}; #[derive(Clone)] pub struct AsDateTime; @@ -42,14 +49,30 @@ impl PluginCommand for AsDateTime { fn signature(&self) -> Signature { Signature::build(self.name()) + .input_output_types(vec![ + ( + Type::Custom("dataframe".into()), + Type::Custom("dataframe".into()), + ), + ( + Type::Custom("expression".into()), + Type::Custom("expression".into()), + ), + ]) .required("format", SyntaxShape::String, "formatting date time string") .switch("not-exact", "the format string may be contained in the date (e.g. foo-2021-01-01-bar could match 2021-01-01)", Some('n')) - .switch("naive", "the input datetimes should be parsed as naive (i.e., not timezone-aware)", None) - .input_output_type( - Type::Custom("dataframe".into()), - Type::Custom("dataframe".into()), - ) - .category(Category::Custom("dataframe".into())) + .switch("naive", "the input datetimes should be parsed as naive (i.e., not timezone-aware). Ignored if input is an expression.", None) + .named( + "ambiguous", + SyntaxShape::OneOf(vec![SyntaxShape::String, SyntaxShape::Nothing]), + r#"Determine how to deal with ambiguous datetimes: + `raise` (default): raise error + `earliest`: use the earliest datetime + `latest`: use the latest datetime + `null`: set to null + Used only when input is a lazyframe or expression and ignored otherwise"#, + Some('a'), + ) .category(Category::Custom("dataframe".into())) } fn examples(&self) -> Vec { @@ -158,6 +181,63 @@ impl PluginCommand for AsDateTime { .into_value(Span::test_data()), ), }, + Example { + description: "Converts string to datetime using the `--not-exact` flag even with excessive symbols in an expression", + example: r#"["2025-11-02 00:00:00", "2025-11-02 01:00:00", "2025-11-02 02:00:00", "2025-11-02 03:00:00"] | polars into-df | polars select (polars col 0 | polars as-datetime "%Y-%m-%d %H:%M:%S")"#, + result: Some( + NuDataFrame::try_from_columns( + vec![Column::new( + "datetime".to_string(), + vec![ + Value::date( + DateTime::parse_from_str( + "2025-11-02 00:00:00 +0000", + "%Y-%m-%d %H:%M:%S %z", + ) + .expect("date calculation should not fail in test"), + Span::test_data(), + ), + + Value::date( + DateTime::parse_from_str( + "2025-11-02 01:00:00 +0000", + "%Y-%m-%d %H:%M:%S %z", + ) + .expect("date calculation should not fail in test"), + Span::test_data(), + ), + Value::date( + DateTime::parse_from_str( + "2025-11-02 02:00:00 +0000", + "%Y-%m-%d %H:%M:%S %z", + ) + .expect("date calculation should not fail in test"), + Span::test_data(), + ), + Value::date( + DateTime::parse_from_str( + "2025-11-02 03:00:00 +0000", + "%Y-%m-%d %H:%M:%S %z", + ) + .expect("date calculation should not fail in test"), + Span::test_data(), + ), + ], + )], + Some(NuSchema::new(Arc::new(Schema::from_iter(vec![ + Field::new( + "datetime".into(), + DataType::Datetime( + TimeUnit::Nanoseconds, + None + ), + ), + ])))), + ) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, ] } @@ -182,7 +262,98 @@ fn command( let not_exact = call.has_flag("not-exact")?; let tz_aware = !call.has_flag("naive")?; - let df = NuDataFrame::try_from_pipeline_coerce(plugin, input, call.head)?; + let value = input.into_value(call.head)?; + + let options = StrptimeOptions { + format: Some(format.into()), + strict: true, + exact: !not_exact, + cache: Default::default(), + }; + + let ambiguous = match call.get_flag::("ambiguous")? { + Some(Value::String { val, internal_span }) => match val.as_str() { + "raise" | "earliest" | "latest" => Ok(val), + _ => Err(ShellError::GenericError { + error: "Invalid argument value".into(), + msg: "`ambiguous` must be one of raise, earliest, latest, or null".into(), + span: Some(internal_span), + help: None, + inner: vec![], + }), + }, + Some(Value::Nothing { .. }) => Ok("null".into()), + Some(_) => unreachable!("Argument only accepts string or null."), + None => Ok("raise".into()), + } + .map_err(LabeledError::from)?; + + match PolarsPluginObject::try_from_value(plugin, &value)? { + PolarsPluginObject::NuLazyFrame(lazy) => { + command_lazy(plugin, engine, call, lazy, options, ambiguous) + } + PolarsPluginObject::NuDataFrame(df) => { + command_eager(plugin, engine, call, df, options, tz_aware) + } + PolarsPluginObject::NuExpression(expr) => { + let res: NuExpression = expr + .into_polars() + .str() + .to_datetime( + None, + None, + options, + Expr::Literal(LiteralValue::String(PlSmallStr::from_string(ambiguous))), + ) + .into(); + res.to_pipeline_data(plugin, engine, call.head) + } + _ => Err(cant_convert_err( + &value, + &[ + PolarsPluginType::NuDataFrame, + PolarsPluginType::NuLazyFrame, + PolarsPluginType::NuExpression, + ], + )), + } +} + +fn command_lazy( + plugin: &PolarsPlugin, + engine: &EngineInterface, + call: &EvaluatedCall, + lazy: NuLazyFrame, + options: StrptimeOptions, + ambiguous: String, +) -> Result { + NuLazyFrame::new( + false, + lazy.to_polars().select([col("*").str().to_datetime( + None, + None, + options, + Expr::Literal(LiteralValue::String(PlSmallStr::from_string(ambiguous))), + )]), + ) + .to_pipeline_data(plugin, engine, call.head) +} + +fn command_eager( + plugin: &PolarsPlugin, + engine: &EngineInterface, + call: &EvaluatedCall, + df: NuDataFrame, + options: StrptimeOptions, + tz_aware: bool, +) -> Result { + let format = if let Some(format) = options.format { + format.to_string() + } else { + unreachable!("`format` will never be None") + }; + let not_exact = !options.exact; + let series = df.as_series(call.head)?; let casted = series.str().map_err(|e| ShellError::GenericError { error: "Error casting to string".into(),