From 7ca2a6f8ace1f5d8e2e9d91df77abe7d1942ef33 Mon Sep 17 00:00:00 2001 From: pyz4 <42039243+pyz4@users.noreply.github.com> Date: Fri, 4 Apr 2025 12:43:21 -0400 Subject: [PATCH] FIX `polars as-datetime`: ignores timezone information on conversion (#15490) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description This PR seeks to fix an error in `polars as-datetime` where timezone information is entirely ignored. This behavior raises a host of silent errors when dealing with datetime conversions (see example below). ## Current Implementation Timezones are entirely ignored and datetimes with different timezones are converted to the same naive datetimes even when the user specifically indicates that the timezone should be parsed. For example, "2021-12-30 00:00:00 +0000" and "2021-12-30 00:00:00 -0400" will both be parsed to "2021-12-30 00:00:00" even when the format string specifically includes "%z". ``` $ ["2021-12-30 00:00:00 +0000" "2021-12-30 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z" ╭───┬───────────────────────╮ │ # │ datetime │ ├───┼───────────────────────┤ │ 0 │ 12/30/2021 12:00:00AM │ │ 1 │ 12/30/2021 12:00:00AM │ <-- Same datetime even though the first is +0000 and second is -0400 ╰───┴───────────────────────╯ $ ["2021-12-30 00:00:00 +0000" "2021-12-30 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z" | polars schema ╭──────────┬──────────────╮ │ datetime │ datetime │ ╰──────────┴──────────────╯ ``` ## New Implementation Datetimes are converted to UTC and timezone information is retained. ``` $ "2021-12-30 00:00:00 +0000" "2021-12-30 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z" ╭───┬───────────────────────╮ │ # │ datetime │ ├───┼───────────────────────┤ │ 0 │ 12/30/2021 12:00:00AM │ │ 1 │ 12/30/2021 04:00:00AM │ <-- Converted to UTC ╰───┴───────────────────────╯ $ ["2021-12-30 00:00:00 +0000" "2021-12-30 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z" | polars schema ╭──────────┬───────────────────╮ │ datetime │ datetime │ ╰──────────┴───────────────────╯ ``` The user may intentionally ignore timezone information by setting the `--naive` flag. ``` $ ["2021-12-30 00:00:00 +0000" "2021-12-30 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z" --naive ╭───┬───────────────────────╮ │ # │ datetime │ ├───┼───────────────────────┤ │ 0 │ 12/30/2021 12:00:00AM │ │ 1 │ 12/30/2021 12:00:00AM │ <-- the -0400 offset is ignored when --naive is set ╰───┴───────────────────────╯ $ ["2021-12-30 00:00:00 +0000" "2021-12-30 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z" --naive | polars schema ╭──────────┬──────────────╮ │ datetime │ datetime │ ╰──────────┴──────────────╯ ``` # User-Facing Changes `polars as-datetime` will now account for timezone information and return type `datetime` rather than `datetime` by default. The user can replicate the previous behavior by setting `--naive`. # Tests + Formatting Tests that incorporated `polars as-datetime` had to be tweaked to include `--naive` flag to replicate previous behavior. # After Submitting --- .../src/dataframe/command/core/to_repr.rs | 34 +++++++------- .../dataframe/command/datetime/as_datetime.rs | 41 ++++++++++++----- .../dataframe/command/datetime/datepart.rs | 17 ++++--- .../values/nu_dataframe/conversion.rs | 45 +++++++++++-------- 4 files changed, 85 insertions(+), 52 deletions(-) diff --git a/crates/nu_plugin_polars/src/dataframe/command/core/to_repr.rs b/crates/nu_plugin_polars/src/dataframe/command/core/to_repr.rs index d867f320cb..417a37cd2e 100644 --- a/crates/nu_plugin_polars/src/dataframe/command/core/to_repr.rs +++ b/crates/nu_plugin_polars/src/dataframe/command/core/to_repr.rs @@ -39,14 +39,14 @@ impl PluginCommand for ToRepr { result: Some(Value::string( r#" shape: (2, 2) -┌─────────────────────┬─────┐ -│ a ┆ b │ -│ --- ┆ --- │ -│ datetime[ns] ┆ i64 │ -╞═════════════════════╪═════╡ -│ 2025-01-01 00:00:00 ┆ 2 │ -│ 2025-01-02 00:00:00 ┆ 4 │ -└─────────────────────┴─────┘"# +┌─────────────────────────┬─────┐ +│ a ┆ b │ +│ --- ┆ --- │ +│ datetime[ns, UTC] ┆ i64 │ +╞═════════════════════════╪═════╡ +│ 2025-01-01 00:00:00 UTC ┆ 2 │ +│ 2025-01-02 00:00:00 UTC ┆ 4 │ +└─────────────────────────┴─────┘"# .trim(), Span::test_data(), )), @@ -54,18 +54,18 @@ shape: (2, 2) Example { description: "Shows lazy dataframe in repr format", example: - "[[a b]; [2025-01-01 2] [2025-01-02 4]] | polars into-df | polars into-lazy | polars into-repr", + "[[a b]; [2025-01-01 2] [2025-01-02 4]] | polars into-lazy | polars into-repr", result: Some(Value::string( r#" shape: (2, 2) -┌─────────────────────┬─────┐ -│ a ┆ b │ -│ --- ┆ --- │ -│ datetime[ns] ┆ i64 │ -╞═════════════════════╪═════╡ -│ 2025-01-01 00:00:00 ┆ 2 │ -│ 2025-01-02 00:00:00 ┆ 4 │ -└─────────────────────┴─────┘"# +┌─────────────────────────┬─────┐ +│ a ┆ b │ +│ --- ┆ --- │ +│ datetime[ns, UTC] ┆ i64 │ +╞═════════════════════════╪═════╡ +│ 2025-01-01 00:00:00 UTC ┆ 2 │ +│ 2025-01-02 00:00:00 UTC ┆ 4 │ +└─────────────────────────┴─────┘"# .trim(), Span::test_data(), )), diff --git a/crates/nu_plugin_polars/src/dataframe/command/datetime/as_datetime.rs b/crates/nu_plugin_polars/src/dataframe/command/datetime/as_datetime.rs index a38779927f..3a246cc58c 100644 --- a/crates/nu_plugin_polars/src/dataframe/command/datetime/as_datetime.rs +++ b/crates/nu_plugin_polars/src/dataframe/command/datetime/as_datetime.rs @@ -1,6 +1,7 @@ use crate::{values::CustomValueSupport, PolarsPlugin}; +use std::sync::Arc; -use super::super::super::values::{Column, NuDataFrame}; +use super::super::super::values::{Column, NuDataFrame, NuSchema}; use chrono::DateTime; use nu_plugin::{EngineInterface, EvaluatedCall, PluginCommand}; @@ -8,7 +9,7 @@ use nu_protocol::{ Category, Example, LabeledError, PipelineData, ShellError, Signature, Span, SyntaxShape, Type, Value, }; -use polars::prelude::{IntoSeries, StringMethods, TimeUnit}; +use polars::prelude::{DataType, Field, IntoSeries, Schema, StringMethods, TimeUnit}; #[derive(Clone)] pub struct AsDateTime; @@ -43,6 +44,7 @@ impl PluginCommand for AsDateTime { Signature::build(self.name()) .required("format", SyntaxShape::String, "formatting date time string") .switch("not-exact", "the format string may be contained in the date (e.g. foo-2021-01-01-bar could match 2021-01-01)", Some('n')) + .switch("naive", "the input datetimes should be parsed as naive (i.e., not timezone-aware)", None) .input_output_type( Type::Custom("dataframe".into()), Type::Custom("dataframe".into()), @@ -54,7 +56,7 @@ impl PluginCommand for AsDateTime { vec![ Example { description: "Converts string to datetime", - example: r#"["2021-12-30 00:00:00" "2021-12-31 00:00:00"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S""#, + example: r#"["2021-12-30 00:00:00 -0400" "2021-12-31 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z""#, result: Some( NuDataFrame::try_from_columns( vec![Column::new( @@ -62,7 +64,7 @@ impl PluginCommand for AsDateTime { vec![ Value::date( DateTime::parse_from_str( - "2021-12-30 00:00:00 +0000", + "2021-12-30 00:00:00 -0400", "%Y-%m-%d %H:%M:%S %z", ) .expect("date calculation should not fail in test"), @@ -70,7 +72,7 @@ impl PluginCommand for AsDateTime { ), Value::date( DateTime::parse_from_str( - "2021-12-31 00:00:00 +0000", + "2021-12-31 00:00:00 -0400", "%Y-%m-%d %H:%M:%S %z", ) .expect("date calculation should not fail in test"), @@ -86,7 +88,7 @@ impl PluginCommand for AsDateTime { }, Example { description: "Converts string to datetime with high resolutions", - example: r#"["2021-12-30 00:00:00.123456789" "2021-12-31 00:00:00.123456789"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S.%9f""#, + example: r#"["2021-12-30 00:00:00.123456789" "2021-12-31 00:00:00.123456789"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S.%9f" --naive"#, result: Some( NuDataFrame::try_from_columns( vec![Column::new( @@ -110,7 +112,15 @@ impl PluginCommand for AsDateTime { ), ], )], - None, + Some(NuSchema::new(Arc::new(Schema::from_iter(vec![ + Field::new( + "datetime".into(), + DataType::Datetime( + TimeUnit::Nanoseconds, + None + ), + ), + ])))), ) .expect("simple df for test should not fail") .into_value(Span::test_data()), @@ -118,7 +128,7 @@ impl PluginCommand for AsDateTime { }, Example { description: "Converts string to datetime using the `--not-exact` flag even with excessive symbols", - example: r#"["2021-12-30 00:00:00 GMT+4"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S" --not-exact"#, + example: r#"["2021-12-30 00:00:00 GMT+4"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S" --not-exact --naive"#, result: Some( NuDataFrame::try_from_columns( vec![Column::new( @@ -134,7 +144,15 @@ impl PluginCommand for AsDateTime { ), ], )], - None, + Some(NuSchema::new(Arc::new(Schema::from_iter(vec![ + Field::new( + "datetime".into(), + DataType::Datetime( + TimeUnit::Nanoseconds, + None + ), + ), + ])))), ) .expect("simple df for test should not fail") .into_value(Span::test_data()), @@ -162,6 +180,7 @@ fn command( ) -> Result { let format: String = call.req(0)?; let not_exact = call.has_flag("not-exact")?; + let tz_aware = !call.has_flag("naive")?; let df = NuDataFrame::try_from_pipeline_coerce(plugin, input, call.head)?; let series = df.as_series(call.head)?; @@ -177,7 +196,7 @@ fn command( casted.as_datetime_not_exact( Some(format.as_str()), TimeUnit::Nanoseconds, - false, + tz_aware, None, &Default::default(), ) @@ -186,7 +205,7 @@ fn command( Some(format.as_str()), TimeUnit::Nanoseconds, false, - false, + tz_aware, None, &Default::default(), ) diff --git a/crates/nu_plugin_polars/src/dataframe/command/datetime/datepart.rs b/crates/nu_plugin_polars/src/dataframe/command/datetime/datepart.rs index c374ac82f5..1e18639d06 100644 --- a/crates/nu_plugin_polars/src/dataframe/command/datetime/datepart.rs +++ b/crates/nu_plugin_polars/src/dataframe/command/datetime/datepart.rs @@ -1,7 +1,8 @@ use crate::values::NuExpression; +use std::sync::Arc; use crate::{ - dataframe::values::{Column, NuDataFrame}, + dataframe::values::{Column, NuDataFrame, NuSchema}, values::CustomValueSupport, PolarsPlugin, }; @@ -13,7 +14,7 @@ use nu_protocol::{ }; use polars::{ datatypes::{DataType, TimeUnit}, - prelude::NamedFrom, + prelude::{Field, NamedFrom, Schema}, series::Series, }; @@ -54,14 +55,20 @@ impl PluginCommand for ExprDatePart { vec![ Example { description: "Creates an expression to capture the year date part", - example: r#"[["2021-12-30T01:02:03.123456789"]] | polars into-df | polars as-datetime "%Y-%m-%dT%H:%M:%S.%9f" | polars with-column [(polars col datetime | polars datepart year | polars as datetime_year )]"#, + example: r#"[["2021-12-30T01:02:03.123456789"]] | polars into-df | polars as-datetime "%Y-%m-%dT%H:%M:%S.%9f" --naive | polars with-column [(polars col datetime | polars datepart year | polars as datetime_year )]"#, result: Some( NuDataFrame::try_from_columns( vec![ Column::new("datetime".to_string(), vec![Value::test_date(dt)]), Column::new("datetime_year".to_string(), vec![Value::test_int(2021)]), ], - None, + Some(NuSchema::new(Arc::new(Schema::from_iter(vec![ + Field::new( + "datetime".into(), + DataType::Datetime(TimeUnit::Nanoseconds, None), + ), + Field::new("datetime_year".into(), DataType::Int64), + ])))), ) .expect("simple df for test should not fail") .into_value(Span::test_data()), @@ -69,7 +76,7 @@ impl PluginCommand for ExprDatePart { }, Example { description: "Creates an expression to capture multiple date parts", - example: r#"[["2021-12-30T01:02:03.123456789"]] | polars into-df | polars as-datetime "%Y-%m-%dT%H:%M:%S.%9f" | + example: r#"[["2021-12-30T01:02:03.123456789"]] | polars into-df | polars as-datetime "%Y-%m-%dT%H:%M:%S.%9f" --naive | polars with-column [ (polars col datetime | polars datepart year | polars as datetime_year ), (polars col datetime | polars datepart month | polars as datetime_month ), (polars col datetime | polars datepart day | polars as datetime_day ), diff --git a/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs b/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs index 82051f7b1b..8b898e449a 100644 --- a/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs +++ b/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs @@ -245,7 +245,10 @@ fn value_to_data_type(value: &Value) -> Option { Value::Float { .. } => Some(DataType::Float64), Value::String { .. } => Some(DataType::String), Value::Bool { .. } => Some(DataType::Boolean), - Value::Date { .. } => Some(DataType::Date), + Value::Date { .. } => Some(DataType::Datetime( + TimeUnit::Nanoseconds, + Some(PlSmallStr::from_static("UTC")), + )), Value::Duration { .. } => Some(DataType::Duration(TimeUnit::Nanoseconds)), Value::Filesize { .. } => Some(DataType::Int64), Value::Binary { .. } => Some(DataType::Binary), @@ -447,24 +450,28 @@ fn typed_column_to_series(name: PlSmallStr, column: TypedColumn) -> Result().map(|tz| val.with_timezone(&tz))) - .transpose() - .map_err(|e| ShellError::GenericError { - error: "Error parsing timezone".into(), - msg: "".into(), - span: None, - help: Some(e.to_string()), - inner: vec![], - })? - .and_then(|dt| dt.timestamp_nanos_opt()) - .map(|nanos| nanos_from_timeunit(nanos, *tu))) - } else { - Ok(None) + match (maybe_tz, &v) { + (Some(tz), Value::Date { val, .. }) => { + // If there is a timezone specified, make sure + // the value is converted to it + Ok(tz + .parse::() + .map(|tz| val.with_timezone(&tz)) + .map_err(|e| ShellError::GenericError { + error: "Error parsing timezone".into(), + msg: "".into(), + span: None, + help: Some(e.to_string()), + inner: vec![], + })? + .timestamp_nanos_opt() + .map(|nanos| nanos_from_timeunit(nanos, *tu))) + } + (None, Value::Date { val, .. }) => Ok(val + .timestamp_nanos_opt() + .map(|nanos| nanos_from_timeunit(nanos, *tu))), + + _ => Ok(None), } }) .collect::>, ShellError>>()?;