FIX polars as-datetime: ignores timezone information on conversion (#15490)

# Description
This PR seeks to fix an error in `polars as-datetime` where timezone
information is entirely ignored. This behavior raises a host of silent
errors when dealing with datetime conversions (see example below).

## Current Implementation
Timezones are entirely ignored and datetimes with different timezones
are converted to the same naive datetimes even when the user
specifically indicates that the timezone should be parsed. For example,
"2021-12-30 00:00:00 +0000" and "2021-12-30 00:00:00 -0400" will both be
parsed to "2021-12-30 00:00:00" even when the format string specifically
includes "%z".

```
$ ["2021-12-30 00:00:00 +0000" "2021-12-30 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z"
╭───┬───────────────────────╮
│ # │       datetime        │
├───┼───────────────────────┤
│ 0 │ 12/30/2021 12:00:00AM │ 
│ 1 │ 12/30/2021 12:00:00AM │ <-- Same datetime even though the first is +0000 and second is -0400
╰───┴───────────────────────╯

$ ["2021-12-30 00:00:00 +0000" "2021-12-30 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z" | polars schema
╭──────────┬──────────────╮
│ datetime │ datetime<ns> │
╰──────────┴──────────────╯
```

## New Implementation
Datetimes are converted to UTC and timezone information is retained.

```
$ "2021-12-30 00:00:00 +0000" "2021-12-30 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z"
╭───┬───────────────────────╮
│ # │       datetime        │
├───┼───────────────────────┤
│ 0 │ 12/30/2021 12:00:00AM │
│ 1 │ 12/30/2021 04:00:00AM │ <-- Converted to UTC
╰───┴───────────────────────╯

$ ["2021-12-30 00:00:00 +0000" "2021-12-30 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z" | polars schema
╭──────────┬───────────────────╮
│ datetime │ datetime<ns, UTC> │
╰──────────┴───────────────────╯
```

The user may intentionally ignore timezone information by setting the
`--naive` flag.
```
$ ["2021-12-30 00:00:00 +0000" "2021-12-30 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z" --naive
╭───┬───────────────────────╮
│ # │       datetime        │
├───┼───────────────────────┤
│ 0 │ 12/30/2021 12:00:00AM │
│ 1 │ 12/30/2021 12:00:00AM │ <-- the -0400 offset is ignored when --naive is set
╰───┴───────────────────────╯

$ ["2021-12-30 00:00:00 +0000" "2021-12-30 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z" --naive | polars schema
╭──────────┬──────────────╮
│ datetime │ datetime<ns> │
╰──────────┴──────────────╯
```

# User-Facing Changes
<!-- List of all changes that impact the user experience here. This
helps us keep track of breaking changes. -->
`polars as-datetime` will now account for timezone information and
return type `datetime<ns,UTC>` rather than `datetime<ns>` by default.
The user can replicate the previous behavior by setting `--naive`.

# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.

Make sure you've run and fixed any issues with these commands:

- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to
check that you're using the standard code style
- `cargo test --workspace` to check that all tests pass (on Windows make
sure to [enable developer
mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging))
- `cargo run -- -c "use toolkit.nu; toolkit test stdlib"` to run the
tests for the standard library

> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->
Tests that incorporated `polars as-datetime` had to be tweaked to
include `--naive` flag to replicate previous behavior.

# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->
This commit is contained in:
pyz4 2025-04-04 12:43:21 -04:00 committed by GitHub
parent 237a685605
commit 7ca2a6f8ac
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 85 additions and 52 deletions

View File

@ -39,14 +39,14 @@ impl PluginCommand for ToRepr {
result: Some(Value::string(
r#"
shape: (2, 2)
a b
--- ---
datetime[ns] i64
2025-01-01 00:00:00 2
2025-01-02 00:00:00 4
"#
a b
--- ---
datetime[ns, UTC] i64
2025-01-01 00:00:00 UTC 2
2025-01-02 00:00:00 UTC 4
"#
.trim(),
Span::test_data(),
)),
@ -54,18 +54,18 @@ shape: (2, 2)
Example {
description: "Shows lazy dataframe in repr format",
example:
"[[a b]; [2025-01-01 2] [2025-01-02 4]] | polars into-df | polars into-lazy | polars into-repr",
"[[a b]; [2025-01-01 2] [2025-01-02 4]] | polars into-lazy | polars into-repr",
result: Some(Value::string(
r#"
shape: (2, 2)
a b
--- ---
datetime[ns] i64
2025-01-01 00:00:00 2
2025-01-02 00:00:00 4
"#
a b
--- ---
datetime[ns, UTC] i64
2025-01-01 00:00:00 UTC 2
2025-01-02 00:00:00 UTC 4
"#
.trim(),
Span::test_data(),
)),

View File

@ -1,6 +1,7 @@
use crate::{values::CustomValueSupport, PolarsPlugin};
use std::sync::Arc;
use super::super::super::values::{Column, NuDataFrame};
use super::super::super::values::{Column, NuDataFrame, NuSchema};
use chrono::DateTime;
use nu_plugin::{EngineInterface, EvaluatedCall, PluginCommand};
@ -8,7 +9,7 @@ use nu_protocol::{
Category, Example, LabeledError, PipelineData, ShellError, Signature, Span, SyntaxShape, Type,
Value,
};
use polars::prelude::{IntoSeries, StringMethods, TimeUnit};
use polars::prelude::{DataType, Field, IntoSeries, Schema, StringMethods, TimeUnit};
#[derive(Clone)]
pub struct AsDateTime;
@ -43,6 +44,7 @@ impl PluginCommand for AsDateTime {
Signature::build(self.name())
.required("format", SyntaxShape::String, "formatting date time string")
.switch("not-exact", "the format string may be contained in the date (e.g. foo-2021-01-01-bar could match 2021-01-01)", Some('n'))
.switch("naive", "the input datetimes should be parsed as naive (i.e., not timezone-aware)", None)
.input_output_type(
Type::Custom("dataframe".into()),
Type::Custom("dataframe".into()),
@ -54,7 +56,7 @@ impl PluginCommand for AsDateTime {
vec![
Example {
description: "Converts string to datetime",
example: r#"["2021-12-30 00:00:00" "2021-12-31 00:00:00"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S""#,
example: r#"["2021-12-30 00:00:00 -0400" "2021-12-31 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z""#,
result: Some(
NuDataFrame::try_from_columns(
vec![Column::new(
@ -62,7 +64,7 @@ impl PluginCommand for AsDateTime {
vec![
Value::date(
DateTime::parse_from_str(
"2021-12-30 00:00:00 +0000",
"2021-12-30 00:00:00 -0400",
"%Y-%m-%d %H:%M:%S %z",
)
.expect("date calculation should not fail in test"),
@ -70,7 +72,7 @@ impl PluginCommand for AsDateTime {
),
Value::date(
DateTime::parse_from_str(
"2021-12-31 00:00:00 +0000",
"2021-12-31 00:00:00 -0400",
"%Y-%m-%d %H:%M:%S %z",
)
.expect("date calculation should not fail in test"),
@ -86,7 +88,7 @@ impl PluginCommand for AsDateTime {
},
Example {
description: "Converts string to datetime with high resolutions",
example: r#"["2021-12-30 00:00:00.123456789" "2021-12-31 00:00:00.123456789"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S.%9f""#,
example: r#"["2021-12-30 00:00:00.123456789" "2021-12-31 00:00:00.123456789"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S.%9f" --naive"#,
result: Some(
NuDataFrame::try_from_columns(
vec![Column::new(
@ -110,7 +112,15 @@ impl PluginCommand for AsDateTime {
),
],
)],
None,
Some(NuSchema::new(Arc::new(Schema::from_iter(vec![
Field::new(
"datetime".into(),
DataType::Datetime(
TimeUnit::Nanoseconds,
None
),
),
])))),
)
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
@ -118,7 +128,7 @@ impl PluginCommand for AsDateTime {
},
Example {
description: "Converts string to datetime using the `--not-exact` flag even with excessive symbols",
example: r#"["2021-12-30 00:00:00 GMT+4"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S" --not-exact"#,
example: r#"["2021-12-30 00:00:00 GMT+4"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S" --not-exact --naive"#,
result: Some(
NuDataFrame::try_from_columns(
vec![Column::new(
@ -134,7 +144,15 @@ impl PluginCommand for AsDateTime {
),
],
)],
None,
Some(NuSchema::new(Arc::new(Schema::from_iter(vec![
Field::new(
"datetime".into(),
DataType::Datetime(
TimeUnit::Nanoseconds,
None
),
),
])))),
)
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
@ -162,6 +180,7 @@ fn command(
) -> Result<PipelineData, ShellError> {
let format: String = call.req(0)?;
let not_exact = call.has_flag("not-exact")?;
let tz_aware = !call.has_flag("naive")?;
let df = NuDataFrame::try_from_pipeline_coerce(plugin, input, call.head)?;
let series = df.as_series(call.head)?;
@ -177,7 +196,7 @@ fn command(
casted.as_datetime_not_exact(
Some(format.as_str()),
TimeUnit::Nanoseconds,
false,
tz_aware,
None,
&Default::default(),
)
@ -186,7 +205,7 @@ fn command(
Some(format.as_str()),
TimeUnit::Nanoseconds,
false,
false,
tz_aware,
None,
&Default::default(),
)

View File

@ -1,7 +1,8 @@
use crate::values::NuExpression;
use std::sync::Arc;
use crate::{
dataframe::values::{Column, NuDataFrame},
dataframe::values::{Column, NuDataFrame, NuSchema},
values::CustomValueSupport,
PolarsPlugin,
};
@ -13,7 +14,7 @@ use nu_protocol::{
};
use polars::{
datatypes::{DataType, TimeUnit},
prelude::NamedFrom,
prelude::{Field, NamedFrom, Schema},
series::Series,
};
@ -54,14 +55,20 @@ impl PluginCommand for ExprDatePart {
vec![
Example {
description: "Creates an expression to capture the year date part",
example: r#"[["2021-12-30T01:02:03.123456789"]] | polars into-df | polars as-datetime "%Y-%m-%dT%H:%M:%S.%9f" | polars with-column [(polars col datetime | polars datepart year | polars as datetime_year )]"#,
example: r#"[["2021-12-30T01:02:03.123456789"]] | polars into-df | polars as-datetime "%Y-%m-%dT%H:%M:%S.%9f" --naive | polars with-column [(polars col datetime | polars datepart year | polars as datetime_year )]"#,
result: Some(
NuDataFrame::try_from_columns(
vec![
Column::new("datetime".to_string(), vec![Value::test_date(dt)]),
Column::new("datetime_year".to_string(), vec![Value::test_int(2021)]),
],
None,
Some(NuSchema::new(Arc::new(Schema::from_iter(vec![
Field::new(
"datetime".into(),
DataType::Datetime(TimeUnit::Nanoseconds, None),
),
Field::new("datetime_year".into(), DataType::Int64),
])))),
)
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
@ -69,7 +76,7 @@ impl PluginCommand for ExprDatePart {
},
Example {
description: "Creates an expression to capture multiple date parts",
example: r#"[["2021-12-30T01:02:03.123456789"]] | polars into-df | polars as-datetime "%Y-%m-%dT%H:%M:%S.%9f" |
example: r#"[["2021-12-30T01:02:03.123456789"]] | polars into-df | polars as-datetime "%Y-%m-%dT%H:%M:%S.%9f" --naive |
polars with-column [ (polars col datetime | polars datepart year | polars as datetime_year ),
(polars col datetime | polars datepart month | polars as datetime_month ),
(polars col datetime | polars datepart day | polars as datetime_day ),

View File

@ -245,7 +245,10 @@ fn value_to_data_type(value: &Value) -> Option<DataType> {
Value::Float { .. } => Some(DataType::Float64),
Value::String { .. } => Some(DataType::String),
Value::Bool { .. } => Some(DataType::Boolean),
Value::Date { .. } => Some(DataType::Date),
Value::Date { .. } => Some(DataType::Datetime(
TimeUnit::Nanoseconds,
Some(PlSmallStr::from_static("UTC")),
)),
Value::Duration { .. } => Some(DataType::Duration(TimeUnit::Nanoseconds)),
Value::Filesize { .. } => Some(DataType::Int64),
Value::Binary { .. } => Some(DataType::Binary),
@ -447,24 +450,28 @@ fn typed_column_to_series(name: PlSmallStr, column: TypedColumn) -> Result<Serie
.values
.iter()
.map(|v| {
if let Value::Date { val, .. } = &v {
// If there is a timezone specified, make sure
// the value is converted to it
Ok(maybe_tz
.as_ref()
.map(|tz| tz.parse::<Tz>().map(|tz| val.with_timezone(&tz)))
.transpose()
.map_err(|e| ShellError::GenericError {
error: "Error parsing timezone".into(),
msg: "".into(),
span: None,
help: Some(e.to_string()),
inner: vec![],
})?
.and_then(|dt| dt.timestamp_nanos_opt())
.map(|nanos| nanos_from_timeunit(nanos, *tu)))
} else {
Ok(None)
match (maybe_tz, &v) {
(Some(tz), Value::Date { val, .. }) => {
// If there is a timezone specified, make sure
// the value is converted to it
Ok(tz
.parse::<Tz>()
.map(|tz| val.with_timezone(&tz))
.map_err(|e| ShellError::GenericError {
error: "Error parsing timezone".into(),
msg: "".into(),
span: None,
help: Some(e.to_string()),
inner: vec![],
})?
.timestamp_nanos_opt()
.map(|nanos| nanos_from_timeunit(nanos, *tu)))
}
(None, Value::Date { val, .. }) => Ok(val
.timestamp_nanos_opt()
.map(|nanos| nanos_from_timeunit(nanos, *tu))),
_ => Ok(None),
}
})
.collect::<Result<Vec<Option<i64>>, ShellError>>()?;