From d1d6518ece9e96b1c4e367d09b4d537768bd4c68 Mon Sep 17 00:00:00 2001 From: pyz4 <42039243+pyz4@users.noreply.github.com> Date: Sat, 26 Apr 2025 14:47:58 -0400 Subject: [PATCH] feat(polars): enable parsing strings as dates and datetime in polars schema (#15645) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description This PR seeks to add a quality-of-life feature that enables date and datetime parsing of strings in `polars into-df`, `polars into-lazy`, and `polars open`, and avoid the more verbose method of casting each column into date/datetime. Currently, setting the schema to `date` on a `str` column would silently error as a null column. See a comparison of the current and proposed implementations. The proposed implementation assumes a date format "%Y-%m-%d" and a datetime format of "%Y-%m-%d %H:%M:%S" for naive datetimes and "%Y-%m-%d %H:%M:%S%:z" for timezone-aware datetimes. Other formats must be specified via parsing through `polars as-date` and `polars as-datetime`. ```nushell # Current Implementations > [[a]; ["2025-04-01"]] | polars into-df --schema {a: date} ╭───┬───╮ │ # │ a │ ├───┼───┤ │ 0 │ │ ╰───┴───╯ > [[a]; ["2025-04-01 01:00:00"]] | polars into-df --schema {a: "datetime"} ╭───┬───╮ │ # │ a │ ├───┼───┤ │ 0 │ │ ╰───┴───╯ # Proposed Implementation > [[a]; ["2025-04-01"]] | polars into-df --schema {a: date} ╭───┬─────────────────────╮ │ # │ a │ ├───┼─────────────────────┤ │ 0 │ 04/01/25 12:00:00AM │ ╰───┴─────────────────────╯ > [[a]; ["2025-04-01 01:00:00"]] | polars into-df --schema {a: "datetime"} ╭───┬─────────────────────╮ │ # │ a │ ├───┼─────────────────────┤ │ 0 │ 04/01/25 01:00:00AM │ ╰───┴─────────────────────╯ > [[a]; ["2025-04-01 01:00:00-04:00"]] | polars into-df --schema {a: "datetime"} ╭───┬─────────────────────╮ │ # │ a │ ├───┼─────────────────────┤ │ 0 │ 04/01/25 05:00:00AM │ ╰───┴─────────────────────╯ ``` # User-Facing Changes No breaking changes. Users have the added option to parse string columns into date/datetimes. # Tests + Formatting No tests were added to any examples. # After Submitting --- .../values/nu_dataframe/conversion.rs | 70 ++++++++++++++++--- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs b/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs index 7c32ec3e62..0c2e1611fa 100644 --- a/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs +++ b/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs @@ -2,7 +2,7 @@ use std::collections::HashMap; use std::ops::{Deref, DerefMut}; use std::sync::Arc; -use chrono::{DateTime, Duration, FixedOffset, NaiveTime, TimeZone, Utc}; +use chrono::{DateTime, Duration, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc}; use chrono_tz::Tz; use indexmap::map::{Entry, IndexMap}; use polars::chunked_array::builder::AnonymousOwnedListBuilder; @@ -461,15 +461,34 @@ fn typed_column_to_series(name: PlSmallStr, column: TypedColumn) -> Result { - let it = column.values.iter().map(|v| { - if let Value::Date { val, .. } = &v { - Some(val.timestamp_nanos_opt().unwrap_or_default()) - } else { - None - } - }); + let it = column + .values + .iter() + .map(|v| match &v { + Value::Date { val, .. } => { + Ok(Some(val.timestamp_nanos_opt().unwrap_or_default())) + } - ChunkedArray::::from_iter_options(name, it) + Value::String { val, .. } => { + let expected_format = "%Y-%m-%d"; + let nanos = NaiveDate::parse_from_str(val, expected_format) + .map_err(|e| ShellError::GenericError { + error: format!("Error parsing date from string: {e}"), + msg: "".into(), + span: None, + help: Some(format!("Expected format {expected_format}. If you need to parse with another format, please set the schema to `str` and parse with `polars as-date `.")), + inner: vec![], + })? + .and_hms_nano_opt(0, 0, 0, 0) + .and_then(|dt| dt.and_utc().timestamp_nanos_opt()); + Ok(nanos) + } + + _ => Ok(None), + }) + .collect::, ShellError>>()?; + + ChunkedArray::::from_iter_options(name, it.into_iter()) .into_datetime(TimeUnit::Nanoseconds, None) .cast_with_options(&DataType::Date, Default::default()) .map_err(|e| ShellError::GenericError { @@ -507,6 +526,39 @@ fn typed_column_to_series(name: PlSmallStr, column: TypedColumn) -> Result { + // because we're converting to the number of nano seconds since epoch, the timezone is irrelevant + let expected_format = "%Y-%m-%d %H:%M:%S%:z"; + DateTime::parse_from_str(val, expected_format) + .map_err(|e| ShellError::GenericError { + error: format!("Error parsing datetime from string: {e}"), + msg: "".into(), + span: None, + help: Some(format!("Expected format {expected_format}. If you need to parse with another format, please set the schema to `str` and parse with `polars as-datetime `.")), + inner: vec![], + })? + .timestamp_nanos_opt() + .map(|nanos| nanos_to_timeunit(nanos, *tu)) + .transpose() + } + + (None, Value::String { val, .. }) => { + let expected_format = "%Y-%m-%d %H:%M:%S"; + + NaiveDateTime::parse_from_str(val, expected_format) + .map_err(|e| ShellError::GenericError { + error: format!("Error parsing datetime from string: {e}"), + msg: "".into(), + span: None, + help: Some(format!("Expected format {expected_format}. If you need to parse with another format, please set the schema to `str` and parse with `polars as-datetime `.")), + inner: vec![], + })? + .and_utc() + .timestamp_nanos_opt() + .map(|nanos| nanos_to_timeunit(nanos, *tu)) + .transpose() + } + _ => Ok(None), } })