mirror of
https://github.com/nushell/nushell.git
synced 2025-04-30 16:14:27 +02:00
feat(polars): enable parsing strings as dates and datetime in polars schema (#15645)
<!-- if this PR closes one or more issues, you can automatically link the PR with them by using one of the [*linking keywords*](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword), e.g. - this PR should close #xxxx - fixes #xxxx you can also mention related issues, PRs or discussions! --> # Description <!-- Thank you for improving Nushell. Please, check our [contributing guide](../CONTRIBUTING.md) and talk to the core team before making major changes. Description of your pull request goes here. **Provide examples and/or screenshots** if your changes affect the user experience. --> This PR seeks to add a quality-of-life feature that enables date and datetime parsing of strings in `polars into-df`, `polars into-lazy`, and `polars open`, and avoid the more verbose method of casting each column into date/datetime. Currently, setting the schema to `date` on a `str` column would silently error as a null column. See a comparison of the current and proposed implementations. The proposed implementation assumes a date format "%Y-%m-%d" and a datetime format of "%Y-%m-%d %H:%M:%S" for naive datetimes and "%Y-%m-%d %H:%M:%S%:z" for timezone-aware datetimes. Other formats must be specified via parsing through `polars as-date` and `polars as-datetime`. ```nushell # Current Implementations > [[a]; ["2025-04-01"]] | polars into-df --schema {a: date} ╭───┬───╮ │ # │ a │ ├───┼───┤ │ 0 │ │ ╰───┴───╯ > [[a]; ["2025-04-01 01:00:00"]] | polars into-df --schema {a: "datetime<ns,*>"} ╭───┬───╮ │ # │ a │ ├───┼───┤ │ 0 │ │ ╰───┴───╯ # Proposed Implementation > [[a]; ["2025-04-01"]] | polars into-df --schema {a: date} ╭───┬─────────────────────╮ │ # │ a │ ├───┼─────────────────────┤ │ 0 │ 04/01/25 12:00:00AM │ ╰───┴─────────────────────╯ > [[a]; ["2025-04-01 01:00:00"]] | polars into-df --schema {a: "datetime<ns,*>"} ╭───┬─────────────────────╮ │ # │ a │ ├───┼─────────────────────┤ │ 0 │ 04/01/25 01:00:00AM │ ╰───┴─────────────────────╯ > [[a]; ["2025-04-01 01:00:00-04:00"]] | polars into-df --schema {a: "datetime<ns,UTC>"} ╭───┬─────────────────────╮ │ # │ a │ ├───┼─────────────────────┤ │ 0 │ 04/01/25 05:00:00AM │ ╰───┴─────────────────────╯ ``` # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> No breaking changes. Users have the added option to parse string columns into date/datetimes. # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use toolkit.nu; toolkit test stdlib"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> No tests were added to any examples. # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. -->
This commit is contained in:
parent
2d868323b6
commit
d1d6518ece
@ -2,7 +2,7 @@ use std::collections::HashMap;
|
|||||||
use std::ops::{Deref, DerefMut};
|
use std::ops::{Deref, DerefMut};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use chrono::{DateTime, Duration, FixedOffset, NaiveTime, TimeZone, Utc};
|
use chrono::{DateTime, Duration, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime, TimeZone, Utc};
|
||||||
use chrono_tz::Tz;
|
use chrono_tz::Tz;
|
||||||
use indexmap::map::{Entry, IndexMap};
|
use indexmap::map::{Entry, IndexMap};
|
||||||
use polars::chunked_array::builder::AnonymousOwnedListBuilder;
|
use polars::chunked_array::builder::AnonymousOwnedListBuilder;
|
||||||
@ -461,15 +461,34 @@ fn typed_column_to_series(name: PlSmallStr, column: TypedColumn) -> Result<Serie
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
DataType::Date => {
|
DataType::Date => {
|
||||||
let it = column.values.iter().map(|v| {
|
let it = column
|
||||||
if let Value::Date { val, .. } = &v {
|
.values
|
||||||
Some(val.timestamp_nanos_opt().unwrap_or_default())
|
.iter()
|
||||||
} else {
|
.map(|v| match &v {
|
||||||
None
|
Value::Date { val, .. } => {
|
||||||
}
|
Ok(Some(val.timestamp_nanos_opt().unwrap_or_default()))
|
||||||
});
|
}
|
||||||
|
|
||||||
ChunkedArray::<Int64Type>::from_iter_options(name, it)
|
Value::String { val, .. } => {
|
||||||
|
let expected_format = "%Y-%m-%d";
|
||||||
|
let nanos = NaiveDate::parse_from_str(val, expected_format)
|
||||||
|
.map_err(|e| ShellError::GenericError {
|
||||||
|
error: format!("Error parsing date from string: {e}"),
|
||||||
|
msg: "".into(),
|
||||||
|
span: None,
|
||||||
|
help: Some(format!("Expected format {expected_format}. If you need to parse with another format, please set the schema to `str` and parse with `polars as-date <format>`.")),
|
||||||
|
inner: vec![],
|
||||||
|
})?
|
||||||
|
.and_hms_nano_opt(0, 0, 0, 0)
|
||||||
|
.and_then(|dt| dt.and_utc().timestamp_nanos_opt());
|
||||||
|
Ok(nanos)
|
||||||
|
}
|
||||||
|
|
||||||
|
_ => Ok(None),
|
||||||
|
})
|
||||||
|
.collect::<Result<Vec<_>, ShellError>>()?;
|
||||||
|
|
||||||
|
ChunkedArray::<Int64Type>::from_iter_options(name, it.into_iter())
|
||||||
.into_datetime(TimeUnit::Nanoseconds, None)
|
.into_datetime(TimeUnit::Nanoseconds, None)
|
||||||
.cast_with_options(&DataType::Date, Default::default())
|
.cast_with_options(&DataType::Date, Default::default())
|
||||||
.map_err(|e| ShellError::GenericError {
|
.map_err(|e| ShellError::GenericError {
|
||||||
@ -507,6 +526,39 @@ fn typed_column_to_series(name: PlSmallStr, column: TypedColumn) -> Result<Serie
|
|||||||
.map(|nanos| nanos_to_timeunit(nanos, *tu))
|
.map(|nanos| nanos_to_timeunit(nanos, *tu))
|
||||||
.transpose(),
|
.transpose(),
|
||||||
|
|
||||||
|
(Some(_), Value::String { val, .. }) => {
|
||||||
|
// because we're converting to the number of nano seconds since epoch, the timezone is irrelevant
|
||||||
|
let expected_format = "%Y-%m-%d %H:%M:%S%:z";
|
||||||
|
DateTime::parse_from_str(val, expected_format)
|
||||||
|
.map_err(|e| ShellError::GenericError {
|
||||||
|
error: format!("Error parsing datetime from string: {e}"),
|
||||||
|
msg: "".into(),
|
||||||
|
span: None,
|
||||||
|
help: Some(format!("Expected format {expected_format}. If you need to parse with another format, please set the schema to `str` and parse with `polars as-datetime <format>`.")),
|
||||||
|
inner: vec![],
|
||||||
|
})?
|
||||||
|
.timestamp_nanos_opt()
|
||||||
|
.map(|nanos| nanos_to_timeunit(nanos, *tu))
|
||||||
|
.transpose()
|
||||||
|
}
|
||||||
|
|
||||||
|
(None, Value::String { val, .. }) => {
|
||||||
|
let expected_format = "%Y-%m-%d %H:%M:%S";
|
||||||
|
|
||||||
|
NaiveDateTime::parse_from_str(val, expected_format)
|
||||||
|
.map_err(|e| ShellError::GenericError {
|
||||||
|
error: format!("Error parsing datetime from string: {e}"),
|
||||||
|
msg: "".into(),
|
||||||
|
span: None,
|
||||||
|
help: Some(format!("Expected format {expected_format}. If you need to parse with another format, please set the schema to `str` and parse with `polars as-datetime <format>`.")),
|
||||||
|
inner: vec![],
|
||||||
|
})?
|
||||||
|
.and_utc()
|
||||||
|
.timestamp_nanos_opt()
|
||||||
|
.map(|nanos| nanos_to_timeunit(nanos, *tu))
|
||||||
|
.transpose()
|
||||||
|
}
|
||||||
|
|
||||||
_ => Ok(None),
|
_ => Ok(None),
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
Loading…
Reference in New Issue
Block a user