Merge branch 'main' into polars_categorical_2

This commit is contained in:
Jack Wright
2025-04-04 13:12:44 -07:00
46 changed files with 914 additions and 291 deletions

View File

@ -39,14 +39,14 @@ impl PluginCommand for ToRepr {
result: Some(Value::string(
r#"
shape: (2, 2)
┌─────────────────────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ datetime[ns] ┆ i64 │
╞═════════════════════╪═════╡
│ 2025-01-01 00:00:00 ┆ 2 │
│ 2025-01-02 00:00:00 ┆ 4 │
└─────────────────────┴─────┘"#
┌─────────────────────────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ datetime[ns, UTC] ┆ i64 │
╞═════════════════════════╪═════╡
│ 2025-01-01 00:00:00 UTC ┆ 2 │
│ 2025-01-02 00:00:00 UTC ┆ 4 │
└─────────────────────────┴─────┘"#
.trim(),
Span::test_data(),
)),
@ -54,18 +54,18 @@ shape: (2, 2)
Example {
description: "Shows lazy dataframe in repr format",
example:
"[[a b]; [2025-01-01 2] [2025-01-02 4]] | polars into-df | polars into-lazy | polars into-repr",
"[[a b]; [2025-01-01 2] [2025-01-02 4]] | polars into-lazy | polars into-repr",
result: Some(Value::string(
r#"
shape: (2, 2)
┌─────────────────────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ datetime[ns] ┆ i64 │
╞═════════════════════╪═════╡
│ 2025-01-01 00:00:00 ┆ 2 │
│ 2025-01-02 00:00:00 ┆ 4 │
└─────────────────────┴─────┘"#
┌─────────────────────────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ datetime[ns, UTC] ┆ i64 │
╞═════════════════════════╪═════╡
│ 2025-01-01 00:00:00 UTC ┆ 2 │
│ 2025-01-02 00:00:00 UTC ┆ 4 │
└─────────────────────────┴─────┘"#
.trim(),
Span::test_data(),
)),

View File

@ -1,6 +1,7 @@
use crate::{values::CustomValueSupport, PolarsPlugin};
use std::sync::Arc;
use super::super::super::values::{Column, NuDataFrame};
use super::super::super::values::{Column, NuDataFrame, NuSchema};
use chrono::DateTime;
use nu_plugin::{EngineInterface, EvaluatedCall, PluginCommand};
@ -8,7 +9,7 @@ use nu_protocol::{
Category, Example, LabeledError, PipelineData, ShellError, Signature, Span, SyntaxShape, Type,
Value,
};
use polars::prelude::{IntoSeries, StringMethods, TimeUnit};
use polars::prelude::{DataType, Field, IntoSeries, Schema, StringMethods, TimeUnit};
#[derive(Clone)]
pub struct AsDateTime;
@ -43,6 +44,7 @@ impl PluginCommand for AsDateTime {
Signature::build(self.name())
.required("format", SyntaxShape::String, "formatting date time string")
.switch("not-exact", "the format string may be contained in the date (e.g. foo-2021-01-01-bar could match 2021-01-01)", Some('n'))
.switch("naive", "the input datetimes should be parsed as naive (i.e., not timezone-aware)", None)
.input_output_type(
Type::Custom("dataframe".into()),
Type::Custom("dataframe".into()),
@ -54,7 +56,7 @@ impl PluginCommand for AsDateTime {
vec![
Example {
description: "Converts string to datetime",
example: r#"["2021-12-30 00:00:00" "2021-12-31 00:00:00"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S""#,
example: r#"["2021-12-30 00:00:00 -0400" "2021-12-31 00:00:00 -0400"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S %z""#,
result: Some(
NuDataFrame::try_from_columns(
vec![Column::new(
@ -62,7 +64,7 @@ impl PluginCommand for AsDateTime {
vec![
Value::date(
DateTime::parse_from_str(
"2021-12-30 00:00:00 +0000",
"2021-12-30 00:00:00 -0400",
"%Y-%m-%d %H:%M:%S %z",
)
.expect("date calculation should not fail in test"),
@ -70,7 +72,7 @@ impl PluginCommand for AsDateTime {
),
Value::date(
DateTime::parse_from_str(
"2021-12-31 00:00:00 +0000",
"2021-12-31 00:00:00 -0400",
"%Y-%m-%d %H:%M:%S %z",
)
.expect("date calculation should not fail in test"),
@ -86,7 +88,7 @@ impl PluginCommand for AsDateTime {
},
Example {
description: "Converts string to datetime with high resolutions",
example: r#"["2021-12-30 00:00:00.123456789" "2021-12-31 00:00:00.123456789"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S.%9f""#,
example: r#"["2021-12-30 00:00:00.123456789" "2021-12-31 00:00:00.123456789"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S.%9f" --naive"#,
result: Some(
NuDataFrame::try_from_columns(
vec![Column::new(
@ -110,7 +112,15 @@ impl PluginCommand for AsDateTime {
),
],
)],
None,
Some(NuSchema::new(Arc::new(Schema::from_iter(vec![
Field::new(
"datetime".into(),
DataType::Datetime(
TimeUnit::Nanoseconds,
None
),
),
])))),
)
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
@ -118,7 +128,7 @@ impl PluginCommand for AsDateTime {
},
Example {
description: "Converts string to datetime using the `--not-exact` flag even with excessive symbols",
example: r#"["2021-12-30 00:00:00 GMT+4"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S" --not-exact"#,
example: r#"["2021-12-30 00:00:00 GMT+4"] | polars into-df | polars as-datetime "%Y-%m-%d %H:%M:%S" --not-exact --naive"#,
result: Some(
NuDataFrame::try_from_columns(
vec![Column::new(
@ -134,7 +144,15 @@ impl PluginCommand for AsDateTime {
),
],
)],
None,
Some(NuSchema::new(Arc::new(Schema::from_iter(vec![
Field::new(
"datetime".into(),
DataType::Datetime(
TimeUnit::Nanoseconds,
None
),
),
])))),
)
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
@ -162,6 +180,7 @@ fn command(
) -> Result<PipelineData, ShellError> {
let format: String = call.req(0)?;
let not_exact = call.has_flag("not-exact")?;
let tz_aware = !call.has_flag("naive")?;
let df = NuDataFrame::try_from_pipeline_coerce(plugin, input, call.head)?;
let series = df.as_series(call.head)?;
@ -177,7 +196,7 @@ fn command(
casted.as_datetime_not_exact(
Some(format.as_str()),
TimeUnit::Nanoseconds,
false,
tz_aware,
None,
&Default::default(),
)
@ -186,7 +205,7 @@ fn command(
Some(format.as_str()),
TimeUnit::Nanoseconds,
false,
false,
tz_aware,
None,
&Default::default(),
)

View File

@ -1,7 +1,8 @@
use crate::values::NuExpression;
use std::sync::Arc;
use crate::{
dataframe::values::{Column, NuDataFrame},
dataframe::values::{Column, NuDataFrame, NuSchema},
values::CustomValueSupport,
PolarsPlugin,
};
@ -13,7 +14,7 @@ use nu_protocol::{
};
use polars::{
datatypes::{DataType, TimeUnit},
prelude::NamedFrom,
prelude::{Field, NamedFrom, Schema},
series::Series,
};
@ -54,14 +55,20 @@ impl PluginCommand for ExprDatePart {
vec![
Example {
description: "Creates an expression to capture the year date part",
example: r#"[["2021-12-30T01:02:03.123456789"]] | polars into-df | polars as-datetime "%Y-%m-%dT%H:%M:%S.%9f" | polars with-column [(polars col datetime | polars datepart year | polars as datetime_year )]"#,
example: r#"[["2021-12-30T01:02:03.123456789"]] | polars into-df | polars as-datetime "%Y-%m-%dT%H:%M:%S.%9f" --naive | polars with-column [(polars col datetime | polars datepart year | polars as datetime_year )]"#,
result: Some(
NuDataFrame::try_from_columns(
vec![
Column::new("datetime".to_string(), vec![Value::test_date(dt)]),
Column::new("datetime_year".to_string(), vec![Value::test_int(2021)]),
],
None,
Some(NuSchema::new(Arc::new(Schema::from_iter(vec![
Field::new(
"datetime".into(),
DataType::Datetime(TimeUnit::Nanoseconds, None),
),
Field::new("datetime_year".into(), DataType::Int64),
])))),
)
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
@ -69,7 +76,7 @@ impl PluginCommand for ExprDatePart {
},
Example {
description: "Creates an expression to capture multiple date parts",
example: r#"[["2021-12-30T01:02:03.123456789"]] | polars into-df | polars as-datetime "%Y-%m-%dT%H:%M:%S.%9f" |
example: r#"[["2021-12-30T01:02:03.123456789"]] | polars into-df | polars as-datetime "%Y-%m-%dT%H:%M:%S.%9f" --naive |
polars with-column [ (polars col datetime | polars datepart year | polars as datetime_year ),
(polars col datetime | polars datepart month | polars as datetime_month ),
(polars col datetime | polars datepart day | polars as datetime_day ),

View File

@ -245,7 +245,10 @@ fn value_to_data_type(value: &Value) -> Option<DataType> {
Value::Float { .. } => Some(DataType::Float64),
Value::String { .. } => Some(DataType::String),
Value::Bool { .. } => Some(DataType::Boolean),
Value::Date { .. } => Some(DataType::Date),
Value::Date { .. } => Some(DataType::Datetime(
TimeUnit::Nanoseconds,
Some(PlSmallStr::from_static("UTC")),
)),
Value::Duration { .. } => Some(DataType::Duration(TimeUnit::Nanoseconds)),
Value::Filesize { .. } => Some(DataType::Int64),
Value::Binary { .. } => Some(DataType::Binary),
@ -447,24 +450,28 @@ fn typed_column_to_series(name: PlSmallStr, column: TypedColumn) -> Result<Serie
.values
.iter()
.map(|v| {
if let Value::Date { val, .. } = &v {
// If there is a timezone specified, make sure
// the value is converted to it
Ok(maybe_tz
.as_ref()
.map(|tz| tz.parse::<Tz>().map(|tz| val.with_timezone(&tz)))
.transpose()
.map_err(|e| ShellError::GenericError {
error: "Error parsing timezone".into(),
msg: "".into(),
span: None,
help: Some(e.to_string()),
inner: vec![],
})?
.and_then(|dt| dt.timestamp_nanos_opt())
.map(|nanos| nanos_from_timeunit(nanos, *tu)))
} else {
Ok(None)
match (maybe_tz, &v) {
(Some(tz), Value::Date { val, .. }) => {
// If there is a timezone specified, make sure
// the value is converted to it
Ok(tz
.parse::<Tz>()
.map(|tz| val.with_timezone(&tz))
.map_err(|e| ShellError::GenericError {
error: "Error parsing timezone".into(),
msg: "".into(),
span: None,
help: Some(e.to_string()),
inner: vec![],
})?
.timestamp_nanos_opt()
.map(|nanos| nanos_from_timeunit(nanos, *tu)))
}
(None, Value::Date { val, .. }) => Ok(val
.timestamp_nanos_opt()
.map(|nanos| nanos_from_timeunit(nanos, *tu))),
_ => Ok(None),
}
})
.collect::<Result<Vec<Option<i64>>, ShellError>>()?;

View File

@ -71,7 +71,7 @@ impl Default for DataFrameValue {
impl PartialEq for DataFrameValue {
fn eq(&self, other: &Self) -> bool {
self.0.partial_cmp(&other.0).map_or(false, Ordering::is_eq)
self.0.partial_cmp(&other.0).is_some_and(Ordering::is_eq)
}
}
impl Eq for DataFrameValue {}

View File

@ -169,6 +169,67 @@ pub fn str_to_dtype(dtype: &str, span: Span) -> Result<DataType, ShellError> {
let time_unit = str_to_time_unit(next, span)?;
Ok(DataType::Duration(time_unit))
}
_ if dtype.starts_with("decimal") => {
let dtype = dtype
.trim_start_matches("decimal")
.trim_start_matches('<')
.trim_end_matches('>');
let mut split = dtype.split(',');
let next = split
.next()
.ok_or_else(|| ShellError::GenericError {
error: "Invalid polars data type".into(),
msg: "Missing decimal precision".into(),
span: Some(span),
help: None,
inner: vec![],
})?
.trim();
let precision = match next {
"*" => None, // infer
_ => Some(
next.parse::<usize>()
.map_err(|e| ShellError::GenericError {
error: "Invalid polars data type".into(),
msg: format!("Error in parsing decimal precision: {e}"),
span: Some(span),
help: None,
inner: vec![],
})?,
),
};
let next = split
.next()
.ok_or_else(|| ShellError::GenericError {
error: "Invalid polars data type".into(),
msg: "Missing decimal scale".into(),
span: Some(span),
help: None,
inner: vec![],
})?
.trim();
let scale = match next {
"*" => Err(ShellError::GenericError {
error: "Invalid polars data type".into(),
msg: "`*` is not a permitted value for scale".into(),
span: Some(span),
help: None,
inner: vec![],
}),
_ => next
.parse::<usize>()
.map(Some)
.map_err(|e| ShellError::GenericError {
error: "Invalid polars data type".into(),
msg: format!("Error in parsing decimal precision: {e}"),
span: Some(span),
help: None,
inner: vec![],
}),
}?;
Ok(DataType::Decimal(precision, scale))
}
_ => Err(ShellError::GenericError {
error: "Invalid polars data type".into(),
msg: format!("Unknown type: {dtype}"),
@ -367,6 +428,24 @@ mod test {
assert_eq!(schema, expected);
}
#[test]
fn test_dtype_str_schema_decimal() {
let dtype = "decimal<7,2>";
let schema = str_to_dtype(dtype, Span::unknown()).unwrap();
let expected = DataType::Decimal(Some(7usize), Some(2usize));
assert_eq!(schema, expected);
// "*" is not a permitted value for scale
let dtype = "decimal<7,*>";
let schema = str_to_dtype(dtype, Span::unknown());
assert!(matches!(schema, Err(ShellError::GenericError { .. })));
let dtype = "decimal<*,2>";
let schema = str_to_dtype(dtype, Span::unknown()).unwrap();
let expected = DataType::Decimal(None, Some(2usize));
assert_eq!(schema, expected);
}
#[test]
fn test_dtype_str_to_schema_list_types() {
let dtype = "list<i32>";
@ -383,5 +462,19 @@ mod test {
let schema = str_to_dtype(dtype, Span::unknown()).unwrap();
let expected = DataType::List(Box::new(DataType::Datetime(TimeUnit::Milliseconds, None)));
assert_eq!(schema, expected);
let dtype = "list<decimal<7,2>>";
let schema = str_to_dtype(dtype, Span::unknown()).unwrap();
let expected = DataType::List(Box::new(DataType::Decimal(Some(7usize), Some(2usize))));
assert_eq!(schema, expected);
let dtype = "list<decimal<*,2>>";
let schema = str_to_dtype(dtype, Span::unknown()).unwrap();
let expected = DataType::List(Box::new(DataType::Decimal(None, Some(2usize))));
assert_eq!(schema, expected);
let dtype = "list<decimal<7,*>>";
let schema = str_to_dtype(dtype, Span::unknown());
assert!(matches!(schema, Err(ShellError::GenericError { .. })));
}
}