From b67b6f7fc5925f1cdd37d027901ecc8c490d9a40 Mon Sep 17 00:00:00 2001 From: WMR Date: Tue, 30 May 2023 07:41:18 -0700 Subject: [PATCH] Add a datepart expression for dfr to be used with dfr with-column (#9285) # Description Today the only way to extract date parts from a dfr series is the dfr get-* set of commands. These create a new dataframe with just the datepart in it, which is almost entirely useless. As far as I can tell there's no way to append it as a series in the original dataframe. In discussion with fdncred on Discord we decided the best route was to add an expression for modifying columns created in dfr with-column. These are the way you manipulate series within a data frame. I'd like feedback on this approach - I think it's a fair way to handle things. An example to test it would be: ```[[ record_time]; [ (date now)]] | dfr into-df | dfr with-column [ ((dfr col record_time) | dfr datepart nanosecond | dfr as "ns" ), (dfr col record_time | dfr datepart second | dfr as "s"), (dfr col record_time | dfr datepart minute | dfr as "m"), (dfr col record_time | dfr datepart hour | dfr as "h") ]``` I'm also proposing we deprecate the dfr get-* commands. I've not been able to figure out any meaningful way they could ever be useful, and this approach makes more sense by attaching the extracted date part to the row in the original dataframe as a new column. # User-Facing Changes add in dfr datepart as an expression # Tests + Formatting Need to add some better assertive tests. I'm also not sure how to properly write the test_dataframe at the bottom, but will revisit as part of this PR. Wanted to get feedback early. # After Submitting --------- Co-authored-by: Robert Waugh --- .../src/dataframe/eager/sql_expr.rs | 4 +- .../src/dataframe/expressions/datepart.rs | 162 ++++++++++++++++++ .../src/dataframe/expressions/mod.rs | 5 +- .../src/dataframe/series/date/as_datetime.rs | 93 ++++++---- .../values/nu_dataframe/between_values.rs | 7 +- .../values/nu_dataframe/conversion.rs | 4 +- .../src/dataframe/values/nu_expression/mod.rs | 10 +- 7 files changed, 236 insertions(+), 49 deletions(-) create mode 100644 crates/nu-cmd-dataframe/src/dataframe/expressions/datepart.rs diff --git a/crates/nu-cmd-dataframe/src/dataframe/eager/sql_expr.rs b/crates/nu-cmd-dataframe/src/dataframe/eager/sql_expr.rs index b078a39f9..66785ebb9 100644 --- a/crates/nu-cmd-dataframe/src/dataframe/eager/sql_expr.rs +++ b/crates/nu-cmd-dataframe/src/dataframe/eager/sql_expr.rs @@ -29,8 +29,8 @@ fn map_sql_polars_datatype(data_type: &SQLDataType) -> Result { SQLDataType::Boolean => DataType::Boolean, SQLDataType::Date => DataType::Date, SQLDataType::Time(_, _) => DataType::Time, - SQLDataType::Timestamp(_, _) => DataType::Datetime(TimeUnit::Milliseconds, None), - SQLDataType::Interval => DataType::Duration(TimeUnit::Milliseconds), + SQLDataType::Timestamp(_, _) => DataType::Datetime(TimeUnit::Microseconds, None), + SQLDataType::Interval => DataType::Duration(TimeUnit::Microseconds), SQLDataType::Array(inner_type) => match inner_type { Some(inner_type) => DataType::List(Box::new(map_sql_polars_datatype(inner_type)?)), None => { diff --git a/crates/nu-cmd-dataframe/src/dataframe/expressions/datepart.rs b/crates/nu-cmd-dataframe/src/dataframe/expressions/datepart.rs new file mode 100644 index 000000000..bdecd5091 --- /dev/null +++ b/crates/nu-cmd-dataframe/src/dataframe/expressions/datepart.rs @@ -0,0 +1,162 @@ +use super::super::values::NuExpression; + +use crate::dataframe::values::{Column, NuDataFrame}; +use chrono::{DateTime, FixedOffset}; +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Type, + Value, +}; + +#[derive(Clone)] +pub struct ExprDatePart; + +impl Command for ExprDatePart { + fn name(&self) -> &str { + "dfr datepart" + } + + fn usage(&self) -> &str { + "Creates an expression for capturing the specified datepart in a column." + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required( + "Datepart name", + SyntaxShape::String, + "Part of the date to capture. Possible values are year, quarter, month, week, weekday, day, hour, minute, second, millisecond, microsecond, nanosecond", + ) + .input_type(Type::Custom("expression".into())) + .output_type(Type::Custom("expression".into())) + .category(Category::Custom("expression".into())) + } + + fn examples(&self) -> Vec { + let dt = DateTime::::parse_from_str( + "2021-12-30T01:02:03.123456789 +0000", + "%Y-%m-%dT%H:%M:%S.%9f %z", + ) + .expect("date calculation should not fail in test"); + vec![ + Example { + description: "Creates an expression to capture the year date part", + example: r#"[["2021-12-30T01:02:03.123456789"]] | dfr into-df | dfr as-datetime "%Y-%m-%dT%H:%M:%S.%9f" | dfr with-column [(dfr col datetime | dfr datepart year | dfr as datetime_year )]"#, + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new("datetime".to_string(), vec![Value::test_date(dt)]), + Column::new("datetime_year".to_string(), vec![Value::test_int(2021)]), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + Example { + description: "Creates an expression to capture multiple date parts", + example: r#"[["2021-12-30T01:02:03.123456789"]] | dfr into-df | dfr as-datetime "%Y-%m-%dT%H:%M:%S.%9f" | + dfr with-column [ (dfr col datetime | dfr datepart year | dfr as datetime_year ), + (dfr col datetime | dfr datepart month | dfr as datetime_month ), + (dfr col datetime | dfr datepart day | dfr as datetime_day ), + (dfr col datetime | dfr datepart hour | dfr as datetime_hour ), + (dfr col datetime | dfr datepart minute | dfr as datetime_minute ), + (dfr col datetime | dfr datepart second | dfr as datetime_second ), + (dfr col datetime | dfr datepart nanosecond | dfr as datetime_ns ) ]"#, + result: Some( + NuDataFrame::try_from_columns(vec![ + Column::new("datetime".to_string(), vec![Value::test_date(dt)]), + Column::new("datetime_year".to_string(), vec![Value::test_int(2021)]), + Column::new("datetime_month".to_string(), vec![Value::test_int(12)]), + Column::new("datetime_day".to_string(), vec![Value::test_int(30)]), + Column::new("datetime_hour".to_string(), vec![Value::test_int(1)]), + Column::new("datetime_minute".to_string(), vec![Value::test_int(2)]), + Column::new("datetime_second".to_string(), vec![Value::test_int(3)]), + Column::new("datetime_ns".to_string(), vec![Value::test_int(123456789)]), + ]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + ] + } + + fn search_terms(&self) -> Vec<&str> { + vec![ + "year", + "month", + "week", + "weekday", + "quarter", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + let part: Spanned = call.req(engine_state, stack, 0)?; + + let expr = NuExpression::try_from_pipeline(input, call.head)?; + let expr_dt = expr.into_polars().dt(); + let expr = match part.item.as_str() { + "year" => expr_dt.year(), + "quarter" => expr_dt.quarter(), + "month" => expr_dt.month(), + "week" => expr_dt.week(), + "day" => expr_dt.day(), + "hour" => expr_dt.hour(), + "minute" => expr_dt.minute(), + "second" => expr_dt.second(), + "millisecond" => expr_dt.millisecond(), + "microsecond" => expr_dt.microsecond(), + "nanosecond" => expr_dt.nanosecond(), + _ => { + return Err(ShellError::UnsupportedInput( + format!("{} is not a valid datepart, expected one of year, month, day, hour, minute, second, millisecond, microsecond, nanosecond", part.item), + "value originates from here".to_string(), + call.head, + part.span, + )); + } + }.into(); + + Ok(PipelineData::Value( + NuExpression::into_value(expr, call.head), + None, + )) + } +} + +#[cfg(test)] +mod test { + use super::super::super::test_dataframe::test_dataframe; + use super::*; + use crate::dataframe::eager::WithColumn; + use crate::dataframe::expressions::ExprAlias; + use crate::dataframe::expressions::ExprAsNu; + use crate::dataframe::expressions::ExprCol; + use crate::dataframe::series::AsDateTime; + + #[test] + fn test_examples() { + test_dataframe(vec![ + Box::new(ExprDatePart {}), + Box::new(ExprCol {}), + Box::new(ExprAsNu {}), + Box::new(AsDateTime {}), + Box::new(WithColumn {}), + Box::new(ExprAlias {}), + ]) + } +} diff --git a/crates/nu-cmd-dataframe/src/dataframe/expressions/mod.rs b/crates/nu-cmd-dataframe/src/dataframe/expressions/mod.rs index 25bfc4d41..75f9a007f 100644 --- a/crates/nu-cmd-dataframe/src/dataframe/expressions/mod.rs +++ b/crates/nu-cmd-dataframe/src/dataframe/expressions/mod.rs @@ -3,6 +3,7 @@ mod arg_where; mod as_nu; mod col; mod concat_str; +mod datepart; mod expressions_macro; mod is_in; mod lit; @@ -17,6 +18,7 @@ use crate::dataframe::expressions::arg_where::ExprArgWhere; use crate::dataframe::expressions::as_nu::ExprAsNu; pub(super) use crate::dataframe::expressions::col::ExprCol; pub(super) use crate::dataframe::expressions::concat_str::ExprConcatStr; +pub(crate) use crate::dataframe::expressions::datepart::ExprDatePart; pub(crate) use crate::dataframe::expressions::expressions_macro::*; pub(super) use crate::dataframe::expressions::is_in::ExprIsIn; pub(super) use crate::dataframe::expressions::lit::ExprLit; @@ -64,6 +66,7 @@ pub fn add_expressions(working_set: &mut StateWorkingSet) { ExprMean, ExprMedian, ExprStd, - ExprVar + ExprVar, + ExprDatePart ); } diff --git a/crates/nu-cmd-dataframe/src/dataframe/series/date/as_datetime.rs b/crates/nu-cmd-dataframe/src/dataframe/series/date/as_datetime.rs index aaff3bd0a..ba69b5a9a 100644 --- a/crates/nu-cmd-dataframe/src/dataframe/series/date/as_datetime.rs +++ b/crates/nu-cmd-dataframe/src/dataframe/series/date/as_datetime.rs @@ -46,35 +46,66 @@ impl Command for AsDateTime { } fn examples(&self) -> Vec { - vec![Example { - description: "Converts string to datetime", - example: r#"["2021-12-30 00:00:00" "2021-12-31 00:00:00"] | dfr into-df | dfr as-datetime "%Y-%m-%d %H:%M:%S""#, - result: Some( - NuDataFrame::try_from_columns(vec![Column::new( - "datetime".to_string(), - vec![ - Value::Date { - val: DateTime::parse_from_str( - "2021-12-30 00:00:00 +0000", - "%Y-%m-%d %H:%M:%S %z", - ) - .expect("date calculation should not fail in test"), - span: Span::test_data(), - }, - Value::Date { - val: DateTime::parse_from_str( - "2021-12-31 00:00:00 +0000", - "%Y-%m-%d %H:%M:%S %z", - ) - .expect("date calculation should not fail in test"), - span: Span::test_data(), - }, - ], - )]) - .expect("simple df for test should not fail") - .into_value(Span::test_data()), - ), - }] + vec![ + Example { + description: "Converts string to datetime", + example: r#"["2021-12-30 00:00:00" "2021-12-31 00:00:00"] | dfr into-df | dfr as-datetime "%Y-%m-%d %H:%M:%S""#, + result: Some( + NuDataFrame::try_from_columns(vec![Column::new( + "datetime".to_string(), + vec![ + Value::Date { + val: DateTime::parse_from_str( + "2021-12-30 00:00:00 +0000", + "%Y-%m-%d %H:%M:%S %z", + ) + .expect("date calculation should not fail in test"), + span: Span::test_data(), + }, + Value::Date { + val: DateTime::parse_from_str( + "2021-12-31 00:00:00 +0000", + "%Y-%m-%d %H:%M:%S %z", + ) + .expect("date calculation should not fail in test"), + span: Span::test_data(), + }, + ], + )]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + Example { + description: "Converts string to datetime with high resolutions", + example: r#"["2021-12-30 00:00:00.123456789" "2021-12-31 00:00:00.123456789"] | dfr into-df | dfr as-datetime "%Y-%m-%d %H:%M:%S.%9f""#, + result: Some( + NuDataFrame::try_from_columns(vec![Column::new( + "datetime".to_string(), + vec![ + Value::Date { + val: DateTime::parse_from_str( + "2021-12-30 00:00:00.123456789 +0000", + "%Y-%m-%d %H:%M:%S.%9f %z", + ) + .expect("date calculation should not fail in test"), + span: Span::test_data(), + }, + Value::Date { + val: DateTime::parse_from_str( + "2021-12-31 00:00:00.123456789 +0000", + "%Y-%m-%d %H:%M:%S.%9f %z", + ) + .expect("date calculation should not fail in test"), + span: Span::test_data(), + }, + ], + )]) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), + }, + ] } fn run( @@ -110,11 +141,11 @@ fn command( })?; let res = if not_exact { - casted.as_datetime_not_exact(Some(format.as_str()), TimeUnit::Milliseconds, None) + casted.as_datetime_not_exact(Some(format.as_str()), TimeUnit::Nanoseconds, None) } else { casted.as_datetime( Some(format.as_str()), - TimeUnit::Milliseconds, + TimeUnit::Nanoseconds, false, false, true, diff --git a/crates/nu-cmd-dataframe/src/dataframe/values/nu_dataframe/between_values.rs b/crates/nu-cmd-dataframe/src/dataframe/values/nu_dataframe/between_values.rs index ade5ca59a..88ba19d99 100644 --- a/crates/nu-cmd-dataframe/src/dataframe/values/nu_dataframe/between_values.rs +++ b/crates/nu-cmd-dataframe/src/dataframe/values/nu_dataframe/between_values.rs @@ -6,7 +6,7 @@ use nu_protocol::{ use num::Zero; use polars::prelude::{ BooleanType, ChunkCompare, ChunkedArray, DataType, Float64Type, Int64Type, IntoSeries, - NumOpsDispatchChecked, PolarsError, Series, TimeUnit, Utf8NameSpaceImpl, + NumOpsDispatchChecked, PolarsError, Series, Utf8NameSpaceImpl, }; use std::ops::{Add, BitAnd, BitOr, Div, Mul, Sub}; @@ -580,10 +580,7 @@ where F: Fn(&ChunkedArray, i64) -> ChunkedArray, { match series.dtype() { - DataType::UInt32 - | DataType::Int32 - | DataType::UInt64 - | DataType::Datetime(TimeUnit::Milliseconds, _) => { + DataType::UInt32 | DataType::Int32 | DataType::UInt64 | DataType::Datetime(_, _) => { let to_i64 = series.cast(&DataType::Int64); match to_i64 { diff --git a/crates/nu-cmd-dataframe/src/dataframe/values/nu_dataframe/conversion.rs b/crates/nu-cmd-dataframe/src/dataframe/values/nu_dataframe/conversion.rs index 5aa82743a..06c7e81f4 100644 --- a/crates/nu-cmd-dataframe/src/dataframe/values/nu_dataframe/conversion.rs +++ b/crates/nu-cmd-dataframe/src/dataframe/values/nu_dataframe/conversion.rs @@ -749,7 +749,7 @@ pub fn from_parsed_columns(column_values: ColumnMap) -> Result { let it = column.values.iter().map(|v| { if let Value::Date { val, .. } = &v { - Some(val.timestamp_millis()) + Some(val.timestamp_nanos()) } else { None } @@ -757,7 +757,7 @@ pub fn from_parsed_columns(column_values: ColumnMap) -> Result::from_iter_options(&name, it) - .into_datetime(TimeUnit::Milliseconds, None); + .into_datetime(TimeUnit::Nanoseconds, None); df_series.push(res.into_series()) } diff --git a/crates/nu-cmd-dataframe/src/dataframe/values/nu_expression/mod.rs b/crates/nu-cmd-dataframe/src/dataframe/values/nu_expression/mod.rs index 83f931bcb..5743fa841 100644 --- a/crates/nu-cmd-dataframe/src/dataframe/values/nu_expression/mod.rs +++ b/crates/nu-cmd-dataframe/src/dataframe/values/nu_expression/mod.rs @@ -1,6 +1,5 @@ mod custom_value; -use core::fmt; use nu_protocol::{PipelineData, ShellError, Span, Value}; use polars::prelude::{col, AggExpr, Expr, Literal}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -8,7 +7,7 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer}; // Polars Expression wrapper for Nushell operations // Object is behind and Option to allow easy implementation of // the Deserialize trait -#[derive(Default, Clone)] +#[derive(Default, Clone, Debug)] pub struct NuExpression(Option); // Mocked serialization of the LazyFrame object @@ -31,12 +30,6 @@ impl<'de> Deserialize<'de> for NuExpression { } } -impl fmt::Debug for NuExpression { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "NuExpression") - } -} - // Referenced access to the real LazyFrame impl AsRef for NuExpression { fn as_ref(&self) -> &polars::prelude::Expr { @@ -132,6 +125,7 @@ impl NuExpression { } } +#[derive(Debug)] // Enum to represent the parsing of the expressions from Value enum ExtractedExpr { Single(Expr),