From 147009a161487a1b339d7159db232c7a21dbd7c7 Mon Sep 17 00:00:00 2001 From: pyz4 <42039243+pyz4@users.noreply.github.com> Date: Mon, 7 Apr 2025 18:58:37 -0400 Subject: [PATCH] `polars into-df`/`polars into-lazy`: `--schema` will not throw error if only some columns are defined (#15473) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description The current implementation of `polars into-df` and `polars into-lazy` will throw an error if `--schema` is provided but not all columns are defined. This PR seeks to remove this requirement so that when a partial `--schema` is provided, the types on the defined columns are overridden while the remaining columns take on their default types. **Current Implementation** ``` $ [[a b]; [1 "foo"] [2 "bar"]] | polars into-df -s {a: str} | polars schema Error: × Schema does not contain column: b ╭─[entry #88:1:12] 1 │ [[a b]; [1 "foo"] [2 "bar"]] | polars into-df -s {a: str} | polars schema · ───── ╰──── ``` **New Implementation (no error thrown on partial schema definition)** Column b is not defined in `--schema` ``` $ [[a b]; [1 "foo"] [2 "bar"]] | polars into-df --schema {a: str} | polars schema ╭───┬─────╮ │ a │ str │ │ b │ str │ ╰───┴─────╯ ``` # User-Facing Changes Soft breaking change: The user's previous (erroneous) code that would have thrown an error would no longer throw an error. The user's previous working code will still work. # Tests + Formatting # After Submitting --- .../src/dataframe/command/core/to_df.rs | 12 +++++- .../src/dataframe/command/core/to_lazy.rs | 14 +++++-- .../values/nu_dataframe/conversion.rs | 41 ++++++++++--------- 3 files changed, 43 insertions(+), 24 deletions(-) diff --git a/crates/nu_plugin_polars/src/dataframe/command/core/to_df.rs b/crates/nu_plugin_polars/src/dataframe/command/core/to_df.rs index f8fd9f4572..90f96a8a99 100644 --- a/crates/nu_plugin_polars/src/dataframe/command/core/to_df.rs +++ b/crates/nu_plugin_polars/src/dataframe/command/core/to_df.rs @@ -35,7 +35,7 @@ impl PluginCommand for ToDataFrame { .named( "schema", SyntaxShape::Record(vec![]), - r#"Polars Schema in format [{name: str}]. CSV, JSON, and JSONL files"#, + r#"Polars Schema in format [{name: str}]."#, Some('s'), ) .switch( @@ -193,6 +193,16 @@ impl PluginCommand for ToDataFrame { .expect("simple df for test should not fail") .into_value(Span::test_data()), ), + }, + Example { + description: "If a provided schema specifies a subset of columns, only those columns are selected", + example: r#"[[a b]; [1 "foo"] [2 "bar"]] | polars into-df -s {a: str}"#, + result: Some(NuDataFrame::try_from_series_vec(vec![ + Series::new("a".into(), ["1", "2"]), + ], Span::test_data()) + .expect("simple df for test should not fail") + .into_value(Span::test_data()), + ), } ] } diff --git a/crates/nu_plugin_polars/src/dataframe/command/core/to_lazy.rs b/crates/nu_plugin_polars/src/dataframe/command/core/to_lazy.rs index 4d8bf5bfcf..6086925aac 100644 --- a/crates/nu_plugin_polars/src/dataframe/command/core/to_lazy.rs +++ b/crates/nu_plugin_polars/src/dataframe/command/core/to_lazy.rs @@ -3,7 +3,9 @@ use crate::{dataframe::values::NuSchema, values::CustomValueSupport, Cacheable, use crate::values::{NuDataFrame, NuLazyFrame}; use nu_plugin::{EngineInterface, EvaluatedCall, PluginCommand}; -use nu_protocol::{Category, Example, LabeledError, PipelineData, Signature, SyntaxShape, Type}; +use nu_protocol::{ + record, Category, Example, LabeledError, PipelineData, Signature, SyntaxShape, Type, Value, +}; #[derive(Clone)] pub struct ToLazyFrame; @@ -24,7 +26,7 @@ impl PluginCommand for ToLazyFrame { .named( "schema", SyntaxShape::Record(vec![]), - r#"Polars Schema in format [{name: str}]. CSV, JSON, and JSONL files"#, + r#"Polars Schema in format [{name: str}]."#, Some('s'), ) .input_output_type(Type::Any, Type::Custom("dataframe".into())) @@ -40,7 +42,7 @@ impl PluginCommand for ToLazyFrame { Example { description: "Takes a table, creates a lazyframe, assigns column 'b' type str, displays the schema", example: "[[a b];[1 2] [3 4]] | polars into-lazy --schema {b: str} | polars schema", - result: None + result: Some(Value::test_record(record! {"b" => Value::test_string("str")})), }, ] } @@ -70,6 +72,7 @@ impl PluginCommand for ToLazyFrame { #[cfg(test)] mod tests { + use crate::test::test_polars_plugin_command; use std::sync::Arc; use nu_plugin_test_support::PluginTest; @@ -87,4 +90,9 @@ mod tests { assert!(!df.from_eager); Ok(()) } + + #[test] + fn test_examples() -> Result<(), ShellError> { + test_polars_plugin_command(&ToLazyFrame) + } } diff --git a/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs b/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs index 8b898e449a..b84e85a632 100644 --- a/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs +++ b/crates/nu_plugin_polars/src/dataframe/values/nu_dataframe/conversion.rs @@ -205,6 +205,13 @@ pub fn insert_value( column_values: &mut ColumnMap, maybe_schema: &Option, ) -> Result<(), ShellError> { + // If we have a schema but a key is not provided, do not create that column + if let Some(schema) = maybe_schema { + if !schema.schema.contains(&key) { + return Ok(()); + } + } + let col_val = match column_values.entry(key.clone()) { Entry::Vacant(entry) => entry.insert(TypedColumn::new_empty(key.clone())), Entry::Occupied(entry) => entry.into_mut(), @@ -215,28 +222,22 @@ pub fn insert_value( if let Some(field) = schema.schema.get_field(&key) { col_val.column_type = Some(field.dtype().clone()); col_val.values.push(value); - Ok(()) - } else { - Err(ShellError::GenericError { - error: format!("Schema does not contain column: {key}"), - msg: "".into(), - span: Some(value.span()), - help: None, - inner: vec![], - }) + return Ok(()); } - } else { - let current_data_type = value_to_data_type(&value); - if col_val.column_type.is_none() { - col_val.column_type = value_to_data_type(&value); - } else if let Some(current_data_type) = current_data_type { - if col_val.column_type.as_ref() != Some(¤t_data_type) { - col_val.column_type = Some(DataType::Object("Value", None)); - } - } - col_val.values.push(value); - Ok(()) } + + // If we do not have a schema, use defaults specified in `value_to_data_type` + let current_data_type = value_to_data_type(&value); + if col_val.column_type.is_none() { + col_val.column_type = value_to_data_type(&value); + } else if let Some(current_data_type) = current_data_type { + if col_val.column_type.as_ref() != Some(¤t_data_type) { + col_val.column_type = Some(DataType::Object("Value", None)); + } + } + col_val.values.push(value); + + Ok(()) } fn value_to_data_type(value: &Value) -> Option {