polars into-df/polars into-lazy: --schema will not throw error if only some columns are defined (#15473)

# Description
The current implementation of `polars into-df` and `polars into-lazy`
will throw an error if `--schema` is provided but not all columns are
defined. This PR seeks to remove this requirement so that when a partial
`--schema` is provided, the types on the defined columns are overridden
while the remaining columns take on their default types.

**Current Implementation**
```
$ [[a b]; [1 "foo"] [2 "bar"]] | polars into-df -s {a: str} | polars schema
Error:   × Schema does not contain column: b
   ╭─[entry #88:1:12]
 1 │ [[a b]; [1 "foo"] [2 "bar"]] | polars into-df -s {a: str} | polars schema
   ·            ─────
   ╰────
```

**New Implementation (no error thrown on partial schema definition)**
Column b is not defined in `--schema`
```
$ [[a b]; [1 "foo"] [2 "bar"]] | polars into-df --schema {a: str} | polars schema
╭───┬─────╮
│ a │ str │
│ b │ str │
╰───┴─────╯
```

# User-Facing Changes
Soft breaking change: The user's previous (erroneous) code that would
have thrown an error would no longer throw an error. The user's previous
working code will still work.

# Tests + Formatting


# After Submitting
This commit is contained in:
pyz4 2025-04-07 18:58:37 -04:00 committed by GitHub
parent 12a1eefe73
commit 147009a161
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 43 additions and 24 deletions

View File

@ -35,7 +35,7 @@ impl PluginCommand for ToDataFrame {
.named( .named(
"schema", "schema",
SyntaxShape::Record(vec![]), SyntaxShape::Record(vec![]),
r#"Polars Schema in format [{name: str}]. CSV, JSON, and JSONL files"#, r#"Polars Schema in format [{name: str}]."#,
Some('s'), Some('s'),
) )
.switch( .switch(
@ -193,6 +193,16 @@ impl PluginCommand for ToDataFrame {
.expect("simple df for test should not fail") .expect("simple df for test should not fail")
.into_value(Span::test_data()), .into_value(Span::test_data()),
), ),
},
Example {
description: "If a provided schema specifies a subset of columns, only those columns are selected",
example: r#"[[a b]; [1 "foo"] [2 "bar"]] | polars into-df -s {a: str}"#,
result: Some(NuDataFrame::try_from_series_vec(vec![
Series::new("a".into(), ["1", "2"]),
], Span::test_data())
.expect("simple df for test should not fail")
.into_value(Span::test_data()),
),
} }
] ]
} }

View File

@ -3,7 +3,9 @@ use crate::{dataframe::values::NuSchema, values::CustomValueSupport, Cacheable,
use crate::values::{NuDataFrame, NuLazyFrame}; use crate::values::{NuDataFrame, NuLazyFrame};
use nu_plugin::{EngineInterface, EvaluatedCall, PluginCommand}; use nu_plugin::{EngineInterface, EvaluatedCall, PluginCommand};
use nu_protocol::{Category, Example, LabeledError, PipelineData, Signature, SyntaxShape, Type}; use nu_protocol::{
record, Category, Example, LabeledError, PipelineData, Signature, SyntaxShape, Type, Value,
};
#[derive(Clone)] #[derive(Clone)]
pub struct ToLazyFrame; pub struct ToLazyFrame;
@ -24,7 +26,7 @@ impl PluginCommand for ToLazyFrame {
.named( .named(
"schema", "schema",
SyntaxShape::Record(vec![]), SyntaxShape::Record(vec![]),
r#"Polars Schema in format [{name: str}]. CSV, JSON, and JSONL files"#, r#"Polars Schema in format [{name: str}]."#,
Some('s'), Some('s'),
) )
.input_output_type(Type::Any, Type::Custom("dataframe".into())) .input_output_type(Type::Any, Type::Custom("dataframe".into()))
@ -40,7 +42,7 @@ impl PluginCommand for ToLazyFrame {
Example { Example {
description: "Takes a table, creates a lazyframe, assigns column 'b' type str, displays the schema", description: "Takes a table, creates a lazyframe, assigns column 'b' type str, displays the schema",
example: "[[a b];[1 2] [3 4]] | polars into-lazy --schema {b: str} | polars schema", example: "[[a b];[1 2] [3 4]] | polars into-lazy --schema {b: str} | polars schema",
result: None result: Some(Value::test_record(record! {"b" => Value::test_string("str")})),
}, },
] ]
} }
@ -70,6 +72,7 @@ impl PluginCommand for ToLazyFrame {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::test::test_polars_plugin_command;
use std::sync::Arc; use std::sync::Arc;
use nu_plugin_test_support::PluginTest; use nu_plugin_test_support::PluginTest;
@ -87,4 +90,9 @@ mod tests {
assert!(!df.from_eager); assert!(!df.from_eager);
Ok(()) Ok(())
} }
#[test]
fn test_examples() -> Result<(), ShellError> {
test_polars_plugin_command(&ToLazyFrame)
}
} }

View File

@ -205,6 +205,13 @@ pub fn insert_value(
column_values: &mut ColumnMap, column_values: &mut ColumnMap,
maybe_schema: &Option<NuSchema>, maybe_schema: &Option<NuSchema>,
) -> Result<(), ShellError> { ) -> Result<(), ShellError> {
// If we have a schema but a key is not provided, do not create that column
if let Some(schema) = maybe_schema {
if !schema.schema.contains(&key) {
return Ok(());
}
}
let col_val = match column_values.entry(key.clone()) { let col_val = match column_values.entry(key.clone()) {
Entry::Vacant(entry) => entry.insert(TypedColumn::new_empty(key.clone())), Entry::Vacant(entry) => entry.insert(TypedColumn::new_empty(key.clone())),
Entry::Occupied(entry) => entry.into_mut(), Entry::Occupied(entry) => entry.into_mut(),
@ -215,28 +222,22 @@ pub fn insert_value(
if let Some(field) = schema.schema.get_field(&key) { if let Some(field) = schema.schema.get_field(&key) {
col_val.column_type = Some(field.dtype().clone()); col_val.column_type = Some(field.dtype().clone());
col_val.values.push(value); col_val.values.push(value);
Ok(()) return Ok(());
} else {
Err(ShellError::GenericError {
error: format!("Schema does not contain column: {key}"),
msg: "".into(),
span: Some(value.span()),
help: None,
inner: vec![],
})
} }
} else {
let current_data_type = value_to_data_type(&value);
if col_val.column_type.is_none() {
col_val.column_type = value_to_data_type(&value);
} else if let Some(current_data_type) = current_data_type {
if col_val.column_type.as_ref() != Some(&current_data_type) {
col_val.column_type = Some(DataType::Object("Value", None));
}
}
col_val.values.push(value);
Ok(())
} }
// If we do not have a schema, use defaults specified in `value_to_data_type`
let current_data_type = value_to_data_type(&value);
if col_val.column_type.is_none() {
col_val.column_type = value_to_data_type(&value);
} else if let Some(current_data_type) = current_data_type {
if col_val.column_type.as_ref() != Some(&current_data_type) {
col_val.column_type = Some(DataType::Object("Value", None));
}
}
col_val.values.push(value);
Ok(())
} }
fn value_to_data_type(value: &Value) -> Option<DataType> { fn value_to_data_type(value: &Value) -> Option<DataType> {