Improved null handling when converting from nu -> dataframe. (#13855)

# Description
Fixes: #12726 and #13185

Previously converting columns that contained null caused polars to force
a dtype of object even when using a schema.

Now:
1. When using a schema, the type the schema defines for the column will
always be used.
2. When a schema is not used, the previous type is used when a value is
null.

# User-Facing Changes
- The type defined by the schema we be respected when passing in a null
value `[a]; [null] | polars into-df -s {a: str}` will create a df with
an str dtype column with one null value versus a column of type object.
- *BREAKING CHANGE* If you define a schema, all columns must be in the
schema.
This commit is contained in:
Jack Wright 2024-09-16 16:07:13 -07:00 committed by GitHub
parent 9ca0fb772d
commit af77bc60e2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 272 additions and 45 deletions

View File

@ -32,12 +32,9 @@ impl PluginCommand for ExprCount {
} }
fn examples(&self) -> Vec<Example> { fn examples(&self) -> Vec<Example> {
// to add an example with a result that contains a null we will need to be able to
// allow null values to be entered into the dataframe from nushell
// and retain the correct dtype. Right now null values cause the dtype to be object
vec![Example { vec![Example {
description: "Count the number of non-null values in a column", description: "Count the number of non-null values in a column",
example: r#"[[a]; ["foo"] ["bar"]] | polars into-df example: r#"[[a]; ["foo"] ["bar"] [null]] | polars into-df
| polars select (polars col a | polars count) | polars select (polars col a | polars count)
| polars collect"#, | polars collect"#,
result: Some( result: Some(

View File

@ -71,8 +71,11 @@ pub struct Column {
} }
impl Column { impl Column {
pub fn new(name: String, values: Vec<Value>) -> Self { pub fn new(name: impl Into<String>, values: Vec<Value>) -> Self {
Self { name, values } Self {
name: name.into(),
values,
}
} }
pub fn new_empty(name: String) -> Self { pub fn new_empty(name: String) -> Self {
@ -149,7 +152,7 @@ pub fn create_column(
) -> Result<Column, ShellError> { ) -> Result<Column, ShellError> {
let size = to_row - from_row; let size = to_row - from_row;
let values = series_to_values(series, Some(from_row), Some(size), span)?; let values = series_to_values(series, Some(from_row), Some(size), span)?;
Ok(Column::new(series.name().into(), values)) Ok(Column::new(series.name(), values))
} }
// Adds a separator to the vector of values using the column names from the // Adds a separator to the vector of values using the column names from the
@ -194,9 +197,25 @@ pub fn insert_value(
Entry::Occupied(entry) => entry.into_mut(), Entry::Occupied(entry) => entry.into_mut(),
}; };
// If we have a schema, use that for determining how things should be added to each column
if let Some(schema) = maybe_schema {
if let Some(field) = schema.schema.get_field(&key) {
col_val.column_type = Some(field.data_type().clone());
col_val.values.push(value);
Ok(())
} else {
Err(ShellError::GenericError {
error: format!("Schema does not contain column: {key}"),
msg: "".into(),
span: Some(value.span()),
help: None,
inner: vec![],
})
}
}
// Checking that the type for the value is the same // Checking that the type for the value is the same
// for the previous value in the column // for the previous value in the column
if col_val.values.is_empty() { else if col_val.values.is_empty() {
if let Some(schema) = maybe_schema { if let Some(schema) = maybe_schema {
if let Some(field) = schema.schema.get_field(&key) { if let Some(field) = schema.schema.get_field(&key) {
col_val.column_type = Some(field.data_type().clone()); col_val.column_type = Some(field.data_type().clone());
@ -206,8 +225,8 @@ pub fn insert_value(
if col_val.column_type.is_none() { if col_val.column_type.is_none() {
col_val.column_type = Some(value_to_data_type(&value)); col_val.column_type = Some(value_to_data_type(&value));
} }
col_val.values.push(value); col_val.values.push(value);
Ok(())
} else { } else {
let prev_value = &col_val.values[col_val.values.len() - 1]; let prev_value = &col_val.values[col_val.values.len() - 1];
@ -219,6 +238,7 @@ pub fn insert_value(
| (Value::Date { .. }, Value::Date { .. }) | (Value::Date { .. }, Value::Date { .. })
| (Value::Filesize { .. }, Value::Filesize { .. }) | (Value::Filesize { .. }, Value::Filesize { .. })
| (Value::Duration { .. }, Value::Duration { .. }) => col_val.values.push(value), | (Value::Duration { .. }, Value::Duration { .. }) => col_val.values.push(value),
(_, Value::Nothing { .. }) => col_val.values.push(value),
(Value::List { .. }, _) => { (Value::List { .. }, _) => {
col_val.column_type = Some(value_to_data_type(&value)); col_val.column_type = Some(value_to_data_type(&value));
col_val.values.push(value); col_val.values.push(value);
@ -228,9 +248,8 @@ pub fn insert_value(
col_val.values.push(value); col_val.values.push(value);
} }
} }
}
Ok(()) Ok(())
}
} }
fn value_to_data_type(value: &Value) -> DataType { fn value_to_data_type(value: &Value) -> DataType {
@ -269,7 +288,8 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column let series_values: Result<Vec<_>, _> = column
.values .values
.iter() .iter()
.map(|v| match v { .map(|v| {
value_to_option(v, |v| match v {
Value::Float { val, .. } => Ok(*val as f32), Value::Float { val, .. } => Ok(*val as f32),
Value::Int { val, .. } => Ok(*val as f32), Value::Int { val, .. } => Ok(*val as f32),
x => Err(ShellError::GenericError { x => Err(ShellError::GenericError {
@ -280,6 +300,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
inner: vec![], inner: vec![],
}), }),
}) })
})
.collect(); .collect();
Ok(Series::new(name, series_values?)) Ok(Series::new(name, series_values?))
} }
@ -287,7 +308,8 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column let series_values: Result<Vec<_>, _> = column
.values .values
.iter() .iter()
.map(|v| match v { .map(|v| {
value_to_option(v, |v| match v {
Value::Float { val, .. } => Ok(*val), Value::Float { val, .. } => Ok(*val),
Value::Int { val, .. } => Ok(*val as f64), Value::Int { val, .. } => Ok(*val as f64),
x => Err(ShellError::GenericError { x => Err(ShellError::GenericError {
@ -298,6 +320,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
inner: vec![], inner: vec![],
}), }),
}) })
})
.collect(); .collect();
Ok(Series::new(name, series_values?)) Ok(Series::new(name, series_values?))
} }
@ -305,7 +328,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column let series_values: Result<Vec<_>, _> = column
.values .values
.iter() .iter()
.map(|v| v.as_i64().map(|v| v as u8)) .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as u8)))
.collect(); .collect();
Ok(Series::new(name, series_values?)) Ok(Series::new(name, series_values?))
} }
@ -313,7 +336,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column let series_values: Result<Vec<_>, _> = column
.values .values
.iter() .iter()
.map(|v| v.as_i64().map(|v| v as u16)) .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as u16)))
.collect(); .collect();
Ok(Series::new(name, series_values?)) Ok(Series::new(name, series_values?))
} }
@ -321,7 +344,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column let series_values: Result<Vec<_>, _> = column
.values .values
.iter() .iter()
.map(|v| v.as_i64().map(|v| v as u32)) .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as u32)))
.collect(); .collect();
Ok(Series::new(name, series_values?)) Ok(Series::new(name, series_values?))
} }
@ -329,7 +352,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column let series_values: Result<Vec<_>, _> = column
.values .values
.iter() .iter()
.map(|v| v.as_i64().map(|v| v as u64)) .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as u64)))
.collect(); .collect();
Ok(Series::new(name, series_values?)) Ok(Series::new(name, series_values?))
} }
@ -337,7 +360,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column let series_values: Result<Vec<_>, _> = column
.values .values
.iter() .iter()
.map(|v| v.as_i64().map(|v| v as i8)) .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as i8)))
.collect(); .collect();
Ok(Series::new(name, series_values?)) Ok(Series::new(name, series_values?))
} }
@ -345,7 +368,7 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column let series_values: Result<Vec<_>, _> = column
.values .values
.iter() .iter()
.map(|v| v.as_i64().map(|v| v as i16)) .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as i16)))
.collect(); .collect();
Ok(Series::new(name, series_values?)) Ok(Series::new(name, series_values?))
} }
@ -353,23 +376,32 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column let series_values: Result<Vec<_>, _> = column
.values .values
.iter() .iter()
.map(|v| v.as_i64().map(|v| v as i32)) .map(|v| value_to_option(v, |v| v.as_i64().map(|v| v as i32)))
.collect(); .collect();
Ok(Series::new(name, series_values?)) Ok(Series::new(name, series_values?))
} }
DataType::Int64 => { DataType::Int64 => {
let series_values: Result<Vec<_>, _> = let series_values: Result<Vec<_>, _> = column
column.values.iter().map(|v| v.as_i64()).collect(); .values
.iter()
.map(|v| value_to_option(v, |v| v.as_i64()))
.collect();
Ok(Series::new(name, series_values?)) Ok(Series::new(name, series_values?))
} }
DataType::Boolean => { DataType::Boolean => {
let series_values: Result<Vec<_>, _> = let series_values: Result<Vec<_>, _> = column
column.values.iter().map(|v| v.as_bool()).collect(); .values
.iter()
.map(|v| value_to_option(v, |v| v.as_bool()))
.collect();
Ok(Series::new(name, series_values?)) Ok(Series::new(name, series_values?))
} }
DataType::String => { DataType::String => {
let series_values: Result<Vec<_>, _> = let series_values: Result<Vec<_>, _> = column
column.values.iter().map(|v| v.coerce_string()).collect(); .values
.iter()
.map(|v| value_to_option(v, |v| v.coerce_string()))
.collect();
Ok(Series::new(name, series_values?)) Ok(Series::new(name, series_values?))
} }
DataType::Object(_, _) => value_to_series(name, &column.values), DataType::Object(_, _) => value_to_series(name, &column.values),
@ -377,7 +409,11 @@ fn typed_column_to_series(name: &str, column: TypedColumn) -> Result<Series, She
let series_values: Result<Vec<_>, _> = column let series_values: Result<Vec<_>, _> = column
.values .values
.iter() .iter()
.map(|v| v.as_i64().map(|v| nanos_from_timeunit(v, *time_unit))) .map(|v| {
value_to_option(v, |v| {
v.as_i64().map(|v| nanos_from_timeunit(v, *time_unit))
})
})
.collect(); .collect();
Ok(Series::new(name, series_values?)) Ok(Series::new(name, series_values?))
} }
@ -1241,6 +1277,17 @@ fn time_from_midnight(nanos: i64, span: Span) -> Result<Value, ShellError> {
}) })
} }
fn value_to_option<T, F>(value: &Value, func: F) -> Result<Option<T>, ShellError>
where
F: FnOnce(&Value) -> Result<T, ShellError>,
{
if value.is_nothing() {
Ok(None)
} else {
func(value).map(|v| Some(v))
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use indexmap::indexmap; use indexmap::indexmap;
@ -1461,4 +1508,187 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn test_typed_column_to_series_f32() -> Result<(), Box<dyn std::error::Error>> {
let column = TypedColumn {
column: Column::new(
"foo",
vec![
Value::test_float(1.1),
Value::test_int(2),
Value::test_nothing(),
],
),
column_type: Some(DataType::Float32),
};
let result = typed_column_to_series("foo", column)?;
assert_eq!(result, Series::new("name", [Some(1.1f32), Some(2.0), None]));
Ok(())
}
#[test]
fn test_typed_column_to_series_f64() -> Result<(), Box<dyn std::error::Error>> {
let column = TypedColumn {
column: Column::new(
"foo",
vec![
Value::test_float(1.1),
Value::test_int(2),
Value::test_nothing(),
],
),
column_type: Some(DataType::Float64),
};
let result = typed_column_to_series("foo", column)?;
assert_eq!(result, Series::new("name", [Some(1.1f64), Some(2.0), None]));
Ok(())
}
#[test]
fn test_typed_column_to_series_u8() -> Result<(), Box<dyn std::error::Error>> {
let column = TypedColumn {
column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
column_type: Some(DataType::UInt8),
};
let result = typed_column_to_series("foo", column)?;
assert_eq!(result, Series::new("name", [Some(1u8), None]));
Ok(())
}
#[test]
fn test_typed_column_to_series_u16() -> Result<(), Box<dyn std::error::Error>> {
let column = TypedColumn {
column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
column_type: Some(DataType::UInt16),
};
let result = typed_column_to_series("foo", column)?;
assert_eq!(result, Series::new("name", [Some(1u16), None]));
Ok(())
}
#[test]
fn test_typed_column_to_series_u32() -> Result<(), Box<dyn std::error::Error>> {
let column = TypedColumn {
column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
column_type: Some(DataType::UInt32),
};
let result = typed_column_to_series("foo", column)?;
assert_eq!(result, Series::new("name", [Some(1u32), None]));
Ok(())
}
#[test]
fn test_typed_column_to_series_u64() -> Result<(), Box<dyn std::error::Error>> {
let column = TypedColumn {
column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
column_type: Some(DataType::UInt64),
};
let result = typed_column_to_series("foo", column)?;
assert_eq!(result, Series::new("name", [Some(1u64), None]));
Ok(())
}
#[test]
fn test_typed_column_to_series_i8() -> Result<(), Box<dyn std::error::Error>> {
let column = TypedColumn {
column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
column_type: Some(DataType::Int8),
};
let result = typed_column_to_series("foo", column)?;
assert_eq!(result, Series::new("name", [Some(1i8), None]));
Ok(())
}
#[test]
fn test_typed_column_to_series_i16() -> Result<(), Box<dyn std::error::Error>> {
let column = TypedColumn {
column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
column_type: Some(DataType::Int16),
};
let result = typed_column_to_series("foo", column)?;
assert_eq!(result, Series::new("name", [Some(1i16), None]));
Ok(())
}
#[test]
fn test_typed_column_to_series_i32() -> Result<(), Box<dyn std::error::Error>> {
let column = TypedColumn {
column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
column_type: Some(DataType::Int32),
};
let result = typed_column_to_series("foo", column)?;
assert_eq!(result, Series::new("name", [Some(1i32), None]));
Ok(())
}
#[test]
fn test_typed_column_to_series_i64() -> Result<(), Box<dyn std::error::Error>> {
let column = TypedColumn {
column: Column::new("foo", vec![Value::test_int(1), Value::test_nothing()]),
column_type: Some(DataType::Int64),
};
let result = typed_column_to_series("foo", column)?;
assert_eq!(result, Series::new("name", [Some(1i64), None]));
Ok(())
}
#[test]
fn test_typed_column_to_series_bool() -> Result<(), Box<dyn std::error::Error>> {
let column = TypedColumn {
column: Column::new(
"foo",
vec![
Value::test_bool(true),
Value::test_bool(false),
Value::test_nothing(),
],
),
column_type: Some(DataType::Boolean),
};
let result = typed_column_to_series("foo", column)?;
assert_eq!(result, Series::new("name", [Some(true), Some(false), None]));
Ok(())
}
#[test]
fn test_typed_column_to_series_string() -> Result<(), Box<dyn std::error::Error>> {
let column = TypedColumn {
column: Column::new(
"foo",
vec![Value::test_string("barbaz"), Value::test_nothing()],
),
column_type: Some(DataType::String),
};
let result = typed_column_to_series("foo", column)?;
assert_eq!(
result,
Series::new("name", [Some("barbaz".to_string()), None])
);
Ok(())
}
} }