nushell/crates/nu-cmd-dataframe/src/dataframe/values/nu_dataframe/mod.rs
JT 1e3e034021
Spanned Value step 1: span all value cases (#10042)
# Description

This doesn't really do much that the user could see, but it helps get us
ready to do the steps of the refactor to split the span off of Value, so
that values can be spanless. This allows us to have top-level values
that can hold both a Value and a Span, without requiring that all values
have them.

We expect to see significant memory reduction by removing so many
unnecessary spans from values. For example, a table of 100,000 rows and
5 columns would have a savings of ~8megs in just spans that are almost
always duplicated.

# User-Facing Changes

Nothing yet

# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.

Make sure you've run and fixed any issues with these commands:

- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used -A
clippy::needless_collect -A clippy::result_large_err` to check that
you're using the standard code style
- `cargo test --workspace` to check that all tests pass
- `cargo run -- -c "use std testing; testing run-tests --path
crates/nu-std"` to run the tests for the standard library

> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->

# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->
2023-08-25 08:48:05 +12:00

505 lines
15 KiB
Rust

mod between_values;
mod conversion;
mod custom_value;
mod operations;
pub use conversion::{Column, ColumnMap};
pub use operations::Axis;
use indexmap::map::IndexMap;
use nu_protocol::{did_you_mean, PipelineData, Record, ShellError, Span, Value};
use polars::prelude::{DataFrame, DataType, IntoLazy, LazyFrame, PolarsObject, Series};
use serde::{Deserialize, Serialize};
use std::{cmp::Ordering, fmt::Display, hash::Hasher};
use super::{utils::DEFAULT_ROWS, NuLazyFrame};
// DataFrameValue is an encapsulation of Nushell Value that can be used
// to define the PolarsObject Trait. The polars object trait allows to
// create dataframes with mixed datatypes
#[derive(Clone, Debug)]
pub struct DataFrameValue(Value);
impl DataFrameValue {
fn new(value: Value) -> Self {
Self(value)
}
fn get_value(&self) -> Value {
self.0.clone()
}
}
impl Display for DataFrameValue {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0.get_type())
}
}
impl Default for DataFrameValue {
fn default() -> Self {
Self(Value::Nothing {
span: Span::unknown(),
})
}
}
impl PartialEq for DataFrameValue {
fn eq(&self, other: &Self) -> bool {
self.0.partial_cmp(&other.0).map_or(false, Ordering::is_eq)
}
}
impl Eq for DataFrameValue {}
impl std::hash::Hash for DataFrameValue {
fn hash<H: Hasher>(&self, state: &mut H) {
match &self.0 {
Value::Nothing { .. } => 0.hash(state),
Value::Int { val, .. } => val.hash(state),
Value::String { val, .. } => val.hash(state),
// TODO. Define hash for the rest of types
_ => {}
}
}
}
impl PolarsObject for DataFrameValue {
fn type_name() -> &'static str {
"object"
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct NuDataFrame {
pub df: DataFrame,
pub from_lazy: bool,
}
impl AsRef<DataFrame> for NuDataFrame {
fn as_ref(&self) -> &polars::prelude::DataFrame {
&self.df
}
}
impl AsMut<DataFrame> for NuDataFrame {
fn as_mut(&mut self) -> &mut polars::prelude::DataFrame {
&mut self.df
}
}
impl From<DataFrame> for NuDataFrame {
fn from(df: DataFrame) -> Self {
Self {
df,
from_lazy: false,
}
}
}
impl NuDataFrame {
pub fn new(from_lazy: bool, df: DataFrame) -> Self {
Self { df, from_lazy }
}
pub fn lazy(&self) -> LazyFrame {
self.df.clone().lazy()
}
fn default_value(span: Span) -> Value {
let dataframe = DataFrame::default();
NuDataFrame::dataframe_into_value(dataframe, span)
}
pub fn dataframe_into_value(dataframe: DataFrame, span: Span) -> Value {
Value::CustomValue {
val: Box::new(Self::new(false, dataframe)),
span,
}
}
pub fn into_value(self, span: Span) -> Value {
if self.from_lazy {
let lazy = NuLazyFrame::from_dataframe(self);
Value::CustomValue {
val: Box::new(lazy),
span,
}
} else {
Value::CustomValue {
val: Box::new(self),
span,
}
}
}
pub fn series_to_value(series: Series, span: Span) -> Result<Value, ShellError> {
match DataFrame::new(vec![series]) {
Ok(dataframe) => Ok(NuDataFrame::dataframe_into_value(dataframe, span)),
Err(e) => Err(ShellError::GenericError(
"Error creating dataframe".into(),
e.to_string(),
Some(span),
None,
Vec::new(),
)),
}
}
pub fn try_from_iter<T>(iter: T) -> Result<Self, ShellError>
where
T: Iterator<Item = Value>,
{
// Dictionary to store the columnar data extracted from
// the input. During the iteration we check if the values
// have different type
let mut column_values: ColumnMap = IndexMap::new();
for value in iter {
match value {
Value::CustomValue { .. } => return Self::try_from_value(value),
Value::List { vals, .. } => {
let cols = (0..vals.len())
.map(|i| format!("{i}"))
.collect::<Vec<String>>();
conversion::insert_record(&mut column_values, Record { cols, vals })?
}
Value::Record { val: record, .. } => {
conversion::insert_record(&mut column_values, record)?
}
_ => {
let key = "0".to_string();
conversion::insert_value(value, key, &mut column_values)?
}
}
}
conversion::from_parsed_columns(column_values)
}
pub fn try_from_series(columns: Vec<Series>, span: Span) -> Result<Self, ShellError> {
let dataframe = DataFrame::new(columns).map_err(|e| {
ShellError::GenericError(
"Error creating dataframe".into(),
format!("Unable to create DataFrame: {e}"),
Some(span),
None,
Vec::new(),
)
})?;
Ok(Self::new(false, dataframe))
}
pub fn try_from_columns(columns: Vec<Column>) -> Result<Self, ShellError> {
let mut column_values: ColumnMap = IndexMap::new();
for column in columns {
let name = column.name().to_string();
for value in column {
conversion::insert_value(value, name.clone(), &mut column_values)?;
}
}
conversion::from_parsed_columns(column_values)
}
pub fn fill_list_nan(list: Vec<Value>, list_span: Span, fill: Value) -> Value {
let newlist = list
.into_iter()
.map(|value| match value {
Value::Float { val, .. } => {
if val.is_nan() {
fill.clone()
} else {
value
}
}
Value::List { vals, span } => Self::fill_list_nan(vals, span, fill.clone()),
_ => value,
})
.collect::<Vec<Value>>();
Value::list(newlist, list_span)
}
pub fn columns(&self, span: Span) -> Result<Vec<Column>, ShellError> {
let height = self.df.height();
self.df
.get_columns()
.iter()
.map(|col| conversion::create_column(col, 0, height, span))
.collect::<Result<Vec<Column>, ShellError>>()
}
pub fn try_from_value(value: Value) -> Result<Self, ShellError> {
if Self::can_downcast(&value) {
Ok(Self::get_df(value)?)
} else if NuLazyFrame::can_downcast(&value) {
let span = value.span();
let lazy = NuLazyFrame::try_from_value(value)?;
let df = lazy.collect(span)?;
Ok(df)
} else {
Err(ShellError::CantConvert {
to_type: "lazy or eager dataframe".into(),
from_type: value.get_type().to_string(),
span: value.span(),
help: None,
})
}
}
pub fn get_df(value: Value) -> Result<Self, ShellError> {
match value {
Value::CustomValue { val, span } => match val.as_any().downcast_ref::<Self>() {
Some(df) => Ok(NuDataFrame {
df: df.df.clone(),
from_lazy: false,
}),
None => Err(ShellError::CantConvert {
to_type: "dataframe".into(),
from_type: "non-dataframe".into(),
span,
help: None,
}),
},
x => Err(ShellError::CantConvert {
to_type: "dataframe".into(),
from_type: x.get_type().to_string(),
span: x.span(),
help: None,
}),
}
}
pub fn try_from_pipeline(input: PipelineData, span: Span) -> Result<Self, ShellError> {
let value = input.into_value(span);
Self::try_from_value(value)
}
pub fn can_downcast(value: &Value) -> bool {
if let Value::CustomValue { val, .. } = value {
val.as_any().downcast_ref::<Self>().is_some()
} else {
false
}
}
pub fn column(&self, column: &str, span: Span) -> Result<Self, ShellError> {
let s = self.df.column(column).map_err(|_| {
let possibilities = self
.df
.get_column_names()
.iter()
.map(|name| name.to_string())
.collect::<Vec<String>>();
let option = did_you_mean(&possibilities, column).unwrap_or_else(|| column.to_string());
ShellError::DidYouMean(option, span)
})?;
let df = DataFrame::new(vec![s.clone()]).map_err(|e| {
ShellError::GenericError(
"Error creating dataframe".into(),
e.to_string(),
Some(span),
None,
Vec::new(),
)
})?;
Ok(Self {
df,
from_lazy: false,
})
}
pub fn is_series(&self) -> bool {
self.df.width() == 1
}
pub fn as_series(&self, span: Span) -> Result<Series, ShellError> {
if !self.is_series() {
return Err(ShellError::GenericError(
"Error using as series".into(),
"dataframe has more than one column".into(),
Some(span),
None,
Vec::new(),
));
}
let series = self
.df
.get_columns()
.get(0)
.expect("We have already checked that the width is 1");
Ok(series.clone())
}
pub fn get_value(&self, row: usize, span: Span) -> Result<Value, ShellError> {
let series = self.as_series(span)?;
let column = conversion::create_column(&series, row, row + 1, span)?;
if column.len() == 0 {
Err(ShellError::AccessEmptyContent { span })
} else {
let value = column
.into_iter()
.next()
.expect("already checked there is a value");
Ok(value)
}
}
// Print is made out a head and if the dataframe is too large, then a tail
pub fn print(&self, span: Span) -> Result<Vec<Value>, ShellError> {
let df = &self.df;
let size: usize = 20;
if df.height() > size {
let sample_size = size / 2;
let mut values = self.head(Some(sample_size), span)?;
conversion::add_separator(&mut values, df, span);
let remaining = df.height() - sample_size;
let tail_size = remaining.min(sample_size);
let mut tail_values = self.tail(Some(tail_size), span)?;
values.append(&mut tail_values);
Ok(values)
} else {
Ok(self.head(Some(size), span)?)
}
}
pub fn height(&self) -> usize {
self.df.height()
}
pub fn head(&self, rows: Option<usize>, span: Span) -> Result<Vec<Value>, ShellError> {
let to_row = rows.unwrap_or(5);
let values = self.to_rows(0, to_row, span)?;
Ok(values)
}
pub fn tail(&self, rows: Option<usize>, span: Span) -> Result<Vec<Value>, ShellError> {
let df = &self.df;
let to_row = df.height();
let size = rows.unwrap_or(DEFAULT_ROWS);
let from_row = to_row.saturating_sub(size);
let values = self.to_rows(from_row, to_row, span)?;
Ok(values)
}
pub fn to_rows(
&self,
from_row: usize,
to_row: usize,
span: Span,
) -> Result<Vec<Value>, ShellError> {
let df = &self.df;
let upper_row = to_row.min(df.height());
let mut size: usize = 0;
let columns = self
.df
.get_columns()
.iter()
.map(
|col| match conversion::create_column(col, from_row, upper_row, span) {
Ok(col) => {
size = col.len();
Ok(col)
}
Err(e) => Err(e),
},
)
.collect::<Result<Vec<Column>, ShellError>>()?;
let mut iterators = columns
.into_iter()
.map(|col| (col.name().to_string(), col.into_iter()))
.collect::<Vec<(String, std::vec::IntoIter<Value>)>>();
let values = (0..size)
.map(|i| {
let mut record = Record::new();
record.push("index", Value::int((i + from_row) as i64, span));
for (name, col) in &mut iterators {
record.push(name.clone(), col.next().unwrap_or(Value::nothing(span)));
}
Value::record(record, span)
})
.collect::<Vec<Value>>();
Ok(values)
}
// Dataframes are considered equal if they have the same shape, column name and values
pub fn is_equal(&self, other: &Self) -> Option<Ordering> {
if self.as_ref().width() == 0 {
// checking for empty dataframe
return None;
}
if self.as_ref().get_column_names() != other.as_ref().get_column_names() {
// checking both dataframes share the same names
return None;
}
if self.as_ref().height() != other.as_ref().height() {
// checking both dataframes have the same row size
return None;
}
// sorting dataframe by the first column
let column_names = self.as_ref().get_column_names();
let first_col = column_names
.first()
.expect("already checked that dataframe is different than 0");
// if unable to sort, then unable to compare
let lhs = match self.as_ref().sort(vec![*first_col], false) {
Ok(df) => df,
Err(_) => return None,
};
let rhs = match other.as_ref().sort(vec![*first_col], false) {
Ok(df) => df,
Err(_) => return None,
};
for name in self.as_ref().get_column_names() {
let self_series = lhs.column(name).expect("name from dataframe names");
let other_series = rhs
.column(name)
.expect("already checked that name in other");
let self_series = match self_series.dtype() {
// Casting needed to compare other numeric types with nushell numeric type.
// In nushell we only have i64 integer numeric types and any array created
// with nushell untagged primitives will be of type i64
DataType::UInt32 | DataType::Int32 => match self_series.cast(&DataType::Int64) {
Ok(series) => series,
Err(_) => return None,
},
_ => self_series.clone(),
};
if !self_series.series_equal(other_series) {
return None;
}
}
Some(Ordering::Equal)
}
}