mirror of
https://github.com/nushell/nushell.git
synced 2025-04-10 14:08:40 +02:00
# Description This doesn't really do much that the user could see, but it helps get us ready to do the steps of the refactor to split the span off of Value, so that values can be spanless. This allows us to have top-level values that can hold both a Value and a Span, without requiring that all values have them. We expect to see significant memory reduction by removing so many unnecessary spans from values. For example, a table of 100,000 rows and 5 columns would have a savings of ~8megs in just spans that are almost always duplicated. # User-Facing Changes Nothing yet # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used -A clippy::needless_collect -A clippy::result_large_err` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass - `cargo run -- -c "use std testing; testing run-tests --path crates/nu-std"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. -->
505 lines
15 KiB
Rust
505 lines
15 KiB
Rust
mod between_values;
|
|
mod conversion;
|
|
mod custom_value;
|
|
mod operations;
|
|
|
|
pub use conversion::{Column, ColumnMap};
|
|
pub use operations::Axis;
|
|
|
|
use indexmap::map::IndexMap;
|
|
use nu_protocol::{did_you_mean, PipelineData, Record, ShellError, Span, Value};
|
|
use polars::prelude::{DataFrame, DataType, IntoLazy, LazyFrame, PolarsObject, Series};
|
|
use serde::{Deserialize, Serialize};
|
|
use std::{cmp::Ordering, fmt::Display, hash::Hasher};
|
|
|
|
use super::{utils::DEFAULT_ROWS, NuLazyFrame};
|
|
|
|
// DataFrameValue is an encapsulation of Nushell Value that can be used
|
|
// to define the PolarsObject Trait. The polars object trait allows to
|
|
// create dataframes with mixed datatypes
|
|
#[derive(Clone, Debug)]
|
|
pub struct DataFrameValue(Value);
|
|
|
|
impl DataFrameValue {
|
|
fn new(value: Value) -> Self {
|
|
Self(value)
|
|
}
|
|
|
|
fn get_value(&self) -> Value {
|
|
self.0.clone()
|
|
}
|
|
}
|
|
|
|
impl Display for DataFrameValue {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
write!(f, "{}", self.0.get_type())
|
|
}
|
|
}
|
|
|
|
impl Default for DataFrameValue {
|
|
fn default() -> Self {
|
|
Self(Value::Nothing {
|
|
span: Span::unknown(),
|
|
})
|
|
}
|
|
}
|
|
|
|
impl PartialEq for DataFrameValue {
|
|
fn eq(&self, other: &Self) -> bool {
|
|
self.0.partial_cmp(&other.0).map_or(false, Ordering::is_eq)
|
|
}
|
|
}
|
|
impl Eq for DataFrameValue {}
|
|
|
|
impl std::hash::Hash for DataFrameValue {
|
|
fn hash<H: Hasher>(&self, state: &mut H) {
|
|
match &self.0 {
|
|
Value::Nothing { .. } => 0.hash(state),
|
|
Value::Int { val, .. } => val.hash(state),
|
|
Value::String { val, .. } => val.hash(state),
|
|
// TODO. Define hash for the rest of types
|
|
_ => {}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl PolarsObject for DataFrameValue {
|
|
fn type_name() -> &'static str {
|
|
"object"
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Serialize, Deserialize)]
|
|
pub struct NuDataFrame {
|
|
pub df: DataFrame,
|
|
pub from_lazy: bool,
|
|
}
|
|
|
|
impl AsRef<DataFrame> for NuDataFrame {
|
|
fn as_ref(&self) -> &polars::prelude::DataFrame {
|
|
&self.df
|
|
}
|
|
}
|
|
|
|
impl AsMut<DataFrame> for NuDataFrame {
|
|
fn as_mut(&mut self) -> &mut polars::prelude::DataFrame {
|
|
&mut self.df
|
|
}
|
|
}
|
|
|
|
impl From<DataFrame> for NuDataFrame {
|
|
fn from(df: DataFrame) -> Self {
|
|
Self {
|
|
df,
|
|
from_lazy: false,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl NuDataFrame {
|
|
pub fn new(from_lazy: bool, df: DataFrame) -> Self {
|
|
Self { df, from_lazy }
|
|
}
|
|
|
|
pub fn lazy(&self) -> LazyFrame {
|
|
self.df.clone().lazy()
|
|
}
|
|
|
|
fn default_value(span: Span) -> Value {
|
|
let dataframe = DataFrame::default();
|
|
NuDataFrame::dataframe_into_value(dataframe, span)
|
|
}
|
|
|
|
pub fn dataframe_into_value(dataframe: DataFrame, span: Span) -> Value {
|
|
Value::CustomValue {
|
|
val: Box::new(Self::new(false, dataframe)),
|
|
span,
|
|
}
|
|
}
|
|
|
|
pub fn into_value(self, span: Span) -> Value {
|
|
if self.from_lazy {
|
|
let lazy = NuLazyFrame::from_dataframe(self);
|
|
Value::CustomValue {
|
|
val: Box::new(lazy),
|
|
span,
|
|
}
|
|
} else {
|
|
Value::CustomValue {
|
|
val: Box::new(self),
|
|
span,
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn series_to_value(series: Series, span: Span) -> Result<Value, ShellError> {
|
|
match DataFrame::new(vec![series]) {
|
|
Ok(dataframe) => Ok(NuDataFrame::dataframe_into_value(dataframe, span)),
|
|
Err(e) => Err(ShellError::GenericError(
|
|
"Error creating dataframe".into(),
|
|
e.to_string(),
|
|
Some(span),
|
|
None,
|
|
Vec::new(),
|
|
)),
|
|
}
|
|
}
|
|
|
|
pub fn try_from_iter<T>(iter: T) -> Result<Self, ShellError>
|
|
where
|
|
T: Iterator<Item = Value>,
|
|
{
|
|
// Dictionary to store the columnar data extracted from
|
|
// the input. During the iteration we check if the values
|
|
// have different type
|
|
let mut column_values: ColumnMap = IndexMap::new();
|
|
|
|
for value in iter {
|
|
match value {
|
|
Value::CustomValue { .. } => return Self::try_from_value(value),
|
|
Value::List { vals, .. } => {
|
|
let cols = (0..vals.len())
|
|
.map(|i| format!("{i}"))
|
|
.collect::<Vec<String>>();
|
|
|
|
conversion::insert_record(&mut column_values, Record { cols, vals })?
|
|
}
|
|
Value::Record { val: record, .. } => {
|
|
conversion::insert_record(&mut column_values, record)?
|
|
}
|
|
_ => {
|
|
let key = "0".to_string();
|
|
conversion::insert_value(value, key, &mut column_values)?
|
|
}
|
|
}
|
|
}
|
|
|
|
conversion::from_parsed_columns(column_values)
|
|
}
|
|
|
|
pub fn try_from_series(columns: Vec<Series>, span: Span) -> Result<Self, ShellError> {
|
|
let dataframe = DataFrame::new(columns).map_err(|e| {
|
|
ShellError::GenericError(
|
|
"Error creating dataframe".into(),
|
|
format!("Unable to create DataFrame: {e}"),
|
|
Some(span),
|
|
None,
|
|
Vec::new(),
|
|
)
|
|
})?;
|
|
|
|
Ok(Self::new(false, dataframe))
|
|
}
|
|
|
|
pub fn try_from_columns(columns: Vec<Column>) -> Result<Self, ShellError> {
|
|
let mut column_values: ColumnMap = IndexMap::new();
|
|
|
|
for column in columns {
|
|
let name = column.name().to_string();
|
|
for value in column {
|
|
conversion::insert_value(value, name.clone(), &mut column_values)?;
|
|
}
|
|
}
|
|
|
|
conversion::from_parsed_columns(column_values)
|
|
}
|
|
|
|
pub fn fill_list_nan(list: Vec<Value>, list_span: Span, fill: Value) -> Value {
|
|
let newlist = list
|
|
.into_iter()
|
|
.map(|value| match value {
|
|
Value::Float { val, .. } => {
|
|
if val.is_nan() {
|
|
fill.clone()
|
|
} else {
|
|
value
|
|
}
|
|
}
|
|
Value::List { vals, span } => Self::fill_list_nan(vals, span, fill.clone()),
|
|
_ => value,
|
|
})
|
|
.collect::<Vec<Value>>();
|
|
Value::list(newlist, list_span)
|
|
}
|
|
|
|
pub fn columns(&self, span: Span) -> Result<Vec<Column>, ShellError> {
|
|
let height = self.df.height();
|
|
self.df
|
|
.get_columns()
|
|
.iter()
|
|
.map(|col| conversion::create_column(col, 0, height, span))
|
|
.collect::<Result<Vec<Column>, ShellError>>()
|
|
}
|
|
|
|
pub fn try_from_value(value: Value) -> Result<Self, ShellError> {
|
|
if Self::can_downcast(&value) {
|
|
Ok(Self::get_df(value)?)
|
|
} else if NuLazyFrame::can_downcast(&value) {
|
|
let span = value.span();
|
|
let lazy = NuLazyFrame::try_from_value(value)?;
|
|
let df = lazy.collect(span)?;
|
|
Ok(df)
|
|
} else {
|
|
Err(ShellError::CantConvert {
|
|
to_type: "lazy or eager dataframe".into(),
|
|
from_type: value.get_type().to_string(),
|
|
span: value.span(),
|
|
help: None,
|
|
})
|
|
}
|
|
}
|
|
|
|
pub fn get_df(value: Value) -> Result<Self, ShellError> {
|
|
match value {
|
|
Value::CustomValue { val, span } => match val.as_any().downcast_ref::<Self>() {
|
|
Some(df) => Ok(NuDataFrame {
|
|
df: df.df.clone(),
|
|
from_lazy: false,
|
|
}),
|
|
None => Err(ShellError::CantConvert {
|
|
to_type: "dataframe".into(),
|
|
from_type: "non-dataframe".into(),
|
|
span,
|
|
help: None,
|
|
}),
|
|
},
|
|
x => Err(ShellError::CantConvert {
|
|
to_type: "dataframe".into(),
|
|
from_type: x.get_type().to_string(),
|
|
span: x.span(),
|
|
help: None,
|
|
}),
|
|
}
|
|
}
|
|
|
|
pub fn try_from_pipeline(input: PipelineData, span: Span) -> Result<Self, ShellError> {
|
|
let value = input.into_value(span);
|
|
Self::try_from_value(value)
|
|
}
|
|
|
|
pub fn can_downcast(value: &Value) -> bool {
|
|
if let Value::CustomValue { val, .. } = value {
|
|
val.as_any().downcast_ref::<Self>().is_some()
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
pub fn column(&self, column: &str, span: Span) -> Result<Self, ShellError> {
|
|
let s = self.df.column(column).map_err(|_| {
|
|
let possibilities = self
|
|
.df
|
|
.get_column_names()
|
|
.iter()
|
|
.map(|name| name.to_string())
|
|
.collect::<Vec<String>>();
|
|
|
|
let option = did_you_mean(&possibilities, column).unwrap_or_else(|| column.to_string());
|
|
ShellError::DidYouMean(option, span)
|
|
})?;
|
|
|
|
let df = DataFrame::new(vec![s.clone()]).map_err(|e| {
|
|
ShellError::GenericError(
|
|
"Error creating dataframe".into(),
|
|
e.to_string(),
|
|
Some(span),
|
|
None,
|
|
Vec::new(),
|
|
)
|
|
})?;
|
|
|
|
Ok(Self {
|
|
df,
|
|
from_lazy: false,
|
|
})
|
|
}
|
|
|
|
pub fn is_series(&self) -> bool {
|
|
self.df.width() == 1
|
|
}
|
|
|
|
pub fn as_series(&self, span: Span) -> Result<Series, ShellError> {
|
|
if !self.is_series() {
|
|
return Err(ShellError::GenericError(
|
|
"Error using as series".into(),
|
|
"dataframe has more than one column".into(),
|
|
Some(span),
|
|
None,
|
|
Vec::new(),
|
|
));
|
|
}
|
|
|
|
let series = self
|
|
.df
|
|
.get_columns()
|
|
.get(0)
|
|
.expect("We have already checked that the width is 1");
|
|
|
|
Ok(series.clone())
|
|
}
|
|
|
|
pub fn get_value(&self, row: usize, span: Span) -> Result<Value, ShellError> {
|
|
let series = self.as_series(span)?;
|
|
let column = conversion::create_column(&series, row, row + 1, span)?;
|
|
|
|
if column.len() == 0 {
|
|
Err(ShellError::AccessEmptyContent { span })
|
|
} else {
|
|
let value = column
|
|
.into_iter()
|
|
.next()
|
|
.expect("already checked there is a value");
|
|
Ok(value)
|
|
}
|
|
}
|
|
|
|
// Print is made out a head and if the dataframe is too large, then a tail
|
|
pub fn print(&self, span: Span) -> Result<Vec<Value>, ShellError> {
|
|
let df = &self.df;
|
|
let size: usize = 20;
|
|
|
|
if df.height() > size {
|
|
let sample_size = size / 2;
|
|
let mut values = self.head(Some(sample_size), span)?;
|
|
conversion::add_separator(&mut values, df, span);
|
|
let remaining = df.height() - sample_size;
|
|
let tail_size = remaining.min(sample_size);
|
|
let mut tail_values = self.tail(Some(tail_size), span)?;
|
|
values.append(&mut tail_values);
|
|
|
|
Ok(values)
|
|
} else {
|
|
Ok(self.head(Some(size), span)?)
|
|
}
|
|
}
|
|
|
|
pub fn height(&self) -> usize {
|
|
self.df.height()
|
|
}
|
|
|
|
pub fn head(&self, rows: Option<usize>, span: Span) -> Result<Vec<Value>, ShellError> {
|
|
let to_row = rows.unwrap_or(5);
|
|
let values = self.to_rows(0, to_row, span)?;
|
|
|
|
Ok(values)
|
|
}
|
|
|
|
pub fn tail(&self, rows: Option<usize>, span: Span) -> Result<Vec<Value>, ShellError> {
|
|
let df = &self.df;
|
|
let to_row = df.height();
|
|
let size = rows.unwrap_or(DEFAULT_ROWS);
|
|
let from_row = to_row.saturating_sub(size);
|
|
|
|
let values = self.to_rows(from_row, to_row, span)?;
|
|
|
|
Ok(values)
|
|
}
|
|
|
|
pub fn to_rows(
|
|
&self,
|
|
from_row: usize,
|
|
to_row: usize,
|
|
span: Span,
|
|
) -> Result<Vec<Value>, ShellError> {
|
|
let df = &self.df;
|
|
let upper_row = to_row.min(df.height());
|
|
|
|
let mut size: usize = 0;
|
|
let columns = self
|
|
.df
|
|
.get_columns()
|
|
.iter()
|
|
.map(
|
|
|col| match conversion::create_column(col, from_row, upper_row, span) {
|
|
Ok(col) => {
|
|
size = col.len();
|
|
Ok(col)
|
|
}
|
|
Err(e) => Err(e),
|
|
},
|
|
)
|
|
.collect::<Result<Vec<Column>, ShellError>>()?;
|
|
|
|
let mut iterators = columns
|
|
.into_iter()
|
|
.map(|col| (col.name().to_string(), col.into_iter()))
|
|
.collect::<Vec<(String, std::vec::IntoIter<Value>)>>();
|
|
|
|
let values = (0..size)
|
|
.map(|i| {
|
|
let mut record = Record::new();
|
|
|
|
record.push("index", Value::int((i + from_row) as i64, span));
|
|
|
|
for (name, col) in &mut iterators {
|
|
record.push(name.clone(), col.next().unwrap_or(Value::nothing(span)));
|
|
}
|
|
|
|
Value::record(record, span)
|
|
})
|
|
.collect::<Vec<Value>>();
|
|
|
|
Ok(values)
|
|
}
|
|
|
|
// Dataframes are considered equal if they have the same shape, column name and values
|
|
pub fn is_equal(&self, other: &Self) -> Option<Ordering> {
|
|
if self.as_ref().width() == 0 {
|
|
// checking for empty dataframe
|
|
return None;
|
|
}
|
|
|
|
if self.as_ref().get_column_names() != other.as_ref().get_column_names() {
|
|
// checking both dataframes share the same names
|
|
return None;
|
|
}
|
|
|
|
if self.as_ref().height() != other.as_ref().height() {
|
|
// checking both dataframes have the same row size
|
|
return None;
|
|
}
|
|
|
|
// sorting dataframe by the first column
|
|
let column_names = self.as_ref().get_column_names();
|
|
let first_col = column_names
|
|
.first()
|
|
.expect("already checked that dataframe is different than 0");
|
|
|
|
// if unable to sort, then unable to compare
|
|
let lhs = match self.as_ref().sort(vec![*first_col], false) {
|
|
Ok(df) => df,
|
|
Err(_) => return None,
|
|
};
|
|
|
|
let rhs = match other.as_ref().sort(vec![*first_col], false) {
|
|
Ok(df) => df,
|
|
Err(_) => return None,
|
|
};
|
|
|
|
for name in self.as_ref().get_column_names() {
|
|
let self_series = lhs.column(name).expect("name from dataframe names");
|
|
|
|
let other_series = rhs
|
|
.column(name)
|
|
.expect("already checked that name in other");
|
|
|
|
let self_series = match self_series.dtype() {
|
|
// Casting needed to compare other numeric types with nushell numeric type.
|
|
// In nushell we only have i64 integer numeric types and any array created
|
|
// with nushell untagged primitives will be of type i64
|
|
DataType::UInt32 | DataType::Int32 => match self_series.cast(&DataType::Int64) {
|
|
Ok(series) => series,
|
|
Err(_) => return None,
|
|
},
|
|
_ => self_series.clone(),
|
|
};
|
|
|
|
if !self_series.series_equal(other_series) {
|
|
return None;
|
|
}
|
|
}
|
|
|
|
Some(Ordering::Equal)
|
|
}
|
|
}
|