Complete Dataframe MVP (#3373)

* Dataframe MVP * Removed test csv file * Dataframe MVP * Removed test csv file * New revision polars * New revision polars * csv file reader * argument parser for file reader * Parser from Row primitive * Column conversion * Added as f32 and f64 * Parsing row to dataframe * Removed repeated push to vector * Accept table values to create dataframe * Removed default serde * Dataframe to rows to show data * Save name of file with dataframe * Usage example * Upgrade polars version * Clippy changes * Added print function with head and tail * Move dataframe struct to folder * Lock file after running tests and merge * Optional feature for dataframe * Removed dataframe from plugins * Update primitive.rs Co-authored-by: JT <jonathandturner@users.noreply.github.com>
2021-05-12 02:01:31 +01:00
parent e73491441a
commit c80a9585b0
25 changed files with 1474 additions and 448 deletions
--- a/crates/nu-protocol/src/dataframe/mod.rs
+++ b/crates/nu-protocol/src/dataframe/mod.rs
@ -0,0 +1,3 @@
+pub mod nu_dataframe;
+
+pub use nu_dataframe::NuDataFrame;
--- a/crates/nu-protocol/src/dataframe/nu_dataframe.rs
+++ b/crates/nu-protocol/src/dataframe/nu_dataframe.rs
@ -0,0 +1,432 @@
+use std::hash::{Hash, Hasher};
+use std::{cmp::Ordering, collections::hash_map::Entry, collections::HashMap};
+
+use bigdecimal::FromPrimitive;
+use chrono::{DateTime, FixedOffset, NaiveDateTime};
+use nu_errors::ShellError;
+use nu_source::Tag;
+use num_bigint::BigInt;
+use polars::prelude::{AnyValue, DataFrame, NamedFrom, Series, TimeUnit};
+use serde::de::{Deserialize, Deserializer, Visitor};
+use serde::Serialize;
+
+use std::fmt;
+
+use crate::{Dictionary, Primitive, UntaggedValue, Value};
+
+const SECS_PER_DAY: i64 = 86_400;
+
+#[derive(Debug)]
+enum InputValue {
+    Integer,
+    Decimal,
+    String,
+}
+
+#[derive(Debug)]
+struct ColumnValues {
+    pub value_type: InputValue,
+    pub values: Vec<Value>,
+}
+
+impl Default for ColumnValues {
+    fn default() -> Self {
+        Self {
+            value_type: InputValue::Integer,
+            values: Vec::new(),
+        }
+    }
+}
+
+type ColumnMap = HashMap<String, ColumnValues>;
+
+// TODO. Using Option to help with deserialization. It will be better to find
+// a way to use serde with dataframes
+#[derive(Debug, Clone, Serialize)]
+pub struct NuDataFrame {
+    #[serde(skip_serializing)]
+    pub dataframe: Option<DataFrame>,
+    pub name: String,
+}
+
+impl Default for NuDataFrame {
+    fn default() -> Self {
+        NuDataFrame {
+            dataframe: None,
+            name: String::from("From Stream"),
+        }
+    }
+}
+
+impl NuDataFrame {
+    fn new() -> Self {
+        Self::default()
+    }
+}
+
+// TODO. Better definition of equality and comparison for a dataframe.
+// Probably it make sense to have a name field and use it for comparisons
+impl PartialEq for NuDataFrame {
+    fn eq(&self, _: &Self) -> bool {
+        false
+    }
+}
+
+impl Eq for NuDataFrame {}
+
+impl PartialOrd for NuDataFrame {
+    fn partial_cmp(&self, _: &Self) -> Option<Ordering> {
+        Some(Ordering::Equal)
+    }
+}
+
+impl Ord for NuDataFrame {
+    fn cmp(&self, _: &Self) -> Ordering {
+        Ordering::Equal
+    }
+}
+
+impl Hash for NuDataFrame {
+    fn hash<H: Hasher>(&self, _: &mut H) {}
+}
+
+impl<'de> Visitor<'de> for NuDataFrame {
+    type Value = Self;
+
+    fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
+        formatter.write_str("an integer between -2^31 and 2^31")
+    }
+}
+
+impl<'de> Deserialize<'de> for NuDataFrame {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        deserializer.deserialize_i32(NuDataFrame::new())
+    }
+}
+
+impl NuDataFrame {
+    pub fn try_from_iter<T>(iter: T, tag: &Tag) -> Result<Self, ShellError>
+    where
+        T: Iterator<Item = Value>,
+    {
+        // Dictionary to store the columnar data extracted from
+        // the input. During the iteration we will sort if the values
+        // have different type
+        let mut column_values: ColumnMap = HashMap::new();
+
+        for value in iter {
+            match value.value {
+                UntaggedValue::Row(dictionary) => insert_row(&mut column_values, dictionary)?,
+                UntaggedValue::Table(table) => insert_table(&mut column_values, table)?,
+                _ => {
+                    return Err(ShellError::labeled_error(
+                        "Format not supported",
+                        "Value not supported for conversion",
+                        &value.tag,
+                    ));
+                }
+            }
+        }
+
+        from_parsed_columns(column_values, tag)
+    }
+
+    // Print is made out a head and if the dataframe is too large, then a tail
+    pub fn print(&self) -> Result<Vec<Value>, ShellError> {
+        if let Some(df) = &self.dataframe {
+            let size: usize = 5;
+            let mut values = self.head(Some(size))?;
+
+            if df.height() > size {
+                add_separator(&mut values, df);
+
+                let remaining = df.height() - size;
+                let tail_size = remaining.min(size);
+                let mut tail_values = self.tail(Some(tail_size))?;
+
+                values.append(&mut tail_values);
+            }
+
+            Ok(values)
+        } else {
+            unreachable!()
+        }
+    }
+
+    pub fn head(&self, rows: Option<usize>) -> Result<Vec<Value>, ShellError> {
+        let to_row = rows.unwrap_or(5);
+        let values = self.to_rows(0, to_row)?;
+
+        Ok(values)
+    }
+
+    pub fn tail(&self, rows: Option<usize>) -> Result<Vec<Value>, ShellError> {
+        if let Some(df) = &self.dataframe {
+            let to_row = df.height();
+            let size = rows.unwrap_or(5);
+            let from_row = to_row.saturating_sub(size);
+
+            let values = self.to_rows(from_row, to_row)?;
+
+            Ok(values)
+        } else {
+            unreachable!()
+        }
+    }
+
+    pub fn to_rows(&self, from_row: usize, to_row: usize) -> Result<Vec<Value>, ShellError> {
+        if let Some(df) = &self.dataframe {
+            let column_names = df.get_column_names();
+
+            let mut values: Vec<Value> = Vec::new();
+
+            let upper_row = to_row.min(df.height());
+            for i in from_row..upper_row {
+                let row = df.get_row(i);
+                let mut dictionary_row = Dictionary::default();
+
+                for (val, name) in row.0.iter().zip(column_names.iter()) {
+                    let untagged_val = anyvalue_to_untagged(val)?;
+
+                    let dict_val = Value {
+                        value: untagged_val,
+                        tag: Tag::unknown(),
+                    };
+
+                    dictionary_row.insert(name.to_string(), dict_val);
+                }
+
+                let value = Value {
+                    value: UntaggedValue::Row(dictionary_row),
+                    tag: Tag::unknown(),
+                };
+
+                values.push(value);
+            }
+
+            Ok(values)
+        } else {
+            unreachable!()
+        }
+    }
+}
+
+// Adds a separator to the vector of values using the column names from the
+// dataframe to create the Values Row
+fn add_separator(values: &mut Vec<Value>, df: &DataFrame) {
+    let column_names = df.get_column_names();
+
+    let mut dictionary = Dictionary::default();
+    for name in column_names {
+        let indicator = Value {
+            value: UntaggedValue::Primitive(Primitive::String("...".to_string())),
+            tag: Tag::unknown(),
+        };
+
+        dictionary.insert(name.to_string(), indicator);
+    }
+
+    let extra_column = Value {
+        value: UntaggedValue::Row(dictionary),
+        tag: Tag::unknown(),
+    };
+
+    values.push(extra_column);
+}
+
+// Converts a polars AnyValue to an UntaggedValue
+// This is used when printing values coming for polars dataframes
+fn anyvalue_to_untagged(anyvalue: &AnyValue) -> Result<UntaggedValue, ShellError> {
+    Ok(match anyvalue {
+        AnyValue::Null => UntaggedValue::Primitive(Primitive::Nothing),
+        AnyValue::Utf8(a) => UntaggedValue::Primitive((*a).into()),
+        AnyValue::Boolean(a) => UntaggedValue::Primitive((*a).into()),
+        AnyValue::Float32(a) => UntaggedValue::Primitive((*a).into()),
+        AnyValue::Float64(a) => UntaggedValue::Primitive((*a).into()),
+        AnyValue::Int32(a) => UntaggedValue::Primitive((*a).into()),
+        AnyValue::Int64(a) => UntaggedValue::Primitive((*a).into()),
+        AnyValue::UInt8(a) => UntaggedValue::Primitive((*a).into()),
+        AnyValue::UInt16(a) => UntaggedValue::Primitive((*a).into()),
+        AnyValue::Int8(a) => UntaggedValue::Primitive((*a).into()),
+        AnyValue::Int16(a) => UntaggedValue::Primitive((*a).into()),
+        AnyValue::UInt32(a) => UntaggedValue::Primitive((*a).into()),
+        AnyValue::UInt64(a) => UntaggedValue::Primitive((*a).into()),
+        AnyValue::Date32(a) => {
+            // elapsed time in day since 1970-01-01
+            let seconds = *a as i64 * SECS_PER_DAY;
+            let naive_datetime = NaiveDateTime::from_timestamp(seconds, 0);
+
+            // Zero length offset
+            let offset = FixedOffset::east(0);
+            let datetime = DateTime::<FixedOffset>::from_utc(naive_datetime, offset);
+
+            UntaggedValue::Primitive(Primitive::Date(datetime))
+        }
+        AnyValue::Date64(a) => {
+            // elapsed time in milliseconds since 1970-01-01
+            let seconds = *a / 1000;
+            let naive_datetime = NaiveDateTime::from_timestamp(seconds, 0);
+
+            // Zero length offset
+            let offset = FixedOffset::east(0);
+            let datetime = DateTime::<FixedOffset>::from_utc(naive_datetime, offset);
+
+            UntaggedValue::Primitive(Primitive::Date(datetime))
+        }
+        AnyValue::Time64(a, _) => UntaggedValue::Primitive((*a).into()),
+        AnyValue::Duration(a, unit) => {
+            let nanoseconds = match unit {
+                TimeUnit::Second => *a / 1_000_000_000,
+                TimeUnit::Millisecond => *a / 1_000_000,
+                TimeUnit::Microsecond => *a / 1_000,
+                TimeUnit::Nanosecond => *a,
+            };
+
+            if let Some(bigint) = BigInt::from_i64(nanoseconds) {
+                UntaggedValue::Primitive(Primitive::Duration(bigint))
+            } else {
+                unreachable!("Internal error: protocol did not use compatible decimal")
+            }
+        }
+        AnyValue::List(_) => {
+            return Err(ShellError::labeled_error(
+                "Format not supported",
+                "Value not supported for conversion",
+                Tag::unknown(),
+            ));
+        }
+    })
+}
+
+// Inserting the values found in a UntaggedValue::Row
+// All the entries for the dictionary are checked in order to check if
+// the column values have the same type value.
+fn insert_row(column_values: &mut ColumnMap, dictionary: Dictionary) -> Result<(), ShellError> {
+    for (key, value) in dictionary.entries {
+        insert_value(value, key, column_values)?;
+    }
+
+    Ok(())
+}
+
+// Inserting the values found in a UntaggedValue::Table
+// All the entries for the table are checked in order to check if
+// the column values have the same type value.
+// The names for the columns are the enumerated numbers from the values
+fn insert_table(column_values: &mut ColumnMap, table: Vec<Value>) -> Result<(), ShellError> {
+    for (index, value) in table.into_iter().enumerate() {
+        let key = format!("{}", index);
+        insert_value(value, key, column_values)?;
+    }
+
+    Ok(())
+}
+
+fn insert_value(
+    value: Value,
+    key: String,
+    column_values: &mut ColumnMap,
+) -> Result<(), ShellError> {
+    let col_val = match column_values.entry(key) {
+        Entry::Vacant(entry) => entry.insert(ColumnValues::default()),
+        Entry::Occupied(entry) => entry.into_mut(),
+    };
+
+    // Checking that the type for the value is the same
+    // for the previous value in the column
+    if col_val.values.is_empty() {
+        match &value.value {
+            UntaggedValue::Primitive(Primitive::Int(_)) => {
+                col_val.value_type = InputValue::Integer;
+            }
+            UntaggedValue::Primitive(Primitive::Decimal(_)) => {
+                col_val.value_type = InputValue::Decimal;
+            }
+            UntaggedValue::Primitive(Primitive::String(_)) => {
+                col_val.value_type = InputValue::String;
+            }
+            _ => {
+                return Err(ShellError::labeled_error(
+                    "Only primitive values accepted",
+                    "Not a primitive value",
+                    &value.tag,
+                ));
+            }
+        }
+        col_val.values.push(value);
+    } else {
+        let prev_value = &col_val.values[col_val.values.len() - 1];
+
+        match (&prev_value.value, &value.value) {
+            (
+                UntaggedValue::Primitive(Primitive::Int(_)),
+                UntaggedValue::Primitive(Primitive::Int(_)),
+            )
+            | (
+                UntaggedValue::Primitive(Primitive::Decimal(_)),
+                UntaggedValue::Primitive(Primitive::Decimal(_)),
+            )
+            | (
+                UntaggedValue::Primitive(Primitive::String(_)),
+                UntaggedValue::Primitive(Primitive::String(_)),
+            ) => col_val.values.push(value),
+            _ => {
+                return Err(ShellError::labeled_error(
+                    "Different values in column",
+                    "Value with different type",
+                    &value.tag,
+                ));
+            }
+        }
+    }
+
+    Ok(())
+}
+
+// The ColumnMap has the parsed data from the StreamInput
+// This data can be used to create a Series object that can initialize
+// the dataframe based on the type of data that is found
+fn from_parsed_columns(column_values: ColumnMap, tag: &Tag) -> Result<NuDataFrame, ShellError> {
+    let mut df_series: Vec<Series> = Vec::new();
+    for (name, column) in column_values {
+        match column.value_type {
+            InputValue::Decimal => {
+                let series_values: Result<Vec<_>, _> =
+                    column.values.iter().map(|v| v.as_f64()).collect();
+                let series = Series::new(&name, series_values?);
+                df_series.push(series)
+            }
+            InputValue::Integer => {
+                let series_values: Result<Vec<_>, _> =
+                    column.values.iter().map(|v| v.as_f32()).collect();
+                let series = Series::new(&name, series_values?);
+                df_series.push(series)
+            }
+            InputValue::String => {
+                let series_values: Result<Vec<_>, _> =
+                    column.values.iter().map(|v| v.as_string()).collect();
+                let series = Series::new(&name, series_values?);
+                df_series.push(series)
+            }
+        }
+    }
+
+    let df = DataFrame::new(df_series);
+
+    match df {
+        Ok(df) => Ok(NuDataFrame {
+            dataframe: Some(df),
+            name: "From stream".to_string(),
+        }),
+        Err(e) => {
+            return Err(ShellError::labeled_error(
+                "Error while creating dataframe",
+                format!("{}", e),
+                tag,
+            ))
+        }
+    }
+}
--- a/crates/nu-protocol/src/lib.rs
+++ b/crates/nu-protocol/src/lib.rs
@ -12,6 +12,9 @@ mod type_name;
 mod type_shape;
 pub mod value;

+#[cfg(feature = "dataframe")]
+pub mod dataframe;
+
 pub use crate::call_info::{CallInfo, EvaluatedArgs};
 pub use crate::config_path::ConfigPath;
 pub use crate::maybe_owned::MaybeOwned;
--- a/crates/nu-protocol/src/type_shape.rs
+++ b/crates/nu-protocol/src/type_shape.rs
@ -69,6 +69,10 @@ pub enum Type {
    BeginningOfStream,
    /// End of stream marker (used as bookend markers rather than actual values)
    EndOfStream,
+
+    /// Dataframe
+    #[cfg(feature = "dataframe")]
+    Dataframe,
 }

 /// A shape representation of the type of a row
@ -183,6 +187,8 @@ impl Type {
            UntaggedValue::Table(table) => Type::from_table(table.iter()),
            UntaggedValue::Error(_) => Type::Error,
            UntaggedValue::Block(_) => Type::Block,
+            #[cfg(feature = "dataframe")]
+            UntaggedValue::Dataframe(_) => Type::Dataframe,
        }
    }
 }
@ -287,6 +293,8 @@ impl PrettyDebug for Type {
                    })
            }
            Type::Block => ty("block"),
+            #[cfg(feature = "dataframe")]
+            Type::Dataframe => ty("dataframe_pretty_debug_for_Type"),
        }
    }
 }
--- a/crates/nu-protocol/src/value.rs
+++ b/crates/nu-protocol/src/value.rs
@ -30,6 +30,9 @@ use std::hash::{Hash, Hasher};
 use std::path::PathBuf;
 use std::time::SystemTime;

+#[cfg(feature = "dataframe")]
+use crate::dataframe::NuDataFrame;
+
 /// The core structured values that flow through a pipeline
 #[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)]
 pub enum UntaggedValue {
@ -47,6 +50,10 @@ pub enum UntaggedValue {

    /// A block of Nu code, eg `{ ls | get name ; echo "done" }` with its captured values
    Block(Box<hir::CapturedBlock>),
+
+    /// NuDataframe
+    #[cfg(feature = "dataframe")]
+    Dataframe(NuDataFrame),
 }

 impl UntaggedValue {
@ -489,6 +496,22 @@ impl Value {
        }
    }

+    /// View the Value as signed 32-bit float, if possible
+    pub fn as_f32(&self) -> Result<f32, ShellError> {
+        match &self.value {
+            UntaggedValue::Primitive(primitive) => primitive.as_f32(self.tag.span),
+            _ => Err(ShellError::type_error("integer", self.spanned_type_name())),
+        }
+    }
+
+    /// View the Value as signed 64-bit float, if possible
+    pub fn as_f64(&self) -> Result<f64, ShellError> {
+        match &self.value {
+            UntaggedValue::Primitive(primitive) => primitive.as_f64(self.tag.span),
+            _ => Err(ShellError::type_error("integer", self.spanned_type_name())),
+        }
+    }
+
    /// View the Value as boolean, if possible
    pub fn as_bool(&self) -> Result<bool, ShellError> {
        match &self.value {
@ -634,6 +657,8 @@ impl ShellTypeName for UntaggedValue {
            UntaggedValue::Table(_) => "table",
            UntaggedValue::Error(_) => "error",
            UntaggedValue::Block(_) => "block",
+            #[cfg(feature = "dataframe")]
+            UntaggedValue::Dataframe(_) => "dataframe",
        }
    }
 }
--- a/crates/nu-protocol/src/value/debug.rs
+++ b/crates/nu-protocol/src/value/debug.rs
@ -24,6 +24,8 @@ impl PrettyDebug for Value {
            .nest(),
            UntaggedValue::Error(_) => DbgDocBldr::error("error"),
            UntaggedValue::Block(_) => DbgDocBldr::opaque("block"),
+            #[cfg(feature = "dataframe")]
+            UntaggedValue::Dataframe(_) => DbgDocBldr::opaque("dataframe_prettydebug_for_Value"),
        }
    }
 }
--- a/crates/nu-protocol/src/value/primitive.rs
+++ b/crates/nu-protocol/src/value/primitive.rs
@ -247,6 +247,29 @@ impl Primitive {
        }
    }

+    pub fn as_f32(&self, span: Span) -> Result<f32, ShellError> {
+        match self {
+            Primitive::Int(int) => int.to_f32().ok_or_else(|| {
+                ShellError::range_error(
+                    ExpectedRange::F32,
+                    &format!("{}", int).spanned(span),
+                    "converting an integer into a signed 32-bit float",
+                )
+            }),
+            Primitive::Decimal(decimal) => decimal.to_f32().ok_or_else(|| {
+                ShellError::range_error(
+                    ExpectedRange::F32,
+                    &format!("{}", decimal).spanned(span),
+                    "converting a decimal into a signed 32-bit float",
+                )
+            }),
+            other => Err(ShellError::type_error(
+                "number",
+                other.type_name().spanned(span),
+            )),
+        }
+    }
+
    // FIXME: This is a bad name, but no other way to differentiate with our own Duration.
    pub fn into_chrono_duration(self, span: Span) -> Result<chrono::Duration, ShellError> {
        match self {
@ -332,17 +355,35 @@ impl From<BigInt> for Primitive {
    }
 }

-impl From<f64> for Primitive {
-    /// Helper to convert from 64-bit float to a Primitive value
-    fn from(float: f64) -> Primitive {
-        if let Some(f) = BigDecimal::from_f64(float) {
-            Primitive::Decimal(f)
-        } else {
-            unreachable!("Internal error: protocol did not use f64-compatible decimal")
+// Macro to define the From trait for native types to primitives
+// The from trait requires a converter that will be applied to the
+// native type.
+macro_rules! from_native_to_primitive {
+    ($native_type:ty, $primitive_type:expr, $converter: expr) => {
+        // e.g. from u32 -> Primitive
+        impl From<$native_type> for Primitive {
+            fn from(int: $native_type) -> Primitive {
+                if let Some(i) = $converter(int) {
+                    $primitive_type(i)
+                } else {
+                    unreachable!("Internal error: protocol did not use compatible decimal")
+                }
+            }
        }
-    }
+    };
 }

+from_native_to_primitive!(i8, Primitive::Int, BigInt::from_i8);
+from_native_to_primitive!(i16, Primitive::Int, BigInt::from_i16);
+from_native_to_primitive!(i32, Primitive::Int, BigInt::from_i32);
+from_native_to_primitive!(i64, Primitive::Int, BigInt::from_i64);
+from_native_to_primitive!(u8, Primitive::Int, BigInt::from_u8);
+from_native_to_primitive!(u16, Primitive::Int, BigInt::from_u16);
+from_native_to_primitive!(u32, Primitive::Int, BigInt::from_u32);
+from_native_to_primitive!(u64, Primitive::Int, BigInt::from_u64);
+from_native_to_primitive!(f32, Primitive::Decimal, BigDecimal::from_f32);
+from_native_to_primitive!(f64, Primitive::Decimal, BigDecimal::from_f64);
+
 impl From<chrono::Duration> for Primitive {
    fn from(duration: chrono::Duration) -> Primitive {
        // FIXME: This is a hack since chrono::Duration does not give access to its 'nanos' field.