diff --git a/Cargo.lock b/Cargo.lock index 0aa88021f9..8a7a404814 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -120,6 +120,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "alloc-no-stdlib" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5192ec435945d87bc2f70992b4d818154b5feede43c09fb7592146374eac90a6" + +[[package]] +name = "alloc-stdlib" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "697ed7edc0f1711de49ce108c541623a0af97c6c60b2f6e2b65229847ac843c2" +dependencies = [ + "alloc-no-stdlib", +] + [[package]] name = "ansi_colours" version = "1.0.2" @@ -427,6 +442,12 @@ dependencies = [ "safemem 0.3.3", ] +[[package]] +name = "base64" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff" + [[package]] name = "base64" version = "0.13.0" @@ -553,6 +574,27 @@ dependencies = [ "once_cell", ] +[[package]] +name = "brotli" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f29919120f08613aadcd4383764e00526fc9f18b6c0895814faeed0dd78613e" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1052e1c3b8d4d80eb84a8b94f0a1498797b5fb96314c001156a1c761940ef4ec" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "bson" version = "0.14.1" @@ -2603,6 +2645,12 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "integer-encoding" +version = "1.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48dc51180a9b377fd75814d0cc02199c20f8e99433d6762f650d39cdbbd3b56f" + [[package]] name = "iovec" version = "0.1.4" @@ -2883,6 +2931,26 @@ dependencies = [ "value-bag", ] +[[package]] +name = "lz4" +version = "1.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aac20ed6991e01bf6a2e68cc73df2b389707403662a8ba89f68511fb340f724c" +dependencies = [ + "libc", + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "mac" version = "0.1.1" @@ -4387,6 +4455,34 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "parquet" +version = "4.0.0-SNAPSHOT" +source = "git+https://github.com/apache/arrow-rs?rev=d008f31b107c1030a1f5144c164e8ca8bf543576#d008f31b107c1030a1f5144c164e8ca8bf543576" +dependencies = [ + "arrow", + "base64 0.12.3", + "brotli", + "byteorder", + "chrono", + "flate2", + "lz4", + "num-bigint 0.3.2", + "parquet-format", + "snap", + "thrift", + "zstd", +] + +[[package]] +name = "parquet-format" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5bc6b23543b5dedc8f6cce50758a35e5582e148e0cfa26bd0cacd569cda5b71" +dependencies = [ + "thrift", +] + [[package]] name = "parse-zoneinfo" version = "0.3.0" @@ -4632,6 +4728,7 @@ dependencies = [ "lazy_static 1.4.0", "num 0.4.0", "num_cpus", + "parquet", "polars-arrow", "prettytable-rs", "rayon", @@ -4656,6 +4753,7 @@ dependencies = [ "memmap2", "num 0.4.0", "num_cpus", + "parquet", "polars-arrow", "polars-core", "rayon", @@ -5865,6 +5963,12 @@ dependencies = [ "syn 1.0.71", ] +[[package]] +name = "snap" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45456094d1983e2ee2a18fdfebce3189fa451699d0502cb8e3b49dba5ba41451" + [[package]] name = "socket2" version = "0.3.19" @@ -6297,6 +6401,28 @@ dependencies = [ "lazy_static 1.4.0", ] +[[package]] +name = "threadpool" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" +dependencies = [ + "num_cpus", +] + +[[package]] +name = "thrift" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6d965454947cc7266d22716ebfd07b18d84ebaf35eec558586bbb2a8cb6b5b" +dependencies = [ + "byteorder", + "integer-encoding", + "log 0.4.14", + "ordered-float", + "threadpool", +] + [[package]] name = "tiff" version = "0.6.1" @@ -7298,3 +7424,32 @@ dependencies = [ "thiserror", "time 0.1.44", ] + +[[package]] +name = "zstd" +version = "0.7.0+zstd.1.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9428752481d8372e15b1bf779ea518a179ad6c771cca2d2c60e4fbff3cc2cd52" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "3.1.0+zstd.1.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aa1926623ad7fe406e090555387daf73db555b948134b4d73eac5eb08fb666d" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "1.5.0+zstd.1.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e6c094340240369025fc6b731b054ee2a834328fa584310ac96aa4baebdc465" +dependencies = [ + "cc", + "libc", +] diff --git a/crates/nu-command/Cargo.toml b/crates/nu-command/Cargo.toml index aa509455ea..958ce43c57 100644 --- a/crates/nu-command/Cargo.toml +++ b/crates/nu-command/Cargo.toml @@ -99,7 +99,12 @@ uuid_crate = { package = "uuid", version = "0.8.2", features = ["v4"], optional which = { version = "4.1.0", optional = true } zip = { version = "0.5.9", optional = true } -polars = {version="0.13.1", git = "https://github.com/ritchie46/polars", rev = "3efad9a5c380c64a5eb78b4b7ad257e1e606b9f0", optional = true} +[dependencies.polars] +version = "0.13.1" +git = "https://github.com/ritchie46/polars" +rev = "3efad9a5c380c64a5eb78b4b7ad257e1e606b9f0" +optional = true +features = ["parquet", "json"] [target.'cfg(unix)'.dependencies] umask = "1.0.0" diff --git a/crates/nu-command/src/commands/dataframe/load.rs b/crates/nu-command/src/commands/dataframe/load.rs index 44c10b0b40..49a3c28364 100644 --- a/crates/nu-command/src/commands/dataframe/load.rs +++ b/crates/nu-command/src/commands/dataframe/load.rs @@ -1,12 +1,15 @@ use std::path::PathBuf; use crate::prelude::*; -use nu_engine::WholeStreamCommand; +use nu_engine::{EvaluatedCommandArgs, WholeStreamCommand}; use nu_errors::ShellError; -use nu_protocol::{dataframe::NuDataFrame, Signature, SyntaxShape, UntaggedValue}; +use nu_protocol::{ + dataframe::NuDataFrame, Primitive, Signature, SyntaxShape, UntaggedValue, Value, +}; use nu_source::Tagged; -use polars::prelude::{CsvReader, SerReader}; +use polars::prelude::{CsvReader, JsonReader, ParquetReader, SerReader}; +use std::fs::File; pub struct Dataframe; @@ -16,15 +19,45 @@ impl WholeStreamCommand for Dataframe { } fn usage(&self) -> &str { - "Loads dataframe form csv or parquet file" + "Loads dataframe form csv file" } fn signature(&self) -> Signature { - Signature::build("dataframe load").required( - "file", - SyntaxShape::FilePath, - "the file path to load values from", - ) + Signature::build("dataframe load") + .required( + "file", + SyntaxShape::FilePath, + "the file path to load values from", + ) + .named( + "delimiter", + SyntaxShape::String, + "file delimiter character. CSV file", + Some('d'), + ) + .switch( + "no_header", + "Indicates if file doesn't have header. CSV file", + None, + ) + .named( + "infer_schema", + SyntaxShape::Number, + "Set number of row to infer the schema of the file. CSV file", + None, + ) + .named( + "skip_rows", + SyntaxShape::Number, + "Number of rows to skip from file. CSV file", + None, + ) + .named( + "columns", + SyntaxShape::Table, + "Columns to be selected from csv file. CSV file", + None, + ) } fn run(&self, args: CommandArgs) -> Result { @@ -45,33 +78,23 @@ fn create_from_file(args: CommandArgs) -> Result { let args = args.evaluate_once()?; let file: Tagged = args.req(0)?; - // Needs more detail and arguments while loading the dataframe - // Options: - // - has header - // - infer schema - // - delimiter - // - csv or parquet <- extracted from extension - let csv_reader = match CsvReader::from_path(&file.item) { - Ok(csv_reader) => csv_reader, - Err(e) => { - return Err(ShellError::labeled_error( - "Unable to parse file", - format!("{}", e), + let df = match file.item().extension() { + Some(e) => match e.to_str() { + Some("csv") => from_csv(args), + Some("parquet") => from_parquet(args), + Some("json") => from_json(args), + _ => Err(ShellError::labeled_error( + "Error with file", + "Not a csv or parquet file", &file.tag, - )) - } - }; - - let df = match csv_reader.infer_schema(None).has_header(true).finish() { - Ok(csv_reader) => csv_reader, - Err(e) => { - return Err(ShellError::labeled_error( - "Error while parsing dataframe", - format!("{}", e), - &file.tag, - )) - } - }; + )), + }, + None => Err(ShellError::labeled_error( + "Error with file", + "File without extension", + &file.tag, + )), + }?; let file_name = match file.item.into_os_string().into_string() { Ok(name) => name, @@ -93,3 +116,105 @@ fn create_from_file(args: CommandArgs) -> Result { Ok(init.to_output_stream()) } + +fn from_parquet(args: EvaluatedCommandArgs) -> Result { + let file: Tagged = args.req(0)?; + + let r = File::open(&file.item) + .map_err(|e| ShellError::labeled_error("Error with file", format!("{:?}", e), &file.tag))?; + + let reader = ParquetReader::new(r); + + reader + .finish() + .map_err(|e| ShellError::labeled_error("Error with file", format!("{:?}", e), &file.tag)) +} + +fn from_json(args: EvaluatedCommandArgs) -> Result { + let file: Tagged = args.req(0)?; + + let r = File::open(&file.item) + .map_err(|e| ShellError::labeled_error("Error with file", format!("{:?}", e), &file.tag))?; + + let reader = JsonReader::new(r); + + reader + .finish() + .map_err(|e| ShellError::labeled_error("Error with file", format!("{:?}", e), &file.tag)) +} + +fn from_csv(args: EvaluatedCommandArgs) -> Result { + let file: Tagged = args.req(0)?; + let delimiter: Option> = args.get_flag("delimiter")?; + let no_header: bool = args.has_flag("no_header"); + let infer_schema: Option> = args.get_flag("infer_schema")?; + let skip_rows: Option> = args.get_flag("skip_rows")?; + let columns: Option> = args.get_flag("columns")?; + + let csv_reader = CsvReader::from_path(&file.item).map_err(|e| { + ShellError::labeled_error("Unable to parse file", format!("{}", e), &file.tag) + })?; + + let csv_reader = match delimiter { + None => csv_reader, + Some(d) => { + if d.item.len() != 1 { + return Err(ShellError::labeled_error( + "Incorrect delimiter", + "Delimiter has to be one char", + &d.tag, + )); + } else { + let delimiter = match d.item.chars().nth(0) { + Some(d) => d as u8, + None => unreachable!(), + }; + csv_reader.with_delimiter(delimiter) + } + } + }; + + let csv_reader = if no_header { + csv_reader.has_header(false) + } else { + csv_reader.has_header(true) + }; + + let csv_reader = match infer_schema { + None => csv_reader.infer_schema(None), + Some(r) => csv_reader.infer_schema(Some(r.item)), + }; + + let csv_reader = match skip_rows { + None => csv_reader, + Some(r) => csv_reader.with_skip_rows(r.item), + }; + + let csv_reader = match columns { + None => csv_reader, + Some(c) => { + let columns = c + .into_iter() + .map(|value| match value.value { + UntaggedValue::Primitive(Primitive::String(s)) => Ok(s), + _ => Err(ShellError::labeled_error( + "Incorrect type for column", + "Only string as columns", + &value.tag, + )), + }) + .collect::, ShellError>>(); + + csv_reader.with_columns(Some(columns?)) + } + }; + + match csv_reader.finish() { + Ok(csv_reader) => Ok(csv_reader), + Err(e) => Err(ShellError::labeled_error( + "Error while parsing dataframe", + format!("{}", e), + &file.tag, + )), + } +}