forked from extern/nushell
Load parquet and json files (#3437)
* Load parquet and json files * changed csv file error
This commit is contained in:
parent
0612e5ccfb
commit
d0229cb96e
155
Cargo.lock
generated
155
Cargo.lock
generated
@ -120,6 +120,21 @@ dependencies = [
|
|||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "alloc-no-stdlib"
|
||||||
|
version = "2.0.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5192ec435945d87bc2f70992b4d818154b5feede43c09fb7592146374eac90a6"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "alloc-stdlib"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "697ed7edc0f1711de49ce108c541623a0af97c6c60b2f6e2b65229847ac843c2"
|
||||||
|
dependencies = [
|
||||||
|
"alloc-no-stdlib",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ansi_colours"
|
name = "ansi_colours"
|
||||||
version = "1.0.2"
|
version = "1.0.2"
|
||||||
@ -427,6 +442,12 @@ dependencies = [
|
|||||||
"safemem 0.3.3",
|
"safemem 0.3.3",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "base64"
|
||||||
|
version = "0.12.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "base64"
|
name = "base64"
|
||||||
version = "0.13.0"
|
version = "0.13.0"
|
||||||
@ -553,6 +574,27 @@ dependencies = [
|
|||||||
"once_cell",
|
"once_cell",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "brotli"
|
||||||
|
version = "3.3.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7f29919120f08613aadcd4383764e00526fc9f18b6c0895814faeed0dd78613e"
|
||||||
|
dependencies = [
|
||||||
|
"alloc-no-stdlib",
|
||||||
|
"alloc-stdlib",
|
||||||
|
"brotli-decompressor",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "brotli-decompressor"
|
||||||
|
version = "2.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1052e1c3b8d4d80eb84a8b94f0a1498797b5fb96314c001156a1c761940ef4ec"
|
||||||
|
dependencies = [
|
||||||
|
"alloc-no-stdlib",
|
||||||
|
"alloc-stdlib",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bson"
|
name = "bson"
|
||||||
version = "0.14.1"
|
version = "0.14.1"
|
||||||
@ -2603,6 +2645,12 @@ dependencies = [
|
|||||||
"cfg-if 1.0.0",
|
"cfg-if 1.0.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "integer-encoding"
|
||||||
|
version = "1.1.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "48dc51180a9b377fd75814d0cc02199c20f8e99433d6762f650d39cdbbd3b56f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "iovec"
|
name = "iovec"
|
||||||
version = "0.1.4"
|
version = "0.1.4"
|
||||||
@ -2883,6 +2931,26 @@ dependencies = [
|
|||||||
"value-bag",
|
"value-bag",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lz4"
|
||||||
|
version = "1.23.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "aac20ed6991e01bf6a2e68cc73df2b389707403662a8ba89f68511fb340f724c"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"lz4-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lz4-sys"
|
||||||
|
version = "1.9.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "dca79aa95d8b3226213ad454d328369853be3a1382d89532a854f4d69640acae"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mac"
|
name = "mac"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
@ -4387,6 +4455,34 @@ dependencies = [
|
|||||||
"winapi 0.3.9",
|
"winapi 0.3.9",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "parquet"
|
||||||
|
version = "4.0.0-SNAPSHOT"
|
||||||
|
source = "git+https://github.com/apache/arrow-rs?rev=d008f31b107c1030a1f5144c164e8ca8bf543576#d008f31b107c1030a1f5144c164e8ca8bf543576"
|
||||||
|
dependencies = [
|
||||||
|
"arrow",
|
||||||
|
"base64 0.12.3",
|
||||||
|
"brotli",
|
||||||
|
"byteorder",
|
||||||
|
"chrono",
|
||||||
|
"flate2",
|
||||||
|
"lz4",
|
||||||
|
"num-bigint 0.3.2",
|
||||||
|
"parquet-format",
|
||||||
|
"snap",
|
||||||
|
"thrift",
|
||||||
|
"zstd",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "parquet-format"
|
||||||
|
version = "2.6.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a5bc6b23543b5dedc8f6cce50758a35e5582e148e0cfa26bd0cacd569cda5b71"
|
||||||
|
dependencies = [
|
||||||
|
"thrift",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "parse-zoneinfo"
|
name = "parse-zoneinfo"
|
||||||
version = "0.3.0"
|
version = "0.3.0"
|
||||||
@ -4632,6 +4728,7 @@ dependencies = [
|
|||||||
"lazy_static 1.4.0",
|
"lazy_static 1.4.0",
|
||||||
"num 0.4.0",
|
"num 0.4.0",
|
||||||
"num_cpus",
|
"num_cpus",
|
||||||
|
"parquet",
|
||||||
"polars-arrow",
|
"polars-arrow",
|
||||||
"prettytable-rs",
|
"prettytable-rs",
|
||||||
"rayon",
|
"rayon",
|
||||||
@ -4656,6 +4753,7 @@ dependencies = [
|
|||||||
"memmap2",
|
"memmap2",
|
||||||
"num 0.4.0",
|
"num 0.4.0",
|
||||||
"num_cpus",
|
"num_cpus",
|
||||||
|
"parquet",
|
||||||
"polars-arrow",
|
"polars-arrow",
|
||||||
"polars-core",
|
"polars-core",
|
||||||
"rayon",
|
"rayon",
|
||||||
@ -5865,6 +5963,12 @@ dependencies = [
|
|||||||
"syn 1.0.71",
|
"syn 1.0.71",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "snap"
|
||||||
|
version = "1.0.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "45456094d1983e2ee2a18fdfebce3189fa451699d0502cb8e3b49dba5ba41451"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "socket2"
|
name = "socket2"
|
||||||
version = "0.3.19"
|
version = "0.3.19"
|
||||||
@ -6297,6 +6401,28 @@ dependencies = [
|
|||||||
"lazy_static 1.4.0",
|
"lazy_static 1.4.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "threadpool"
|
||||||
|
version = "1.8.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa"
|
||||||
|
dependencies = [
|
||||||
|
"num_cpus",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "thrift"
|
||||||
|
version = "0.13.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0c6d965454947cc7266d22716ebfd07b18d84ebaf35eec558586bbb2a8cb6b5b"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder",
|
||||||
|
"integer-encoding",
|
||||||
|
"log 0.4.14",
|
||||||
|
"ordered-float",
|
||||||
|
"threadpool",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tiff"
|
name = "tiff"
|
||||||
version = "0.6.1"
|
version = "0.6.1"
|
||||||
@ -7298,3 +7424,32 @@ dependencies = [
|
|||||||
"thiserror",
|
"thiserror",
|
||||||
"time 0.1.44",
|
"time 0.1.44",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zstd"
|
||||||
|
version = "0.7.0+zstd.1.4.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9428752481d8372e15b1bf779ea518a179ad6c771cca2d2c60e4fbff3cc2cd52"
|
||||||
|
dependencies = [
|
||||||
|
"zstd-safe",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zstd-safe"
|
||||||
|
version = "3.1.0+zstd.1.4.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5aa1926623ad7fe406e090555387daf73db555b948134b4d73eac5eb08fb666d"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"zstd-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "zstd-sys"
|
||||||
|
version = "1.5.0+zstd.1.4.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4e6c094340240369025fc6b731b054ee2a834328fa584310ac96aa4baebdc465"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
@ -99,7 +99,12 @@ uuid_crate = { package = "uuid", version = "0.8.2", features = ["v4"], optional
|
|||||||
which = { version = "4.1.0", optional = true }
|
which = { version = "4.1.0", optional = true }
|
||||||
zip = { version = "0.5.9", optional = true }
|
zip = { version = "0.5.9", optional = true }
|
||||||
|
|
||||||
polars = {version="0.13.1", git = "https://github.com/ritchie46/polars", rev = "3efad9a5c380c64a5eb78b4b7ad257e1e606b9f0", optional = true}
|
[dependencies.polars]
|
||||||
|
version = "0.13.1"
|
||||||
|
git = "https://github.com/ritchie46/polars"
|
||||||
|
rev = "3efad9a5c380c64a5eb78b4b7ad257e1e606b9f0"
|
||||||
|
optional = true
|
||||||
|
features = ["parquet", "json"]
|
||||||
|
|
||||||
[target.'cfg(unix)'.dependencies]
|
[target.'cfg(unix)'.dependencies]
|
||||||
umask = "1.0.0"
|
umask = "1.0.0"
|
||||||
|
@ -1,12 +1,15 @@
|
|||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use crate::prelude::*;
|
use crate::prelude::*;
|
||||||
use nu_engine::WholeStreamCommand;
|
use nu_engine::{EvaluatedCommandArgs, WholeStreamCommand};
|
||||||
use nu_errors::ShellError;
|
use nu_errors::ShellError;
|
||||||
use nu_protocol::{dataframe::NuDataFrame, Signature, SyntaxShape, UntaggedValue};
|
use nu_protocol::{
|
||||||
|
dataframe::NuDataFrame, Primitive, Signature, SyntaxShape, UntaggedValue, Value,
|
||||||
|
};
|
||||||
|
|
||||||
use nu_source::Tagged;
|
use nu_source::Tagged;
|
||||||
use polars::prelude::{CsvReader, SerReader};
|
use polars::prelude::{CsvReader, JsonReader, ParquetReader, SerReader};
|
||||||
|
use std::fs::File;
|
||||||
|
|
||||||
pub struct Dataframe;
|
pub struct Dataframe;
|
||||||
|
|
||||||
@ -16,15 +19,45 @@ impl WholeStreamCommand for Dataframe {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn usage(&self) -> &str {
|
fn usage(&self) -> &str {
|
||||||
"Loads dataframe form csv or parquet file"
|
"Loads dataframe form csv file"
|
||||||
}
|
}
|
||||||
|
|
||||||
fn signature(&self) -> Signature {
|
fn signature(&self) -> Signature {
|
||||||
Signature::build("dataframe load").required(
|
Signature::build("dataframe load")
|
||||||
|
.required(
|
||||||
"file",
|
"file",
|
||||||
SyntaxShape::FilePath,
|
SyntaxShape::FilePath,
|
||||||
"the file path to load values from",
|
"the file path to load values from",
|
||||||
)
|
)
|
||||||
|
.named(
|
||||||
|
"delimiter",
|
||||||
|
SyntaxShape::String,
|
||||||
|
"file delimiter character. CSV file",
|
||||||
|
Some('d'),
|
||||||
|
)
|
||||||
|
.switch(
|
||||||
|
"no_header",
|
||||||
|
"Indicates if file doesn't have header. CSV file",
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.named(
|
||||||
|
"infer_schema",
|
||||||
|
SyntaxShape::Number,
|
||||||
|
"Set number of row to infer the schema of the file. CSV file",
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.named(
|
||||||
|
"skip_rows",
|
||||||
|
SyntaxShape::Number,
|
||||||
|
"Number of rows to skip from file. CSV file",
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.named(
|
||||||
|
"columns",
|
||||||
|
SyntaxShape::Table,
|
||||||
|
"Columns to be selected from csv file. CSV file",
|
||||||
|
None,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn run(&self, args: CommandArgs) -> Result<OutputStream, ShellError> {
|
fn run(&self, args: CommandArgs) -> Result<OutputStream, ShellError> {
|
||||||
@ -45,33 +78,23 @@ fn create_from_file(args: CommandArgs) -> Result<OutputStream, ShellError> {
|
|||||||
let args = args.evaluate_once()?;
|
let args = args.evaluate_once()?;
|
||||||
let file: Tagged<PathBuf> = args.req(0)?;
|
let file: Tagged<PathBuf> = args.req(0)?;
|
||||||
|
|
||||||
// Needs more detail and arguments while loading the dataframe
|
let df = match file.item().extension() {
|
||||||
// Options:
|
Some(e) => match e.to_str() {
|
||||||
// - has header
|
Some("csv") => from_csv(args),
|
||||||
// - infer schema
|
Some("parquet") => from_parquet(args),
|
||||||
// - delimiter
|
Some("json") => from_json(args),
|
||||||
// - csv or parquet <- extracted from extension
|
_ => Err(ShellError::labeled_error(
|
||||||
let csv_reader = match CsvReader::from_path(&file.item) {
|
"Error with file",
|
||||||
Ok(csv_reader) => csv_reader,
|
"Not a csv or parquet file",
|
||||||
Err(e) => {
|
|
||||||
return Err(ShellError::labeled_error(
|
|
||||||
"Unable to parse file",
|
|
||||||
format!("{}", e),
|
|
||||||
&file.tag,
|
&file.tag,
|
||||||
))
|
)),
|
||||||
}
|
},
|
||||||
};
|
None => Err(ShellError::labeled_error(
|
||||||
|
"Error with file",
|
||||||
let df = match csv_reader.infer_schema(None).has_header(true).finish() {
|
"File without extension",
|
||||||
Ok(csv_reader) => csv_reader,
|
|
||||||
Err(e) => {
|
|
||||||
return Err(ShellError::labeled_error(
|
|
||||||
"Error while parsing dataframe",
|
|
||||||
format!("{}", e),
|
|
||||||
&file.tag,
|
&file.tag,
|
||||||
))
|
)),
|
||||||
}
|
}?;
|
||||||
};
|
|
||||||
|
|
||||||
let file_name = match file.item.into_os_string().into_string() {
|
let file_name = match file.item.into_os_string().into_string() {
|
||||||
Ok(name) => name,
|
Ok(name) => name,
|
||||||
@ -93,3 +116,105 @@ fn create_from_file(args: CommandArgs) -> Result<OutputStream, ShellError> {
|
|||||||
|
|
||||||
Ok(init.to_output_stream())
|
Ok(init.to_output_stream())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn from_parquet(args: EvaluatedCommandArgs) -> Result<polars::prelude::DataFrame, ShellError> {
|
||||||
|
let file: Tagged<PathBuf> = args.req(0)?;
|
||||||
|
|
||||||
|
let r = File::open(&file.item)
|
||||||
|
.map_err(|e| ShellError::labeled_error("Error with file", format!("{:?}", e), &file.tag))?;
|
||||||
|
|
||||||
|
let reader = ParquetReader::new(r);
|
||||||
|
|
||||||
|
reader
|
||||||
|
.finish()
|
||||||
|
.map_err(|e| ShellError::labeled_error("Error with file", format!("{:?}", e), &file.tag))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_json(args: EvaluatedCommandArgs) -> Result<polars::prelude::DataFrame, ShellError> {
|
||||||
|
let file: Tagged<PathBuf> = args.req(0)?;
|
||||||
|
|
||||||
|
let r = File::open(&file.item)
|
||||||
|
.map_err(|e| ShellError::labeled_error("Error with file", format!("{:?}", e), &file.tag))?;
|
||||||
|
|
||||||
|
let reader = JsonReader::new(r);
|
||||||
|
|
||||||
|
reader
|
||||||
|
.finish()
|
||||||
|
.map_err(|e| ShellError::labeled_error("Error with file", format!("{:?}", e), &file.tag))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_csv(args: EvaluatedCommandArgs) -> Result<polars::prelude::DataFrame, ShellError> {
|
||||||
|
let file: Tagged<PathBuf> = args.req(0)?;
|
||||||
|
let delimiter: Option<Tagged<String>> = args.get_flag("delimiter")?;
|
||||||
|
let no_header: bool = args.has_flag("no_header");
|
||||||
|
let infer_schema: Option<Tagged<usize>> = args.get_flag("infer_schema")?;
|
||||||
|
let skip_rows: Option<Tagged<usize>> = args.get_flag("skip_rows")?;
|
||||||
|
let columns: Option<Vec<Value>> = args.get_flag("columns")?;
|
||||||
|
|
||||||
|
let csv_reader = CsvReader::from_path(&file.item).map_err(|e| {
|
||||||
|
ShellError::labeled_error("Unable to parse file", format!("{}", e), &file.tag)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let csv_reader = match delimiter {
|
||||||
|
None => csv_reader,
|
||||||
|
Some(d) => {
|
||||||
|
if d.item.len() != 1 {
|
||||||
|
return Err(ShellError::labeled_error(
|
||||||
|
"Incorrect delimiter",
|
||||||
|
"Delimiter has to be one char",
|
||||||
|
&d.tag,
|
||||||
|
));
|
||||||
|
} else {
|
||||||
|
let delimiter = match d.item.chars().nth(0) {
|
||||||
|
Some(d) => d as u8,
|
||||||
|
None => unreachable!(),
|
||||||
|
};
|
||||||
|
csv_reader.with_delimiter(delimiter)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let csv_reader = if no_header {
|
||||||
|
csv_reader.has_header(false)
|
||||||
|
} else {
|
||||||
|
csv_reader.has_header(true)
|
||||||
|
};
|
||||||
|
|
||||||
|
let csv_reader = match infer_schema {
|
||||||
|
None => csv_reader.infer_schema(None),
|
||||||
|
Some(r) => csv_reader.infer_schema(Some(r.item)),
|
||||||
|
};
|
||||||
|
|
||||||
|
let csv_reader = match skip_rows {
|
||||||
|
None => csv_reader,
|
||||||
|
Some(r) => csv_reader.with_skip_rows(r.item),
|
||||||
|
};
|
||||||
|
|
||||||
|
let csv_reader = match columns {
|
||||||
|
None => csv_reader,
|
||||||
|
Some(c) => {
|
||||||
|
let columns = c
|
||||||
|
.into_iter()
|
||||||
|
.map(|value| match value.value {
|
||||||
|
UntaggedValue::Primitive(Primitive::String(s)) => Ok(s),
|
||||||
|
_ => Err(ShellError::labeled_error(
|
||||||
|
"Incorrect type for column",
|
||||||
|
"Only string as columns",
|
||||||
|
&value.tag,
|
||||||
|
)),
|
||||||
|
})
|
||||||
|
.collect::<Result<Vec<String>, ShellError>>();
|
||||||
|
|
||||||
|
csv_reader.with_columns(Some(columns?))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
match csv_reader.finish() {
|
||||||
|
Ok(csv_reader) => Ok(csv_reader),
|
||||||
|
Err(e) => Err(ShellError::labeled_error(
|
||||||
|
"Error while parsing dataframe",
|
||||||
|
format!("{}", e),
|
||||||
|
&file.tag,
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user