Expose polars avro support (#10019)

# Description

Exposes polars avro support via dfr open and dfr to-avro

---------

Co-authored-by: Jack Wright <jack.wright@disqo.com>
This commit is contained in:
Jack Wright
2023-08-15 18:31:49 -07:00
committed by GitHub
parent 3ed45c7ba8
commit 7a123d3eb1
5 changed files with 235 additions and 3 deletions

View File

@ -24,6 +24,7 @@ indexmap = { version = "2.0" }
num = { version = "0.4", optional = true }
serde = { version = "1.0", features = ["derive"] }
sqlparser = { version = "0.34", features = ["serde"], optional = true }
polars-io = { version = "0.30.0", features = ["avro"] }
[dependencies.polars]
features = [
@ -50,7 +51,7 @@ features = [
"serde",
"serde-lazy",
"strings",
"to_dummies",
"to_dummies"
]
optional = true
version = "0.30.0"

View File

@ -22,6 +22,7 @@ mod sql_expr;
mod summary;
mod take;
mod to_arrow;
mod to_avro;
mod to_csv;
mod to_df;
mod to_json_lines;
@ -55,6 +56,7 @@ pub use sql_expr::parse_sql_expr;
pub use summary::Summary;
pub use take::TakeDF;
pub use to_arrow::ToArrow;
pub use to_avro::ToAvro;
pub use to_csv::ToCSV;
pub use to_df::ToDataFrame;
pub use to_json_lines::ToJsonLines;
@ -96,6 +98,7 @@ pub fn add_eager_decls(working_set: &mut StateWorkingSet) {
SliceDF,
TakeDF,
ToArrow,
ToAvro,
ToCSV,
ToDataFrame,
ToNu,

View File

@ -13,6 +13,8 @@ use polars::prelude::{
LazyFrame, ParallelStrategy, ParquetReader, ScanArgsIpc, ScanArgsParquet, SerReader,
};
use polars_io::avro::AvroReader;
#[derive(Clone)]
pub struct OpenDataFrame;
@ -22,7 +24,7 @@ impl Command for OpenDataFrame {
}
fn usage(&self) -> &str {
"Opens CSV, JSON, JSON lines, arrow, or parquet file to create dataframe."
"Opens CSV, JSON, JSON lines, arrow, avro, or parquet file to create dataframe."
}
fn signature(&self) -> Signature {
@ -36,7 +38,7 @@ impl Command for OpenDataFrame {
.named(
"type",
SyntaxShape::String,
"File type: csv, tsv, json, parquet, arrow. If omitted, derive from file extension",
"File type: csv, tsv, json, parquet, arrow, avro. If omitted, derive from file extension",
Some('t'),
)
.named(
@ -118,6 +120,7 @@ fn command(
"ipc" | "arrow" => from_ipc(engine_state, stack, call),
"json" => from_json(engine_state, stack, call),
"jsonl" => from_jsonl(engine_state, stack, call),
"avro" => from_avro(engine_state, stack, call),
_ => Err(ShellError::FileNotFoundCustom(
format!("{msg}. Supported values: csv, tsv, parquet, ipc, arrow, json"),
blamed,
@ -199,6 +202,46 @@ fn from_parquet(
}
}
fn from_avro(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
) -> Result<Value, ShellError> {
let file: Spanned<PathBuf> = call.req(engine_state, stack, 0)?;
let columns: Option<Vec<String>> = call.get_flag(engine_state, stack, "columns")?;
let r = File::open(&file.item).map_err(|e| {
ShellError::GenericError(
"Error opening file".into(),
e.to_string(),
Some(file.span),
None,
Vec::new(),
)
})?;
let reader = AvroReader::new(r);
let reader = match columns {
None => reader,
Some(columns) => reader.with_columns(Some(columns)),
};
let df: NuDataFrame = reader
.finish()
.map_err(|e| {
ShellError::GenericError(
"Avro reader error".into(),
format!("{e:?}"),
Some(call.head),
None,
Vec::new(),
)
})?
.into();
Ok(df.into_value(call.head))
}
fn from_ipc(
engine_state: &EngineState,
stack: &mut Stack,

View File

@ -0,0 +1,122 @@
use std::{fs::File, path::PathBuf};
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, PipelineData, ShellError, Signature, Spanned, SyntaxShape, Type, Value,
};
use polars_io::avro::{AvroCompression, AvroWriter};
use polars_io::SerWriter;
use super::super::values::NuDataFrame;
#[derive(Clone)]
pub struct ToAvro;
impl Command for ToAvro {
fn name(&self) -> &str {
"dfr to-avro"
}
fn usage(&self) -> &str {
"Saves dataframe to avro file."
}
fn signature(&self) -> Signature {
Signature::build(self.name())
.named(
"compression",
SyntaxShape::String,
"use compression, supports deflate or snappy",
Some('c'),
)
.required("file", SyntaxShape::Filepath, "file path to save dataframe")
.input_output_type(Type::Custom("dataframe".into()), Type::Any)
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Saves dataframe to avro file",
example: "[[a b]; [1 2] [3 4]] | dfr into-df | dfr to-avro test.avro",
result: None,
}]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
command(engine_state, stack, call, input)
}
}
fn get_compression(call: &Call) -> Result<Option<AvroCompression>, ShellError> {
if let Some((compression, span)) = call
.get_flag_expr("compression")
.and_then(|e| e.as_string().map(|s| (s, e.span)))
{
match compression.as_ref() {
"snappy" => Ok(Some(AvroCompression::Snappy)),
"deflate" => Ok(Some(AvroCompression::Deflate)),
_ => Err(ShellError::IncorrectValue {
msg: "compression must be one of deflate or snappy".to_string(),
span,
}),
}
} else {
Ok(None)
}
}
fn command(
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let file_name: Spanned<PathBuf> = call.req(engine_state, stack, 0)?;
let compression = get_compression(call)?;
let mut df = NuDataFrame::try_from_pipeline(input, call.head)?;
let file = File::create(&file_name.item).map_err(|e| {
ShellError::GenericError(
"Error with file name".into(),
e.to_string(),
Some(file_name.span),
None,
Vec::new(),
)
})?;
AvroWriter::new(file)
.with_compression(compression)
.finish(df.as_mut())
.map_err(|e| {
ShellError::GenericError(
"Error saving file".into(),
e.to_string(),
Some(file_name.span),
None,
Vec::new(),
)
})?;
let file_value = Value::String {
val: format!("saved {:?}", &file_name.item),
span: file_name.span,
};
Ok(PipelineData::Value(
Value::List {
vals: vec![file_value],
span: call.head,
},
None,
))
}