mirror of
https://github.com/nushell/nushell.git
synced 2024-11-28 19:33:47 +01:00
Polars default infer (#13193)
Addresses performance issues that @maxim-uvarov found with CSV and JSON lines. This ensures that the schema inference follows the polars defaults of 100 lines. Recent changes caused the default values to be override and caused the entire file to be scanned when inferring the schema.
This commit is contained in:
parent
10e84038af
commit
db86dd9f26
@ -32,6 +32,8 @@ use polars_io::{
|
|||||||
avro::AvroReader, csv::read::CsvReadOptions, prelude::ParallelStrategy, HiveOptions,
|
avro::AvroReader, csv::read::CsvReadOptions, prelude::ParallelStrategy, HiveOptions,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const DEFAULT_INFER_SCHEMA: usize = 100;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct OpenDataFrame;
|
pub struct OpenDataFrame;
|
||||||
|
|
||||||
@ -374,7 +376,9 @@ fn from_jsonl(
|
|||||||
file_path: &Path,
|
file_path: &Path,
|
||||||
file_span: Span,
|
file_span: Span,
|
||||||
) -> Result<Value, ShellError> {
|
) -> Result<Value, ShellError> {
|
||||||
let infer_schema: Option<usize> = call.get_flag("infer-schema")?;
|
let infer_schema: usize = call
|
||||||
|
.get_flag("infer-schema")?
|
||||||
|
.unwrap_or(DEFAULT_INFER_SCHEMA);
|
||||||
let maybe_schema = call
|
let maybe_schema = call
|
||||||
.get_flag("schema")?
|
.get_flag("schema")?
|
||||||
.map(|schema| NuSchema::try_from(&schema))
|
.map(|schema| NuSchema::try_from(&schema))
|
||||||
@ -384,7 +388,7 @@ fn from_jsonl(
|
|||||||
let start_time = std::time::Instant::now();
|
let start_time = std::time::Instant::now();
|
||||||
|
|
||||||
let df = LazyJsonLineReader::new(file_path)
|
let df = LazyJsonLineReader::new(file_path)
|
||||||
.with_infer_schema_length(infer_schema)
|
.with_infer_schema_length(Some(infer_schema))
|
||||||
.with_schema(maybe_schema.map(|s| s.into()))
|
.with_schema(maybe_schema.map(|s| s.into()))
|
||||||
.finish()
|
.finish()
|
||||||
.map_err(|e| ShellError::GenericError {
|
.map_err(|e| ShellError::GenericError {
|
||||||
@ -417,7 +421,7 @@ fn from_jsonl(
|
|||||||
let buf_reader = BufReader::new(file);
|
let buf_reader = BufReader::new(file);
|
||||||
let reader = JsonReader::new(buf_reader)
|
let reader = JsonReader::new(buf_reader)
|
||||||
.with_json_format(JsonFormat::JsonLines)
|
.with_json_format(JsonFormat::JsonLines)
|
||||||
.infer_schema_len(infer_schema);
|
.infer_schema_len(Some(infer_schema));
|
||||||
|
|
||||||
let reader = match maybe_schema {
|
let reader = match maybe_schema {
|
||||||
Some(schema) => reader.with_schema(schema.into()),
|
Some(schema) => reader.with_schema(schema.into()),
|
||||||
@ -459,7 +463,9 @@ fn from_csv(
|
|||||||
) -> Result<Value, ShellError> {
|
) -> Result<Value, ShellError> {
|
||||||
let delimiter: Option<Spanned<String>> = call.get_flag("delimiter")?;
|
let delimiter: Option<Spanned<String>> = call.get_flag("delimiter")?;
|
||||||
let no_header: bool = call.has_flag("no-header")?;
|
let no_header: bool = call.has_flag("no-header")?;
|
||||||
let infer_schema: Option<usize> = call.get_flag("infer-schema")?;
|
let infer_schema: usize = call
|
||||||
|
.get_flag("infer-schema")?
|
||||||
|
.unwrap_or(DEFAULT_INFER_SCHEMA);
|
||||||
let skip_rows: Option<usize> = call.get_flag("skip-rows")?;
|
let skip_rows: Option<usize> = call.get_flag("skip-rows")?;
|
||||||
let columns: Option<Vec<String>> = call.get_flag("columns")?;
|
let columns: Option<Vec<String>> = call.get_flag("columns")?;
|
||||||
|
|
||||||
@ -499,10 +505,7 @@ fn from_csv(
|
|||||||
None => csv_reader,
|
None => csv_reader,
|
||||||
};
|
};
|
||||||
|
|
||||||
let csv_reader = match infer_schema {
|
let csv_reader = csv_reader.with_infer_schema_length(Some(infer_schema));
|
||||||
None => csv_reader,
|
|
||||||
Some(r) => csv_reader.with_infer_schema_length(Some(r)),
|
|
||||||
};
|
|
||||||
|
|
||||||
let csv_reader = match skip_rows {
|
let csv_reader = match skip_rows {
|
||||||
None => csv_reader,
|
None => csv_reader,
|
||||||
@ -535,7 +538,7 @@ fn from_csv(
|
|||||||
let start_time = std::time::Instant::now();
|
let start_time = std::time::Instant::now();
|
||||||
let df = CsvReadOptions::default()
|
let df = CsvReadOptions::default()
|
||||||
.with_has_header(!no_header)
|
.with_has_header(!no_header)
|
||||||
.with_infer_schema_length(infer_schema)
|
.with_infer_schema_length(Some(infer_schema))
|
||||||
.with_skip_rows(skip_rows.unwrap_or_default())
|
.with_skip_rows(skip_rows.unwrap_or_default())
|
||||||
.with_schema(maybe_schema.map(|s| s.into()))
|
.with_schema(maybe_schema.map(|s| s.into()))
|
||||||
.with_columns(columns.map(Arc::new))
|
.with_columns(columns.map(Arc::new))
|
||||||
|
Loading…
Reference in New Issue
Block a user