Polars default infer (#13193)

Addresses performance issues that @maxim-uvarov found with CSV and JSON
lines.

This ensures that the schema inference follows the polars defaults of
100 lines. Recent changes caused the default values to be override and
caused the entire file to be scanned when inferring the schema.
This commit is contained in:
Jack Wright 2024-06-22 05:23:42 -07:00 committed by GitHub
parent 10e84038af
commit db86dd9f26
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -32,6 +32,8 @@ use polars_io::{
avro::AvroReader, csv::read::CsvReadOptions, prelude::ParallelStrategy, HiveOptions, avro::AvroReader, csv::read::CsvReadOptions, prelude::ParallelStrategy, HiveOptions,
}; };
const DEFAULT_INFER_SCHEMA: usize = 100;
#[derive(Clone)] #[derive(Clone)]
pub struct OpenDataFrame; pub struct OpenDataFrame;
@ -374,7 +376,9 @@ fn from_jsonl(
file_path: &Path, file_path: &Path,
file_span: Span, file_span: Span,
) -> Result<Value, ShellError> { ) -> Result<Value, ShellError> {
let infer_schema: Option<usize> = call.get_flag("infer-schema")?; let infer_schema: usize = call
.get_flag("infer-schema")?
.unwrap_or(DEFAULT_INFER_SCHEMA);
let maybe_schema = call let maybe_schema = call
.get_flag("schema")? .get_flag("schema")?
.map(|schema| NuSchema::try_from(&schema)) .map(|schema| NuSchema::try_from(&schema))
@ -384,7 +388,7 @@ fn from_jsonl(
let start_time = std::time::Instant::now(); let start_time = std::time::Instant::now();
let df = LazyJsonLineReader::new(file_path) let df = LazyJsonLineReader::new(file_path)
.with_infer_schema_length(infer_schema) .with_infer_schema_length(Some(infer_schema))
.with_schema(maybe_schema.map(|s| s.into())) .with_schema(maybe_schema.map(|s| s.into()))
.finish() .finish()
.map_err(|e| ShellError::GenericError { .map_err(|e| ShellError::GenericError {
@ -417,7 +421,7 @@ fn from_jsonl(
let buf_reader = BufReader::new(file); let buf_reader = BufReader::new(file);
let reader = JsonReader::new(buf_reader) let reader = JsonReader::new(buf_reader)
.with_json_format(JsonFormat::JsonLines) .with_json_format(JsonFormat::JsonLines)
.infer_schema_len(infer_schema); .infer_schema_len(Some(infer_schema));
let reader = match maybe_schema { let reader = match maybe_schema {
Some(schema) => reader.with_schema(schema.into()), Some(schema) => reader.with_schema(schema.into()),
@ -459,7 +463,9 @@ fn from_csv(
) -> Result<Value, ShellError> { ) -> Result<Value, ShellError> {
let delimiter: Option<Spanned<String>> = call.get_flag("delimiter")?; let delimiter: Option<Spanned<String>> = call.get_flag("delimiter")?;
let no_header: bool = call.has_flag("no-header")?; let no_header: bool = call.has_flag("no-header")?;
let infer_schema: Option<usize> = call.get_flag("infer-schema")?; let infer_schema: usize = call
.get_flag("infer-schema")?
.unwrap_or(DEFAULT_INFER_SCHEMA);
let skip_rows: Option<usize> = call.get_flag("skip-rows")?; let skip_rows: Option<usize> = call.get_flag("skip-rows")?;
let columns: Option<Vec<String>> = call.get_flag("columns")?; let columns: Option<Vec<String>> = call.get_flag("columns")?;
@ -499,10 +505,7 @@ fn from_csv(
None => csv_reader, None => csv_reader,
}; };
let csv_reader = match infer_schema { let csv_reader = csv_reader.with_infer_schema_length(Some(infer_schema));
None => csv_reader,
Some(r) => csv_reader.with_infer_schema_length(Some(r)),
};
let csv_reader = match skip_rows { let csv_reader = match skip_rows {
None => csv_reader, None => csv_reader,
@ -535,7 +538,7 @@ fn from_csv(
let start_time = std::time::Instant::now(); let start_time = std::time::Instant::now();
let df = CsvReadOptions::default() let df = CsvReadOptions::default()
.with_has_header(!no_header) .with_has_header(!no_header)
.with_infer_schema_length(infer_schema) .with_infer_schema_length(Some(infer_schema))
.with_skip_rows(skip_rows.unwrap_or_default()) .with_skip_rows(skip_rows.unwrap_or_default())
.with_schema(maybe_schema.map(|s| s.into())) .with_schema(maybe_schema.map(|s| s.into()))
.with_columns(columns.map(Arc::new)) .with_columns(columns.map(Arc::new))