Infer types from regular delimited plain text unstructured files. (#1494)

* Infer types from regular delimited plain text unstructured files.

* Nothing resolves to an empty string.
This commit is contained in:
Andrés N. Robalino
2020-03-16 15:50:45 -05:00
committed by GitHub
parent d8c4565413
commit b36d21e76f
20 changed files with 751 additions and 315 deletions

View File

@ -1,42 +1,12 @@
use crate::prelude::*;
use csv::{ErrorKind, ReaderBuilder};
use nu_errors::ShellError;
use nu_protocol::{Primitive, ReturnSuccess, TaggedDictBuilder, UntaggedValue, Value};
use nu_parser::hir::syntax_shape::{ExpandContext, SignatureRegistry};
use nu_parser::utils::{parse_line_with_separator as parse, LineSeparatedShape};
use nu_parser::TokensIterator;
use nu_protocol::{ReturnSuccess, Signature, TaggedDictBuilder, UntaggedValue, Value};
use nu_source::nom_input;
fn from_delimited_string_to_value(
s: String,
headerless: bool,
separator: char,
tag: impl Into<Tag>,
) -> Result<Value, csv::Error> {
let mut reader = ReaderBuilder::new()
.has_headers(!headerless)
.delimiter(separator as u8)
.from_reader(s.as_bytes());
let tag = tag.into();
let headers = if headerless {
(1..=reader.headers()?.len())
.map(|i| format!("Column{}", i))
.collect::<Vec<String>>()
} else {
reader.headers()?.iter().map(String::from).collect()
};
let mut rows = vec![];
for row in reader.records() {
let mut tagged_row = TaggedDictBuilder::new(&tag);
for (value, header) in row?.iter().zip(headers.iter()) {
tagged_row.insert_value(
header,
UntaggedValue::Primitive(Primitive::String(String::from(value))).into_value(&tag),
)
}
rows.push(tagged_row.into_value());
}
Ok(UntaggedValue::Table(rows).into_value(&tag))
}
use derive_new::new;
pub fn from_delimited_data(
headerless: bool,
@ -50,19 +20,20 @@ pub fn from_delimited_data(
let concat_string = input.collect_string(name_tag.clone()).await?;
match from_delimited_string_to_value(concat_string.item, headerless, sep, name_tag.clone()) {
Ok(x) => match x {
Value { value: UntaggedValue::Table(list), .. } => {
for l in list {
yield ReturnSuccess::value(l);
Ok(rows) => {
for row in rows {
match row {
Value { value: UntaggedValue::Table(list), .. } => {
for l in list {
yield ReturnSuccess::value(l);
}
}
x => yield ReturnSuccess::value(x),
}
}
x => yield ReturnSuccess::value(x),
},
Err(err) => {
let line_one = match pretty_csv_error(err) {
Some(pretty) => format!("Could not parse as {} ({})", format_name,pretty),
None => format!("Could not parse as {}", format_name),
};
let line_one = format!("Could not parse as {}", format_name);
let line_two = format!("input cannot be parsed as {}", format_name);
yield Err(ShellError::labeled_error_with_secondary(
line_one,
@ -78,25 +49,121 @@ pub fn from_delimited_data(
Ok(stream.to_output_stream())
}
fn pretty_csv_error(err: csv::Error) -> Option<String> {
match err.kind() {
ErrorKind::UnequalLengths {
pos,
expected_len,
len,
} => {
if let Some(pos) = pos {
Some(format!(
"Line {}: expected {} fields, found {}",
pos.line(),
expected_len,
len
))
} else {
Some(format!("Expected {} fields, found {}", expected_len, len))
}
}
ErrorKind::Seek => Some("Internal error while parsing csv".to_string()),
_ => None,
#[derive(Debug, Clone, new)]
pub struct EmptyRegistry {
#[new(default)]
signatures: indexmap::IndexMap<String, Signature>,
}
impl EmptyRegistry {}
impl SignatureRegistry for EmptyRegistry {
fn has(&self, _name: &str) -> bool {
false
}
fn get(&self, _name: &str) -> Option<Signature> {
None
}
fn clone_box(&self) -> Box<dyn SignatureRegistry> {
Box::new(self.clone())
}
}
fn from_delimited_string_to_value(
s: String,
headerless: bool,
sep: char,
tag: impl Into<Tag>,
) -> Result<Vec<Value>, ShellError> {
let tag = tag.into();
let mut entries = s.lines();
let mut fields = vec![];
let mut out = vec![];
if let Some(first_entry) = entries.next() {
let tokens = match parse(&sep.to_string(), nom_input(first_entry)) {
Ok((_, tokens)) => tokens,
Err(err) => return Err(ShellError::parse_error(err)),
};
let tokens_span = tokens.span;
let source: nu_source::Text = tokens_span.slice(&first_entry).into();
if !headerless {
fields = tokens
.item
.iter()
.filter(|token| !token.is_separator())
.map(|field| field.source(&source).to_string())
.collect::<Vec<_>>();
}
let registry = Box::new(EmptyRegistry::new());
let ctx = ExpandContext::new(registry, &source, None);
let mut iterator = TokensIterator::new(&tokens.item, ctx, tokens_span);
let (results, tokens_identified) = iterator.expand(LineSeparatedShape);
let results = results?;
let mut row = TaggedDictBuilder::new(&tag);
if headerless {
let fallback_columns = (1..=tokens_identified)
.map(|i| format!("Column{}", i))
.collect::<Vec<String>>();
for (idx, field) in results.into_iter().enumerate() {
let key = if headerless {
&fallback_columns[idx]
} else {
&fields[idx]
};
row.insert_value(key, field.into_value(&tag));
}
out.push(row.into_value())
}
}
for entry in entries {
let tokens = match parse(&sep.to_string(), nom_input(entry)) {
Ok((_, tokens)) => tokens,
Err(err) => return Err(ShellError::parse_error(err)),
};
let tokens_span = tokens.span;
let source: nu_source::Text = tokens_span.slice(&entry).into();
let registry = Box::new(EmptyRegistry::new());
let ctx = ExpandContext::new(registry, &source, None);
let mut iterator = TokensIterator::new(&tokens.item, ctx, tokens_span);
let (results, tokens_identified) = iterator.expand(LineSeparatedShape);
let results = results?;
let mut row = TaggedDictBuilder::new(&tag);
let fallback_columns = (1..=tokens_identified)
.map(|i| format!("Column{}", i))
.collect::<Vec<String>>();
for (idx, field) in results.into_iter().enumerate() {
let key = if headerless {
&fallback_columns[idx]
} else {
match fields.get(idx) {
Some(key) => key,
None => &fallback_columns[idx],
}
};
row.insert_value(key, field.into_value(&tag));
}
out.push(row.into_value())
}
Ok(out)
}

View File

@ -140,6 +140,7 @@ fn to_string_tagged_value(v: &Value) -> Result<String, ShellError> {
| UntaggedValue::Primitive(Primitive::Path(_))
| UntaggedValue::Primitive(Primitive::Int(_)) => as_string(v),
UntaggedValue::Primitive(Primitive::Date(d)) => Ok(d.to_string()),
UntaggedValue::Primitive(Primitive::Nothing) => Ok(String::new()),
UntaggedValue::Table(_) => Ok(String::from("[Table]")),
UntaggedValue::Row(_) => Ok(String::from("[Row]")),
_ => Err(ShellError::labeled_error(

View File

@ -73,8 +73,36 @@ fn table_to_csv_text_skipping_headers_after_conversion() {
}
#[test]
fn from_csv_text_to_table() {
fn infers_types() {
Playground::setup("filter_from_csv_test_1", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_cuatro_mosqueteros.csv",
r#"
first_name,last_name,rusty_luck
Andrés,Robalino,1,d
Jonathan,Turner,1,d
Yehuda,Katz,1,d
Jason,Gedge,1,d
"#,
)]);
let actual = nu!(
cwd: dirs.test(), pipeline(
r#"
open los_cuatro_mosqueteros.csv
| where rusty_luck > 0
| count
| echo $it
"#
));
assert_eq!(actual, "4");
})
}
#[test]
fn from_csv_text_to_table() {
Playground::setup("filter_from_csv_test_2", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_caballeros.txt",
r#"
@ -102,7 +130,7 @@ fn from_csv_text_to_table() {
#[test]
fn from_csv_text_with_separator_to_table() {
Playground::setup("filter_from_csv_test_2", |dirs, sandbox| {
Playground::setup("filter_from_csv_test_3", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_caballeros.txt",
r#"
@ -130,7 +158,7 @@ fn from_csv_text_with_separator_to_table() {
#[test]
fn from_csv_text_with_tab_separator_to_table() {
Playground::setup("filter_from_csv_test_3", |dirs, sandbox| {
Playground::setup("filter_from_csv_test_4", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_caballeros.txt",
r#"
@ -158,7 +186,7 @@ fn from_csv_text_with_tab_separator_to_table() {
#[test]
fn from_csv_text_skipping_headers_to_table() {
Playground::setup("filter_from_csv_test_4", |dirs, sandbox| {
Playground::setup("filter_from_csv_test_5", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_amigos.txt",
r#"