forked from extern/nushell
Infer types from regular delimited plain text unstructured files. (#1494)
* Infer types from regular delimited plain text unstructured files. * Nothing resolves to an empty string.
This commit is contained in:
committed by
GitHub
parent
d8c4565413
commit
b36d21e76f
@ -1,42 +1,12 @@
|
||||
use crate::prelude::*;
|
||||
use csv::{ErrorKind, ReaderBuilder};
|
||||
use nu_errors::ShellError;
|
||||
use nu_protocol::{Primitive, ReturnSuccess, TaggedDictBuilder, UntaggedValue, Value};
|
||||
use nu_parser::hir::syntax_shape::{ExpandContext, SignatureRegistry};
|
||||
use nu_parser::utils::{parse_line_with_separator as parse, LineSeparatedShape};
|
||||
use nu_parser::TokensIterator;
|
||||
use nu_protocol::{ReturnSuccess, Signature, TaggedDictBuilder, UntaggedValue, Value};
|
||||
use nu_source::nom_input;
|
||||
|
||||
fn from_delimited_string_to_value(
|
||||
s: String,
|
||||
headerless: bool,
|
||||
separator: char,
|
||||
tag: impl Into<Tag>,
|
||||
) -> Result<Value, csv::Error> {
|
||||
let mut reader = ReaderBuilder::new()
|
||||
.has_headers(!headerless)
|
||||
.delimiter(separator as u8)
|
||||
.from_reader(s.as_bytes());
|
||||
let tag = tag.into();
|
||||
|
||||
let headers = if headerless {
|
||||
(1..=reader.headers()?.len())
|
||||
.map(|i| format!("Column{}", i))
|
||||
.collect::<Vec<String>>()
|
||||
} else {
|
||||
reader.headers()?.iter().map(String::from).collect()
|
||||
};
|
||||
|
||||
let mut rows = vec![];
|
||||
for row in reader.records() {
|
||||
let mut tagged_row = TaggedDictBuilder::new(&tag);
|
||||
for (value, header) in row?.iter().zip(headers.iter()) {
|
||||
tagged_row.insert_value(
|
||||
header,
|
||||
UntaggedValue::Primitive(Primitive::String(String::from(value))).into_value(&tag),
|
||||
)
|
||||
}
|
||||
rows.push(tagged_row.into_value());
|
||||
}
|
||||
|
||||
Ok(UntaggedValue::Table(rows).into_value(&tag))
|
||||
}
|
||||
use derive_new::new;
|
||||
|
||||
pub fn from_delimited_data(
|
||||
headerless: bool,
|
||||
@ -50,19 +20,20 @@ pub fn from_delimited_data(
|
||||
let concat_string = input.collect_string(name_tag.clone()).await?;
|
||||
|
||||
match from_delimited_string_to_value(concat_string.item, headerless, sep, name_tag.clone()) {
|
||||
Ok(x) => match x {
|
||||
Value { value: UntaggedValue::Table(list), .. } => {
|
||||
for l in list {
|
||||
yield ReturnSuccess::value(l);
|
||||
Ok(rows) => {
|
||||
for row in rows {
|
||||
match row {
|
||||
Value { value: UntaggedValue::Table(list), .. } => {
|
||||
for l in list {
|
||||
yield ReturnSuccess::value(l);
|
||||
}
|
||||
}
|
||||
x => yield ReturnSuccess::value(x),
|
||||
}
|
||||
}
|
||||
x => yield ReturnSuccess::value(x),
|
||||
},
|
||||
Err(err) => {
|
||||
let line_one = match pretty_csv_error(err) {
|
||||
Some(pretty) => format!("Could not parse as {} ({})", format_name,pretty),
|
||||
None => format!("Could not parse as {}", format_name),
|
||||
};
|
||||
let line_one = format!("Could not parse as {}", format_name);
|
||||
let line_two = format!("input cannot be parsed as {}", format_name);
|
||||
yield Err(ShellError::labeled_error_with_secondary(
|
||||
line_one,
|
||||
@ -78,25 +49,121 @@ pub fn from_delimited_data(
|
||||
Ok(stream.to_output_stream())
|
||||
}
|
||||
|
||||
fn pretty_csv_error(err: csv::Error) -> Option<String> {
|
||||
match err.kind() {
|
||||
ErrorKind::UnequalLengths {
|
||||
pos,
|
||||
expected_len,
|
||||
len,
|
||||
} => {
|
||||
if let Some(pos) = pos {
|
||||
Some(format!(
|
||||
"Line {}: expected {} fields, found {}",
|
||||
pos.line(),
|
||||
expected_len,
|
||||
len
|
||||
))
|
||||
} else {
|
||||
Some(format!("Expected {} fields, found {}", expected_len, len))
|
||||
}
|
||||
}
|
||||
ErrorKind::Seek => Some("Internal error while parsing csv".to_string()),
|
||||
_ => None,
|
||||
#[derive(Debug, Clone, new)]
|
||||
pub struct EmptyRegistry {
|
||||
#[new(default)]
|
||||
signatures: indexmap::IndexMap<String, Signature>,
|
||||
}
|
||||
|
||||
impl EmptyRegistry {}
|
||||
|
||||
impl SignatureRegistry for EmptyRegistry {
|
||||
fn has(&self, _name: &str) -> bool {
|
||||
false
|
||||
}
|
||||
fn get(&self, _name: &str) -> Option<Signature> {
|
||||
None
|
||||
}
|
||||
fn clone_box(&self) -> Box<dyn SignatureRegistry> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
fn from_delimited_string_to_value(
|
||||
s: String,
|
||||
headerless: bool,
|
||||
sep: char,
|
||||
tag: impl Into<Tag>,
|
||||
) -> Result<Vec<Value>, ShellError> {
|
||||
let tag = tag.into();
|
||||
|
||||
let mut entries = s.lines();
|
||||
|
||||
let mut fields = vec![];
|
||||
let mut out = vec![];
|
||||
|
||||
if let Some(first_entry) = entries.next() {
|
||||
let tokens = match parse(&sep.to_string(), nom_input(first_entry)) {
|
||||
Ok((_, tokens)) => tokens,
|
||||
Err(err) => return Err(ShellError::parse_error(err)),
|
||||
};
|
||||
|
||||
let tokens_span = tokens.span;
|
||||
let source: nu_source::Text = tokens_span.slice(&first_entry).into();
|
||||
|
||||
if !headerless {
|
||||
fields = tokens
|
||||
.item
|
||||
.iter()
|
||||
.filter(|token| !token.is_separator())
|
||||
.map(|field| field.source(&source).to_string())
|
||||
.collect::<Vec<_>>();
|
||||
}
|
||||
|
||||
let registry = Box::new(EmptyRegistry::new());
|
||||
let ctx = ExpandContext::new(registry, &source, None);
|
||||
|
||||
let mut iterator = TokensIterator::new(&tokens.item, ctx, tokens_span);
|
||||
let (results, tokens_identified) = iterator.expand(LineSeparatedShape);
|
||||
let results = results?;
|
||||
|
||||
let mut row = TaggedDictBuilder::new(&tag);
|
||||
|
||||
if headerless {
|
||||
let fallback_columns = (1..=tokens_identified)
|
||||
.map(|i| format!("Column{}", i))
|
||||
.collect::<Vec<String>>();
|
||||
|
||||
for (idx, field) in results.into_iter().enumerate() {
|
||||
let key = if headerless {
|
||||
&fallback_columns[idx]
|
||||
} else {
|
||||
&fields[idx]
|
||||
};
|
||||
|
||||
row.insert_value(key, field.into_value(&tag));
|
||||
}
|
||||
|
||||
out.push(row.into_value())
|
||||
}
|
||||
}
|
||||
|
||||
for entry in entries {
|
||||
let tokens = match parse(&sep.to_string(), nom_input(entry)) {
|
||||
Ok((_, tokens)) => tokens,
|
||||
Err(err) => return Err(ShellError::parse_error(err)),
|
||||
};
|
||||
let tokens_span = tokens.span;
|
||||
|
||||
let source: nu_source::Text = tokens_span.slice(&entry).into();
|
||||
let registry = Box::new(EmptyRegistry::new());
|
||||
let ctx = ExpandContext::new(registry, &source, None);
|
||||
|
||||
let mut iterator = TokensIterator::new(&tokens.item, ctx, tokens_span);
|
||||
let (results, tokens_identified) = iterator.expand(LineSeparatedShape);
|
||||
let results = results?;
|
||||
|
||||
let mut row = TaggedDictBuilder::new(&tag);
|
||||
|
||||
let fallback_columns = (1..=tokens_identified)
|
||||
.map(|i| format!("Column{}", i))
|
||||
.collect::<Vec<String>>();
|
||||
|
||||
for (idx, field) in results.into_iter().enumerate() {
|
||||
let key = if headerless {
|
||||
&fallback_columns[idx]
|
||||
} else {
|
||||
match fields.get(idx) {
|
||||
Some(key) => key,
|
||||
None => &fallback_columns[idx],
|
||||
}
|
||||
};
|
||||
|
||||
row.insert_value(key, field.into_value(&tag));
|
||||
}
|
||||
|
||||
out.push(row.into_value())
|
||||
}
|
||||
|
||||
Ok(out)
|
||||
}
|
||||
|
@ -140,6 +140,7 @@ fn to_string_tagged_value(v: &Value) -> Result<String, ShellError> {
|
||||
| UntaggedValue::Primitive(Primitive::Path(_))
|
||||
| UntaggedValue::Primitive(Primitive::Int(_)) => as_string(v),
|
||||
UntaggedValue::Primitive(Primitive::Date(d)) => Ok(d.to_string()),
|
||||
UntaggedValue::Primitive(Primitive::Nothing) => Ok(String::new()),
|
||||
UntaggedValue::Table(_) => Ok(String::from("[Table]")),
|
||||
UntaggedValue::Row(_) => Ok(String::from("[Row]")),
|
||||
_ => Err(ShellError::labeled_error(
|
||||
|
Reference in New Issue
Block a user