Add 'detect columns' command (#4127)

* Add 'detect columns' command

* Fix warnings
This commit is contained in:
JT 2021-11-16 11:29:54 +13:00 committed by GitHub
parent f2f01b8a4d
commit df87d90b8c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 326 additions and 128 deletions

165
Cargo.lock generated
View File

@ -109,7 +109,7 @@ checksum = "47044a1809e2953fe6d084312b81dcb7d9ffc24fee45aa39e5b938f66f75b8a8"
dependencies = [ dependencies = [
"clipboard-win", "clipboard-win",
"core-graphics", "core-graphics",
"image 0.23.14", "image",
"log", "log",
"objc", "objc",
"objc-foundation", "objc-foundation",
@ -168,18 +168,6 @@ dependencies = [
"strength_reduce", "strength_reduce",
] ]
[[package]]
name = "as-slice"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "45403b49e3954a4b8428a0ac21a4b7afadccf92bfd96273f1a58cd4812496ae0"
dependencies = [
"generic-array 0.12.4",
"generic-array 0.13.3",
"generic-array 0.14.4",
"stable_deref_trait",
]
[[package]] [[package]]
name = "async-stream" name = "async-stream"
version = "0.3.2" version = "0.3.2"
@ -355,7 +343,7 @@ version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4"
dependencies = [ dependencies = [
"generic-array 0.14.4", "generic-array",
] ]
[[package]] [[package]]
@ -381,21 +369,21 @@ dependencies = [
[[package]] [[package]]
name = "bson" name = "bson"
version = "0.14.1" version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c177ed0122f24ce5e0f05bf9b29e79f3ac1a359bc504e0e14c3b34896c71c00" checksum = "ff58d466782b57e0001c8e97c6a70c01c2359d7e13e257a83654c0b783ecc139"
dependencies = [ dependencies = [
"byteorder", "ahash",
"base64",
"chrono", "chrono",
"decimal", "hex",
"hex 0.3.2", "indexmap",
"libc", "lazy_static",
"linked-hash-map", "rand 0.8.4",
"md5 0.6.1",
"rand 0.7.3",
"serde", "serde",
"serde_bytes",
"serde_json", "serde_json",
"time", "uuid",
] ]
[[package]] [[package]]
@ -800,7 +788,7 @@ version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1d1a86f49236c215f271d40892d5fc950490551400b02ef360692c29815c714" checksum = "b1d1a86f49236c215f271d40892d5fc950490551400b02ef360692c29815c714"
dependencies = [ dependencies = [
"generic-array 0.14.4", "generic-array",
"subtle", "subtle",
] ]
@ -879,27 +867,6 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7313c0d620d0cb4dbd9d019e461a4beb501071ff46ec0ab933efb4daa76d73e3" checksum = "7313c0d620d0cb4dbd9d019e461a4beb501071ff46ec0ab933efb4daa76d73e3"
[[package]]
name = "decimal"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a8ab77e91baeb15034c3be91e87bff4665c9036216148e4996d9a9f5792114d"
dependencies = [
"bitflags",
"cc",
"libc",
]
[[package]]
name = "deflate"
version = "0.7.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "707b6a7b384888a70c8d2e8650b3e60170dfc6a67bb4aa67b6dfca57af4bedb4"
dependencies = [
"adler32",
"byteorder",
]
[[package]] [[package]]
name = "deflate" name = "deflate"
version = "0.8.6" version = "0.8.6"
@ -940,7 +907,7 @@ version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066"
dependencies = [ dependencies = [
"generic-array 0.14.4", "generic-array",
] ]
[[package]] [[package]]
@ -1456,24 +1423,6 @@ version = "0.3.55"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f5f3913fa0bfe7ee1fd8248b6b9f42a5af4b9d65ec2dd2c3c26132b950ecfc2" checksum = "8f5f3913fa0bfe7ee1fd8248b6b9f42a5af4b9d65ec2dd2c3c26132b950ecfc2"
[[package]]
name = "generic-array"
version = "0.12.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ffdf9f34f1447443d37393cc6c2b8313aebddcd96906caf34e54c68d8e57d7bd"
dependencies = [
"typenum",
]
[[package]]
name = "generic-array"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f797e67af32588215eaaab8327027ee8e71b9dd0b2b26996aedf20c030fce309"
dependencies = [
"typenum",
]
[[package]] [[package]]
name = "generic-array" name = "generic-array"
version = "0.14.4" version = "0.14.4"
@ -1651,9 +1600,9 @@ dependencies = [
[[package]] [[package]]
name = "hash32" name = "hash32"
version = "0.1.1" version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d4041af86e63ac4298ce40e5cca669066e75b6f1aa3390fe2561ffa5e1d9f4cc" checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67"
dependencies = [ dependencies = [
"byteorder", "byteorder",
] ]
@ -1685,13 +1634,12 @@ dependencies = [
[[package]] [[package]]
name = "heapless" name = "heapless"
version = "0.6.1" version = "0.7.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "634bd4d29cbf24424d0a4bfcbf80c6960129dc24424752a7d1d1390607023422" checksum = "9c1ad878e07405df82b695089e63d278244344f80e764074d0bdfe99b89460f3"
dependencies = [ dependencies = [
"as-slice",
"generic-array 0.14.4",
"hash32", "hash32",
"spin",
"stable_deref_trait", "stable_deref_trait",
] ]
@ -1704,12 +1652,6 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "hex"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "805026a5d0141ffc30abb3be3173848ad46a1b1664fe632428479619a3644d77"
[[package]] [[package]]
name = "hex" name = "hex"
version = "0.4.3" version = "0.4.3"
@ -1855,20 +1797,6 @@ dependencies = [
"unicode-normalization", "unicode-normalization",
] ]
[[package]]
name = "image"
version = "0.22.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08ed2ada878397b045454ac7cfb011d73132c59f31a955d230bd1f1c2e68eb4a"
dependencies = [
"byteorder",
"jpeg-decoder",
"num-iter",
"num-rational 0.2.4",
"num-traits",
"png 0.15.3",
]
[[package]] [[package]]
name = "image" name = "image"
version = "0.23.14" version = "0.23.14"
@ -1878,10 +1806,11 @@ dependencies = [
"bytemuck", "bytemuck",
"byteorder", "byteorder",
"color_quant", "color_quant",
"jpeg-decoder",
"num-iter", "num-iter",
"num-rational 0.3.2", "num-rational 0.3.2",
"num-traits", "num-traits",
"png 0.16.8", "png",
"tiff", "tiff",
] ]
@ -1896,15 +1825,6 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "inflate"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1cdb29978cc5797bd8dcc8e5bf7de604891df2a8dc576973d71a281e916db2ff"
dependencies = [
"adler32",
]
[[package]] [[package]]
name = "insta" name = "insta"
version = "1.7.2" version = "1.7.2"
@ -2237,12 +2157,6 @@ dependencies = [
"opaque-debug", "opaque-debug",
] ]
[[package]]
name = "md5"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e6bcd6433cff03a4bfc3d9834d504467db1f1cf6d0ea765d37d330249ed629d"
[[package]] [[package]]
name = "md5" name = "md5"
version = "0.7.0" version = "0.7.0"
@ -2342,9 +2256,9 @@ dependencies = [
[[package]] [[package]]
name = "mp4" name = "mp4"
version = "0.8.3" version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51eb18a88129198ca1e8e92f26038ed6814cd0e608fa43215bf57368604bf093" checksum = "85660d4d88b9318d95396943adc4a254b3ed8bf1de917e6f093abda1ccf0bec0"
dependencies = [ dependencies = [
"byteorder", "byteorder",
"bytes 0.5.6", "bytes 0.5.6",
@ -2679,16 +2593,14 @@ dependencies = [
"sha2", "sha2",
"sys-locale", "sys-locale",
"toml", "toml",
"users",
] ]
[[package]] [[package]]
name = "nu-engine" name = "nu-engine"
version = "0.39.0" version = "0.39.0"
dependencies = [ dependencies = [
"ansi_term 0.12.1",
"bigdecimal-rs", "bigdecimal-rs",
"bytes 0.5.6", "bytes 1.1.0",
"chrono", "chrono",
"codespan-reporting", "codespan-reporting",
"derive-new", "derive-new",
@ -2911,7 +2823,7 @@ name = "nu_plugin_binaryview"
version = "0.39.0" version = "0.39.0"
dependencies = [ dependencies = [
"crossterm", "crossterm",
"image 0.22.5", "image",
"neso", "neso",
"nu-ansi-term", "nu-ansi-term",
"nu-errors", "nu-errors",
@ -3075,7 +2987,7 @@ dependencies = [
name = "nu_plugin_to_sqlite" name = "nu_plugin_to_sqlite"
version = "0.39.0" version = "0.39.0"
dependencies = [ dependencies = [
"hex 0.4.3", "hex",
"nu-errors", "nu-errors",
"nu-plugin", "nu-plugin",
"nu-protocol", "nu-protocol",
@ -3597,18 +3509,6 @@ dependencies = [
"xml-rs", "xml-rs",
] ]
[[package]]
name = "png"
version = "0.15.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef859a23054bbfee7811284275ae522f0434a3c8e7f4b74bd4a35ae7e1c4a283"
dependencies = [
"bitflags",
"crc32fast",
"deflate 0.7.20",
"inflate",
]
[[package]] [[package]]
name = "png" name = "png"
version = "0.16.8" version = "0.16.8"
@ -3617,7 +3517,7 @@ checksum = "3c3287920cb847dee3de33d301c463fba14dda99db24214ddf93f83d3021f4c6"
dependencies = [ dependencies = [
"bitflags", "bitflags",
"crc32fast", "crc32fast",
"deflate 0.8.6", "deflate",
"miniz_oxide 0.3.7", "miniz_oxide 0.3.7",
] ]
@ -4351,7 +4251,7 @@ dependencies = [
"hmac", "hmac",
"hmac-sha1", "hmac-sha1",
"log", "log",
"md5 0.7.0", "md5",
"mime_guess", "mime_guess",
"quick-xml 0.22.0", "quick-xml 0.22.0",
"regex", "regex",
@ -4703,6 +4603,15 @@ dependencies = [
"winapi 0.3.9", "winapi 0.3.9",
] ]
[[package]]
name = "spin"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "511254be0c5bcf062b019a6c89c01a664aa359ded62f78aa72c6fc137c0590e5"
dependencies = [
"lock_api",
]
[[package]] [[package]]
name = "stable_deref_trait" name = "stable_deref_trait"
version = "1.2.0" version = "1.2.0"

View File

@ -0,0 +1,283 @@
use std::{iter::Peekable, str::CharIndices};
use crate::prelude::*;
use nu_engine::WholeStreamCommand;
use nu_errors::ShellError;
use nu_protocol::{Signature, SyntaxShape, TaggedDictBuilder, UntaggedValue};
use nu_source::Spanned;
type Input<'t> = Peekable<CharIndices<'t>>;
pub struct DetectColumns;
impl WholeStreamCommand for DetectColumns {
fn name(&self) -> &str {
"detect columns"
}
fn signature(&self) -> Signature {
Signature::build("detect columns")
.named(
"skip",
SyntaxShape::Int,
"number of rows to skip before detecting",
Some('s'),
)
.switch("no_headers", "don't detect headers", Some('n'))
}
fn usage(&self) -> &str {
"splits contents across multiple columns via the separator."
}
fn run(&self, args: CommandArgs) -> Result<OutputStream, ShellError> {
detect_columns(args)
}
}
fn detect_columns(args: CommandArgs) -> Result<OutputStream, ShellError> {
let name_tag = args.name_tag();
let num_rows_to_skip: Option<usize> = args.get_flag("skip")?;
let noheader = args.has_flag("no_headers");
let input = args.input.collect_string(name_tag.clone())?;
let input: Vec<_> = input
.lines()
.skip(num_rows_to_skip.unwrap_or_default())
.map(|x| x.to_string())
.collect();
let mut input = input.into_iter();
let headers = input.next();
if let Some(orig_headers) = headers {
let headers = find_columns(&orig_headers);
Ok((if noheader {
vec![orig_headers].into_iter().chain(input)
} else {
vec![].into_iter().chain(input)
})
.map(move |x| {
let row = find_columns(&x);
let mut dict = TaggedDictBuilder::new(name_tag.clone());
if headers.len() == row.len() && !noheader {
for (header, val) in headers.iter().zip(row.iter()) {
dict.insert_untagged(&header.item, UntaggedValue::string(&val.item));
}
} else {
let mut pre_output = vec![];
// column counts don't line up, so see if we can figure out why
for cell in row {
for header in &headers {
if cell.span.start() <= header.span.end()
&& cell.span.end() > header.span.start()
{
pre_output
.push((header.item.to_string(), UntaggedValue::string(&cell.item)));
}
}
}
for header in &headers {
let mut found = false;
for pre_o in &pre_output {
if pre_o.0 == header.item {
found = true;
break;
}
}
if !found {
pre_output.push((header.item.to_string(), UntaggedValue::nothing()));
}
}
if noheader {
for header in headers.iter().enumerate() {
for pre_o in &pre_output {
if pre_o.0 == header.1.item {
dict.insert_untagged(format!("Column{}", header.0), pre_o.1.clone())
}
}
}
} else {
for header in &headers {
for pre_o in &pre_output {
if pre_o.0 == header.item {
dict.insert_untagged(&header.item, pre_o.1.clone())
}
}
}
}
}
dict.into_value()
})
.into_output_stream())
} else {
Ok(OutputStream::empty())
}
}
pub fn find_columns(input: &str) -> Vec<Spanned<String>> {
let mut chars = input.char_indices().peekable();
let mut output = vec![];
while let Some((_, c)) = chars.peek() {
if c.is_whitespace() {
// If the next character is non-newline whitespace, skip it.
let _ = chars.next();
} else {
// Otherwise, try to consume an unclassified token.
let result = baseline(&mut chars);
output.push(result);
}
}
output
}
#[derive(Clone, Copy)]
enum BlockKind {
Paren,
CurlyBracket,
SquareBracket,
}
fn baseline(src: &mut Input) -> Spanned<String> {
let mut token_contents = String::new();
let start_offset = if let Some((pos, _)) = src.peek() {
*pos
} else {
0
};
// This variable tracks the starting character of a string literal, so that
// we remain inside the string literal lexer mode until we encounter the
// closing quote.
let mut quote_start: Option<char> = None;
// This Vec tracks paired delimiters
let mut block_level: Vec<BlockKind> = vec![];
// A baseline token is terminated if it's not nested inside of a paired
// delimiter and the next character is one of: `|`, `;`, `#` or any
// whitespace.
fn is_termination(block_level: &[BlockKind], c: char) -> bool {
block_level.is_empty() && (c.is_whitespace())
}
// The process of slurping up a baseline token repeats:
//
// - String literal, which begins with `'`, `"` or `\``, and continues until
// the same character is encountered again.
// - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
// the matching closing delimiter is found, skipping comments and string
// literals.
// - When not nested inside of a delimiter pair, when a terminating
// character (whitespace, `|`, `;` or `#`) is encountered, the baseline
// token is done.
// - Otherwise, accumulate the character into the current baseline token.
while let Some((_, c)) = src.peek() {
let c = *c;
if quote_start.is_some() {
// If we encountered the closing quote character for the current
// string, we're done with the current string.
if Some(c) == quote_start {
quote_start = None;
}
} else if c == '\n' {
if is_termination(&block_level, c) {
break;
}
} else if c == '\'' || c == '"' || c == '`' {
// We encountered the opening quote of a string literal.
quote_start = Some(c);
} else if c == '[' {
// We encountered an opening `[` delimiter.
block_level.push(BlockKind::SquareBracket);
} else if c == ']' {
// We encountered a closing `]` delimiter. Pop off the opening `[`
// delimiter.
if let Some(BlockKind::SquareBracket) = block_level.last() {
let _ = block_level.pop();
}
} else if c == '{' {
// We encountered an opening `{` delimiter.
block_level.push(BlockKind::CurlyBracket);
} else if c == '}' {
// We encountered a closing `}` delimiter. Pop off the opening `{`.
if let Some(BlockKind::CurlyBracket) = block_level.last() {
let _ = block_level.pop();
}
} else if c == '(' {
// We enceountered an opening `(` delimiter.
block_level.push(BlockKind::Paren);
} else if c == ')' {
// We encountered a closing `)` delimiter. Pop off the opening `(`.
if let Some(BlockKind::Paren) = block_level.last() {
let _ = block_level.pop();
}
} else if is_termination(&block_level, c) {
break;
}
// Otherwise, accumulate the character into the current token.
token_contents.push(c);
// Consume the character.
let _ = src.next();
}
let span = Span::new(start_offset, start_offset + token_contents.len());
// If there is still unclosed opening delimiters, close them and add
// synthetic closing characters to the accumulated token.
if block_level.last().is_some() {
// let delim: char = (*block).closing();
// let cause = ParseError::unexpected_eof(delim.to_string(), span);
// while let Some(bk) = block_level.pop() {
// token_contents.push(bk.closing());
// }
return token_contents.spanned(span);
}
if quote_start.is_some() {
// The non-lite parse trims quotes on both sides, so we add the expected quote so that
// anyone wanting to consume this partial parse (e.g., completions) will be able to get
// correct information from the non-lite parse.
// token_contents.push(delimiter);
// return (
// token_contents.spanned(span),
// Some(ParseError::unexpected_eof(delimiter.to_string(), span)),
// );
return token_contents.spanned(span);
}
token_contents.spanned(span)
}
#[cfg(test)]
mod tests {
use super::DetectColumns;
use super::ShellError;
#[test]
fn examples_work_as_expected() -> Result<(), ShellError> {
use crate::examples::test as test_examples;
test_examples(DetectColumns {})
}
}

View File

@ -0,0 +1,3 @@
pub mod columns;
pub use columns::DetectColumns;

View File

@ -1,5 +1,6 @@
mod build_string; mod build_string;
mod char_; mod char_;
mod detect;
mod format; mod format;
mod lines; mod lines;
mod parse; mod parse;
@ -10,6 +11,7 @@ mod str_;
pub use build_string::BuildString; pub use build_string::BuildString;
pub use char_::Char; pub use char_::Char;
pub use detect::DetectColumns;
pub use format::*; pub use format::*;
pub use lines::Lines; pub use lines::Lines;
pub use parse::*; pub use parse::*;

View File

@ -127,6 +127,7 @@ pub fn create_default_context(interactive: bool) -> Result<EvaluationContext, Bo
whole_stream_command(AnsiStrip), whole_stream_command(AnsiStrip),
whole_stream_command(AnsiGradient), whole_stream_command(AnsiGradient),
whole_stream_command(Char), whole_stream_command(Char),
whole_stream_command(DetectColumns),
// Column manipulation // Column manipulation
whole_stream_command(DropColumn), whole_stream_command(DropColumn),
whole_stream_command(MoveColumn), whole_stream_command(MoveColumn),