From 060a4b3f4831186c818e757bc1cf5370e002ce42 Mon Sep 17 00:00:00 2001 From: JT <547158+jntrnr@users.noreply.github.com> Date: Sun, 30 Jan 2022 07:52:24 -0500 Subject: [PATCH] Port `detect columns` (#892) --- crates/nu-command/src/default_context.rs | 1 + crates/nu-command/src/filters/rename.rs | 2 +- .../nu-command/src/strings/detect_columns.rs | 313 ++++++++++++++++++ crates/nu-command/src/strings/mod.rs | 2 + 4 files changed, 317 insertions(+), 1 deletion(-) create mode 100644 crates/nu-command/src/strings/detect_columns.rs diff --git a/crates/nu-command/src/default_context.rs b/crates/nu-command/src/default_context.rs index 676cce4468..83bf2e3951 100644 --- a/crates/nu-command/src/default_context.rs +++ b/crates/nu-command/src/default_context.rs @@ -133,6 +133,7 @@ pub fn create_default_context(cwd: impl AsRef) -> EngineState { BuildString, Char, Decode, + DetectColumns, Format, Parse, Size, diff --git a/crates/nu-command/src/filters/rename.rs b/crates/nu-command/src/filters/rename.rs index 385b2064d0..5abe6c26c9 100644 --- a/crates/nu-command/src/filters/rename.rs +++ b/crates/nu-command/src/filters/rename.rs @@ -138,7 +138,7 @@ fn rename( } None => { for (idx, val) in columns.iter().enumerate() { - if idx > cols.len() - 1 { + if idx >= cols.len() { // skip extra new columns names if we already reached the final column break; } diff --git a/crates/nu-command/src/strings/detect_columns.rs b/crates/nu-command/src/strings/detect_columns.rs new file mode 100644 index 0000000000..4bba032ec0 --- /dev/null +++ b/crates/nu-command/src/strings/detect_columns.rs @@ -0,0 +1,313 @@ +use std::iter::Peekable; +use std::str::CharIndices; + +use nu_engine::CallExt; +use nu_protocol::ast::Call; +use nu_protocol::engine::{Command, EngineState, Stack}; +use nu_protocol::{ + Category, IntoInterruptiblePipelineData, PipelineData, ShellError, Signature, Span, Spanned, + SyntaxShape, Value, +}; + +type Input<'t> = Peekable>; + +#[derive(Clone)] +pub struct DetectColumns; + +impl Command for DetectColumns { + fn name(&self) -> &str { + "detect columns" + } + + fn signature(&self) -> Signature { + Signature::build("detect columns") + .named( + "skip", + SyntaxShape::Int, + "number of rows to skip before detecting", + Some('s'), + ) + .switch("no_headers", "don't detect headers", Some('n')) + .category(Category::Strings) + } + + fn usage(&self) -> &str { + "splits contents across multiple columns via the separator." + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + detect_columns(engine_state, stack, call, input) + } +} + +fn detect_columns( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let name_span = call.head; + let num_rows_to_skip: Option = call.get_flag(engine_state, stack, "skip")?; + let noheader = call.has_flag("no_headers"); + let ctrlc = engine_state.ctrlc.clone(); + let config = stack.get_config()?; + let input = input.collect_string("", &config)?; + + let input: Vec<_> = input + .lines() + .skip(num_rows_to_skip.unwrap_or_default()) + .map(|x| x.to_string()) + .collect(); + + let mut input = input.into_iter(); + let headers = input.next(); + + if let Some(orig_headers) = headers { + let mut headers = find_columns(&orig_headers); + + if noheader { + for header in headers.iter_mut().enumerate() { + header.1.item = format!("Column{}", header.0); + } + } + + Ok((if noheader { + vec![orig_headers].into_iter().chain(input) + } else { + vec![].into_iter().chain(input) + }) + .map(move |x| { + let row = find_columns(&x); + + let mut cols = vec![]; + let mut vals = vec![]; + + if headers.len() == row.len() { + for (header, val) in headers.iter().zip(row.iter()) { + cols.push(header.item.clone()); + vals.push(Value::String { + val: val.item.clone(), + span: name_span, + }); + } + } else { + let mut pre_output = vec![]; + + // column counts don't line up, so see if we can figure out why + for cell in row { + for header in &headers { + if cell.span.start <= header.span.end && cell.span.end > header.span.start { + pre_output.push(( + header.item.to_string(), + Value::string(&cell.item, name_span), + )); + } + } + } + + for header in &headers { + let mut found = false; + for pre_o in &pre_output { + if pre_o.0 == header.item { + found = true; + break; + } + } + + if !found { + pre_output.push((header.item.to_string(), Value::nothing(name_span))); + } + } + + for header in &headers { + for pre_o in &pre_output { + if pre_o.0 == header.item { + cols.push(header.item.clone()); + vals.push(pre_o.1.clone()) + } + } + } + } + + Value::Record { + cols, + vals, + span: name_span, + } + }) + .into_pipeline_data(ctrlc)) + } else { + Ok(PipelineData::new(name_span)) + } +} + +pub fn find_columns(input: &str) -> Vec> { + let mut chars = input.char_indices().peekable(); + let mut output = vec![]; + + while let Some((_, c)) = chars.peek() { + if c.is_whitespace() { + // If the next character is non-newline whitespace, skip it. + + let _ = chars.next(); + } else { + // Otherwise, try to consume an unclassified token. + + let result = baseline(&mut chars); + + output.push(result); + } + } + + output +} + +#[derive(Clone, Copy)] +enum BlockKind { + Paren, + CurlyBracket, + SquareBracket, +} + +fn baseline(src: &mut Input) -> Spanned { + let mut token_contents = String::new(); + + let start_offset = if let Some((pos, _)) = src.peek() { + *pos + } else { + 0 + }; + + // This variable tracks the starting character of a string literal, so that + // we remain inside the string literal lexer mode until we encounter the + // closing quote. + let mut quote_start: Option = None; + + // This Vec tracks paired delimiters + let mut block_level: Vec = vec![]; + + // A baseline token is terminated if it's not nested inside of a paired + // delimiter and the next character is one of: `|`, `;`, `#` or any + // whitespace. + fn is_termination(block_level: &[BlockKind], c: char) -> bool { + block_level.is_empty() && (c.is_whitespace()) + } + + // The process of slurping up a baseline token repeats: + // + // - String literal, which begins with `'`, `"` or `\``, and continues until + // the same character is encountered again. + // - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until + // the matching closing delimiter is found, skipping comments and string + // literals. + // - When not nested inside of a delimiter pair, when a terminating + // character (whitespace, `|`, `;` or `#`) is encountered, the baseline + // token is done. + // - Otherwise, accumulate the character into the current baseline token. + while let Some((_, c)) = src.peek() { + let c = *c; + + if quote_start.is_some() { + // If we encountered the closing quote character for the current + // string, we're done with the current string. + if Some(c) == quote_start { + quote_start = None; + } + } else if c == '\n' { + if is_termination(&block_level, c) { + break; + } + } else if c == '\'' || c == '"' || c == '`' { + // We encountered the opening quote of a string literal. + quote_start = Some(c); + } else if c == '[' { + // We encountered an opening `[` delimiter. + block_level.push(BlockKind::SquareBracket); + } else if c == ']' { + // We encountered a closing `]` delimiter. Pop off the opening `[` + // delimiter. + if let Some(BlockKind::SquareBracket) = block_level.last() { + let _ = block_level.pop(); + } + } else if c == '{' { + // We encountered an opening `{` delimiter. + block_level.push(BlockKind::CurlyBracket); + } else if c == '}' { + // We encountered a closing `}` delimiter. Pop off the opening `{`. + if let Some(BlockKind::CurlyBracket) = block_level.last() { + let _ = block_level.pop(); + } + } else if c == '(' { + // We enceountered an opening `(` delimiter. + block_level.push(BlockKind::Paren); + } else if c == ')' { + // We encountered a closing `)` delimiter. Pop off the opening `(`. + if let Some(BlockKind::Paren) = block_level.last() { + let _ = block_level.pop(); + } + } else if is_termination(&block_level, c) { + break; + } + + // Otherwise, accumulate the character into the current token. + token_contents.push(c); + + // Consume the character. + let _ = src.next(); + } + + let span = Span::new(start_offset, start_offset + token_contents.len()); + + // If there is still unclosed opening delimiters, close them and add + // synthetic closing characters to the accumulated token. + if block_level.last().is_some() { + // let delim: char = (*block).closing(); + // let cause = ParseError::unexpected_eof(delim.to_string(), span); + + // while let Some(bk) = block_level.pop() { + // token_contents.push(bk.closing()); + // } + + return Spanned { + item: token_contents, + span, + }; + } + + if quote_start.is_some() { + // The non-lite parse trims quotes on both sides, so we add the expected quote so that + // anyone wanting to consume this partial parse (e.g., completions) will be able to get + // correct information from the non-lite parse. + // token_contents.push(delimiter); + + // return ( + // token_contents.spanned(span), + // Some(ParseError::unexpected_eof(delimiter.to_string(), span)), + // ); + return Spanned { + item: token_contents, + span, + }; + } + + Spanned { + item: token_contents, + span, + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_examples() { + crate::test_examples(DetectColumns) + } +} diff --git a/crates/nu-command/src/strings/mod.rs b/crates/nu-command/src/strings/mod.rs index e30e207a73..1cdcb4a18f 100644 --- a/crates/nu-command/src/strings/mod.rs +++ b/crates/nu-command/src/strings/mod.rs @@ -1,6 +1,7 @@ mod build_string; mod char_; mod decode; +mod detect_columns; mod format; mod parse; mod size; @@ -10,6 +11,7 @@ mod str_; pub use build_string::BuildString; pub use char_::Char; pub use decode::*; +pub use detect_columns::*; pub use format::*; pub use parse::*; pub use size::Size;