From 060a4b3f4831186c818e757bc1cf5370e002ce42 Mon Sep 17 00:00:00 2001
From: JT <547158+jntrnr@users.noreply.github.com>
Date: Sun, 30 Jan 2022 07:52:24 -0500
Subject: [PATCH] Port `detect columns` (#892)

---
 crates/nu-command/src/default_context.rs      |   1 +
 crates/nu-command/src/filters/rename.rs       |   2 +-
 .../nu-command/src/strings/detect_columns.rs  | 313 ++++++++++++++++++
 crates/nu-command/src/strings/mod.rs          |   2 +
 4 files changed, 317 insertions(+), 1 deletion(-)
 create mode 100644 crates/nu-command/src/strings/detect_columns.rs
diff --git a/crates/nu-command/src/default_context.rs b/crates/nu-command/src/default_context.rs
index 676cce4468..83bf2e3951 100644
--- a/crates/nu-command/src/default_context.rs
+++ b/crates/nu-command/src/default_context.rs
@@ -133,6 +133,7 @@ pub fn create_default_context(cwd: impl AsRef<Path>) -> EngineState {
             BuildString,
             Char,
             Decode,
+            DetectColumns,
             Format,
             Parse,
             Size,
diff --git a/crates/nu-command/src/filters/rename.rs b/crates/nu-command/src/filters/rename.rs
index 385b2064d0..5abe6c26c9 100644
--- a/crates/nu-command/src/filters/rename.rs
+++ b/crates/nu-command/src/filters/rename.rs
@@ -138,7 +138,7 @@ fn rename(
                     }
                     None => {
                         for (idx, val) in columns.iter().enumerate() {
-                            if idx > cols.len() - 1 {
+                            if idx >= cols.len() {
                                 // skip extra new columns names if we already reached the final column
                                 break;
                             }
diff --git a/crates/nu-command/src/strings/detect_columns.rs b/crates/nu-command/src/strings/detect_columns.rs
new file mode 100644
index 0000000000..4bba032ec0
--- /dev/null
+++ b/crates/nu-command/src/strings/detect_columns.rs
@@ -0,0 +1,313 @@
+use std::iter::Peekable;
+use std::str::CharIndices;
+
+use nu_engine::CallExt;
+use nu_protocol::ast::Call;
+use nu_protocol::engine::{Command, EngineState, Stack};
+use nu_protocol::{
+    Category, IntoInterruptiblePipelineData, PipelineData, ShellError, Signature, Span, Spanned,
+    SyntaxShape, Value,
+};
+
+type Input<'t> = Peekable<CharIndices<'t>>;
+
+#[derive(Clone)]
+pub struct DetectColumns;
+
+impl Command for DetectColumns {
+    fn name(&self) -> &str {
+        "detect columns"
+    }
+
+    fn signature(&self) -> Signature {
+        Signature::build("detect columns")
+            .named(
+                "skip",
+                SyntaxShape::Int,
+                "number of rows to skip before detecting",
+                Some('s'),
+            )
+            .switch("no_headers", "don't detect headers", Some('n'))
+            .category(Category::Strings)
+    }
+
+    fn usage(&self) -> &str {
+        "splits contents across multiple columns via the separator."
+    }
+
+    fn run(
+        &self,
+        engine_state: &EngineState,
+        stack: &mut Stack,
+        call: &Call,
+        input: PipelineData,
+    ) -> Result<PipelineData, ShellError> {
+        detect_columns(engine_state, stack, call, input)
+    }
+}
+
+fn detect_columns(
+    engine_state: &EngineState,
+    stack: &mut Stack,
+    call: &Call,
+    input: PipelineData,
+) -> Result<PipelineData, ShellError> {
+    let name_span = call.head;
+    let num_rows_to_skip: Option<usize> = call.get_flag(engine_state, stack, "skip")?;
+    let noheader = call.has_flag("no_headers");
+    let ctrlc = engine_state.ctrlc.clone();
+    let config = stack.get_config()?;
+    let input = input.collect_string("", &config)?;
+
+    let input: Vec<_> = input
+        .lines()
+        .skip(num_rows_to_skip.unwrap_or_default())
+        .map(|x| x.to_string())
+        .collect();
+
+    let mut input = input.into_iter();
+    let headers = input.next();
+
+    if let Some(orig_headers) = headers {
+        let mut headers = find_columns(&orig_headers);
+
+        if noheader {
+            for header in headers.iter_mut().enumerate() {
+                header.1.item = format!("Column{}", header.0);
+            }
+        }
+
+        Ok((if noheader {
+            vec![orig_headers].into_iter().chain(input)
+        } else {
+            vec![].into_iter().chain(input)
+        })
+        .map(move |x| {
+            let row = find_columns(&x);
+
+            let mut cols = vec![];
+            let mut vals = vec![];
+
+            if headers.len() == row.len() {
+                for (header, val) in headers.iter().zip(row.iter()) {
+                    cols.push(header.item.clone());
+                    vals.push(Value::String {
+                        val: val.item.clone(),
+                        span: name_span,
+                    });
+                }
+            } else {
+                let mut pre_output = vec![];
+
+                // column counts don't line up, so see if we can figure out why
+                for cell in row {
+                    for header in &headers {
+                        if cell.span.start <= header.span.end && cell.span.end > header.span.start {
+                            pre_output.push((
+                                header.item.to_string(),
+                                Value::string(&cell.item, name_span),
+                            ));
+                        }
+                    }
+                }
+
+                for header in &headers {
+                    let mut found = false;
+                    for pre_o in &pre_output {
+                        if pre_o.0 == header.item {
+                            found = true;
+                            break;
+                        }
+                    }
+
+                    if !found {
+                        pre_output.push((header.item.to_string(), Value::nothing(name_span)));
+                    }
+                }
+
+                for header in &headers {
+                    for pre_o in &pre_output {
+                        if pre_o.0 == header.item {
+                            cols.push(header.item.clone());
+                            vals.push(pre_o.1.clone())
+                        }
+                    }
+                }
+            }
+
+            Value::Record {
+                cols,
+                vals,
+                span: name_span,
+            }
+        })
+        .into_pipeline_data(ctrlc))
+    } else {
+        Ok(PipelineData::new(name_span))
+    }
+}
+
+pub fn find_columns(input: &str) -> Vec<Spanned<String>> {
+    let mut chars = input.char_indices().peekable();
+    let mut output = vec![];
+
+    while let Some((_, c)) = chars.peek() {
+        if c.is_whitespace() {
+            // If the next character is non-newline whitespace, skip it.
+
+            let _ = chars.next();
+        } else {
+            // Otherwise, try to consume an unclassified token.
+
+            let result = baseline(&mut chars);
+
+            output.push(result);
+        }
+    }
+
+    output
+}
+
+#[derive(Clone, Copy)]
+enum BlockKind {
+    Paren,
+    CurlyBracket,
+    SquareBracket,
+}
+
+fn baseline(src: &mut Input) -> Spanned<String> {
+    let mut token_contents = String::new();
+
+    let start_offset = if let Some((pos, _)) = src.peek() {
+        *pos
+    } else {
+        0
+    };
+
+    // This variable tracks the starting character of a string literal, so that
+    // we remain inside the string literal lexer mode until we encounter the
+    // closing quote.
+    let mut quote_start: Option<char> = None;
+
+    // This Vec tracks paired delimiters
+    let mut block_level: Vec<BlockKind> = vec![];
+
+    // A baseline token is terminated if it's not nested inside of a paired
+    // delimiter and the next character is one of: `|`, `;`, `#` or any
+    // whitespace.
+    fn is_termination(block_level: &[BlockKind], c: char) -> bool {
+        block_level.is_empty() && (c.is_whitespace())
+    }
+
+    // The process of slurping up a baseline token repeats:
+    //
+    // - String literal, which begins with `'`, `"` or `\``, and continues until
+    //   the same character is encountered again.
+    // - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
+    //   the matching closing delimiter is found, skipping comments and string
+    //   literals.
+    // - When not nested inside of a delimiter pair, when a terminating
+    //   character (whitespace, `|`, `;` or `#`) is encountered, the baseline
+    //   token is done.
+    // - Otherwise, accumulate the character into the current baseline token.
+    while let Some((_, c)) = src.peek() {
+        let c = *c;
+
+        if quote_start.is_some() {
+            // If we encountered the closing quote character for the current
+            // string, we're done with the current string.
+            if Some(c) == quote_start {
+                quote_start = None;
+            }
+        } else if c == '\n' {
+            if is_termination(&block_level, c) {
+                break;
+            }
+        } else if c == '\'' || c == '"' || c == '`' {
+            // We encountered the opening quote of a string literal.
+            quote_start = Some(c);
+        } else if c == '[' {
+            // We encountered an opening `[` delimiter.
+            block_level.push(BlockKind::SquareBracket);
+        } else if c == ']' {
+            // We encountered a closing `]` delimiter. Pop off the opening `[`
+            // delimiter.
+            if let Some(BlockKind::SquareBracket) = block_level.last() {
+                let _ = block_level.pop();
+            }
+        } else if c == '{' {
+            // We encountered an opening `{` delimiter.
+            block_level.push(BlockKind::CurlyBracket);
+        } else if c == '}' {
+            // We encountered a closing `}` delimiter. Pop off the opening `{`.
+            if let Some(BlockKind::CurlyBracket) = block_level.last() {
+                let _ = block_level.pop();
+            }
+        } else if c == '(' {
+            // We enceountered an opening `(` delimiter.
+            block_level.push(BlockKind::Paren);
+        } else if c == ')' {
+            // We encountered a closing `)` delimiter. Pop off the opening `(`.
+            if let Some(BlockKind::Paren) = block_level.last() {
+                let _ = block_level.pop();
+            }
+        } else if is_termination(&block_level, c) {
+            break;
+        }
+
+        // Otherwise, accumulate the character into the current token.
+        token_contents.push(c);
+
+        // Consume the character.
+        let _ = src.next();
+    }
+
+    let span = Span::new(start_offset, start_offset + token_contents.len());
+
+    // If there is still unclosed opening delimiters, close them and add
+    // synthetic closing characters to the accumulated token.
+    if block_level.last().is_some() {
+        // let delim: char = (*block).closing();
+        // let cause = ParseError::unexpected_eof(delim.to_string(), span);
+
+        // while let Some(bk) = block_level.pop() {
+        //     token_contents.push(bk.closing());
+        // }
+
+        return Spanned {
+            item: token_contents,
+            span,
+        };
+    }
+
+    if quote_start.is_some() {
+        // The non-lite parse trims quotes on both sides, so we add the expected quote so that
+        // anyone wanting to consume this partial parse (e.g., completions) will be able to get
+        // correct information from the non-lite parse.
+        // token_contents.push(delimiter);
+
+        // return (
+        //     token_contents.spanned(span),
+        //     Some(ParseError::unexpected_eof(delimiter.to_string(), span)),
+        // );
+        return Spanned {
+            item: token_contents,
+            span,
+        };
+    }
+
+    Spanned {
+        item: token_contents,
+        span,
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_examples() {
+        crate::test_examples(DetectColumns)
+    }
+}
diff --git a/crates/nu-command/src/strings/mod.rs b/crates/nu-command/src/strings/mod.rs
index e30e207a73..1cdcb4a18f 100644
--- a/crates/nu-command/src/strings/mod.rs
+++ b/crates/nu-command/src/strings/mod.rs
@@ -1,6 +1,7 @@
 mod build_string;
 mod char_;
 mod decode;
+mod detect_columns;
 mod format;
 mod parse;
 mod size;
@@ -10,6 +11,7 @@ mod str_;
 pub use build_string::BuildString;
 pub use char_::Char;
 pub use decode::*;
+pub use detect_columns::*;
 pub use format::*;
 pub use parse::*;
 pub use size::Size;