From 0a6692ac4499f572eabe0d67ed58fcf19433b10c Mon Sep 17 00:00:00 2001 From: Jason Gedge Date: Thu, 28 May 2020 09:58:06 -0400 Subject: [PATCH] Simplify parse plugin code. (#1904) Primarily, instead of building a parse pattern enum, we just build the regex directly, with the appropriate capture group names so that the column name codepaths can be shared between simple and `--regex` patterns. Also removed capture group count compared to column name count. I don't think this codepath can possibly be reached with the regex we now use for the simplified capture form. --- crates/nu-cli/tests/commands/parse.rs | 103 ++++++++++---- crates/nu_plugin_parse/src/lib.rs | 2 +- crates/nu_plugin_parse/src/nu/mod.rs | 193 +++++++++----------------- crates/nu_plugin_parse/src/parse.rs | 6 +- 4 files changed, 147 insertions(+), 157 deletions(-) diff --git a/crates/nu-cli/tests/commands/parse.rs b/crates/nu-cli/tests/commands/parse.rs index 5c07543290..8d53f928be 100644 --- a/crates/nu-cli/tests/commands/parse.rs +++ b/crates/nu-cli/tests/commands/parse.rs @@ -38,10 +38,10 @@ mod simple { let actual = nu!( cwd: dirs.test(), pipeline( r#" - echo "{abc}123" - | parse "{{abc}{name}" - | echo $it.name - "# + echo "{abc}123" + | parse "{{abc}{name}" + | echo $it.name + "# )); assert_eq!(actual.out, "123"); @@ -54,15 +54,48 @@ mod simple { let actual = nu!( cwd: dirs.test(), pipeline( r#" - echo "(abc)123" - | parse "(abc){name}" - | echo $it.name - "# + echo "(abc)123" + | parse "(abc){name}" + | echo $it.name + "# )); assert_eq!(actual.out, "123"); }) } + + #[test] + fn properly_captures_empty_column() { + Playground::setup("parse_test_regex_4", |dirs, _sandbox| { + let actual = nu!( + cwd: dirs.test(), pipeline( + r#" + echo ["1:INFO:component:all is well" "2:ERROR::something bad happened"] + | parse "{timestamp}:{level}:{tag}:{entry}" + | get entry + | nth 1 + "# + )); + + assert_eq!(actual.out, "something bad happened"); + }) + } + + #[test] + fn errors_when_missing_closing_brace() { + Playground::setup("parse_test_regex_5", |dirs, _sandbox| { + let actual = nu!( + cwd: dirs.test(), pipeline( + r#" + echo "(abc)123" + | parse "(abc){name" + | echo $it.name + "# + )); + + assert!(actual.err.contains("invalid parse pattern")); + }) + } } mod regex { @@ -86,12 +119,12 @@ mod regex { let actual = nu!( cwd: dirs.test(), pipeline( r#" - open nushell_git_log_oneline.txt - | parse --regex "(?P\w+) (?P.+) \(#(?P\d+)\)" - | nth 1 - | get PR - | echo $it - "# + open nushell_git_log_oneline.txt + | parse --regex "(?P\w+) (?P.+) \(#(?P\d+)\)" + | nth 1 + | get PR + | echo $it + "# )); assert_eq!(actual.out, "1842"); @@ -106,12 +139,12 @@ mod regex { let actual = nu!( cwd: dirs.test(), pipeline( r#" - open nushell_git_log_oneline.txt - | parse --regex "(\w+) (.+) \(#(\d+)\)" - | nth 1 - | get Capture1 - | echo $it - "# + open nushell_git_log_oneline.txt + | parse --regex "(\w+) (.+) \(#(\d+)\)" + | nth 1 + | get Capture1 + | echo $it + "# )); assert_eq!(actual.out, "b89976da"); @@ -126,15 +159,33 @@ mod regex { let actual = nu!( cwd: dirs.test(), pipeline( r#" - open nushell_git_log_oneline.txt - | parse --regex "(?P\w+) (.+) \(#(?P\d+)\)" - | nth 1 - | get Capture2 - | echo $it - "# + open nushell_git_log_oneline.txt + | parse --regex "(?P\w+) (.+) \(#(?P\d+)\)" + | nth 1 + | get Capture2 + | echo $it + "# )); assert_eq!(actual.out, "let format access variables also"); }) } + + #[test] + fn errors_with_invalid_regex() { + Playground::setup("parse_test_regex_1", |dirs, sandbox| { + sandbox.with_files(nushell_git_log_oneline()); + + let actual = nu!( + cwd: dirs.test(), pipeline( + r#" + open nushell_git_log_oneline.txt + | parse --regex "(?P\w+ unfinished capture group" + | echo $it + "# + )); + + assert!(actual.err.contains("invalid regex")); + }) + } } diff --git a/crates/nu_plugin_parse/src/lib.rs b/crates/nu_plugin_parse/src/lib.rs index d3c29cb0d0..b5bd5842d0 100644 --- a/crates/nu_plugin_parse/src/lib.rs +++ b/crates/nu_plugin_parse/src/lib.rs @@ -1,4 +1,4 @@ mod nu; mod parse; -pub use parse::{ColumnNames, Parse}; +pub use parse::Parse; diff --git a/crates/nu_plugin_parse/src/nu/mod.rs b/crates/nu_plugin_parse/src/nu/mod.rs index b12c505412..357572e7c2 100644 --- a/crates/nu_plugin_parse/src/nu/mod.rs +++ b/crates/nu_plugin_parse/src/nu/mod.rs @@ -1,12 +1,14 @@ +use regex::{self, Regex}; + use nu_errors::ShellError; use nu_plugin::Plugin; use nu_protocol::{ - CallInfo, Primitive, ReturnSuccess, ReturnValue, Signature, SyntaxShape, TaggedDictBuilder, - UntaggedValue, Value, + CallInfo, Primitive, ReturnSuccess, ReturnValue, ShellTypeName, Signature, SyntaxShape, + TaggedDictBuilder, UntaggedValue, Value, }; +use nu_source::Tag; -use crate::{ColumnNames, Parse}; -use regex::{self, Regex}; +use crate::Parse; impl Plugin for Parse { fn config(&mut self) -> Result { @@ -21,114 +23,81 @@ impl Plugin for Parse { } fn begin_filter(&mut self, call_info: CallInfo) -> Result, ShellError> { - if let Some(ref args) = &call_info.args.positional { - match &args[0] { + if let Some(ref args) = call_info.args.positional { + let value = &args[0]; + match value { Value { value: UntaggedValue::Primitive(Primitive::String(s)), tag, } => { self.pattern_tag = tag.clone(); - if call_info.args.has("regex") { - self.regex = Regex::new(&s).map_err(|_| { - ShellError::labeled_error( - "Could not parse regex", - "could not parse regex", - tag.span, - ) - })?; - self.column_names = ColumnNames::from(&self.regex); + self.regex = if call_info.args.has("regex") { + Regex::new(&s).map_err(|_| { + ShellError::labeled_error("Invalid regex", "invalid regex", tag.span) + })? } else { - let parse_pattern = parse(&s); - let parse_regex = build_regex(&parse_pattern); - self.column_names = ColumnNames::from(parse_pattern.as_slice()); - self.regex = Regex::new(&parse_regex).map_err(|_| { + let parse_regex = build_regex(&s, tag.clone())?; + Regex::new(&parse_regex).map_err(|_| { ShellError::labeled_error( - "Could not parse regex", - "could not parse regex", + "Invalid pattern", + "invalid pattern", tag.span, ) - })?; + })? }; + + self.column_names = column_names(&self.regex); } Value { tag, .. } => { return Err(ShellError::labeled_error( - "Unrecognized type in params", - "unexpected value", + format!( + "Unexpected type in params (found `{}`, expected `String`)", + value.type_name() + ), + "unexpected type", tag, )); } } } + Ok(vec![]) } fn filter(&mut self, input: Value) -> Result, ShellError> { - match &input.as_string() { - Ok(s) => { - let mut output = vec![]; - for caps in self.regex.captures_iter(&s) { - let group_count = caps.len() - 1; - - if self.column_names.0.len() != group_count { - return Err(ShellError::labeled_error( - format!( - "There are {} column(s) specified in the pattern, but could only match the first {}: [{}]", - self.column_names.0.len(), - group_count, - caps.iter() - .skip(1) - .map(|m| { - if let Some(m) = m { - let m = m.as_str(); - let mut m = m.replace(",","\\,"); - if m.len() > 20 { - m.truncate(17); - format!("{}...", m) - } else { - m - } - } else { - "".to_string() - } - }) - .collect::>() - .join(", ") - ), - "could not match all columns in pattern", - &self.pattern_tag, - )); - } - + if let Ok(s) = input.as_string() { + Ok(self + .regex + .captures_iter(&s) + .map(|caps| { let mut dict = TaggedDictBuilder::new(&input.tag); - for (idx, column_name) in self.column_names.0.iter().enumerate() { - dict.insert_untagged( - column_name, - UntaggedValue::string(caps[idx + 1].to_string()), - ); + for (column_name, cap) in self.column_names.iter().zip(caps.iter().skip(1)) { + let cap_string = cap.map(|v| v.as_str()).unwrap_or("").to_string(); + dict.insert_untagged(column_name, UntaggedValue::string(cap_string)); } - output.push(Ok(ReturnSuccess::Value(dict.into_value()))); - } - Ok(output) - } - _ => Err(ShellError::labeled_error_with_secondary( + + Ok(ReturnSuccess::Value(dict.into_value())) + }) + .collect()) + } else { + Err(ShellError::labeled_error_with_secondary( "Expected string input", "expected string input", &self.name, "value originated here", input.tag, - )), + )) } } } -fn parse(input: &str) -> Vec { - let mut output = vec![]; +fn build_regex(input: &str, tag: Tag) -> Result { + let mut output = "(?s)\\A".to_string(); //let mut loop_input = input; let mut loop_input = input.chars().peekable(); loop { let mut before = String::new(); - while let Some(c) = loop_input.next() { if c == '{' { // If '{{', still creating a plaintext parse command, but just for a single '{' char @@ -142,20 +111,30 @@ fn parse(input: &str) -> Vec { } if !before.is_empty() { - output.push(ParseCommand::Text(before.to_string())); + output.push_str(®ex::escape(&before)); } + // Look for column as we're now at one let mut column = String::new(); - while let Some(c) = loop_input.next() { if c == '}' { break; } column.push(c); + + if loop_input.peek().is_none() { + return Err(ShellError::labeled_error( + "Found opening `{` without an associated closing `}`", + "invalid parse pattern", + tag, + )); + } } if !column.is_empty() { - output.push(ParseCommand::Column(column.to_string())); + output.push_str("(?P<"); + output.push_str(&column); + output.push_str(">.*?)"); } if before.is_empty() && column.is_empty() { @@ -163,56 +142,18 @@ fn parse(input: &str) -> Vec { } } - output -} - -impl From<&[ParseCommand]> for ColumnNames { - fn from(commands: &[ParseCommand]) -> ColumnNames { - let mut output = vec![]; - - for command in commands { - if let ParseCommand::Column(c) = command { - output.push(c.clone()); - } - } - - ColumnNames(output) - } -} - -impl From<&Regex> for ColumnNames { - fn from(regex: &Regex) -> ColumnNames { - let output = regex - .capture_names() - .enumerate() - .skip(1) - .map(|(i, name)| name.map(String::from).unwrap_or(format!("Capture{}", i))) - .collect::>(); - ColumnNames(output) - } -} - -fn build_regex(commands: &[ParseCommand]) -> String { - let mut output = "(?s)\\A".to_string(); - - for command in commands { - match command { - ParseCommand::Text(s) => { - output.push_str(®ex::escape(&s)); - } - ParseCommand::Column(_) => { - output.push_str("(.*?)"); - } - } - } - output.push_str("\\z"); - - output + Ok(output) } -#[derive(Debug)] -enum ParseCommand { - Text(String), - Column(String), +fn column_names(regex: &Regex) -> Vec { + regex + .capture_names() + .enumerate() + .skip(1) + .map(|(i, name)| { + name.map(String::from) + .unwrap_or_else(|| format!("Capture{}", i)) + }) + .collect() } diff --git a/crates/nu_plugin_parse/src/parse.rs b/crates/nu_plugin_parse/src/parse.rs index 9eb6f7d65b..ba82632442 100644 --- a/crates/nu_plugin_parse/src/parse.rs +++ b/crates/nu_plugin_parse/src/parse.rs @@ -5,11 +5,9 @@ pub struct Parse { pub regex: Regex, pub name: Tag, pub pattern_tag: Tag, - pub column_names: ColumnNames, + pub column_names: Vec, } -pub struct ColumnNames(pub Vec); - impl Parse { #[allow(clippy::trivial_regex)] pub fn new() -> Result> { @@ -17,7 +15,7 @@ impl Parse { regex: Regex::new("")?, name: Tag::unknown(), pattern_tag: Tag::unknown(), - column_names: ColumnNames(vec![]), + column_names: vec![], }) } }