Add regex separators for split row/list/column (#8707)

# Description Verified on discord with maintainer Change adds regex separators in split rows/column/list. The primary motivating reason was to make it easier to split on separators with unbounded whitespace without requiring a lot of trim jiggery. But, secondary motivation is the same as the set of all motivations for adding split regex features to most languages. # User-Facing Changes Adds -r option to split rows/column/list. # Tests + Formatting Ran tests, however tests.nu fails with unrelated errors: ``` ~/src/nushell> cargo run -- crates/nu-utils/standard_library/tests.nu 04/02/2023 02:07:25 AM Finished dev [unoptimized + debuginfo] target(s) in 0.24s Running `target/debug/nu crates/nu-utils/standard_library/tests.nu` INF|2023-04-02T02:07:27.060|Running tests in test_asserts INF|2023-04-02T02:07:27.141|Running tests in test_dirs Error: × list is just pwd after initialization INF|2023-04-02T02:07:27.167|Running tests in test_logger INF|2023-04-02T02:07:27.286|Running tests in test_std Error: × some tests did not pass (see complete errors above): │ │ test_asserts test_assert │ test_asserts test_assert_equal │ test_asserts test_assert_error │ test_asserts test_assert_greater │ test_asserts test_assert_greater_or_equal │ test_asserts test_assert_length │ test_asserts test_assert_less │ test_asserts test_assert_less_or_equal │ test_asserts test_assert_not_equal │ ⨯ test_dirs test_dirs_command │ test_logger test_critical │ test_logger test_debug │ test_logger test_error │ test_logger test_info │ test_logger test_warning │ test_std test_path_add │ ``` Upon investigating seeing this difference: ``` ╭───┬─────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ 0 │ /var/folders/1f/ltbr1m8s5s1811k6n1rhpc0r0000gn/T/test_dirs_c1ed89d6-19f7-47c7-9e1f-74c39f3623b5 │ │ 1 │ /private/var/folders/1f/ltbr1m8s5s1811k6n1rhpc0r0000gn/T/test_dirs_c1ed89d6-19f7-47c7-9e1f-74c39f3623b5 │ ╰───┴─────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ``` This seems unrelated to my changes, but can investigate further if desired. # After Submitting If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. Co-authored-by: Robert Waugh <robert@waugh.io>
2025-05-29 14:21:45 +02:00 · 2023-04-07 04:46:11 -07:00 · 2023-04-07 04:46:11 -07:00 · 4fda6d7eaa
commit 4fda6d7eaa
parent 771e24913d
5 changed files with 205 additions and 33 deletions
--- a/crates/nu-command/src/strings/split/column.rs
+++ b/crates/nu-command/src/strings/split/column.rs
@ -5,6 +5,7 @@ use nu_protocol::{
    Category, Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Type,
    Value,
 };
+use regex::Regex;

 #[derive(Clone)]
 pub struct SubCommand;
@ -30,6 +31,7 @@ impl Command for SubCommand {
                "the character or string that denotes what separates columns",
            )
            .switch("collapse-empty", "remove empty columns", Some('c'))
+            .switch("regex", "separator is a regular expression", Some('r'))
            .rest(
                "rest",
                SyntaxShape::String,
@ -117,6 +119,25 @@ impl Command for SubCommand {
                    span: Span::test_data(),
                }),
            },
+            Example {
+                description: "Split a list of strings into a table, ignoring padding",
+                example: r"['a -  b' 'c  -    d'] | split column -r '\s*-\s*'",
+                result: Some(Value::List {
+                    vals: vec![
+                        Value::Record {
+                            cols: vec!["column1".to_string(), "column2".to_string()],
+                            vals: vec![Value::test_string("a"), Value::test_string("b")],
+                            span: Span::test_data(),
+                        },
+                        Value::Record {
+                            cols: vec!["column1".to_string(), "column2".to_string()],
+                            vals: vec![Value::test_string("c"), Value::test_string("d")],
+                            span: Span::test_data(),
+                        },
+                    ],
+                    span: Span::test_data(),
+                }),
+            },
        ]
    }
 }
@ -132,30 +153,43 @@ fn split_column(
    let rest: Vec<Spanned<String>> = call.rest(engine_state, stack, 1)?;
    let collapse_empty = call.has_flag("collapse-empty");

+    let regex = if call.has_flag("regex") {
+        Regex::new(&separator.item)
+    } else {
+        let escaped = regex::escape(&separator.item);
+        Regex::new(&escaped)
+    }
+    .map_err(|err| {
+        ShellError::GenericError(
+            "Error with regular expression".into(),
+            err.to_string(),
+            Some(separator.span),
+            None,
+            Vec::new(),
+        )
+    })?;
+
    input.flat_map(
-        move |x| split_column_helper(&x, &separator, &rest, collapse_empty, name_span),
+        move |x| split_column_helper(&x, &regex, &rest, collapse_empty, name_span),
        engine_state.ctrlc.clone(),
    )
 }

 fn split_column_helper(
    v: &Value,
-    separator: &Spanned<String>,
+    separator: &Regex,
    rest: &[Spanned<String>],
    collapse_empty: bool,
    head: Span,
 ) -> Vec<Value> {
    if let Ok(s) = v.as_string() {
-        let split_result: Vec<_> = if collapse_empty {
-            s.split(&separator.item).filter(|s| !s.is_empty()).collect()
-        } else {
-            s.split(&separator.item).collect()
-        };
-
+        let split_result: Vec<_> = separator
+            .split(&s)
+            .filter(|x| !(collapse_empty && x.is_empty()))
+            .collect();
        let positional: Vec<_> = rest.iter().map(|f| f.item.clone()).collect();

        // If they didn't provide column names, make up our own
-
        let mut cols = vec![];
        let mut vals = vec![];
        if positional.is_empty() {
--- a/crates/nu-command/src/strings/split/list.rs
+++ b/crates/nu-command/src/strings/split/list.rs
@ -5,6 +5,7 @@ use nu_protocol::{
    Category, Example, IntoPipelineData, PipelineData, ShellError, Signature, Span, SyntaxShape,
    Type, Value,
 };
+use regex::Regex;

 #[derive(Clone)]
 pub struct SubCommand;
@ -25,6 +26,10 @@ impl Command for SubCommand {
                SyntaxShape::Any,
                "the value that denotes what separates the list",
            )
+            .switch(
+                "regex", 
+                "separator is a regular expression, matching values that can be coerced into a string", 
+                Some('r'))
            .category(Category::Filters)
    }

@ -121,10 +126,76 @@ impl Command for SubCommand {
                    span: Span::test_data(),
                }),
            },
+            Example {
+                description: "Split a list of chars into lists based on multiple characters",
+                example: r"[a, b, c, d, a, e, f, g] | split list -r '(b|e)'",
+                result: Some(Value::List {
+                    vals: vec![
+                        Value::List {
+                            vals: vec![Value::test_string("a")],
+                            span: Span::test_data(),
+                        },
+                        Value::List {
+                            vals: vec![
+                                Value::test_string("c"),
+                                Value::test_string("d"),
+                                Value::test_string("a"),
+                            ],
+                            span: Span::test_data(),
+                        },
+                        Value::List {
+                            vals: vec![Value::test_string("f"), Value::test_string("g")],
+                            span: Span::test_data(),
+                        },
+                    ],
+                    span: Span::test_data(),
+                }),
+            },
        ]
    }
 }

+enum Matcher {
+    Regex(Regex),
+    Direct(Value),
+}
+
+impl Matcher {
+    pub fn new(regex: bool, lhs: Value) -> Result<Self, ShellError> {
+        if regex {
+            Ok(Matcher::Regex(Regex::new(&lhs.as_string()?).map_err(
+                |err| {
+                    ShellError::GenericError(
+                        "Error with regular expression".into(),
+                        err.to_string(),
+                        match lhs {
+                            Value::Error { error: _ } => None,
+                            _ => Some(lhs.expect_span()),
+                        },
+                        None,
+                        Vec::new(),
+                    )
+                },
+            )?))
+        } else {
+            Ok(Matcher::Direct(lhs))
+        }
+    }
+
+    pub fn compare(&self, rhs: &Value) -> Result<bool, ShellError> {
+        Ok(match self {
+            Matcher::Regex(regex) => {
+                if let Ok(rhs_str) = rhs.as_string() {
+                    regex.is_match(&rhs_str)
+                } else {
+                    false
+                }
+            }
+            Matcher::Direct(lhs) => rhs == lhs,
+        })
+    }
+}
+
 fn split_list(
    engine_state: &EngineState,
    stack: &mut Stack,
@ -134,9 +205,11 @@ fn split_list(
    let separator: Value = call.req(engine_state, stack, 0)?;
    let mut temp_list = Vec::new();
    let mut returned_list = Vec::new();
+
    let iter = input.into_interruptible_iter(engine_state.ctrlc.clone());
+    let matcher = Matcher::new(call.has_flag("regex"), separator)?;
    for val in iter {
-        if val == separator {
+        if matcher.compare(&val)? {
            if !temp_list.is_empty() {
                returned_list.push(Value::List {
                    vals: temp_list.clone(),
--- a/crates/nu-command/src/strings/split/row.rs
+++ b/crates/nu-command/src/strings/split/row.rs
@ -5,7 +5,7 @@ use nu_protocol::{
    Category, Example, PipelineData, ShellError, Signature, Span, Spanned, SyntaxShape, Type,
    Value,
 };
-
+use regex::Regex;
 #[derive(Clone)]
 pub struct SubCommand;

@ -21,7 +21,7 @@ impl Command for SubCommand {
            .required(
                "separator",
                SyntaxShape::String,
-                "the character that denotes what separates rows",
+                "a character or regex that denotes what separates rows",
            )
            .named(
                "number",
@ -29,6 +29,7 @@ impl Command for SubCommand {
                "Split into maximum number of items",
                Some('n'),
            )
+            .switch("regex", "use regex syntax for separator", Some('r'))
            .category(Category::Strings)
    }

@ -92,6 +93,18 @@ impl Command for SubCommand {
                    span: Span::test_data(),
                }),
            },
+            Example {
+                description: "Split a string by regex",
+                example: r"'a   b       c' | split row -r '\s+'",
+                result: Some(Value::List {
+                    vals: vec![
+                        Value::test_string("a"),
+                        Value::test_string("b"),
+                        Value::test_string("c"),
+                    ],
+                    span: Span::test_data(),
+                }),
+            },
        ]
    }
 }
@ -104,30 +117,40 @@ fn split_row(
 ) -> Result<PipelineData, ShellError> {
    let name_span = call.head;
    let separator: Spanned<String> = call.req(engine_state, stack, 0)?;
+    let regex = if call.has_flag("regex") {
+        Regex::new(&separator.item)
+    } else {
+        let escaped = regex::escape(&separator.item);
+        Regex::new(&escaped)
+    }
+    .map_err(|err| {
+        ShellError::GenericError(
+            "Error with regular expression".into(),
+            err.to_string(),
+            Some(separator.span),
+            None,
+            Vec::new(),
+        )
+    })?;
    let max_split: Option<usize> = call.get_flag(engine_state, stack, "number")?;
    input.flat_map(
-        move |x| split_row_helper(&x, &separator, max_split, name_span),
+        move |x| split_row_helper(&x, &regex, max_split, name_span),
        engine_state.ctrlc.clone(),
    )
 }

-fn split_row_helper(
-    v: &Value,
-    separator: &Spanned<String>,
-    max_split: Option<usize>,
-    name: Span,
-) -> Vec<Value> {
+fn split_row_helper(v: &Value, regex: &Regex, max_split: Option<usize>, name: Span) -> Vec<Value> {
    match v.span() {
        Ok(v_span) => {
            if let Ok(s) = v.as_string() {
                match max_split {
-                    Some(max_split) => s
-                        .splitn(max_split, &separator.item)
-                        .map(|s| Value::string(s, v_span))
+                    Some(max_split) => regex
+                        .splitn(&s, max_split)
+                        .map(|x: &str| Value::string(x, v_span))
                        .collect(),
-                    None => s
-                        .split(&separator.item)
-                        .map(|s| Value::string(s, v_span))
+                    None => regex
+                        .split(&s)
+                        .map(|x: &str| Value::string(x, v_span))
                        .collect(),
                }
            } else {
--- a/crates/nu-command/tests/commands/split_column.rs
+++ b/crates/nu-command/tests/commands/split_column.rs
@ -5,12 +5,20 @@ use nu_test_support::{nu, pipeline};
 #[test]
 fn to_column() {
    Playground::setup("split_column_test_1", |dirs, sandbox| {
-        sandbox.with_files(vec![FileWithContentToBeTrimmed(
-            "sample.txt",
-            r#"
+        sandbox.with_files(vec![
+            FileWithContentToBeTrimmed(
+                "sample.txt",
+                r#"
                importer,shipper,tariff_item,name,origin
            "#,
-        )]);
+            ),
+            FileWithContentToBeTrimmed(
+                "sample2.txt",
+                r#"
+                importer , shipper  , tariff_item  ,   name  ,  origin
+            "#,
+            ),
+        ]);

        let actual = nu!(
            cwd: dirs.test(), pipeline(
@ -24,5 +32,18 @@ fn to_column() {
        ));

        assert!(actual.out.contains("shipper"));
+
+        let actual = nu!(
+            cwd: dirs.test(), pipeline(
+            r#"
+                open sample2.txt
+                | lines
+                | str trim
+                | split column -r '\s*,\s*'
+                | get column2
+            "#
+        ));
+
+        assert!(actual.out.contains("shipper"));
    })
 }
--- a/crates/nu-command/tests/commands/split_row.rs
+++ b/crates/nu-command/tests/commands/split_row.rs
@ -5,12 +5,20 @@ use nu_test_support::{nu, pipeline};
 #[test]
 fn to_row() {
    Playground::setup("split_row_test_1", |dirs, sandbox| {
-        sandbox.with_files(vec![FileWithContentToBeTrimmed(
-            "sample.txt",
-            r#"
+        sandbox.with_files(vec![
+            FileWithContentToBeTrimmed(
+                "sample.txt",
+                r#"
                importer,shipper,tariff_item,name,origin
            "#,
-        )]);
+            ),
+            FileWithContentToBeTrimmed(
+                "sample2.txt",
+                r#"
+                importer      ,   shipper      ,  tariff_item,name      ,    origin
+            "#,
+            ),
+        ]);

        let actual = nu!(
            cwd: dirs.test(), pipeline(
@ -24,5 +32,18 @@ fn to_row() {
        ));

        assert!(actual.out.contains('5'));
+
+        let actual = nu!(
+            cwd: dirs.test(), pipeline(
+            r#"
+                open sample2.txt
+                | lines
+                | str trim
+                | split row -r '\s*,\s*'
+                | length
+            "#
+        ));
+
+        assert!(actual.out.contains('5'));
    })
 }