From df3892f32396fb94f8d9c10592312e3ca97614ca Mon Sep 17 00:00:00 2001 From: Jack Wright <56345+ayax79@users.noreply.github.com> Date: Thu, 2 Jan 2025 13:03:24 -0800 Subject: [PATCH] Provide the ability to split strings in columns via `polars str-split` (#14723) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description Provides the ability to split string columns. This will change the column type to list. ```nushell > ❯ : [[a]; ["one,two,three"]] | polars into-df | polars select (polars col a | polars str-split ",") | polars collect ╭───┬───────────────╮ │ # │ a │ ├───┼───────────────┤ │ 0 │ ╭───┬───────╮ │ │ │ │ 0 │ one │ │ │ │ │ 1 │ two │ │ │ │ │ 2 │ three │ │ │ │ ╰───┴───────╯ │ ╰───┴───────────────╯ > ❯ : [[a]; ["one,two,three"]] | polars into-df | polars select (polars col a | polars str-split ",") | polars schema ╭───┬───────────╮ │ a │ list │ ╰───┴───────────╯ ``` # User-Facing Changes - Introduces new command `polars str-split` --- .../src/dataframe/command/string/mod.rs | 2 + .../src/dataframe/command/string/str_split.rs | 92 +++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 crates/nu_plugin_polars/src/dataframe/command/string/str_split.rs diff --git a/crates/nu_plugin_polars/src/dataframe/command/string/mod.rs b/crates/nu_plugin_polars/src/dataframe/command/string/mod.rs index c921fa2d79..3538878190 100644 --- a/crates/nu_plugin_polars/src/dataframe/command/string/mod.rs +++ b/crates/nu_plugin_polars/src/dataframe/command/string/mod.rs @@ -5,6 +5,7 @@ mod replace_all; mod str_join; mod str_lengths; mod str_slice; +mod str_split; mod to_lowercase; mod to_uppercase; @@ -27,6 +28,7 @@ pub(crate) fn string_commands() -> Vec &str { + "polars str-split" + } + + fn description(&self) -> &str { + "Split the string by a substring. The resulting dtype is list." + } + + fn signature(&self) -> Signature { + Signature::build(self.name()) + .required("expr", SyntaxShape::Any, "Separator expression") + .input_output_types(vec![( + Type::Custom("expression".into()), + Type::Custom("expression".into()), + )]) + .category(Category::Custom("dataframe".into())) + } + + fn examples(&self) -> Vec { + vec![Example { + description: "Split the string by comma, then create a new row for each string", + example: r#"[[a]; ["one,two,three"]] | polars into-df + | polars select (polars col a | polars str-split "," | polars explode) + | polars collect"#, + result: Some( + NuDataFrame::from( + df!( + "a" => ["one", "two", "three"] + ) + .expect("Should be able to create a dataframe"), + ) + .into_value(Span::test_data()), + ), + }] + } + + fn run( + &self, + plugin: &Self::Plugin, + engine: &EngineInterface, + call: &EvaluatedCall, + input: PipelineData, + ) -> Result { + let separator = call.req::>(0).and_then(|sep| { + let sep_expr = NuExpression::try_from_value(plugin, &sep.item)?; + Ok(Spanned { + item: sep_expr, + span: sep.span, + }) + })?; + + let expr = NuExpression::try_from_pipeline(plugin, input, call.head)?; + let res: NuExpression = expr + .into_polars() + .str() + .split(separator.item.into_polars()) + .into(); + res.to_pipeline_data(plugin, engine, call.head) + .map_err(LabeledError::from) + } +} + +#[cfg(test)] +mod test { + use nu_protocol::ShellError; + + use super::*; + use crate::test::test_polars_plugin_command; + + #[test] + fn test_examples() -> Result<(), ShellError> { + test_polars_plugin_command(&StrSplit) + } +}