Files
nushell/crates/nu-command/src/strings/split/words.rs
Embers-of-the-Fire 96493b26d9 Make string related commands parse-time evaluatable (#13032)
<!--
if this PR closes one or more issues, you can automatically link the PR
with
them by using one of the [*linking
keywords*](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword),
e.g.
- this PR should close #xxxx
- fixes #xxxx

you can also mention related issues, PRs or discussions!
-->

Related meta-issue: #10239.

# Description
<!--
Thank you for improving Nushell. Please, check our [contributing
guide](../CONTRIBUTING.md) and talk to the core team before making major
changes.

Description of your pull request goes here. **Provide examples and/or
screenshots** if your changes affect the user experience.
-->

This PR will modify some `str`-related commands so that they can be
evaluated at parse time.

See the following list for those implemented by this pr.

# User-Facing Changes
<!-- List of all changes that impact the user experience here. This
helps us keep track of breaking changes. -->

Available now:
- `str` subcommands
  - `trim`
  - `contains`
  - `distance`
  - `ends-with`
  - `expand`
  - `index-of`
  - `join`
  - `replace`
  - `reverse`
  - `starts-with`
  - `stats`
  - `substring`
  - `capitalize`
  - `downcase`
  - `upcase`
- `split` subcommands
  - `chars`
  - `column`
  - `list`
  - `row`
  - `words`
- `format` subcommands
  - `date`
  - `duration`
  - `filesize`
- string related commands
  - `parse`
  - `detect columns`
  - `encode` & `decode`

# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.

Make sure you've run and fixed any issues with these commands:

- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to
check that you're using the standard code style
- `cargo test --workspace` to check that all tests pass (on Windows make
sure to [enable developer
mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging))
- `cargo run -- -c "use toolkit.nu; toolkit test stdlib"` to run the
tests for the standard library

> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->

Unresolved questions:
- [ ] Is there any routine of testing const expressions? I haven't found
any yet.
- [ ] Is const expressions required to behave just like there non-const
version, like what rust promises?

# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->

Unresolved questions:
- [ ] Do const commands need special marks in the docs?
2024-06-05 22:21:52 +03:00

426 lines
15 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use crate::{grapheme_flags, grapheme_flags_const};
use fancy_regex::Regex;
use nu_engine::command_prelude::*;
use unicode_segmentation::UnicodeSegmentation;
#[derive(Clone)]
pub struct SubCommand;
impl Command for SubCommand {
fn name(&self) -> &str {
"split words"
}
fn signature(&self) -> Signature {
Signature::build("split words")
.input_output_types(vec![
(Type::String, Type::List(Box::new(Type::String))),
(
Type::List(Box::new(Type::String)),
Type::List(Box::new(Type::List(Box::new(Type::String))))
),
])
.allow_variants_without_examples(true)
.category(Category::Strings)
// .switch(
// "ignore-hyphenated",
// "ignore hyphenated words, splitting at the hyphen",
// Some('i'),
// )
// .switch(
// "ignore-apostrophes",
// "ignore apostrophes in words by removing them",
// Some('a'),
// )
// .switch(
// "ignore-punctuation",
// "ignore punctuation around words by removing them",
// Some('p'),
// )
.named(
"min-word-length",
SyntaxShape::Int,
"The minimum word length",
Some('l'),
)
.switch(
"grapheme-clusters",
"measure word length in grapheme clusters (requires -l)",
Some('g'),
)
.switch(
"utf-8-bytes",
"measure word length in UTF-8 bytes (default; requires -l; non-ASCII chars are length 2+)",
Some('b'),
)
}
fn usage(&self) -> &str {
"Split a string's words into separate rows."
}
fn search_terms(&self) -> Vec<&str> {
vec!["separate", "divide"]
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Split the string's words into separate rows",
example: "'hello world' | split words",
result: Some(Value::list(
vec![Value::test_string("hello"), Value::test_string("world")],
Span::test_data(),
)),
},
Example {
description:
"Split the string's words, of at least 3 characters, into separate rows",
example: "'hello to the world' | split words --min-word-length 3",
result: Some(Value::list(
vec![
Value::test_string("hello"),
Value::test_string("the"),
Value::test_string("world"),
],
Span::test_data(),
)),
},
Example {
description:
"A real-world example of splitting words",
example: "http get https://www.gutenberg.org/files/11/11-0.txt | str downcase | split words --min-word-length 2 | uniq --count | sort-by count --reverse | first 10",
result: None,
},
]
}
fn is_const(&self) -> bool {
true
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let word_length: Option<usize> = call.get_flag(engine_state, stack, "min-word-length")?;
let has_grapheme = call.has_flag(engine_state, stack, "grapheme-clusters")?;
let has_utf8 = call.has_flag(engine_state, stack, "utf-8-bytes")?;
let graphemes = grapheme_flags(engine_state, stack, call)?;
let args = Arguments {
word_length,
has_grapheme,
has_utf8,
graphemes,
};
split_words(engine_state, call, input, args)
}
fn run_const(
&self,
working_set: &StateWorkingSet,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let word_length: Option<usize> = call.get_flag_const(working_set, "min-word-length")?;
let has_grapheme = call.has_flag_const(working_set, "grapheme-clusters")?;
let has_utf8 = call.has_flag_const(working_set, "utf-8-bytes")?;
let graphemes = grapheme_flags_const(working_set, call)?;
let args = Arguments {
word_length,
has_grapheme,
has_utf8,
graphemes,
};
split_words(working_set.permanent(), call, input, args)
}
}
struct Arguments {
word_length: Option<usize>,
has_grapheme: bool,
has_utf8: bool,
graphemes: bool,
}
fn split_words(
engine_state: &EngineState,
call: &Call,
input: PipelineData,
args: Arguments,
) -> Result<PipelineData, ShellError> {
let span = call.head;
// let ignore_hyphenated = call.has_flag(engine_state, stack, "ignore-hyphenated")?;
// let ignore_apostrophes = call.has_flag(engine_state, stack, "ignore-apostrophes")?;
// let ignore_punctuation = call.has_flag(engine_state, stack, "ignore-punctuation")?;
if args.word_length.is_none() {
if args.has_grapheme {
return Err(ShellError::IncompatibleParametersSingle {
msg: "--grapheme-clusters (-g) requires --min-word-length (-l)".to_string(),
span,
});
}
if args.has_utf8 {
return Err(ShellError::IncompatibleParametersSingle {
msg: "--utf-8-bytes (-b) requires --min-word-length (-l)".to_string(),
span,
});
}
}
input.map(
move |x| split_words_helper(&x, args.word_length, span, args.graphemes),
engine_state.ctrlc.clone(),
)
}
fn split_words_helper(v: &Value, word_length: Option<usize>, span: Span, graphemes: bool) -> Value {
// There are some options here with this regex.
// [^A-Za-z\'] = do not match uppercase or lowercase letters or apostrophes
// [^[:alpha:]\'] = do not match any uppercase or lowercase letters or apostrophes
// [^\p{L}\'] = do not match any unicode uppercase or lowercase letters or apostrophes
// Let's go with the unicode one in hopes that it works on more than just ascii characters
let regex_replace = Regex::new(r"[^\p{L}\']").expect("regular expression error");
let v_span = v.span();
match v {
Value::Error { error, .. } => Value::error(*error.clone(), v_span),
v => {
let v_span = v.span();
if let Ok(s) = v.coerce_str() {
// let splits = s.unicode_words();
// let words = trim_to_words(s);
// let words: Vec<&str> = s.split_whitespace().collect();
let replaced_string = regex_replace.replace_all(&s, " ").to_string();
let words = replaced_string
.split(' ')
.filter_map(|s| {
if s.trim() != "" {
if let Some(len) = word_length {
if if graphemes {
s.graphemes(true).count()
} else {
s.len()
} >= len
{
Some(Value::string(s, v_span))
} else {
None
}
} else {
Some(Value::string(s, v_span))
}
} else {
None
}
})
.collect::<Vec<Value>>();
Value::list(words, v_span)
} else {
Value::error(
ShellError::PipelineMismatch {
exp_input_type: "string".into(),
dst_span: span,
src_span: v_span,
},
v_span,
)
}
}
}
}
// original at least 1 char long
// curl -sL "https://www.gutenberg.org/files/11/11-0.txt" | tr '[:upper:]' '[:lower:]' | grep -oE "[a-z\']{1,}" | ^sort | ^uniq -c | ^sort -nr | head -n 10
// benchmark INCLUDING DOWNLOAD: 1sec 253ms 91µs 511ns
// 1839 the
// 942 and
// 811 to
// 695 a
// 638 of
// 610 it
// 553 she
// 546 i
// 486 you
// 462 said
// original at least 2 chars long
// curl -sL "https://www.gutenberg.org/files/11/11-0.txt" | tr '[:upper:]' '[:lower:]' | grep -oE "[a-z\']{2,}" | ^sort | ^uniq -c | ^sort -nr | head -n 10
// 1839 the
// 942 and
// 811 to
// 638 of
// 610 it
// 553 she
// 486 you
// 462 said
// 435 in
// 403 alice
// regex means, replace everything that is not A-Z or a-z or ' with a space
// $contents | str replace "[^A-Za-z\']" " " -a | split row ' ' | where ($it | str length) > 1 | uniq -i -c | sort-by count --reverse | first 10
// benchmark: 1sec 775ms 471µs 600ns
// ╭───┬───────┬───────╮
// │ # │ value │ count │
// ├───┼───────┼───────┤
// │ 0 │ the │ 1839 │
// │ 1 │ and │ 942 │
// │ 2 │ to │ 811 │
// │ 3 │ of │ 638 │
// │ 4 │ it │ 610 │
// │ 5 │ she │ 553 │
// │ 6 │ you │ 486 │
// │ 7 │ said │ 462 │
// │ 8 │ in │ 435 │
// │ 9 │ alice │ 403 │
// ╰───┴───────┴───────╯
// $alice |str replace "[^A-Za-z\']" " " -a | split row ' ' | uniq -i -c | sort-by count --reverse | first 10
// benchmark: 1sec 518ms 701µs 200ns
// ╭───┬───────┬───────╮
// │ # │ value │ count │
// ├───┼───────┼───────┤
// │ 0 │ the │ 1839 │
// │ 1 │ and │ 942 │
// │ 2 │ to │ 811 │
// │ 3 │ a │ 695 │
// │ 4 │ of │ 638 │
// │ 5 │ it │ 610 │
// │ 6 │ she │ 553 │
// │ 7 │ i │ 546 │
// │ 8 │ you │ 486 │
// │ 9 │ said │ 462 │
// ├───┼───────┼───────┤
// │ # │ value │ count │
// ╰───┴───────┴───────╯
// s.unicode_words()
// $alice | str downcase | split words | sort | uniq -c | sort-by count | reverse | first 10
// benchmark: 4sec 965ms 285µs 800ns
// ╭───┬───────┬───────╮
// │ # │ value │ count │
// ├───┼───────┼───────┤
// │ 0 │ the │ 1839 │
// │ 1 │ and │ 941 │
// │ 2 │ to │ 811 │
// │ 3 │ a │ 695 │
// │ 4 │ of │ 638 │
// │ 5 │ it │ 542 │
// │ 6 │ she │ 538 │
// │ 7 │ said │ 460 │
// │ 8 │ in │ 434 │
// │ 9 │ you │ 426 │
// ├───┼───────┼───────┤
// │ # │ value │ count │
// ╰───┴───────┴───────╯
// trim_to_words
// benchmark: 5sec 992ms 76µs 200ns
// ╭───┬───────┬───────╮
// │ # │ value │ count │
// ├───┼───────┼───────┤
// │ 0 │ the │ 1829 │
// │ 1 │ and │ 918 │
// │ 2 │ to │ 801 │
// │ 3 │ a │ 689 │
// │ 4 │ of │ 632 │
// │ 5 │ she │ 537 │
// │ 6 │ it │ 493 │
// │ 7 │ said │ 457 │
// │ 8 │ in │ 430 │
// │ 9 │ you │ 413 │
// ├───┼───────┼───────┤
// │ # │ value │ count │
// ╰───┴───────┴───────╯
// fn trim_to_words(content: String) -> std::vec::Vec<std::string::String> {
// let content: Vec<String> = content
// .to_lowercase()
// .replace(&['-'][..], " ")
// //should 's be replaced?
// .replace("'s", "")
// .replace(
// &[
// '(', ')', ',', '\"', '.', ';', ':', '=', '[', ']', '{', '}', '-', '_', '/', '\'',
// '', '?', '!', '“', '',
// ][..],
// "",
// )
// .split_whitespace()
// .map(String::from)
// .collect::<Vec<String>>();
// content
// }
// split_whitespace()
// benchmark: 9sec 379ms 790µs 900ns
// ╭───┬───────┬───────╮
// │ # │ value │ count │
// ├───┼───────┼───────┤
// │ 0 │ the │ 1683 │
// │ 1 │ and │ 783 │
// │ 2 │ to │ 778 │
// │ 3 │ a │ 667 │
// │ 4 │ of │ 605 │
// │ 5 │ she │ 485 │
// │ 6 │ said │ 416 │
// │ 7 │ in │ 406 │
// │ 8 │ it │ 357 │
// │ 9 │ was │ 329 │
// ├───┼───────┼───────┤
// │ # │ value │ count │
// ╰───┴───────┴───────╯
// current
// $alice | str downcase | split words | uniq -c | sort-by count --reverse | first 10
// benchmark: 1sec 481ms 604µs 700ns
// ╭───┬───────┬───────╮
// │ # │ value │ count │
// ├───┼───────┼───────┤
// │ 0 │ the │ 1839 │
// │ 1 │ and │ 942 │
// │ 2 │ to │ 811 │
// │ 3 │ a │ 695 │
// │ 4 │ of │ 638 │
// │ 5 │ it │ 610 │
// │ 6 │ she │ 553 │
// │ 7 │ i │ 546 │
// │ 8 │ you │ 486 │
// │ 9 │ said │ 462 │
// ├───┼───────┼───────┤
// │ # │ value │ count │
// ╰───┴───────┴───────╯
#[cfg(test)]
mod test {
use super::*;
use nu_test_support::nu;
#[test]
fn test_incompat_flags() {
let out = nu!("'a' | split words -bg -l 2");
assert!(out.err.contains("incompatible_parameters"));
}
#[test]
fn test_incompat_flags_2() {
let out = nu!("'a' | split words -g");
assert!(out.err.contains("incompatible_parameters"));
}
#[test]
fn test_examples() {
use crate::test_examples;
test_examples(SubCommand {})
}
}