Files
nushell/crates/nu-command/src/strings/str_/substring.rs
JT 786ba3bf91 Input output checking (#9680)
# Description

This PR tights input/output type-checking a bit more. There are a lot of
commands that don't have correct input/output types, so part of the
effort is updating them.

This PR now contains updates to commands that had wrong input/output
signatures. It doesn't add examples for these new signatures, but that
can be follow-up work.

# User-Facing Changes

BREAKING CHANGE BREAKING CHANGE

This work enforces many more checks on pipeline type correctness than
previous nushell versions. This strictness may uncover incompatibilities
in existing scripts or shortcomings in the type information for internal
commands.

# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.

Make sure you've run and fixed any issues with these commands:

- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used -A
clippy::needless_collect -A clippy::result_large_err` to check that
you're using the standard code style
- `cargo test --workspace` to check that all tests pass
- `cargo run -- -c "use std testing; testing run-tests --path
crates/nu-std"` to run the tests for the standard library

> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->

# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->
2023-07-14 15:20:35 +12:00

289 lines
9.3 KiB
Rust
Raw Blame History

use crate::grapheme_flags;
use nu_cmd_base::input_handler::{operate, CmdArgument};
use nu_cmd_base::util;
use nu_engine::CallExt;
use nu_protocol::ast::Call;
use nu_protocol::ast::CellPath;
use nu_protocol::engine::{Command, EngineState, Stack};
use nu_protocol::{
Example, PipelineData, Range, ShellError, Signature, Span, SyntaxShape, Type, Value,
};
use std::cmp::Ordering;
use unicode_segmentation::UnicodeSegmentation;
#[derive(Clone)]
pub struct SubCommand;
struct Arguments {
indexes: Substring,
cell_paths: Option<Vec<CellPath>>,
graphemes: bool,
}
impl CmdArgument for Arguments {
fn take_cell_paths(&mut self) -> Option<Vec<CellPath>> {
self.cell_paths.take()
}
}
#[derive(Clone)]
struct Substring(isize, isize);
impl From<(isize, isize)> for Substring {
fn from(input: (isize, isize)) -> Substring {
Substring(input.0, input.1)
}
}
impl Command for SubCommand {
fn name(&self) -> &str {
"str substring"
}
fn signature(&self) -> Signature {
Signature::build("str substring")
.input_output_types(vec![(Type::String, Type::String), (Type::Table(vec![]), Type::Table(vec![]))])
.vectorizes_over_list(true)
.allow_variants_without_examples(true)
.switch(
"grapheme-clusters",
"count indexes and split using grapheme clusters (all visible chars have length 1)",
Some('g'),
)
.switch(
"utf-8-bytes",
"count indexes and split using UTF-8 bytes (default; non-ASCII chars have length 2+)",
Some('b'),
)
.required(
"range",
SyntaxShape::Any,
"the indexes to substring [start end]",
)
.rest(
"rest",
SyntaxShape::CellPath,
"For a data structure input, turn strings at the given cell paths into substrings",
)
}
fn usage(&self) -> &str {
"Get part of a string. Note that the start is included but the end is excluded, and that the first character of a string is index 0."
}
fn search_terms(&self) -> Vec<&str> {
vec!["slice"]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let range: Range = call.req(engine_state, stack, 0)?;
let indexes = match util::process_range(&range) {
Ok(idxs) => idxs.into(),
Err(processing_error) => {
return Err(processing_error("could not perform substring", call.head))
}
};
let cell_paths: Vec<CellPath> = call.rest(engine_state, stack, 1)?;
let cell_paths = (!cell_paths.is_empty()).then_some(cell_paths);
let args = Arguments {
indexes,
cell_paths,
graphemes: grapheme_flags(call)?,
};
operate(action, args, input, call.head, engine_state.ctrlc.clone())
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description:
"Get a substring \"nushell\" from the text \"good nushell\" using a range",
example: " 'good nushell' | str substring 5..12",
result: Some(Value::test_string("nushell")),
},
Example {
description: "Count indexes and split using grapheme clusters",
example: " '🇯🇵ほげ ふが ぴよ' | str substring -g 4..6",
result: Some(Value::test_string("ふが")),
},
]
}
}
fn action(input: &Value, args: &Arguments, head: Span) -> Value {
let options = &args.indexes;
match input {
Value::String { val: s, .. } => {
let len: isize = s.len() as isize;
let start: isize = if options.0 < 0 {
options.0 + len
} else {
options.0
};
let end: isize = if options.1 < 0 {
std::cmp::max(len + options.1, 0)
} else {
options.1
};
if start < len && end >= 0 {
match start.cmp(&end) {
Ordering::Equal => Value::string("", head),
Ordering::Greater => Value::Error {
error: Box::new(ShellError::TypeMismatch {
err_message: "End must be greater than or equal to Start".to_string(),
span: head,
}),
},
Ordering::Less => Value::String {
val: {
if end == isize::max_value() {
if args.graphemes {
s.graphemes(true)
.skip(start as usize)
.collect::<Vec<&str>>()
.join("")
} else {
String::from_utf8_lossy(
&s.bytes().skip(start as usize).collect::<Vec<_>>(),
)
.to_string()
}
} else if args.graphemes {
s.graphemes(true)
.skip(start as usize)
.take((end - start) as usize)
.collect::<Vec<&str>>()
.join("")
} else {
String::from_utf8_lossy(
&s.bytes()
.skip(start as usize)
.take((end - start) as usize)
.collect::<Vec<_>>(),
)
.to_string()
}
},
span: head,
},
}
} else {
Value::string("", head)
}
}
// Propagate errors by explicitly matching them before the final case.
Value::Error { .. } => input.clone(),
other => Value::Error {
error: Box::new(ShellError::UnsupportedInput(
"Only string values are supported".into(),
format!("input type: {:?}", other.get_type()),
head,
// This line requires the Value::Error match above.
other.expect_span(),
)),
},
}
}
#[cfg(test)]
mod tests {
use super::{action, Arguments, Span, SubCommand, Substring, Value};
#[test]
fn test_examples() {
use crate::test_examples;
test_examples(SubCommand {})
}
struct Expectation<'a> {
options: (isize, isize),
expected: &'a str,
}
impl Expectation<'_> {
fn options(&self) -> Substring {
Substring(self.options.0, self.options.1)
}
}
fn expectation(word: &str, indexes: (isize, isize)) -> Expectation {
Expectation {
options: indexes,
expected: word,
}
}
#[test]
fn substrings_indexes() {
let word = Value::test_string("andres");
let cases = vec![
expectation("a", (0, 1)),
expectation("an", (0, 2)),
expectation("and", (0, 3)),
expectation("andr", (0, 4)),
expectation("andre", (0, 5)),
expectation("andres", (0, 6)),
expectation("", (0, -6)),
expectation("a", (0, -5)),
expectation("an", (0, -4)),
expectation("and", (0, -3)),
expectation("andr", (0, -2)),
expectation("andre", (0, -1)),
// str substring [ -4 , _ ]
// str substring -4 ,
expectation("dres", (-4, isize::max_value())),
expectation("", (0, -110)),
expectation("", (6, 0)),
expectation("", (6, -1)),
expectation("", (6, -2)),
expectation("", (6, -3)),
expectation("", (6, -4)),
expectation("", (6, -5)),
expectation("", (6, -6)),
];
for expectation in &cases {
let expected = expectation.expected;
let actual = action(
&word,
&Arguments {
indexes: expectation.options(),
cell_paths: None,
graphemes: false,
},
Span::test_data(),
);
assert_eq!(actual, Value::test_string(expected));
}
}
#[test]
fn use_utf8_bytes() {
let word = Value::String {
val: String::from("🇯🇵ほげ ふが ぴよ"),
span: Span::test_data(),
};
let options = Arguments {
cell_paths: None,
indexes: Substring(4, 5),
graphemes: false,
};
let actual = action(&word, &options, Span::test_data());
assert_eq!(actual, Value::test_string("<EFBFBD>"));
}
}