mirror of
https://github.com/nushell/nushell.git
synced 2025-01-06 22:40:01 +01:00
Add bytes split
command (#14652)
Related #10708 # Description Add `bytes split` command. `bytes split` splits its input on the provided separator on binary values _and_ binary streams without collecting. The separator can be a multiple character string or multiple byte binary. It can be used when neither `split row` (not streaming over raw input) nor `lines` (streaming, but can only split on newlines) is right. The backing iterator implemented in this PR, `SplitRead`, can be used to implement a streaming `split row` in the future. # User-Facing Changes `bytes split` command added, which can be used to split binary values and raw streams using a separator. # Tests + Formatting - 🟢 toolkit fmt - 🟢 toolkit clippy - 🟢 toolkit test - 🟢 toolkit test stdlib # After Submitting Mention in release notes.
This commit is contained in:
parent
23ba613b00
commit
469e23cae4
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -4055,6 +4055,7 @@ dependencies = [
|
|||||||
"indexmap",
|
"indexmap",
|
||||||
"log",
|
"log",
|
||||||
"lru",
|
"lru",
|
||||||
|
"memchr",
|
||||||
"miette",
|
"miette",
|
||||||
"nix 0.29.0",
|
"nix 0.29.0",
|
||||||
"nu-derive-value",
|
"nu-derive-value",
|
||||||
|
@ -183,6 +183,7 @@ which = "7.0.0"
|
|||||||
windows = "0.56"
|
windows = "0.56"
|
||||||
windows-sys = "0.48"
|
windows-sys = "0.48"
|
||||||
winreg = "0.52"
|
winreg = "0.52"
|
||||||
|
memchr = "2.7.4"
|
||||||
|
|
||||||
[workspace.lints.clippy]
|
[workspace.lints.clippy]
|
||||||
# Warning: workspace lints affect library code as well as tests, so don't enable lints that would be too noisy in tests like that.
|
# Warning: workspace lints affect library code as well as tests, so don't enable lints that would be too noisy in tests like that.
|
||||||
|
@ -9,6 +9,7 @@ mod length;
|
|||||||
mod remove;
|
mod remove;
|
||||||
mod replace;
|
mod replace;
|
||||||
mod reverse;
|
mod reverse;
|
||||||
|
mod split;
|
||||||
mod starts_with;
|
mod starts_with;
|
||||||
|
|
||||||
pub use add::BytesAdd;
|
pub use add::BytesAdd;
|
||||||
@ -22,4 +23,5 @@ pub use length::BytesLen;
|
|||||||
pub use remove::BytesRemove;
|
pub use remove::BytesRemove;
|
||||||
pub use replace::BytesReplace;
|
pub use replace::BytesReplace;
|
||||||
pub use reverse::BytesReverse;
|
pub use reverse::BytesReverse;
|
||||||
|
pub use split::BytesSplit;
|
||||||
pub use starts_with::BytesStartsWith;
|
pub use starts_with::BytesStartsWith;
|
||||||
|
109
crates/nu-command/src/bytes/split.rs
Normal file
109
crates/nu-command/src/bytes/split.rs
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
use nu_engine::command_prelude::*;
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct BytesSplit;
|
||||||
|
|
||||||
|
impl Command for BytesSplit {
|
||||||
|
fn name(&self) -> &str {
|
||||||
|
"bytes split"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn signature(&self) -> Signature {
|
||||||
|
Signature::build("bytes split")
|
||||||
|
.input_output_types(vec![(Type::Binary, Type::list(Type::Binary))])
|
||||||
|
.required(
|
||||||
|
"separator",
|
||||||
|
SyntaxShape::OneOf(vec![SyntaxShape::Binary, SyntaxShape::String]),
|
||||||
|
"Bytes or string that the input will be split on (must be non-empty).",
|
||||||
|
)
|
||||||
|
.category(Category::Bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Split input into multiple items using a separator."
|
||||||
|
}
|
||||||
|
|
||||||
|
fn search_terms(&self) -> Vec<&str> {
|
||||||
|
vec!["separate", "stream"]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn examples(&self) -> Vec<Example> {
|
||||||
|
vec![
|
||||||
|
Example {
|
||||||
|
example: r#"0x[66 6F 6F 20 62 61 72 20 62 61 7A 20] | bytes split 0x[20]"#,
|
||||||
|
description: "Split a binary value using a binary separator",
|
||||||
|
result: Some(Value::test_list(vec![
|
||||||
|
Value::test_binary("foo"),
|
||||||
|
Value::test_binary("bar"),
|
||||||
|
Value::test_binary("baz"),
|
||||||
|
Value::test_binary(""),
|
||||||
|
])),
|
||||||
|
},
|
||||||
|
Example {
|
||||||
|
example: r#"0x[61 2D 2D 62 2D 2D 63] | bytes split "--""#,
|
||||||
|
description: "Split a binary value using a string separator",
|
||||||
|
result: Some(Value::test_list(vec![
|
||||||
|
Value::test_binary("a"),
|
||||||
|
Value::test_binary("b"),
|
||||||
|
Value::test_binary("c"),
|
||||||
|
])),
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run(
|
||||||
|
&self,
|
||||||
|
engine_state: &EngineState,
|
||||||
|
stack: &mut Stack,
|
||||||
|
call: &Call,
|
||||||
|
input: PipelineData,
|
||||||
|
) -> Result<PipelineData, ShellError> {
|
||||||
|
let head = call.head;
|
||||||
|
let Spanned {
|
||||||
|
item: separator,
|
||||||
|
span,
|
||||||
|
}: Spanned<Vec<u8>> = call.req(engine_state, stack, 0)?;
|
||||||
|
|
||||||
|
if separator.is_empty() {
|
||||||
|
return Err(ShellError::IncorrectValue {
|
||||||
|
msg: "Separator can't be empty".into(),
|
||||||
|
val_span: span,
|
||||||
|
call_span: call.head,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let (split_read, md) = match input {
|
||||||
|
PipelineData::Value(Value::Binary { val, .. }, md) => (
|
||||||
|
ByteStream::read_binary(val, head, engine_state.signals().clone()).split(separator),
|
||||||
|
md,
|
||||||
|
),
|
||||||
|
PipelineData::ByteStream(stream, md) => (stream.split(separator), md),
|
||||||
|
input => {
|
||||||
|
let span = input.span().unwrap_or(head);
|
||||||
|
return Err(input.unsupported_input_error("bytes", span));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if let Some(split) = split_read {
|
||||||
|
Ok(split
|
||||||
|
.map(move |part| match part {
|
||||||
|
Ok(val) => Value::binary(val, head),
|
||||||
|
Err(err) => Value::error(err, head),
|
||||||
|
})
|
||||||
|
.into_pipeline_data_with_metadata(head, engine_state.signals().clone(), md))
|
||||||
|
} else {
|
||||||
|
Ok(PipelineData::empty())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_examples() {
|
||||||
|
use crate::test_examples;
|
||||||
|
|
||||||
|
test_examples(BytesSplit {})
|
||||||
|
}
|
||||||
|
}
|
@ -380,6 +380,7 @@ pub fn add_shell_command_context(mut engine_state: EngineState) -> EngineState {
|
|||||||
bind_command! {
|
bind_command! {
|
||||||
Bytes,
|
Bytes,
|
||||||
BytesLen,
|
BytesLen,
|
||||||
|
BytesSplit,
|
||||||
BytesStartsWith,
|
BytesStartsWith,
|
||||||
BytesEndsWith,
|
BytesEndsWith,
|
||||||
BytesReverse,
|
BytesReverse,
|
||||||
|
@ -41,6 +41,7 @@ typetag = "0.2"
|
|||||||
os_pipe = { workspace = true, optional = true, features = ["io_safety"] }
|
os_pipe = { workspace = true, optional = true, features = ["io_safety"] }
|
||||||
log = { workspace = true }
|
log = { workspace = true }
|
||||||
web-time = { workspace = true }
|
web-time = { workspace = true }
|
||||||
|
memchr = { workspace = true }
|
||||||
|
|
||||||
[target.'cfg(unix)'.dependencies]
|
[target.'cfg(unix)'.dependencies]
|
||||||
nix = { workspace = true, default-features = false, features = ["signal"] }
|
nix = { workspace = true, default-features = false, features = ["signal"] }
|
||||||
|
@ -415,6 +415,18 @@ impl ByteStream {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert the [`ByteStream`] into a [`SplitRead`] iterator where each element is a `Result<String, ShellError>`.
|
||||||
|
///
|
||||||
|
/// Each call to [`next`](Iterator::next) reads the currently available data from the byte
|
||||||
|
/// stream source, until `delimiter` or the end of the stream is encountered.
|
||||||
|
///
|
||||||
|
/// If the source of the [`ByteStream`] is [`ByteStreamSource::Child`] and the child has no stdout,
|
||||||
|
/// then the stream is considered empty and `None` will be returned.
|
||||||
|
pub fn split(self, delimiter: Vec<u8>) -> Option<SplitRead> {
|
||||||
|
let reader = self.stream.reader()?;
|
||||||
|
Some(SplitRead::new(reader, delimiter, self.span, self.signals))
|
||||||
|
}
|
||||||
|
|
||||||
/// Convert the [`ByteStream`] into a [`Chunks`] iterator where each element is a `Result<Value, ShellError>`.
|
/// Convert the [`ByteStream`] into a [`Chunks`] iterator where each element is a `Result<Value, ShellError>`.
|
||||||
///
|
///
|
||||||
/// Each call to [`next`](Iterator::next) reads the currently available data from the byte stream source,
|
/// Each call to [`next`](Iterator::next) reads the currently available data from the byte stream source,
|
||||||
@ -746,6 +758,200 @@ impl Iterator for Lines {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mod split_read {
|
||||||
|
use std::io::{BufRead, ErrorKind};
|
||||||
|
|
||||||
|
use memchr::memmem::Finder;
|
||||||
|
|
||||||
|
pub struct SplitRead<R> {
|
||||||
|
reader: Option<R>,
|
||||||
|
buf: Option<Vec<u8>>,
|
||||||
|
finder: Finder<'static>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: BufRead> SplitRead<R> {
|
||||||
|
pub fn new(reader: R, delim: impl AsRef<[u8]>) -> Self {
|
||||||
|
// empty delimiter results in an infinite stream of empty items
|
||||||
|
debug_assert!(!delim.as_ref().is_empty(), "delimiter can't be empty");
|
||||||
|
Self {
|
||||||
|
reader: Some(reader),
|
||||||
|
buf: Some(Vec::new()),
|
||||||
|
finder: Finder::new(delim.as_ref()).into_owned(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: BufRead> Iterator for SplitRead<R> {
|
||||||
|
type Item = Result<Vec<u8>, std::io::Error>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
let buf = self.buf.as_mut()?;
|
||||||
|
let mut search_start = 0usize;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
if let Some(i) = self.finder.find(&buf[search_start..]) {
|
||||||
|
let needle_idx = search_start + i;
|
||||||
|
let right = buf.split_off(needle_idx + self.finder.needle().len());
|
||||||
|
buf.truncate(needle_idx);
|
||||||
|
let left = std::mem::replace(buf, right);
|
||||||
|
return Some(Ok(left));
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(mut r) = self.reader.take() {
|
||||||
|
search_start = buf.len().saturating_sub(self.finder.needle().len() + 1);
|
||||||
|
let available = match r.fill_buf() {
|
||||||
|
Ok(n) => n,
|
||||||
|
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
|
||||||
|
Err(e) => return Some(Err(e)),
|
||||||
|
};
|
||||||
|
|
||||||
|
buf.extend_from_slice(available);
|
||||||
|
let used = available.len();
|
||||||
|
r.consume(used);
|
||||||
|
if used != 0 {
|
||||||
|
self.reader = Some(r);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
return self.buf.take().map(Ok);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use std::io::{self, Cursor, Read};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn simple() {
|
||||||
|
let s = "foo-bar-baz";
|
||||||
|
let cursor = Cursor::new(String::from(s));
|
||||||
|
let mut split =
|
||||||
|
SplitRead::new(cursor, "-").map(|r| String::from_utf8(r.unwrap()).unwrap());
|
||||||
|
|
||||||
|
assert_eq!(split.next().as_deref(), Some("foo"));
|
||||||
|
assert_eq!(split.next().as_deref(), Some("bar"));
|
||||||
|
assert_eq!(split.next().as_deref(), Some("baz"));
|
||||||
|
assert_eq!(split.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn with_empty_fields() -> Result<(), io::Error> {
|
||||||
|
let s = "\0\0foo\0\0bar\0\0\0\0baz\0\0";
|
||||||
|
let cursor = Cursor::new(String::from(s));
|
||||||
|
let mut split =
|
||||||
|
SplitRead::new(cursor, "\0\0").map(|r| String::from_utf8(r.unwrap()).unwrap());
|
||||||
|
|
||||||
|
assert_eq!(split.next().as_deref(), Some(""));
|
||||||
|
assert_eq!(split.next().as_deref(), Some("foo"));
|
||||||
|
assert_eq!(split.next().as_deref(), Some("bar"));
|
||||||
|
assert_eq!(split.next().as_deref(), Some(""));
|
||||||
|
assert_eq!(split.next().as_deref(), Some("baz"));
|
||||||
|
assert_eq!(split.next().as_deref(), Some(""));
|
||||||
|
assert_eq!(split.next().as_deref(), None);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn complex_delimiter() -> Result<(), io::Error> {
|
||||||
|
let s = "<|>foo<|>bar<|><|>baz<|>";
|
||||||
|
let cursor = Cursor::new(String::from(s));
|
||||||
|
let mut split =
|
||||||
|
SplitRead::new(cursor, "<|>").map(|r| String::from_utf8(r.unwrap()).unwrap());
|
||||||
|
|
||||||
|
assert_eq!(split.next().as_deref(), Some(""));
|
||||||
|
assert_eq!(split.next().as_deref(), Some("foo"));
|
||||||
|
assert_eq!(split.next().as_deref(), Some("bar"));
|
||||||
|
assert_eq!(split.next().as_deref(), Some(""));
|
||||||
|
assert_eq!(split.next().as_deref(), Some("baz"));
|
||||||
|
assert_eq!(split.next().as_deref(), Some(""));
|
||||||
|
assert_eq!(split.next().as_deref(), None);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn all_empty() -> Result<(), io::Error> {
|
||||||
|
let s = "<><>";
|
||||||
|
let cursor = Cursor::new(String::from(s));
|
||||||
|
let mut split =
|
||||||
|
SplitRead::new(cursor, "<>").map(|r| String::from_utf8(r.unwrap()).unwrap());
|
||||||
|
|
||||||
|
assert_eq!(split.next().as_deref(), Some(""));
|
||||||
|
assert_eq!(split.next().as_deref(), Some(""));
|
||||||
|
assert_eq!(split.next().as_deref(), Some(""));
|
||||||
|
assert_eq!(split.next(), None);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[should_panic = "delimiter can't be empty"]
|
||||||
|
#[test]
|
||||||
|
fn empty_delimiter() {
|
||||||
|
let s = "abc";
|
||||||
|
let cursor = Cursor::new(String::from(s));
|
||||||
|
let _split = SplitRead::new(cursor, "").map(|e| e.unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn delimiter_spread_across_reads() {
|
||||||
|
let reader = Cursor::new("<|>foo<|")
|
||||||
|
.chain(Cursor::new(">bar<|><"))
|
||||||
|
.chain(Cursor::new("|>baz<|>"));
|
||||||
|
|
||||||
|
let mut split =
|
||||||
|
SplitRead::new(reader, "<|>").map(|r| String::from_utf8(r.unwrap()).unwrap());
|
||||||
|
|
||||||
|
assert_eq!(split.next().unwrap(), "");
|
||||||
|
assert_eq!(split.next().unwrap(), "foo");
|
||||||
|
assert_eq!(split.next().unwrap(), "bar");
|
||||||
|
assert_eq!(split.next().unwrap(), "");
|
||||||
|
assert_eq!(split.next().unwrap(), "baz");
|
||||||
|
assert_eq!(split.next().unwrap(), "");
|
||||||
|
assert_eq!(split.next(), None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct SplitRead {
|
||||||
|
internal: split_read::SplitRead<BufReader<SourceReader>>,
|
||||||
|
span: Span,
|
||||||
|
signals: Signals,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SplitRead {
|
||||||
|
fn new(
|
||||||
|
reader: SourceReader,
|
||||||
|
delimiter: impl AsRef<[u8]>,
|
||||||
|
span: Span,
|
||||||
|
signals: Signals,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
internal: split_read::SplitRead::new(BufReader::new(reader), delimiter),
|
||||||
|
span,
|
||||||
|
signals,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn span(&self) -> Span {
|
||||||
|
self.span
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for SplitRead {
|
||||||
|
type Item = Result<Vec<u8>, ShellError>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
if self.signals.interrupted() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
self.internal.next().map(|r| r.map_err(|e| e.into()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Turn a readable stream into [`Value`]s.
|
/// Turn a readable stream into [`Value`]s.
|
||||||
///
|
///
|
||||||
/// The `Value` type depends on the type of the stream ([`ByteStreamType`]). If `Unknown`, the
|
/// The `Value` type depends on the type of the stream ([`ByteStreamType`]). If `Unknown`, the
|
||||||
|
Loading…
Reference in New Issue
Block a user