Replace ExternalStream with new ByteStream type (#12774)

# Description
This PR introduces a `ByteStream` type which is a `Read`-able stream of
bytes. Internally, it has an enum over three different byte stream
sources:
```rust
pub enum ByteStreamSource {
    Read(Box<dyn Read + Send + 'static>),
    File(File),
    Child(ChildProcess),
}
```

This is in comparison to the current `RawStream` type, which is an
`Iterator<Item = Vec<u8>>` and has to allocate for each read chunk.

Currently, `PipelineData::ExternalStream` serves a weird dual role where
it is either external command output or a wrapper around `RawStream`.
`ByteStream` makes this distinction more clear (via `ByteStreamSource`)
and replaces `PipelineData::ExternalStream` in this PR:
```rust
pub enum PipelineData {
    Empty,
    Value(Value, Option<PipelineMetadata>),
    ListStream(ListStream, Option<PipelineMetadata>),
    ByteStream(ByteStream, Option<PipelineMetadata>),
}
```

The PR is relatively large, but a decent amount of it is just repetitive
changes.

This PR fixes #7017, fixes #10763, and fixes #12369.

This PR also improves performance when piping external commands. Nushell
should, in most cases, have competitive pipeline throughput compared to,
e.g., bash.
| Command | Before (MB/s) | After (MB/s) | Bash (MB/s) |
| -------------------------------------------------- | -------------:|
------------:| -----------:|
| `throughput \| rg 'x'` | 3059 | 3744 | 3739 |
| `throughput \| nu --testbin relay o> /dev/null` | 3508 | 8087 | 8136 |

# User-Facing Changes
- This is a breaking change for the plugin communication protocol,
because the `ExternalStreamInfo` was replaced with `ByteStreamInfo`.
Plugins now only have to deal with a single input stream, as opposed to
the previous three streams: stdout, stderr, and exit code.
- The output of `describe` has been changed for external/byte streams.
- Temporary breaking change: `bytes starts-with` no longer works with
byte streams. This is to keep the PR smaller, and `bytes ends-with`
already does not work on byte streams.
- If a process core dumped, then instead of having a `Value::Error` in
the `exit_code` column of the output returned from `complete`, it now is
a `Value::Int` with the negation of the signal number.

# After Submitting
- Update docs and book as necessary
- Release notes (e.g., plugin protocol changes)
- Adapt/convert commands to work with byte streams (high priority is
`str length`, `bytes starts-with`, and maybe `bytes ends-with`).
- Refactor the `tee` code, Devyn has already done some work on this.

---------

Co-authored-by: Devyn Cairns <devyn.cairns@gmail.com>
This commit is contained in:
Ian Manske
2024-05-16 14:11:18 +00:00
committed by GitHub
parent 1b8eb23785
commit 6fd854ed9f
210 changed files with 3955 additions and 4012 deletions

View File

@ -59,7 +59,7 @@ impl Command for FromJson {
let (string_input, span, metadata) = input.collect_string_strict(span)?;
if string_input.is_empty() {
return Ok(PipelineData::new_with_metadata(metadata, span));
return Ok(Value::nothing(span).into_pipeline_data());
}
let strict = call.has_flag(engine_state, stack, "strict")?;

View File

@ -2,9 +2,8 @@
// implementation here is unique.
use std::{
collections::VecDeque,
error::Error,
io::{self, Cursor, ErrorKind, Write},
io::{self, Cursor, ErrorKind},
string::FromUtf8Error,
sync::{atomic::AtomicBool, Arc},
};
@ -12,7 +11,6 @@ use std::{
use byteorder::{BigEndian, ReadBytesExt};
use chrono::{TimeZone, Utc};
use nu_engine::command_prelude::*;
use nu_protocol::RawStream;
use rmp::decode::{self as mp, ValueReadError};
/// Max recursion depth
@ -121,12 +119,20 @@ MessagePack: https://msgpack.org/
read_msgpack(Cursor::new(bytes), opts)
}
// Deserialize from a raw stream directly without having to collect it
PipelineData::ExternalStream {
stdout: Some(raw_stream),
..
} => read_msgpack(ReadRawStream::new(raw_stream), opts),
PipelineData::ByteStream(stream, ..) => {
let span = stream.span();
if let Some(reader) = stream.reader() {
read_msgpack(reader, opts)
} else {
Err(ShellError::PipelineMismatch {
exp_input_type: "binary or byte stream".into(),
dst_span: call.head,
src_span: span,
})
}
}
input => Err(ShellError::PipelineMismatch {
exp_input_type: "binary".into(),
exp_input_type: "binary or byte stream".into(),
dst_span: call.head,
src_span: input.span().unwrap_or(call.head),
}),
@ -483,57 +489,6 @@ where
.map_err(|err| ReadError::Io(err, span))
}
/// Adapter to read MessagePack from a `RawStream`
///
/// TODO: contribute this back to `RawStream` in general, with more polish, if it works
pub(crate) struct ReadRawStream {
pub stream: RawStream,
// Use a `VecDeque` for read efficiency
pub leftover: VecDeque<u8>,
}
impl ReadRawStream {
pub(crate) fn new(mut stream: RawStream) -> ReadRawStream {
ReadRawStream {
leftover: std::mem::take(&mut stream.leftover).into(),
stream,
}
}
}
impl io::Read for ReadRawStream {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
if buf.is_empty() {
Ok(0)
} else if !self.leftover.is_empty() {
// Take as many leftover bytes as possible
self.leftover.read(buf)
} else {
// Try to get data from the RawStream. We have to be careful not to break on a zero-len
// buffer though, since that would mean EOF
loop {
if let Some(result) = self.stream.stream.next() {
let bytes = result.map_err(|err| io::Error::new(ErrorKind::Other, err))?;
if !bytes.is_empty() {
let min_len = bytes.len().min(buf.len());
let (source, leftover_bytes) = bytes.split_at(min_len);
buf[0..min_len].copy_from_slice(source);
// Keep whatever bytes we couldn't use in the leftover vec
self.leftover.write_all(leftover_bytes)?;
return Ok(min_len);
} else {
// Zero-length buf, continue
continue;
}
} else {
// End of input
return Ok(0);
}
}
}
}
}
/// Return an error if this is not the end of file.
///
/// This can help detect if parsing succeeded incorrectly, perhaps due to corruption.

View File

@ -2,7 +2,7 @@ use std::io::Cursor;
use nu_engine::command_prelude::*;
use super::msgpack::{read_msgpack, Opts, ReadRawStream};
use super::msgpack::{read_msgpack, Opts};
const BUFFER_SIZE: usize = 65536;
@ -50,15 +50,21 @@ impl Command for FromMsgpackz {
read_msgpack(reader, opts)
}
// Deserialize from a raw stream directly without having to collect it
PipelineData::ExternalStream {
stdout: Some(raw_stream),
..
} => {
let reader = brotli::Decompressor::new(ReadRawStream::new(raw_stream), BUFFER_SIZE);
read_msgpack(reader, opts)
PipelineData::ByteStream(stream, ..) => {
let span = stream.span();
if let Some(reader) = stream.reader() {
let reader = brotli::Decompressor::new(reader, BUFFER_SIZE);
read_msgpack(reader, opts)
} else {
Err(ShellError::PipelineMismatch {
exp_input_type: "binary or byte stream".into(),
dst_span: call.head,
src_span: span,
})
}
}
_ => Err(ShellError::PipelineMismatch {
exp_input_type: "binary".into(),
exp_input_type: "binary or byte stream".into(),
dst_span: call.head,
src_span: span,
}),

View File

@ -81,28 +81,32 @@ fn convert_columns(columns: &[Value]) -> Result<Vec<String>, ShellError> {
}
fn collect_binary(input: PipelineData, span: Span) -> Result<Vec<u8>, ShellError> {
let mut bytes = vec![];
let mut values = input.into_iter();
if let PipelineData::ByteStream(stream, ..) = input {
stream.into_bytes()
} else {
let mut bytes = vec![];
let mut values = input.into_iter();
loop {
match values.next() {
Some(Value::Binary { val: b, .. }) => {
bytes.extend_from_slice(&b);
loop {
match values.next() {
Some(Value::Binary { val: b, .. }) => {
bytes.extend_from_slice(&b);
}
Some(Value::Error { error, .. }) => return Err(*error),
Some(x) => {
return Err(ShellError::UnsupportedInput {
msg: "Expected binary from pipeline".to_string(),
input: "value originates from here".into(),
msg_span: span,
input_span: x.span(),
})
}
None => break,
}
Some(Value::Error { error, .. }) => return Err(*error),
Some(x) => {
return Err(ShellError::UnsupportedInput {
msg: "Expected binary from pipeline".to_string(),
input: "value originates from here".into(),
msg_span: span,
input_span: x.span(),
})
}
None => break,
}
}
Ok(bytes)
Ok(bytes)
}
}
fn from_ods(

View File

@ -82,27 +82,31 @@ fn convert_columns(columns: &[Value]) -> Result<Vec<String>, ShellError> {
}
fn collect_binary(input: PipelineData, span: Span) -> Result<Vec<u8>, ShellError> {
let mut bytes = vec![];
let mut values = input.into_iter();
if let PipelineData::ByteStream(stream, ..) = input {
stream.into_bytes()
} else {
let mut bytes = vec![];
let mut values = input.into_iter();
loop {
match values.next() {
Some(Value::Binary { val: b, .. }) => {
bytes.extend_from_slice(&b);
loop {
match values.next() {
Some(Value::Binary { val: b, .. }) => {
bytes.extend_from_slice(&b);
}
Some(x) => {
return Err(ShellError::UnsupportedInput {
msg: "Expected binary from pipeline".to_string(),
input: "value originates from here".into(),
msg_span: span,
input_span: x.span(),
})
}
None => break,
}
Some(x) => {
return Err(ShellError::UnsupportedInput {
msg: "Expected binary from pipeline".to_string(),
input: "value originates from here".into(),
msg_span: span,
input_span: x.span(),
})
}
None => break,
}
}
Ok(bytes)
Ok(bytes)
}
}
fn from_xlsx(

View File

@ -150,7 +150,7 @@ pub fn to_delimited_data(
span: Span,
config: &Config,
) -> Result<PipelineData, ShellError> {
let value = input.into_value(span);
let value = input.into_value(span)?;
let output = match from_value_to_delimited_string(&value, sep, config, span) {
Ok(mut x) => {
if noheaders {

View File

@ -46,7 +46,7 @@ impl Command for ToJson {
let span = call.head;
// allow ranges to expand and turn into array
let input = input.try_expand_range()?;
let value = input.into_value(span);
let value = input.into_value(span)?;
let json_value = value_to_json_value(&value)?;
let json_result = if raw {

View File

@ -75,7 +75,7 @@ MessagePack: https://msgpack.org/
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let value_span = input.span().unwrap_or(call.head);
let value = input.into_value(value_span);
let value = input.into_value(value_span)?;
let mut out = vec![];
write_value(&mut out, &value, 0)?;

View File

@ -70,7 +70,7 @@ impl Command for ToMsgpackz {
.transpose()?;
let value_span = input.span().unwrap_or(call.head);
let value = input.into_value(value_span);
let value = input.into_value(value_span)?;
let mut out_buf = vec![];
let mut out = brotli::CompressorWriter::new(
&mut out_buf,

View File

@ -53,7 +53,7 @@ impl Command for ToNuon {
};
let span = call.head;
let value = input.into_value(span);
let value = input.into_value(span)?;
match nuon::to_nuon(&value, style, Some(span)) {
Ok(serde_nuon_string) => {

View File

@ -1,6 +1,12 @@
use chrono_humanize::HumanTime;
use nu_engine::command_prelude::*;
use nu_protocol::{format_duration, format_filesize_from_conf, Config, RawStream, ValueIterator};
use nu_protocol::{format_duration, format_filesize_from_conf, ByteStream, Config};
const LINE_ENDING: &str = if cfg!(target_os = "windows") {
"\r\n"
} else {
"\n"
};
#[derive(Clone)]
pub struct ToText;
@ -28,39 +34,28 @@ impl Command for ToText {
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let span = call.head;
let config = engine_state.get_config();
let line_ending = if cfg!(target_os = "windows") {
"\r\n"
} else {
"\n"
};
let input = input.try_expand_range()?;
if let PipelineData::ListStream(stream, _) = input {
Ok(PipelineData::ExternalStream {
stdout: Some(RawStream::new(
Box::new(ListStreamIterator {
stream: stream.into_inner(),
separator: line_ending.into(),
config: config.clone(),
}),
engine_state.ctrlc.clone(),
span,
None,
)),
stderr: None,
exit_code: None,
span,
metadata: None,
trim_end_newline: false,
})
} else {
// FIXME: don't collect! stream the output wherever possible!
// Even if the data is collected when it arrives at `to text`, we should be able to stream it out
let collected_input = local_into_string(input.into_value(span), line_ending, config);
Ok(Value::string(collected_input, span).into_pipeline_data())
match input {
PipelineData::Empty => Ok(Value::string(String::new(), span).into_pipeline_data()),
PipelineData::Value(value, ..) => {
let str = local_into_string(value, LINE_ENDING, engine_state.get_config());
Ok(Value::string(str, span).into_pipeline_data())
}
PipelineData::ListStream(stream, meta) => {
let span = stream.span();
let config = engine_state.get_config().clone();
let iter = stream.into_inner().map(move |value| {
let mut str = local_into_string(value, LINE_ENDING, &config);
str.push_str(LINE_ENDING);
str
});
Ok(PipelineData::ByteStream(
ByteStream::from_iter(iter, span, engine_state.ctrlc.clone()),
meta,
))
}
PipelineData::ByteStream(stream, meta) => Ok(PipelineData::ByteStream(stream, meta)),
}
}
@ -85,26 +80,6 @@ impl Command for ToText {
}
}
struct ListStreamIterator {
stream: ValueIterator,
separator: String,
config: Config,
}
impl Iterator for ListStreamIterator {
type Item = Result<Vec<u8>, ShellError>;
fn next(&mut self) -> Option<Self::Item> {
if let Some(item) = self.stream.next() {
let mut string = local_into_string(item, &self.separator, &self.config);
string.push_str(&self.separator);
Some(Ok(string.as_bytes().to_vec()))
} else {
None
}
}
}
fn local_into_string(value: Value, separator: &str, config: &Config) -> String {
let span = value.span();
match value {

View File

@ -141,7 +141,7 @@ fn to_toml(
input: PipelineData,
span: Span,
) -> Result<PipelineData, ShellError> {
let value = input.into_value(span);
let value = input.into_value(span)?;
let toml_value = value_to_toml_value(engine_state, &value, span)?;
match toml_value {

View File

@ -132,7 +132,7 @@ impl Job {
}
fn run(mut self, input: PipelineData, head: Span) -> Result<PipelineData, ShellError> {
let value = input.into_value(head);
let value = input.into_value(head)?;
self.write_xml_entry(value, true).and_then(|_| {
let b = self.writer.into_inner().into_inner();

View File

@ -95,7 +95,7 @@ pub fn value_to_yaml_value(v: &Value) -> Result<serde_yaml::Value, ShellError> {
}
fn to_yaml(input: PipelineData, head: Span) -> Result<PipelineData, ShellError> {
let value = input.into_value(head);
let value = input.into_value(head)?;
let yaml_value = value_to_yaml_value(&value)?;
match serde_yaml::to_string(&yaml_value) {