Replace ExternalStream with new ByteStream type (#12774)

# Description
This PR introduces a `ByteStream` type which is a `Read`-able stream of
bytes. Internally, it has an enum over three different byte stream
sources:
```rust
pub enum ByteStreamSource {
    Read(Box<dyn Read + Send + 'static>),
    File(File),
    Child(ChildProcess),
}
```

This is in comparison to the current `RawStream` type, which is an
`Iterator<Item = Vec<u8>>` and has to allocate for each read chunk.

Currently, `PipelineData::ExternalStream` serves a weird dual role where
it is either external command output or a wrapper around `RawStream`.
`ByteStream` makes this distinction more clear (via `ByteStreamSource`)
and replaces `PipelineData::ExternalStream` in this PR:
```rust
pub enum PipelineData {
    Empty,
    Value(Value, Option<PipelineMetadata>),
    ListStream(ListStream, Option<PipelineMetadata>),
    ByteStream(ByteStream, Option<PipelineMetadata>),
}
```

The PR is relatively large, but a decent amount of it is just repetitive
changes.

This PR fixes #7017, fixes #10763, and fixes #12369.

This PR also improves performance when piping external commands. Nushell
should, in most cases, have competitive pipeline throughput compared to,
e.g., bash.
| Command | Before (MB/s) | After (MB/s) | Bash (MB/s) |
| -------------------------------------------------- | -------------:|
------------:| -----------:|
| `throughput \| rg 'x'` | 3059 | 3744 | 3739 |
| `throughput \| nu --testbin relay o> /dev/null` | 3508 | 8087 | 8136 |

# User-Facing Changes
- This is a breaking change for the plugin communication protocol,
because the `ExternalStreamInfo` was replaced with `ByteStreamInfo`.
Plugins now only have to deal with a single input stream, as opposed to
the previous three streams: stdout, stderr, and exit code.
- The output of `describe` has been changed for external/byte streams.
- Temporary breaking change: `bytes starts-with` no longer works with
byte streams. This is to keep the PR smaller, and `bytes ends-with`
already does not work on byte streams.
- If a process core dumped, then instead of having a `Value::Error` in
the `exit_code` column of the output returned from `complete`, it now is
a `Value::Int` with the negation of the signal number.

# After Submitting
- Update docs and book as necessary
- Release notes (e.g., plugin protocol changes)
- Adapt/convert commands to work with byte streams (high priority is
`str length`, `bytes starts-with`, and maybe `bytes ends-with`).
- Refactor the `tee` code, Devyn has already done some work on this.

---------

Co-authored-by: Devyn Cairns <devyn.cairns@gmail.com>
This commit is contained in:
Ian Manske
2024-05-16 14:11:18 +00:00
committed by GitHub
parent 1b8eb23785
commit 6fd854ed9f
210 changed files with 3955 additions and 4012 deletions

View File

@ -1,15 +1,10 @@
//! Implements the stream multiplexing interface for both the plugin side and the engine side.
use nu_plugin_protocol::{
ExternalStreamInfo, ListStreamInfo, PipelineDataHeader, RawStreamInfo, StreamMessage,
};
use nu_protocol::{ListStream, PipelineData, RawStream, ShellError};
use nu_plugin_protocol::{ByteStreamInfo, ListStreamInfo, PipelineDataHeader, StreamMessage};
use nu_protocol::{ByteStream, IntoSpanned, ListStream, PipelineData, Reader, ShellError};
use std::{
io::Write,
sync::{
atomic::{AtomicBool, Ordering::Relaxed},
Arc, Mutex,
},
io::{Read, Write},
sync::{atomic::AtomicBool, Arc, Mutex},
thread,
};
@ -185,31 +180,10 @@ pub trait InterfaceManager {
let reader = handle.read_stream(info.id, self.get_interface())?;
ListStream::new(reader, info.span, ctrlc.cloned()).into()
}
PipelineDataHeader::ExternalStream(info) => {
PipelineDataHeader::ByteStream(info) => {
let handle = self.stream_manager().get_handle();
let span = info.span;
let new_raw_stream = |raw_info: RawStreamInfo| {
let reader = handle.read_stream(raw_info.id, self.get_interface())?;
let mut stream =
RawStream::new(Box::new(reader), ctrlc.cloned(), span, raw_info.known_size);
stream.is_binary = raw_info.is_binary;
Ok::<_, ShellError>(stream)
};
PipelineData::ExternalStream {
stdout: info.stdout.map(new_raw_stream).transpose()?,
stderr: info.stderr.map(new_raw_stream).transpose()?,
exit_code: info
.exit_code
.map(|list_info| {
handle
.read_stream(list_info.id, self.get_interface())
.map(|reader| ListStream::new(reader, info.span, ctrlc.cloned()))
})
.transpose()?,
span: info.span,
metadata: None,
trim_end_newline: info.trim_end_newline,
}
let reader = handle.read_stream(info.id, self.get_interface())?;
ByteStream::from_result_iter(reader, info.span, ctrlc.cloned()).into()
}
})
}
@ -271,11 +245,11 @@ pub trait Interface: Clone + Send {
Ok::<_, ShellError>((id, writer))
};
match self.prepare_pipeline_data(data, context)? {
PipelineData::Value(value, _) => {
PipelineData::Value(value, ..) => {
Ok((PipelineDataHeader::Value(value), PipelineDataWriter::None))
}
PipelineData::Empty => Ok((PipelineDataHeader::Empty, PipelineDataWriter::None)),
PipelineData::ListStream(stream, _) => {
PipelineData::ListStream(stream, ..) => {
let (id, writer) = new_stream(LIST_STREAM_HIGH_PRESSURE)?;
Ok((
PipelineDataHeader::ListStream(ListStreamInfo {
@ -285,50 +259,15 @@ pub trait Interface: Clone + Send {
PipelineDataWriter::ListStream(writer, stream),
))
}
PipelineData::ExternalStream {
stdout,
stderr,
exit_code,
span,
metadata: _,
trim_end_newline,
} => {
// Create the writers and stream ids
let stdout_stream = stdout
.is_some()
.then(|| new_stream(RAW_STREAM_HIGH_PRESSURE))
.transpose()?;
let stderr_stream = stderr
.is_some()
.then(|| new_stream(RAW_STREAM_HIGH_PRESSURE))
.transpose()?;
let exit_code_stream = exit_code
.is_some()
.then(|| new_stream(LIST_STREAM_HIGH_PRESSURE))
.transpose()?;
// Generate the header, with the stream ids
let header = PipelineDataHeader::ExternalStream(ExternalStreamInfo {
span,
stdout: stdout
.as_ref()
.zip(stdout_stream.as_ref())
.map(|(stream, (id, _))| RawStreamInfo::new(*id, stream)),
stderr: stderr
.as_ref()
.zip(stderr_stream.as_ref())
.map(|(stream, (id, _))| RawStreamInfo::new(*id, stream)),
exit_code: exit_code_stream
.as_ref()
.map(|&(id, _)| ListStreamInfo { id, span }),
trim_end_newline,
});
// Collect the writers
let writer = PipelineDataWriter::ExternalStream {
stdout: stdout_stream.map(|(_, writer)| writer).zip(stdout),
stderr: stderr_stream.map(|(_, writer)| writer).zip(stderr),
exit_code: exit_code_stream.map(|(_, writer)| writer).zip(exit_code),
};
Ok((header, writer))
PipelineData::ByteStream(stream, ..) => {
let span = stream.span();
if let Some(reader) = stream.reader() {
let (id, writer) = new_stream(RAW_STREAM_HIGH_PRESSURE)?;
let header = PipelineDataHeader::ByteStream(ByteStreamInfo { id, span });
Ok((header, PipelineDataWriter::ByteStream(writer, reader)))
} else {
Ok((PipelineDataHeader::Empty, PipelineDataWriter::None))
}
}
}
}
@ -355,11 +294,7 @@ pub enum PipelineDataWriter<W: WriteStreamMessage> {
#[default]
None,
ListStream(StreamWriter<W>, ListStream),
ExternalStream {
stdout: Option<(StreamWriter<W>, RawStream)>,
stderr: Option<(StreamWriter<W>, RawStream)>,
exit_code: Option<(StreamWriter<W>, ListStream)>,
},
ByteStream(StreamWriter<W>, Reader),
}
impl<W> PipelineDataWriter<W>
@ -376,49 +311,16 @@ where
writer.write_all(stream)?;
Ok(())
}
// Write all three possible streams of an ExternalStream on separate threads.
PipelineDataWriter::ExternalStream {
stdout,
stderr,
exit_code,
} => {
thread::scope(|scope| {
let stderr_thread = stderr
.map(|(mut writer, stream)| {
thread::Builder::new()
.name("plugin stderr writer".into())
.spawn_scoped(scope, move || {
writer.write_all(raw_stream_iter(stream))
})
})
.transpose()?;
let exit_code_thread = exit_code
.map(|(mut writer, stream)| {
thread::Builder::new()
.name("plugin exit_code writer".into())
.spawn_scoped(scope, move || writer.write_all(stream))
})
.transpose()?;
// Optimize for stdout: if only stdout is present, don't spawn any other
// threads.
if let Some((mut writer, stream)) = stdout {
writer.write_all(raw_stream_iter(stream))?;
}
let panicked = |thread_name: &str| {
Err(ShellError::NushellFailed {
msg: format!(
"{thread_name} thread panicked in PipelineDataWriter::write"
),
})
};
stderr_thread
.map(|t| t.join().unwrap_or_else(|_| panicked("stderr")))
.transpose()?;
exit_code_thread
.map(|t| t.join().unwrap_or_else(|_| panicked("exit_code")))
.transpose()?;
Ok(())
})
// Write a byte stream.
PipelineDataWriter::ByteStream(mut writer, mut reader) => {
let span = reader.span();
let buf = &mut [0; 8192];
writer.write_all(std::iter::from_fn(move || match reader.read(buf) {
Ok(0) => None,
Ok(len) => Some(Ok(buf[..len].to_vec())),
Err(err) => Some(Err(ShellError::from(err.into_spanned(span)))),
}))?;
Ok(())
}
}
}
@ -446,11 +348,3 @@ where
}
}
}
/// Custom iterator for [`RawStream`] that respects ctrlc, but still has binary chunks
fn raw_stream_iter(stream: RawStream) -> impl Iterator<Item = Result<Vec<u8>, ShellError>> {
let ctrlc = stream.ctrlc;
stream
.stream
.take_while(move |_| ctrlc.as_ref().map(|b| !b.load(Relaxed)).unwrap_or(true))
}

View File

@ -6,11 +6,12 @@ use super::{
Interface, InterfaceManager, PluginRead, PluginWrite,
};
use nu_plugin_protocol::{
ExternalStreamInfo, ListStreamInfo, PipelineDataHeader, PluginInput, PluginOutput,
RawStreamInfo, StreamData, StreamMessage,
ByteStreamInfo, ListStreamInfo, PipelineDataHeader, PluginInput, PluginOutput, StreamData,
StreamMessage,
};
use nu_protocol::{
DataSource, ListStream, PipelineData, PipelineMetadata, RawStream, ShellError, Span, Value,
ByteStream, ByteStreamSource, DataSource, ListStream, PipelineData, PipelineMetadata,
ShellError, Span, Value,
};
use std::{path::Path, sync::Arc};
@ -140,9 +141,9 @@ fn read_pipeline_data_value() -> Result<(), ShellError> {
let header = PipelineDataHeader::Value(value.clone());
match manager.read_pipeline_data(header, None)? {
PipelineData::Value(read_value, _) => assert_eq!(value, read_value),
PipelineData::ListStream(_, _) => panic!("unexpected ListStream"),
PipelineData::ExternalStream { .. } => panic!("unexpected ExternalStream"),
PipelineData::Value(read_value, ..) => assert_eq!(value, read_value),
PipelineData::ListStream(..) => panic!("unexpected ListStream"),
PipelineData::ByteStream(..) => panic!("unexpected ByteStream"),
PipelineData::Empty => panic!("unexpected Empty"),
}
@ -188,47 +189,25 @@ fn read_pipeline_data_list_stream() -> Result<(), ShellError> {
}
#[test]
fn read_pipeline_data_external_stream() -> Result<(), ShellError> {
fn read_pipeline_data_byte_stream() -> Result<(), ShellError> {
let test = TestCase::new();
let mut manager = TestInterfaceManager::new(&test);
let iterations = 100;
let out_pattern = b"hello".to_vec();
let err_pattern = vec![5, 4, 3, 2];
test.add(StreamMessage::Data(14, Value::test_int(1).into()));
for _ in 0..iterations {
test.add(StreamMessage::Data(
12,
StreamData::Raw(Ok(out_pattern.clone())),
));
test.add(StreamMessage::Data(
13,
StreamData::Raw(Ok(err_pattern.clone())),
));
}
test.add(StreamMessage::End(12));
test.add(StreamMessage::End(13));
test.add(StreamMessage::End(14));
let test_span = Span::new(10, 13);
let header = PipelineDataHeader::ExternalStream(ExternalStreamInfo {
let header = PipelineDataHeader::ByteStream(ByteStreamInfo {
id: 12,
span: test_span,
stdout: Some(RawStreamInfo {
id: 12,
is_binary: false,
known_size: Some((out_pattern.len() * iterations) as u64),
}),
stderr: Some(RawStreamInfo {
id: 13,
is_binary: true,
known_size: None,
}),
exit_code: Some(ListStreamInfo {
id: 14,
span: Span::test_data(),
}),
trim_end_newline: true,
});
let pipe = manager.read_pipeline_data(header, None)?;
@ -237,52 +216,28 @@ fn read_pipeline_data_external_stream() -> Result<(), ShellError> {
manager.consume_all()?;
match pipe {
PipelineData::ExternalStream {
stdout,
stderr,
exit_code,
span,
metadata,
trim_end_newline,
} => {
let stdout = stdout.expect("stdout is None");
let stderr = stderr.expect("stderr is None");
let exit_code = exit_code.expect("exit_code is None");
assert_eq!(test_span, span);
PipelineData::ByteStream(stream, metadata) => {
assert_eq!(test_span, stream.span());
assert!(
metadata.is_some(),
"expected metadata to be Some due to prepare_pipeline_data()"
);
assert!(trim_end_newline);
assert!(!stdout.is_binary);
assert!(stderr.is_binary);
assert_eq!(
Some((out_pattern.len() * iterations) as u64),
stdout.known_size
);
assert_eq!(None, stderr.known_size);
// check the streams
let mut count = 0;
for chunk in stdout.stream {
assert_eq!(out_pattern, chunk?);
count += 1;
match stream.into_source() {
ByteStreamSource::Read(mut read) => {
let mut buf = Vec::new();
read.read_to_end(&mut buf)?;
let iter = buf.chunks_exact(out_pattern.len());
assert_eq!(iter.len(), iterations);
for chunk in iter {
assert_eq!(out_pattern, chunk)
}
}
ByteStreamSource::File(..) => panic!("unexpected byte stream source: file"),
ByteStreamSource::Child(..) => {
panic!("unexpected byte stream source: child")
}
}
assert_eq!(iterations, count, "stdout length");
let mut count = 0;
for chunk in stderr.stream {
assert_eq!(err_pattern, chunk?);
count += 1;
}
assert_eq!(iterations, count, "stderr length");
assert_eq!(
vec![Value::test_int(1)],
exit_code.into_iter().collect::<Vec<_>>()
);
}
_ => panic!("unexpected PipelineData: {pipe:?}"),
}
@ -436,120 +391,51 @@ fn write_pipeline_data_list_stream() -> Result<(), ShellError> {
}
#[test]
fn write_pipeline_data_external_stream() -> Result<(), ShellError> {
fn write_pipeline_data_byte_stream() -> Result<(), ShellError> {
let test = TestCase::new();
let manager = TestInterfaceManager::new(&test);
let interface = manager.get_interface();
let stdout_bufs = vec![
b"hello".to_vec(),
b"world".to_vec(),
b"these are tests".to_vec(),
];
let stdout_len = stdout_bufs.iter().map(|b| b.len() as u64).sum::<u64>();
let stderr_bufs = vec![b"error messages".to_vec(), b"go here".to_vec()];
let exit_code = Value::test_int(7);
let expected = "hello\nworld\nthese are tests";
let span = Span::new(400, 500);
// Set up pipeline data for an external stream
let pipe = PipelineData::ExternalStream {
stdout: Some(RawStream::new(
Box::new(stdout_bufs.clone().into_iter().map(Ok)),
None,
span,
Some(stdout_len),
)),
stderr: Some(RawStream::new(
Box::new(stderr_bufs.clone().into_iter().map(Ok)),
None,
span,
None,
)),
exit_code: Some(ListStream::new(
std::iter::once(exit_code.clone()),
Span::test_data(),
None,
)),
span,
metadata: None,
trim_end_newline: true,
};
// Set up pipeline data for a byte stream
let data = PipelineData::ByteStream(
ByteStream::read(std::io::Cursor::new(expected), span, None),
None,
);
let (header, writer) = interface.init_write_pipeline_data(pipe, &())?;
let (header, writer) = interface.init_write_pipeline_data(data, &())?;
let info = match header {
PipelineDataHeader::ExternalStream(info) => info,
PipelineDataHeader::ByteStream(info) => info,
_ => panic!("unexpected header: {header:?}"),
};
writer.write()?;
let stdout_info = info.stdout.as_ref().expect("stdout info is None");
let stderr_info = info.stderr.as_ref().expect("stderr info is None");
let exit_code_info = info.exit_code.as_ref().expect("exit code info is None");
assert_eq!(span, info.span);
assert!(info.trim_end_newline);
assert_eq!(Some(stdout_len), stdout_info.known_size);
assert_eq!(None, stderr_info.known_size);
// Now make sure the stream messages have been written
let mut stdout_iter = stdout_bufs.into_iter();
let mut stderr_iter = stderr_bufs.into_iter();
let mut exit_code_iter = std::iter::once(exit_code);
let mut actual = Vec::new();
let mut ended = false;
let mut stdout_ended = false;
let mut stderr_ended = false;
let mut exit_code_ended = false;
// There's no specific order these messages must come in with respect to how the streams are
// interleaved, but all of the data for each stream must be in its original order, and the
// End must come after all Data
for msg in test.written() {
match msg {
PluginOutput::Data(id, data) => {
if id == stdout_info.id {
let result: Result<Vec<u8>, ShellError> =
data.try_into().expect("wrong data in stdout stream");
assert_eq!(
stdout_iter.next().expect("too much data in stdout"),
result.expect("unexpected error in stdout stream")
);
} else if id == stderr_info.id {
let result: Result<Vec<u8>, ShellError> =
data.try_into().expect("wrong data in stderr stream");
assert_eq!(
stderr_iter.next().expect("too much data in stderr"),
result.expect("unexpected error in stderr stream")
);
} else if id == exit_code_info.id {
let code: Value = data.try_into().expect("wrong data in stderr stream");
assert_eq!(
exit_code_iter.next().expect("too much data in stderr"),
code
);
if id == info.id {
let data: Result<Vec<u8>, ShellError> =
data.try_into().expect("wrong data in stream");
let data = data.expect("unexpected error in stream");
actual.extend(data);
} else {
panic!("unrecognized stream id: {id}");
}
}
PluginOutput::End(id) => {
if id == stdout_info.id {
assert!(!stdout_ended, "double End of stdout");
assert!(stdout_iter.next().is_none(), "unexpected end of stdout");
stdout_ended = true;
} else if id == stderr_info.id {
assert!(!stderr_ended, "double End of stderr");
assert!(stderr_iter.next().is_none(), "unexpected end of stderr");
stderr_ended = true;
} else if id == exit_code_info.id {
assert!(!exit_code_ended, "double End of exit_code");
assert!(
exit_code_iter.next().is_none(),
"unexpected end of exit_code"
);
exit_code_ended = true;
if id == info.id {
ended = true;
} else {
panic!("unrecognized stream id: {id}");
}
@ -558,9 +444,8 @@ fn write_pipeline_data_external_stream() -> Result<(), ShellError> {
}
}
assert!(stdout_ended, "stdout did not End");
assert!(stderr_ended, "stderr did not End");
assert!(exit_code_ended, "exit_code did not End");
assert_eq!(expected.as_bytes(), actual);
assert!(ended, "stream did not End");
Ok(())
}