Replace ExternalStream with new ByteStream type (#12774)

# Description
This PR introduces a `ByteStream` type which is a `Read`-able stream of
bytes. Internally, it has an enum over three different byte stream
sources:
```rust
pub enum ByteStreamSource {
    Read(Box<dyn Read + Send + 'static>),
    File(File),
    Child(ChildProcess),
}
```

This is in comparison to the current `RawStream` type, which is an
`Iterator<Item = Vec<u8>>` and has to allocate for each read chunk.

Currently, `PipelineData::ExternalStream` serves a weird dual role where
it is either external command output or a wrapper around `RawStream`.
`ByteStream` makes this distinction more clear (via `ByteStreamSource`)
and replaces `PipelineData::ExternalStream` in this PR:
```rust
pub enum PipelineData {
    Empty,
    Value(Value, Option<PipelineMetadata>),
    ListStream(ListStream, Option<PipelineMetadata>),
    ByteStream(ByteStream, Option<PipelineMetadata>),
}
```

The PR is relatively large, but a decent amount of it is just repetitive
changes.

This PR fixes #7017, fixes #10763, and fixes #12369.

This PR also improves performance when piping external commands. Nushell
should, in most cases, have competitive pipeline throughput compared to,
e.g., bash.
| Command | Before (MB/s) | After (MB/s) | Bash (MB/s) |
| -------------------------------------------------- | -------------:|
------------:| -----------:|
| `throughput \| rg 'x'` | 3059 | 3744 | 3739 |
| `throughput \| nu --testbin relay o> /dev/null` | 3508 | 8087 | 8136 |

# User-Facing Changes
- This is a breaking change for the plugin communication protocol,
because the `ExternalStreamInfo` was replaced with `ByteStreamInfo`.
Plugins now only have to deal with a single input stream, as opposed to
the previous three streams: stdout, stderr, and exit code.
- The output of `describe` has been changed for external/byte streams.
- Temporary breaking change: `bytes starts-with` no longer works with
byte streams. This is to keep the PR smaller, and `bytes ends-with`
already does not work on byte streams.
- If a process core dumped, then instead of having a `Value::Error` in
the `exit_code` column of the output returned from `complete`, it now is
a `Value::Int` with the negation of the signal number.

# After Submitting
- Update docs and book as necessary
- Release notes (e.g., plugin protocol changes)
- Adapt/convert commands to work with byte streams (high priority is
`str length`, `bytes starts-with`, and maybe `bytes ends-with`).
- Refactor the `tee` code, Devyn has already done some work on this.

---------

Co-authored-by: Devyn Cairns <devyn.cairns@gmail.com>
This commit is contained in:
Ian Manske
2024-05-16 14:11:18 +00:00
committed by GitHub
parent 1b8eb23785
commit 6fd854ed9f
210 changed files with 3955 additions and 4012 deletions

View File

@ -1,16 +1,16 @@
use nu_cmd_base::hook::eval_hook;
use nu_engine::{command_prelude::*, env_to_strings, get_eval_expression};
use nu_protocol::{ast::Expr, did_you_mean, ListStream, NuGlob, OutDest, RawStream};
use nu_protocol::{ast::Expr, did_you_mean, process::ChildProcess, ByteStream, NuGlob, OutDest};
use nu_system::ForegroundChild;
use nu_utils::IgnoreCaseExt;
use os_pipe::PipeReader;
use pathdiff::diff_paths;
use std::{
collections::HashMap,
io::{BufRead, BufReader, Read, Write},
io::Write,
path::{Path, PathBuf},
process::{Command as CommandSys, Stdio},
sync::{mpsc, Arc},
sync::Arc,
thread,
};
@ -163,89 +163,124 @@ impl ExternalCommand {
) -> Result<PipelineData, ShellError> {
let head = self.name.span;
#[allow(unused_mut)]
let (cmd, mut reader) = self.create_process(&input, false, head)?;
#[cfg(all(not(unix), not(windows)))] // are there any systems like this?
let child = ForegroundChild::spawn(cmd);
#[cfg(windows)]
let child = match ForegroundChild::spawn(cmd) {
Ok(child) => Ok(child),
Err(err) => {
// Running external commands on Windows has 2 points of complication:
// 1. Some common Windows commands are actually built in to cmd.exe, not executables in their own right.
// 2. We need to let users run batch scripts etc. (.bat, .cmd) without typing their extension
let (child, reader, input) = {
// We may need to run `create_process` again, so we have to clone the underlying
// file or pipe in `input` here first.
let (input_consumed, stdin) = match &input {
PipelineData::ByteStream(stream, ..) => match stream.source() {
nu_protocol::ByteStreamSource::Read(_) => (false, Stdio::piped()),
nu_protocol::ByteStreamSource::File(file) => {
(true, file.try_clone().err_span(head)?.into())
}
nu_protocol::ByteStreamSource::Child(child) => {
if let Some(nu_protocol::process::ChildPipe::Pipe(pipe)) = &child.stdout {
(true, pipe.try_clone().err_span(head)?.into())
} else {
(false, Stdio::piped())
}
}
},
PipelineData::Empty => (false, Stdio::inherit()),
_ => (false, Stdio::piped()),
};
// To support these situations, we have a fallback path that gets run if a command
// fails to be run as a normal executable:
// 1. "shell out" to cmd.exe if the command is a known cmd.exe internal command
// 2. Otherwise, use `which-rs` to look for batch files etc. then run those in cmd.exe
let mut input = input;
let (cmd, mut reader) = self.create_process(stdin, false, head)?;
let child = match ForegroundChild::spawn(cmd) {
Ok(child) => {
if input_consumed {
input = PipelineData::Empty;
}
Ok(child)
}
Err(err) => {
// Running external commands on Windows has 2 points of complication:
// 1. Some common Windows commands are actually built in to cmd.exe, not executables in their own right.
// 2. We need to let users run batch scripts etc. (.bat, .cmd) without typing their extension
// set the default value, maybe we'll override it later
let mut child = Err(err);
// To support these situations, we have a fallback path that gets run if a command
// fails to be run as a normal executable:
// 1. "shell out" to cmd.exe if the command is a known cmd.exe internal command
// 2. Otherwise, use `which-rs` to look for batch files etc. then run those in cmd.exe
// This has the full list of cmd.exe "internal" commands: https://ss64.com/nt/syntax-internal.html
// I (Reilly) went through the full list and whittled it down to ones that are potentially useful:
const CMD_INTERNAL_COMMANDS: [&str; 9] = [
"ASSOC", "CLS", "ECHO", "FTYPE", "MKLINK", "PAUSE", "START", "VER", "VOL",
];
let command_name = &self.name.item;
let looks_like_cmd_internal = CMD_INTERNAL_COMMANDS
.iter()
.any(|&cmd| command_name.eq_ignore_ascii_case(cmd));
// set the default value, maybe we'll override it later
let mut child = Err(err);
if looks_like_cmd_internal {
let (cmd, new_reader) = self.create_process(&input, true, head)?;
reader = new_reader;
child = ForegroundChild::spawn(cmd);
} else {
#[cfg(feature = "which-support")]
{
// maybe it's a batch file (foo.cmd) and the user typed `foo`. Try to find it with `which-rs`
// TODO: clean this up with an if-let chain once those are stable
if let Ok(path) =
nu_engine::env::path_str(engine_state, stack, self.name.span)
// This has the full list of cmd.exe "internal" commands: https://ss64.com/nt/syntax-internal.html
// I (Reilly) went through the full list and whittled it down to ones that are potentially useful:
const CMD_INTERNAL_COMMANDS: [&str; 9] = [
"ASSOC", "CLS", "ECHO", "FTYPE", "MKLINK", "PAUSE", "START", "VER", "VOL",
];
let command_name = &self.name.item;
let looks_like_cmd_internal = CMD_INTERNAL_COMMANDS
.iter()
.any(|&cmd| command_name.eq_ignore_ascii_case(cmd));
let (data, stdin) = extract_stdio(input);
input = data;
if looks_like_cmd_internal {
let (cmd, new_reader) = self.create_process(stdin, true, head)?;
reader = new_reader;
child = ForegroundChild::spawn(cmd);
} else {
#[cfg(feature = "which-support")]
{
if let Some(cwd) = self.env_vars.get("PWD") {
// append cwd to PATH so `which-rs` looks in the cwd too.
// this approximates what cmd.exe does.
let path_with_cwd = format!("{};{}", cwd, path);
if let Ok(which_path) =
which::which_in(&self.name.item, Some(path_with_cwd), cwd)
{
if let Some(file_name) = which_path.file_name() {
if !file_name.to_string_lossy().eq_ignore_case(command_name)
{
// which-rs found an executable file with a slightly different name
// than the one the user tried. Let's try running it
let mut new_command = self.clone();
new_command.name = Spanned {
item: file_name.to_string_lossy().to_string(),
span: self.name.span,
};
let (cmd, new_reader) =
new_command.create_process(&input, true, head)?;
reader = new_reader;
child = ForegroundChild::spawn(cmd);
// maybe it's a batch file (foo.cmd) and the user typed `foo`. Try to find it with `which-rs`
// TODO: clean this up with an if-let chain once those are stable
if let Ok(path) =
nu_engine::env::path_str(engine_state, stack, self.name.span)
{
if let Some(cwd) = self.env_vars.get("PWD") {
// append cwd to PATH so `which-rs` looks in the cwd too.
// this approximates what cmd.exe does.
let path_with_cwd = format!("{};{}", cwd, path);
if let Ok(which_path) =
which::which_in(&self.name.item, Some(path_with_cwd), cwd)
{
if let Some(file_name) = which_path.file_name() {
if !file_name
.to_string_lossy()
.eq_ignore_case(command_name)
{
// which-rs found an executable file with a slightly different name
// than the one the user tried. Let's try running it
let mut new_command = self.clone();
new_command.name = Spanned {
item: file_name.to_string_lossy().to_string(),
span: self.name.span,
};
let (cmd, new_reader) = new_command
.create_process(stdin, true, head)?;
reader = new_reader;
child = ForegroundChild::spawn(cmd);
}
}
}
}
}
}
}
}
child
}
child
}
};
(child, reader, input)
};
#[cfg(unix)]
let child = ForegroundChild::spawn(
cmd,
engine_state.is_interactive,
&engine_state.pipeline_externals_state,
);
let (child, reader, input) = {
let (input, stdin) = extract_stdio(input);
let (cmd, reader) = self.create_process(stdin, false, head)?;
let child = ForegroundChild::spawn(
cmd,
engine_state.is_interactive,
&engine_state.pipeline_externals_state,
);
(child, reader, input)
};
match child {
Err(err) => {
@ -381,9 +416,8 @@ impl ExternalCommand {
.name("external stdin worker".to_string())
.spawn(move || {
let input = match input {
input @ PipelineData::Value(Value::Binary { .. }, ..) => {
Ok(input)
}
input @ PipelineData::ByteStream(..) => input,
input @ PipelineData::Value(Value::Binary { .. }, ..) => input,
input => {
let stack = &mut stack.start_capture();
// Attempt to render the input as a table before piping it to the external.
@ -397,143 +431,39 @@ impl ExternalCommand {
stack,
&Call::new(head),
input,
)
)?
}
};
if let Ok(input) = input {
if let PipelineData::ByteStream(stream, ..) = input {
stream.write_to(&mut stdin_write)?;
} else {
for value in input.into_iter() {
let buf = match value {
Value::String { val, .. } => val.into_bytes(),
Value::Binary { val, .. } => val,
_ => return Err(()),
};
if stdin_write.write(&buf).is_err() {
return Ok(());
}
let buf = value.coerce_into_binary()?;
stdin_write.write_all(&buf)?;
}
}
Ok(())
Ok::<_, ShellError>(())
})
.err_span(head)?;
}
}
#[cfg(unix)]
let commandname = self.name.item.clone();
let span = self.name.span;
let (exit_code_tx, exit_code_rx) = mpsc::channel();
let child =
ChildProcess::new(child, reader, matches!(self.err, OutDest::Pipe), head)?;
let (stdout, stderr) = if let Some(combined) = reader {
(
Some(RawStream::new(
Box::new(ByteLines::new(combined)),
engine_state.ctrlc.clone(),
head,
None,
)),
None,
)
} else {
let stdout = child.as_mut().stdout.take().map(|out| {
RawStream::new(
Box::new(ByteLines::new(out)),
engine_state.ctrlc.clone(),
head,
None,
)
});
let stderr = child.as_mut().stderr.take().map(|err| {
RawStream::new(
Box::new(ByteLines::new(err)),
engine_state.ctrlc.clone(),
head,
None,
)
});
if matches!(self.err, OutDest::Pipe) {
(stderr, stdout)
} else {
(stdout, stderr)
}
};
// Create a thread to wait for an exit code.
thread::Builder::new()
.name("exit code waiter".into())
.spawn(move || match child.as_mut().wait() {
Err(err) => Err(ShellError::ExternalCommand {
label: "External command exited with error".into(),
help: err.to_string(),
span,
}),
Ok(x) => {
#[cfg(unix)]
{
use nix::sys::signal::Signal;
use nu_ansi_term::{Color, Style};
use std::os::unix::process::ExitStatusExt;
if x.core_dumped() {
let cause = x
.signal()
.and_then(|sig| {
Signal::try_from(sig).ok().map(Signal::as_str)
})
.unwrap_or("Something went wrong");
let style = Style::new().bold().on(Color::Red);
let message = format!(
"{cause}: child process '{commandname}' core dumped"
);
eprintln!("{}", style.paint(&message));
let _ = exit_code_tx.send(Value::error(
ShellError::ExternalCommand {
label: "core dumped".into(),
help: message,
span: head,
},
head,
));
return Ok(());
}
}
if let Some(code) = x.code() {
let _ = exit_code_tx.send(Value::int(code as i64, head));
} else if x.success() {
let _ = exit_code_tx.send(Value::int(0, head));
} else {
let _ = exit_code_tx.send(Value::int(-1, head));
}
Ok(())
}
})
.err_span(head)?;
let exit_code = Some(ListStream::new(
ValueReceiver::new(exit_code_rx),
head,
Ok(PipelineData::ByteStream(
ByteStream::child(child, head),
None,
));
Ok(PipelineData::ExternalStream {
stdout,
stderr,
exit_code,
span: head,
metadata: None,
trim_end_newline: true,
})
))
}
}
}
pub fn create_process(
&self,
input: &PipelineData,
stdin: Stdio,
use_cmd: bool,
span: Span,
) -> Result<(CommandSys, Option<PipeReader>), ShellError> {
@ -578,11 +508,7 @@ impl ExternalCommand {
None
};
// If there is an input from the pipeline. The stdin from the process
// is piped so it can be used to send the input information
if !input.is_nothing() {
process.stdin(Stdio::piped());
}
process.stdin(stdin);
Ok((process, reader))
}
@ -764,51 +690,14 @@ fn remove_quotes(input: String) -> String {
}
}
struct ByteLines<R: Read>(BufReader<R>);
impl<R: Read> ByteLines<R> {
fn new(read: R) -> Self {
Self(BufReader::new(read))
}
}
impl<R: Read> Iterator for ByteLines<R> {
type Item = Result<Vec<u8>, ShellError>;
fn next(&mut self) -> Option<Self::Item> {
let mut buf = Vec::new();
// `read_until` will never stop reading unless `\n` or EOF is encountered,
// so let's limit the number of bytes using `take` as the Rust docs suggest.
let capacity = self.0.capacity() as u64;
let mut reader = (&mut self.0).take(capacity);
match reader.read_until(b'\n', &mut buf) {
Ok(0) => None,
Ok(_) => Some(Ok(buf)),
Err(e) => Some(Err(e.into())),
}
}
}
// Receiver used for the ListStream
// It implements iterator so it can be used as a ListStream
struct ValueReceiver {
rx: mpsc::Receiver<Value>,
}
impl ValueReceiver {
pub fn new(rx: mpsc::Receiver<Value>) -> Self {
Self { rx }
}
}
impl Iterator for ValueReceiver {
type Item = Value;
fn next(&mut self) -> Option<Self::Item> {
match self.rx.recv() {
Ok(v) => Some(v),
Err(_) => None,
}
fn extract_stdio(pipeline: PipelineData) -> (PipelineData, Stdio) {
match pipeline {
PipelineData::ByteStream(stream, metadata) => match stream.into_stdio() {
Ok(pipe) => (PipelineData::Empty, pipe),
Err(stream) => (PipelineData::ByteStream(stream, metadata), Stdio::piped()),
},
PipelineData::Empty => (PipelineData::Empty, Stdio::inherit()),
data => (data, Stdio::piped()),
}
}