Keep order for par-each (#10249)

# Description
This PR adds new flag `--keep-order/-k` for the `par_each` filter. This
flag keeps sequence of output same as the order of input.

Output without the flag:
```nu
> 1..6 | par-each {|n| $n * 2 }
╭────╮
│  4 │
│ 10 │
│  2 │
│  8 │
│ 12 │
│  6 │
╰────╯
```

Output with the `--keep-order` flag:
```nu
> 1..6 | par-each --keep-order {|n| $n * 2 }
╭────╮
│  2 │
│  4 │
│  6 │
│  8 │
│ 10 │
│ 12 │
╰────╯
```

I think the presence of this flag is justified, since:
- Much easier to use than `.. | enumerate | par-each {|p| update item
..} | sort-by index | get item`
- Faster, as it uses internally parallel sorting in the same thread pool

A note about naming: it may conflict with `--keep-empty/-k` flag of the
`each` filter if the same feature will be used in `par-each`, so maybe
it needs some other name.
This commit is contained in:
Nano 2023-09-11 23:42:09 +12:00 committed by GitHub
parent eddff46155
commit 7b89fab327
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -37,6 +37,11 @@ impl Command for ParEach {
"the number of threads to use",
Some('t'),
)
.switch(
"keep-order",
"keep sequence of output same as the order of input",
Some('k'),
)
.required(
"closure",
SyntaxShape::Closure(Some(vec![SyntaxShape::Any, SyntaxShape::Int])),
@ -49,39 +54,43 @@ impl Command for ParEach {
fn examples(&self) -> Vec<Example> {
vec![
Example {
example: "[1 2 3] | par-each {|| 2 * $in }",
example: "[1 2 3] | par-each {|e| $e * 2 }",
description:
"Multiplies each number. Note that the list will become arbitrarily disordered.",
result: None,
},
Example {
example: r#"[foo bar baz] | par-each {|e| $e + '!' } | sort"#,
description: "Output can still be sorted afterward",
result: Some(Value::list(
vec![
Value::test_string("bar!"),
Value::test_string("baz!"),
Value::test_string("foo!"),
],
Span::test_data(),
)),
example: r#"[1 2 3] | par-each --keep-order {|e| $e * 2 }"#,
description: "Multiplies each number, keeping an original order",
result: Some(Value::test_list(vec![
Value::test_int(2),
Value::test_int(4),
Value::test_int(6),
])),
},
Example {
example: r#"1..3 | enumerate | par-each {|p| update item ($p.item * 2)} | sort-by item | get item"#,
description: "Enumerate and sort-by can be used to reconstruct the original order",
result: Some(Value::list(
vec![Value::test_int(2), Value::test_int(4), Value::test_int(6)],
Span::test_data(),
)),
result: Some(Value::test_list(vec![
Value::test_int(2),
Value::test_int(4),
Value::test_int(6),
])),
},
Example {
example: r#"[foo bar baz] | par-each {|e| $e + '!' } | sort"#,
description: "Output can still be sorted afterward",
result: Some(Value::test_list(vec![
Value::test_string("bar!"),
Value::test_string("baz!"),
Value::test_string("foo!"),
])),
},
Example {
example: r#"[1 2 3] | enumerate | par-each { |e| if $e.item == 2 { $"found 2 at ($e.index)!"} }"#,
description:
"Iterate over each element, producing a list showing indexes of any 2s",
result: Some(Value::list(
vec![Value::test_string("found 2 at 1!")],
Span::test_data(),
)),
result: Some(Value::test_list(vec![Value::test_string("found 2 at 1!")])),
},
]
}
@ -114,6 +123,7 @@ impl Command for ParEach {
let capture_block: Closure = call.req(engine_state, stack, 0)?;
let threads: Option<usize> = call.get_flag(engine_state, stack, "threads")?;
let max_threads = threads.unwrap_or(0);
let keep_order = call.has_flag("keep-order");
let metadata = input.metadata();
let ctrlc = engine_state.ctrlc.clone();
let outer_ctrlc = engine_state.ctrlc.clone();
@ -123,14 +133,27 @@ impl Command for ParEach {
let redirect_stdout = call.redirect_stdout;
let redirect_stderr = call.redirect_stderr;
// A helper function sorts the output if needed
let apply_order = |mut vec: Vec<(usize, Value)>| {
if keep_order {
// It runs inside the rayon's thread pool so parallel sorting can be used.
// There are no identical indexes, so unstable sorting can be used.
vec.par_sort_unstable_by_key(|(index, _)| *index);
}
vec.into_iter().map(|(_, val)| val)
};
match input {
PipelineData::Empty => Ok(PipelineData::Empty),
PipelineData::Value(Value::Range { val, .. }, ..) => Ok(create_pool(max_threads)?
.install(|| {
val.into_range_iter(ctrlc.clone())
let vec = val
.into_range_iter(ctrlc.clone())
.expect("unable to create a range iterator")
.enumerate()
.par_bridge()
.map(move |x| {
.map(move |(index, x)| {
let block = engine_state.get_block(block_id);
let mut stack = stack.clone();
@ -144,7 +167,7 @@ impl Command for ParEach {
let val_span = x.span();
let x_is_error = x.is_error();
match eval_block_with_early_return(
let val = match eval_block_with_early_return(
engine_state,
&mut stack,
block,
@ -153,21 +176,24 @@ impl Command for ParEach {
redirect_stderr,
) {
Ok(v) => v.into_value(span),
Err(error) => Value::error(
chain_error_with_input(error, x_is_error, val_span),
val_span,
),
}
};
(index, val)
})
.collect::<Vec<_>>()
.into_iter()
.into_pipeline_data(ctrlc)
.collect::<Vec<_>>();
apply_order(vec).into_pipeline_data(ctrlc)
})),
PipelineData::Value(Value::List { vals: val, .. }, ..) => Ok(create_pool(max_threads)?
.install(|| {
val.par_iter()
.map(move |x| {
let vec = val
.par_iter()
.enumerate()
.map(move |(index, x)| {
let block = engine_state.get_block(block_id);
let mut stack = stack.clone();
@ -181,7 +207,7 @@ impl Command for ParEach {
let val_span = x.span();
let x_is_error = x.is_error();
match eval_block_with_early_return(
let val = match eval_block_with_early_return(
engine_state,
&mut stack,
block,
@ -194,16 +220,19 @@ impl Command for ParEach {
chain_error_with_input(error, x_is_error, val_span),
val_span,
),
}
};
(index, val)
})
.collect::<Vec<_>>()
.into_iter()
.into_pipeline_data(ctrlc)
.collect::<Vec<_>>();
apply_order(vec).into_pipeline_data(ctrlc)
})),
PipelineData::ListStream(stream, ..) => Ok(create_pool(max_threads)?.install(|| {
stream
let vec = stream
.enumerate()
.par_bridge()
.map(move |x| {
.map(move |(index, x)| {
let block = engine_state.get_block(block_id);
let mut stack = stack.clone();
@ -217,7 +246,7 @@ impl Command for ParEach {
let val_span = x.span();
let x_is_error = x.is_error();
match eval_block_with_early_return(
let val = match eval_block_with_early_return(
engine_state,
&mut stack,
block,
@ -230,23 +259,26 @@ impl Command for ParEach {
chain_error_with_input(error, x_is_error, val_span),
val_span,
),
}
};
(index, val)
})
.collect::<Vec<_>>()
.into_iter()
.into_pipeline_data(ctrlc)
.collect::<Vec<_>>();
apply_order(vec).into_pipeline_data(ctrlc)
})),
PipelineData::ExternalStream { stdout: None, .. } => Ok(PipelineData::empty()),
PipelineData::ExternalStream {
stdout: Some(stream),
..
} => Ok(create_pool(max_threads)?.install(|| {
stream
let vec = stream
.enumerate()
.par_bridge()
.map(move |x| {
.map(move |(index, x)| {
let x = match x {
Ok(x) => x,
Err(err) => return Value::error(err, span),
Err(err) => return (index, Value::error(err, span)),
};
let block = engine_state.get_block(block_id);
@ -259,7 +291,7 @@ impl Command for ParEach {
}
}
match eval_block_with_early_return(
let val = match eval_block_with_early_return(
engine_state,
&mut stack,
block,
@ -269,11 +301,13 @@ impl Command for ParEach {
) {
Ok(v) => v.into_value(span),
Err(error) => Value::error(error, span),
}
};
(index, val)
})
.collect::<Vec<_>>()
.into_iter()
.into_pipeline_data(ctrlc)
.collect::<Vec<_>>();
apply_order(vec).into_pipeline_data(ctrlc)
})),
// This match allows non-iterables to be accepted,
// which is currently considered undesirable (Nov 2022).