Improve select row perf for large N (#10355)

# Description
While reviewing #10350 I noticed that a `HashSet<usize>` was used to
deduplicate the incoming rows, which are then sorted after cloning to a
separate `Vec`. This sounds like a candidate for a `BTreeSet` which
guarantees the ordering.

In the process I removed some dead code.

- Use `BTreeSet` instead of `HashSet`
- Remove dead `skip` logic
- Use `BTreeSet` directly in `NthIterator`
- Consume `BTreeSet` through `Peekable<IntoIter>`
This commit is contained in:
Stefan Holderbach 2023-09-21 23:51:13 +02:00 committed by GitHub
parent ef7ade59f3
commit 1bb953877e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -5,7 +5,7 @@ use nu_protocol::{
Category, Example, IntoInterruptiblePipelineData, IntoPipelineData, PipelineData, Category, Example, IntoInterruptiblePipelineData, IntoPipelineData, PipelineData,
PipelineIterator, Record, ShellError, Signature, Span, SyntaxShape, Type, Value, PipelineIterator, Record, ShellError, Signature, Span, SyntaxShape, Type, Value,
}; };
use std::collections::HashSet; use std::collections::BTreeSet;
#[derive(Clone)] #[derive(Clone)]
pub struct Select; pub struct Select;
@ -197,7 +197,7 @@ fn select(
columns: Vec<CellPath>, columns: Vec<CellPath>,
input: PipelineData, input: PipelineData,
) -> Result<PipelineData, ShellError> { ) -> Result<PipelineData, ShellError> {
let mut unique_rows: HashSet<usize> = HashSet::new(); let mut unique_rows: BTreeSet<usize> = BTreeSet::new();
let mut new_columns = vec![]; let mut new_columns = vec![];
@ -224,18 +224,15 @@ fn select(
}; };
} }
let columns = new_columns; let columns = new_columns;
let mut unique_rows: Vec<usize> = unique_rows.into_iter().collect();
let input = if !unique_rows.is_empty() { let input = if !unique_rows.is_empty() {
unique_rows.sort_unstable();
// let skip = call.has_flag("skip"); // let skip = call.has_flag("skip");
let metadata = input.metadata(); let metadata = input.metadata();
let pipeline_iter: PipelineIterator = input.into_iter(); let pipeline_iter: PipelineIterator = input.into_iter();
NthIterator { NthIterator {
input: pipeline_iter, input: pipeline_iter,
rows: unique_rows, rows: unique_rows.into_iter().peekable(),
skip: false,
current: 0, current: 0,
} }
.into_pipeline_data(engine_state.ctrlc.clone()) .into_pipeline_data(engine_state.ctrlc.clone())
@ -336,8 +333,7 @@ fn select(
struct NthIterator { struct NthIterator {
input: PipelineIterator, input: PipelineIterator,
rows: Vec<usize>, rows: std::iter::Peekable<std::collections::btree_set::IntoIter<usize>>,
skip: bool,
current: usize, current: usize,
} }
@ -346,32 +342,18 @@ impl Iterator for NthIterator {
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
loop { loop {
if !self.skip { if let Some(row) = self.rows.peek() {
if let Some(row) = self.rows.first() {
if self.current == *row {
self.rows.remove(0);
self.current += 1;
return self.input.next();
} else {
self.current += 1;
let _ = self.input.next();
continue;
}
} else {
return None;
}
} else if let Some(row) = self.rows.first() {
if self.current == *row { if self.current == *row {
self.rows.remove(0); self.rows.next();
self.current += 1;
return self.input.next();
} else {
self.current += 1; self.current += 1;
let _ = self.input.next(); let _ = self.input.next();
continue; continue;
} else {
self.current += 1;
return self.input.next();
} }
} else { } else {
return self.input.next(); return None;
} }
} }
} }