From cf9813cbf81fbbc818e9b4278bab7fb70fd8b9e9 Mon Sep 17 00:00:00 2001 From: Ian Manske Date: Tue, 30 Jan 2024 21:56:19 +0000 Subject: [PATCH] Refactor `lines` command (#11685) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description This PR uses `str::lines` to simplify the `lines` command (and one other section of code). This has two main benefits: 1. We no longer need to use regex to split on lines, as `str::lines` splits on `\r\n` or `\n`. 2. We no longer need to handle blank empty lines at the end. E.g., `str::lines` results in `["text"]` for both `"test\n"` and `"text"`. These changes give a slight boost to performance for the following benchmarks: 1. lines of `Value::String`: ```nushell let data = open Cargo.lock 1..10000 | each { $data | timeit { lines } } | math avg ``` current main: 392µs this PR: 270µs 2. lines of external stream: ```nushell 1..10000 | each { open Cargo.lock | timeit { lines } } | math avg ``` current main: 794µs this PR: 489µs --- crates/nu-command/src/filters/lines.rs | 156 +++++++++---------------- crates/nu-json/src/ser.rs | 20 +--- 2 files changed, 59 insertions(+), 117 deletions(-) diff --git a/crates/nu-command/src/filters/lines.rs b/crates/nu-command/src/filters/lines.rs index c0e8dd80b0..f97bdac78a 100644 --- a/crates/nu-command/src/filters/lines.rs +++ b/crates/nu-command/src/filters/lines.rs @@ -1,14 +1,12 @@ +use std::collections::VecDeque; + use nu_engine::CallExt; use nu_protocol::ast::Call; use nu_protocol::engine::{Command, EngineState, Stack}; use nu_protocol::{ - Category, Example, IntoInterruptiblePipelineData, PipelineData, RawStream, ShellError, - Signature, Span, Type, Value, + Category, Example, IntoInterruptiblePipelineData, IntoPipelineData, PipelineData, RawStream, + ShellError, Signature, Span, Type, Value, }; -use once_cell::sync::Lazy; -// regex can be replaced with fancy-regex once it supports `split()` -// https://github.com/fancy-regex/fancy-regex/issues/104 -use regex::Regex; #[derive(Clone)] pub struct Lines; @@ -39,38 +37,24 @@ impl Command for Lines { let ctrlc = engine_state.ctrlc.clone(); let skip_empty = call.has_flag(engine_state, stack, "skip-empty")?; - // match \r\n or \n - static LINE_BREAK_REGEX: Lazy = - Lazy::new(|| Regex::new(r"\r\n|\n").expect("unable to compile regex")); let span = input.span().unwrap_or(call.head); match input { - #[allow(clippy::needless_collect)] - // Collect is needed because the string may not live long enough for - // the Rc structure to continue using it. If split could take ownership - // of the split values, then this wouldn't be needed PipelineData::Value(Value::String { val, .. }, ..) => { - let mut lines = LINE_BREAK_REGEX - .split(&val) - .map(|s| s.to_string()) - .collect::>(); + let lines = if skip_empty { + val.lines() + .filter_map(|s| { + if s.trim().is_empty() { + None + } else { + Some(Value::string(s, span)) + } + }) + .collect() + } else { + val.lines().map(|s| Value::string(s, span)).collect() + }; - // if the last one is empty, remove it, as it was just - // a newline at the end of the input we got - if let Some(last) = lines.last() { - if last.is_empty() { - lines.pop(); - } - } - - let iter = lines.into_iter().filter_map(move |s| { - if skip_empty && s.trim().is_empty() { - None - } else { - Some(Value::string(s, span)) - } - }); - - Ok(iter.into_pipeline_data(engine_state.ctrlc.clone())) + Ok(Value::list(lines, span).into_pipeline_data()) } PipelineData::Empty => Ok(PipelineData::Empty), PipelineData::ListStream(stream, ..) => { @@ -79,26 +63,17 @@ impl Command for Lines { .filter_map(move |value| { let span = value.span(); if let Value::String { val, .. } = value { - let mut lines = LINE_BREAK_REGEX - .split(&val) - .filter_map(|s| { - if skip_empty && s.trim().is_empty() { - None - } else { - Some(s.to_string()) - } - }) - .collect::>(); - - // if the last one is empty, remove it, as it was just - // a newline at the end of the input we got - if let Some(last) = lines.last() { - if last.is_empty() { - lines.pop(); - } - } - - Some(lines.into_iter().map(move |x| Value::string(x, span))) + Some( + val.lines() + .filter_map(|s| { + if skip_empty && s.trim().is_empty() { + None + } else { + Some(Value::string(s, span)) + } + }) + .collect::>(), + ) } else { None } @@ -124,11 +99,7 @@ impl Command for Lines { stdout: Some(stream), .. } => Ok(RawStreamLinesAdapter::new(stream, head, skip_empty) - .enumerate() - .map(move |(_idx, x)| match x { - Ok(x) => x, - Err(err) => Value::error(err, head), - }) + .map(move |x| x.unwrap_or_else(|err| Value::error(err, head))) .into_pipeline_data(ctrlc)), } } @@ -152,38 +123,30 @@ struct RawStreamLinesAdapter { skip_empty: bool, span: Span, incomplete_line: String, - queue: Vec, + queue: VecDeque, } impl Iterator for RawStreamLinesAdapter { type Item = Result; fn next(&mut self) -> Option { - static LINE_BREAK_REGEX: Lazy = - Lazy::new(|| Regex::new(r"\r\n|\n").expect("unable to compile regex")); - loop { - if !self.queue.is_empty() { - let s = self.queue.remove(0usize); - + if let Some(s) = self.queue.pop_front() { if self.skip_empty && s.trim().is_empty() { continue; } - return Some(Ok(Value::string(s, self.span))); } else { // inner is complete, feed out remaining state if self.inner_complete { - if !self.incomplete_line.is_empty() { - let r = Some(Ok(Value::string( - self.incomplete_line.to_string(), + return if self.incomplete_line.is_empty() { + None + } else { + Some(Ok(Value::string( + std::mem::take(&mut self.incomplete_line), self.span, - ))); - self.incomplete_line = String::new(); - return r; - } - - return None; + ))) + }; } // pull more data from inner @@ -196,36 +159,29 @@ impl Iterator for RawStreamLinesAdapter { Value::String { val, .. } => { self.span = span; - let mut lines = LINE_BREAK_REGEX - .split(&val) - .map(|s| s.to_string()) - .collect::>(); + let mut lines = val.lines(); // handle incomplete line from previous if !self.incomplete_line.is_empty() { - if let Some(first) = lines.first() { - let new_incomplete_line = - self.incomplete_line.to_string() + first.as_str(); - lines.splice(0..1, vec![new_incomplete_line]); - self.incomplete_line = String::new(); - } - } - - // store incomplete line from current - if let Some(last) = lines.last() { - if last.is_empty() { - // we ended on a line ending - lines.pop(); - } else { - // incomplete line, save for next time - if let Some(s) = lines.pop() { - self.incomplete_line = s; - } + if let Some(first) = lines.next() { + self.incomplete_line.push_str(first); + self.queue.push_back(std::mem::take( + &mut self.incomplete_line, + )); } } // save completed lines - self.queue.append(&mut lines); + self.queue.extend(lines.map(String::from)); + + if !val.ends_with('\n') { + // incomplete line, save for next time + // if `val` and `incomplete_line` were empty, + // then pop will return none + if let Some(s) = self.queue.pop_back() { + self.incomplete_line = s; + } + } } // Propagate errors by explicitly matching them before the final case. Value::Error { error, .. } => return Some(Err(*error)), @@ -256,7 +212,7 @@ impl RawStreamLinesAdapter { span, skip_empty, incomplete_line: String::new(), - queue: Vec::::new(), + queue: VecDeque::new(), inner_complete: false, } } diff --git a/crates/nu-json/src/ser.rs b/crates/nu-json/src/ser.rs index 246a5db209..128a294e2f 100644 --- a/crates/nu-json/src/ser.rs +++ b/crates/nu-json/src/ser.rs @@ -4,7 +4,6 @@ use std::fmt::{Display, LowerExp}; use std::io; -use std::io::{BufRead, BufReader}; use std::num::FpCategory; use super::error::{Error, ErrorCode, Result}; @@ -1034,20 +1033,7 @@ where T: ser::Serialize, { let vec = to_vec(value)?; - let string = remove_json_whitespace(vec); - Ok(string) -} - -fn remove_json_whitespace(v: Vec) -> String { - let reader = BufReader::new(&v[..]); - let mut output = String::new(); - for line in reader.lines() { - match line { - Ok(line) => output.push_str(line.trim().trim_end()), - _ => { - eprintln!("Error removing JSON whitespace"); - } - } - } - output + let string = String::from_utf8(vec)?; + let output = string.lines().map(str::trim).collect(); + Ok(output) }