From ee5a3873008dd32aa87acb36583e67dcbfcfea76 Mon Sep 17 00:00:00 2001 From: Reilly Wood <26268125+rgwood@users.noreply.github.com> Date: Fri, 2 Dec 2022 08:30:26 -0800 Subject: [PATCH] Handle mixed LF+CRLF in `lines` (#7316) This closes #4989. Previously `lines` was unable to handle text input with CRLF line breaks _and_ LF line breaks. ### Before: ![image](https://user-images.githubusercontent.com/26268125/205207685-b25da9e1-19fa-4abb-8ab2-0dd216c63fc0.png) ### After: ![image](https://user-images.githubusercontent.com/26268125/205207808-9f687242-a8c2-4b79-a12c-38b0583d8d52.png) --- Cargo.lock | 1 + crates/nu-command/Cargo.toml | 1 + crates/nu-command/src/filters/lines.rs | 34 +++++++++++------------ crates/nu-command/tests/commands/lines.rs | 13 +++++++++ 4 files changed, 32 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0f53db7c24..0a25dc5305 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2638,6 +2638,7 @@ dependencies = [ "rand 0.8.5", "rayon", "reedline", + "regex", "reqwest", "roxmltree", "rstest", diff --git a/crates/nu-command/Cargo.toml b/crates/nu-command/Cargo.toml index 7e2282ef64..27d562c369 100644 --- a/crates/nu-command/Cargo.toml +++ b/crates/nu-command/Cargo.toml @@ -68,6 +68,7 @@ powierza-coefficient = "1.0.1" quick-xml = "0.25" rand = "0.8" rayon = "1.5.1" +regex = "1.6.0" reqwest = {version = "0.11", features = ["blocking", "json"] } roxmltree = "0.16.0" rust-embed = "6.3.0" diff --git a/crates/nu-command/src/filters/lines.rs b/crates/nu-command/src/filters/lines.rs index 02b4e8c6cc..3947a30a5b 100644 --- a/crates/nu-command/src/filters/lines.rs +++ b/crates/nu-command/src/filters/lines.rs @@ -4,6 +4,10 @@ use nu_protocol::{ Category, Example, IntoInterruptiblePipelineData, PipelineData, RawStream, ShellError, Signature, Span, Type, Value, }; +use once_cell::sync::Lazy; +// regex can be replaced with fancy-regex once it suppports `split()` +// https://github.com/fancy-regex/fancy-regex/issues/104 +use regex::Regex; #[derive(Clone)] pub struct Lines; @@ -34,16 +38,18 @@ impl Command for Lines { let head = call.head; let ctrlc = engine_state.ctrlc.clone(); let skip_empty = call.has_flag("skip-empty"); + + // match \r\n or \n + static LINE_BREAK_REGEX: Lazy = + Lazy::new(|| Regex::new(r"\r\n|\n").expect("unable to compile regex")); match input { #[allow(clippy::needless_collect)] // Collect is needed because the string may not live long enough for // the Rc structure to continue using it. If split could take ownership // of the split values, then this wouldn't be needed PipelineData::Value(Value::String { val, span }, ..) => { - let split_char = if val.contains("\r\n") { "\r\n" } else { "\n" }; - - let mut lines = val - .split(split_char) + let mut lines = LINE_BREAK_REGEX + .split(&val) .map(|s| s.to_string()) .collect::>(); @@ -66,18 +72,12 @@ impl Command for Lines { Ok(iter.into_pipeline_data(engine_state.ctrlc.clone())) } PipelineData::ListStream(stream, ..) => { - let mut split_char = "\n"; - let iter = stream .into_iter() .filter_map(move |value| { if let Value::String { val, span } = value { - if split_char != "\r\n" && val.contains("\r\n") { - split_char = "\r\n"; - } - - let mut lines = val - .split(split_char) + let mut lines = LINE_BREAK_REGEX + .split(&val) .filter_map(|s| { if skip_empty && s.trim().is_empty() { None @@ -153,6 +153,9 @@ impl Iterator for RawStreamLinesAdapter { type Item = Result; fn next(&mut self) -> Option { + static LINE_BREAK_REGEX: Lazy = + Lazy::new(|| Regex::new(r"\r\n|\n").expect("unable to compile regex")); + loop { if !self.queue.is_empty() { let s = self.queue.remove(0usize); @@ -188,11 +191,8 @@ impl Iterator for RawStreamLinesAdapter { Value::String { val, span } => { self.span = span; - let split_char = - if val.contains("\r\n") { "\r\n" } else { "\n" }; - - let mut lines = val - .split(split_char) + let mut lines = LINE_BREAK_REGEX + .split(&val) .map(|s| s.to_string()) .collect::>(); diff --git a/crates/nu-command/tests/commands/lines.rs b/crates/nu-command/tests/commands/lines.rs index c78bf43922..d2d8134545 100644 --- a/crates/nu-command/tests/commands/lines.rs +++ b/crates/nu-command/tests/commands/lines.rs @@ -48,3 +48,16 @@ fn lines_multi_value_split() { assert_eq!(actual.out, "6"); } + +/// test whether this handles CRLF and LF in the same input +#[test] +fn lines_mixed_line_endings() { + let actual = nu!( + cwd: "tests/fixtures/formats", pipeline( + r#" + "foo\nbar\r\nquux" | lines | length + "# + )); + + assert_eq!(actual.out, "3"); +}