Handle mixed LF+CRLF in lines (#7316)

This closes #4989. Previously `lines` was unable to handle text input
with CRLF line breaks _and_ LF line breaks.

### Before:

![image](https://user-images.githubusercontent.com/26268125/205207685-b25da9e1-19fa-4abb-8ab2-0dd216c63fc0.png)

### After:


![image](https://user-images.githubusercontent.com/26268125/205207808-9f687242-a8c2-4b79-a12c-38b0583d8d52.png)
This commit is contained in:
Reilly Wood 2022-12-02 08:30:26 -08:00 committed by GitHub
parent 3ac36879e0
commit ee5a387300
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 32 additions and 17 deletions

1
Cargo.lock generated
View File

@ -2638,6 +2638,7 @@ dependencies = [
"rand 0.8.5", "rand 0.8.5",
"rayon", "rayon",
"reedline", "reedline",
"regex",
"reqwest", "reqwest",
"roxmltree", "roxmltree",
"rstest", "rstest",

View File

@ -68,6 +68,7 @@ powierza-coefficient = "1.0.1"
quick-xml = "0.25" quick-xml = "0.25"
rand = "0.8" rand = "0.8"
rayon = "1.5.1" rayon = "1.5.1"
regex = "1.6.0"
reqwest = {version = "0.11", features = ["blocking", "json"] } reqwest = {version = "0.11", features = ["blocking", "json"] }
roxmltree = "0.16.0" roxmltree = "0.16.0"
rust-embed = "6.3.0" rust-embed = "6.3.0"

View File

@ -4,6 +4,10 @@ use nu_protocol::{
Category, Example, IntoInterruptiblePipelineData, PipelineData, RawStream, ShellError, Category, Example, IntoInterruptiblePipelineData, PipelineData, RawStream, ShellError,
Signature, Span, Type, Value, Signature, Span, Type, Value,
}; };
use once_cell::sync::Lazy;
// regex can be replaced with fancy-regex once it suppports `split()`
// https://github.com/fancy-regex/fancy-regex/issues/104
use regex::Regex;
#[derive(Clone)] #[derive(Clone)]
pub struct Lines; pub struct Lines;
@ -34,16 +38,18 @@ impl Command for Lines {
let head = call.head; let head = call.head;
let ctrlc = engine_state.ctrlc.clone(); let ctrlc = engine_state.ctrlc.clone();
let skip_empty = call.has_flag("skip-empty"); let skip_empty = call.has_flag("skip-empty");
// match \r\n or \n
static LINE_BREAK_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\r\n|\n").expect("unable to compile regex"));
match input { match input {
#[allow(clippy::needless_collect)] #[allow(clippy::needless_collect)]
// Collect is needed because the string may not live long enough for // Collect is needed because the string may not live long enough for
// the Rc structure to continue using it. If split could take ownership // the Rc structure to continue using it. If split could take ownership
// of the split values, then this wouldn't be needed // of the split values, then this wouldn't be needed
PipelineData::Value(Value::String { val, span }, ..) => { PipelineData::Value(Value::String { val, span }, ..) => {
let split_char = if val.contains("\r\n") { "\r\n" } else { "\n" }; let mut lines = LINE_BREAK_REGEX
.split(&val)
let mut lines = val
.split(split_char)
.map(|s| s.to_string()) .map(|s| s.to_string())
.collect::<Vec<String>>(); .collect::<Vec<String>>();
@ -66,18 +72,12 @@ impl Command for Lines {
Ok(iter.into_pipeline_data(engine_state.ctrlc.clone())) Ok(iter.into_pipeline_data(engine_state.ctrlc.clone()))
} }
PipelineData::ListStream(stream, ..) => { PipelineData::ListStream(stream, ..) => {
let mut split_char = "\n";
let iter = stream let iter = stream
.into_iter() .into_iter()
.filter_map(move |value| { .filter_map(move |value| {
if let Value::String { val, span } = value { if let Value::String { val, span } = value {
if split_char != "\r\n" && val.contains("\r\n") { let mut lines = LINE_BREAK_REGEX
split_char = "\r\n"; .split(&val)
}
let mut lines = val
.split(split_char)
.filter_map(|s| { .filter_map(|s| {
if skip_empty && s.trim().is_empty() { if skip_empty && s.trim().is_empty() {
None None
@ -153,6 +153,9 @@ impl Iterator for RawStreamLinesAdapter {
type Item = Result<Value, ShellError>; type Item = Result<Value, ShellError>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
static LINE_BREAK_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\r\n|\n").expect("unable to compile regex"));
loop { loop {
if !self.queue.is_empty() { if !self.queue.is_empty() {
let s = self.queue.remove(0usize); let s = self.queue.remove(0usize);
@ -188,11 +191,8 @@ impl Iterator for RawStreamLinesAdapter {
Value::String { val, span } => { Value::String { val, span } => {
self.span = span; self.span = span;
let split_char = let mut lines = LINE_BREAK_REGEX
if val.contains("\r\n") { "\r\n" } else { "\n" }; .split(&val)
let mut lines = val
.split(split_char)
.map(|s| s.to_string()) .map(|s| s.to_string())
.collect::<Vec<_>>(); .collect::<Vec<_>>();

View File

@ -48,3 +48,16 @@ fn lines_multi_value_split() {
assert_eq!(actual.out, "6"); assert_eq!(actual.out, "6");
} }
/// test whether this handles CRLF and LF in the same input
#[test]
fn lines_mixed_line_endings() {
let actual = nu!(
cwd: "tests/fixtures/formats", pipeline(
r#"
"foo\nbar\r\nquux" | lines | length
"#
));
assert_eq!(actual.out, "3");
}