Refactor parse command (#12791)

# Description
- Switches the `excess` in the `ParserStream` and
`ParseStreamerExternal` types from a `Vec` to a `VecDeque`
- Removes unnecessary clones to `stream_helper`
- Other simplifications and loop restructuring
- Merges the `ParseStreamer` and `ParseStreamerExternal` types into a
common `ParseIter`
- `parse` now streams for list values
This commit is contained in:
Ian Manske 2024-05-08 11:50:58 +00:00 committed by GitHub
parent e462b6cd99
commit 3b26c08dab
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,9 +1,9 @@
use fancy_regex::Regex; use fancy_regex::{Captures, Regex};
use nu_engine::command_prelude::*; use nu_engine::command_prelude::*;
use nu_protocol::{ListStream, ValueIterator}; use nu_protocol::ListStream;
use std::sync::{ use std::{
atomic::{AtomicBool, Ordering}, collections::VecDeque,
Arc, sync::{atomic::AtomicBool, Arc},
}; };
#[derive(Clone)] #[derive(Clone)]
@ -119,7 +119,6 @@ fn operate(
let head = call.head; let head = call.head;
let pattern: Spanned<String> = call.req(engine_state, stack, 0)?; let pattern: Spanned<String> = call.req(engine_state, stack, 0)?;
let regex: bool = call.has_flag(engine_state, stack, "regex")?; let regex: bool = call.has_flag(engine_state, stack, "regex")?;
let ctrlc = engine_state.ctrlc.clone();
let pattern_item = pattern.item; let pattern_item = pattern.item;
let pattern_span = pattern.span; let pattern_span = pattern.span;
@ -130,7 +129,7 @@ fn operate(
build_regex(&pattern_item, pattern_span)? build_regex(&pattern_item, pattern_span)?
}; };
let regex_pattern = Regex::new(&item_to_parse).map_err(|e| ShellError::GenericError { let regex = Regex::new(&item_to_parse).map_err(|e| ShellError::GenericError {
error: "Error with regular expression".into(), error: "Error with regular expression".into(),
msg: e.to_string(), msg: e.to_string(),
span: Some(pattern_span), span: Some(pattern_span),
@ -138,92 +137,108 @@ fn operate(
inner: vec![], inner: vec![],
})?; })?;
let columns = column_names(&regex_pattern); let columns = regex
.capture_names()
.skip(1)
.enumerate()
.map(|(i, name)| {
name.map(String::from)
.unwrap_or_else(|| format!("capture{i}"))
})
.collect::<Vec<_>>();
let ctrlc = engine_state.ctrlc.clone();
match input { match input {
PipelineData::Empty => Ok(PipelineData::Empty), PipelineData::Empty => Ok(PipelineData::Empty),
PipelineData::Value(..) => { PipelineData::Value(value, ..) => match value {
let mut parsed: Vec<Value> = Vec::new(); Value::String { val, .. } => {
let captures = regex
.captures_iter(&val)
.map(|captures| captures_to_value(captures, &columns, head))
.collect::<Result<_, _>>()?;
for v in input { Ok(Value::list(captures, head).into_pipeline_data())
let v_span = v.span();
match v.coerce_into_string() {
Ok(s) => {
let results = regex_pattern.captures_iter(&s);
for c in results {
let captures = match c {
Ok(c) => c,
Err(e) => {
return Err(ShellError::GenericError {
error: "Error with regular expression captures".into(),
msg: e.to_string(),
span: None,
help: None,
inner: vec![],
})
}
};
let record = columns
.iter()
.zip(captures.iter().skip(1))
.map(|(column_name, cap)| {
let cap_string = cap.map(|v| v.as_str()).unwrap_or("");
(column_name.clone(), Value::string(cap_string, v_span))
})
.collect();
parsed.push(Value::record(record, head));
}
}
Err(_) => {
return Err(ShellError::PipelineMismatch {
exp_input_type: "string".into(),
dst_span: head,
src_span: v_span,
})
}
}
} }
Value::List { vals, .. } => {
let iter = vals.into_iter().map(move |val| {
let span = val.span();
val.into_string().map_err(|_| ShellError::PipelineMismatch {
exp_input_type: "string".into(),
dst_span: head,
src_span: span,
})
});
Ok(ListStream::new(parsed.into_iter(), head, ctrlc).into()) let iter = ParseIter {
} captures: VecDeque::new(),
regex,
columns,
iter,
span: head,
ctrlc,
};
Ok(ListStream::new(iter, head, None).into())
}
value => Err(ShellError::PipelineMismatch {
exp_input_type: "string".into(),
dst_span: head,
src_span: value.span(),
}),
},
PipelineData::ListStream(stream, ..) => Ok(stream PipelineData::ListStream(stream, ..) => Ok(stream
.modify(|stream| ParseStreamer { .modify(|stream| {
span: head, let iter = stream.map(move |val| {
excess: Vec::new(), let span = val.span();
regex: regex_pattern, val.into_string().map_err(|_| ShellError::PipelineMismatch {
columns, exp_input_type: "string".into(),
stream, dst_span: head,
ctrlc, src_span: span,
})
});
ParseIter {
captures: VecDeque::new(),
regex,
columns,
iter,
span: head,
ctrlc,
}
}) })
.into()), .into()),
PipelineData::ExternalStream { stdout: None, .. } => Ok(PipelineData::Empty), PipelineData::ExternalStream { stdout: None, .. } => Ok(PipelineData::Empty),
PipelineData::ExternalStream { PipelineData::ExternalStream {
stdout: Some(stream), stdout: Some(stream),
.. ..
} => Ok(ListStream::new( } => {
ParseStreamerExternal { // Collect all `stream` chunks into a single `chunk` to be able to deal with matches that
span: head, // extend across chunk boundaries.
excess: Vec::new(), // This is a stop-gap solution until the `regex` crate supports streaming or an alternative
regex: regex_pattern, // solution is found.
// See https://github.com/nushell/nushell/issues/9795
let str = stream.into_string()?.item;
// let iter = stream.lines();
let iter = ParseIter {
captures: VecDeque::new(),
regex,
columns, columns,
stream: stream.stream, iter: std::iter::once(Ok(str)),
}, span: head,
head, ctrlc,
ctrlc, };
)
.into()), Ok(ListStream::new(iter, head, None).into())
}
} }
} }
fn build_regex(input: &str, span: Span) -> Result<String, ShellError> { fn build_regex(input: &str, span: Span) -> Result<String, ShellError> {
let mut output = "(?s)\\A".to_string(); let mut output = "(?s)\\A".to_string();
//let mut loop_input = input;
let mut loop_input = input.chars().peekable(); let mut loop_input = input.chars().peekable();
loop { loop {
let mut before = String::new(); let mut before = String::new();
@ -274,172 +289,73 @@ fn build_regex(input: &str, span: Span) -> Result<String, ShellError> {
Ok(output) Ok(output)
} }
fn column_names(regex: &Regex) -> Vec<String> { struct ParseIter<I: Iterator<Item = Result<String, ShellError>>> {
regex captures: VecDeque<Value>,
.capture_names()
.enumerate()
.skip(1)
.map(|(i, name)| {
name.map(String::from)
.unwrap_or_else(|| format!("capture{}", i - 1))
})
.collect()
}
pub struct ParseStreamer {
span: Span,
excess: Vec<Value>,
regex: Regex, regex: Regex,
columns: Vec<String>, columns: Vec<String>,
stream: ValueIterator, iter: I,
span: Span,
ctrlc: Option<Arc<AtomicBool>>, ctrlc: Option<Arc<AtomicBool>>,
} }
impl Iterator for ParseStreamer { impl<I: Iterator<Item = Result<String, ShellError>>> ParseIter<I> {
type Item = Value; fn populate_captures(&mut self, str: &str) -> Result<(), ShellError> {
fn next(&mut self) -> Option<Value> { for captures in self.regex.captures_iter(str) {
if !self.excess.is_empty() { self.captures
return Some(self.excess.remove(0)); .push_back(captures_to_value(captures, &self.columns, self.span)?);
} }
Ok(())
}
}
impl<I: Iterator<Item = Result<String, ShellError>>> Iterator for ParseIter<I> {
type Item = Value;
fn next(&mut self) -> Option<Value> {
loop { loop {
if let Some(ctrlc) = &self.ctrlc { if nu_utils::ctrl_c::was_pressed(&self.ctrlc) {
if ctrlc.load(Ordering::SeqCst) { return None;
break None;
}
} }
let v = self.stream.next()?; if let Some(val) = self.captures.pop_front() {
let span = v.span(); return Some(val);
}
let Ok(s) = v.coerce_into_string() else { let result = self
return Some(Value::error( .iter
ShellError::PipelineMismatch { .next()?
exp_input_type: "string".into(), .and_then(|str| self.populate_captures(&str));
dst_span: self.span,
src_span: span,
},
span,
));
};
let parsed = stream_helper( if let Err(err) = result {
self.regex.clone(), return Some(Value::error(err, self.span));
span, }
s,
self.columns.clone(),
&mut self.excess,
);
if parsed.is_none() {
continue;
};
return parsed;
} }
} }
} }
pub struct ParseStreamerExternal { fn captures_to_value(
captures: Result<Captures, fancy_regex::Error>,
columns: &[String],
span: Span, span: Span,
excess: Vec<Value>, ) -> Result<Value, ShellError> {
regex: Regex, let captures = captures.map_err(|err| ShellError::GenericError {
columns: Vec<String>, error: "Error with regular expression captures".into(),
stream: Box<dyn Iterator<Item = Result<Vec<u8>, ShellError>> + Send + 'static>, msg: err.to_string(),
} span: Some(span),
help: None,
inner: vec![],
})?;
impl Iterator for ParseStreamerExternal { let record = columns
type Item = Value; .iter()
fn next(&mut self) -> Option<Value> { .zip(captures.iter().skip(1))
if !self.excess.is_empty() { .map(|(column, match_)| {
return Some(self.excess.remove(0)); let match_str = match_.map(|m| m.as_str()).unwrap_or("");
} (column.clone(), Value::string(match_str, span))
})
.collect();
let mut chunk = self.stream.next(); Ok(Value::record(record, span))
// Collect all `stream` chunks into a single `chunk` to be able to deal with matches that
// extend across chunk boundaries.
// This is a stop-gap solution until the `regex` crate supports streaming or an alternative
// solution is found.
// See https://github.com/nushell/nushell/issues/9795
while let Some(Ok(chunks)) = &mut chunk {
match self.stream.next() {
Some(Ok(mut next_chunk)) => chunks.append(&mut next_chunk),
error @ Some(Err(_)) => chunk = error,
None => break,
}
}
let chunk = match chunk {
Some(Ok(chunk)) => chunk,
Some(Err(err)) => return Some(Value::error(err, self.span)),
_ => return None,
};
let Ok(chunk) = String::from_utf8(chunk) else {
return Some(Value::error(
ShellError::PipelineMismatch {
exp_input_type: "string".into(),
dst_span: self.span,
src_span: self.span,
},
self.span,
));
};
stream_helper(
self.regex.clone(),
self.span,
chunk,
self.columns.clone(),
&mut self.excess,
)
}
}
fn stream_helper(
regex: Regex,
span: Span,
s: String,
columns: Vec<String>,
excess: &mut Vec<Value>,
) -> Option<Value> {
let results = regex.captures_iter(&s);
for c in results {
let captures = match c {
Ok(c) => c,
Err(e) => {
return Some(Value::error(
ShellError::GenericError {
error: "Error with regular expression captures".into(),
msg: e.to_string(),
span: Some(span),
help: Some(e.to_string()),
inner: vec![],
},
span,
))
}
};
let record = columns
.iter()
.zip(captures.iter().skip(1))
.map(|(column_name, cap)| {
let cap_string = cap.map(|v| v.as_str()).unwrap_or("");
(column_name.clone(), Value::string(cap_string, span))
})
.collect();
excess.push(Value::record(record, span));
}
if !excess.is_empty() {
Some(excess.remove(0))
} else {
None
}
} }
#[cfg(test)] #[cfg(test)]