From ae0e13733d09524e9380cd45c47db0d489cfa05e Mon Sep 17 00:00:00 2001 From: Kira Date: Wed, 28 Aug 2024 22:53:56 +0200 Subject: [PATCH] Fix parsing record values containing colons (#13413) This PR is an attempt to fix #8257 and fix #10985 (which is duplicate-ish) # Description The parser currently doesn't know how to deal with colons appearing while lexing whitespace-terminated tokens specifying a record value. Most notably, this means you can't use datetime literals in record value position (and as a consequence, `| to nuon | from nuon` roundtrips can fail), but it also means that bare words containing colons cause a non-useful error message. ![image](https://github.com/user-attachments/assets/f04a8417-ee18-44e7-90eb-a0ecef943a0f) `parser::parse_record` calls `lex::lex` with the `:` colon character in the `special_tokens` argument. This allows colons to terminate record keys, but as a side effect, it also causes colons to terminate record *values*. I added a new function `lex::lex_n_tokens`, which allows the caller to drive the lexing process more explicitly, and used it in `parser::parse_record` to let colons terminate record keys while not giving them special treatment when appearing in record values. This PR description previously said: *Another approach suggested in one of the issues was to support an additional datetime literal format that doesn't require colons. I like that that wouldn't require new `lex::lex_internal` behaviour, but an advantage of my approach is that it also newly allows for string record values given as bare words containing colons. I think this eliminates another possible source of confusion.* It was determined that this is undesirable, and in the current state of this PR, bare word record values with colons are rejected explicitly. The better error message is still a win. # User-Facing Changes In addition to the above, this PR also disables the use of "special" (non-item) tokens in record key and value position, and the use of a single bare `:` as a record key. Examples of behaviour *before* this PR: ```nu { a: b } # Valid, same as { 'a': 'b' } { a: b:c } # Error: expected ':' { a: 2024-08-13T22:11:09 } # Error: expected ':' { :: 1 } # Valid, same as { ':': 1 } { ;: 1 } # Valid, same as { ';': 1 } { a: || } # Valid, same as { 'a': '||' } ``` Examples of behaviour *after* this PR: ```nu { a: b } # (Unchanged) Valid, same as { 'a': 'b' } { a: b:c } # Error: colon in bare word specifying record value { a: 2024-08-13T22:11:09 } # Valid, same as { a: (2024-08-13T22:11:09) } { :: 1 } # Error: colon in bare word specifying record key { ;: 1 } # Error: expected item in record key position { a: || } # Error: expected item in record value position ``` # Tests + Formatting I added tests, but I'm not sure if they're sufficient and in the right place. # After Submitting I don't think documentation changes are needed for this, but please let me know if you disagree. --- crates/nu-parser/src/lex.rs | 140 ++++++++++++++++++-------- crates/nu-parser/src/lib.rs | 2 +- crates/nu-parser/src/parser.rs | 110 +++++++++++++++++++- crates/nu-parser/tests/test_lex.rs | 25 ++++- crates/nu-parser/tests/test_parser.rs | 53 ++++++++++ 5 files changed, 281 insertions(+), 49 deletions(-) diff --git a/crates/nu-parser/src/lex.rs b/crates/nu-parser/src/lex.rs index bad9e94a10..f0802fcd7a 100644 --- a/crates/nu-parser/src/lex.rs +++ b/crates/nu-parser/src/lex.rs @@ -447,14 +447,61 @@ pub fn lex_signature( special_tokens: &[u8], skip_comment: bool, ) -> (Vec, Option) { - lex_internal( + let mut state = LexState { input, + output: Vec::new(), + error: None, span_offset, + }; + lex_internal( + &mut state, additional_whitespace, special_tokens, skip_comment, true, - ) + None, + ); + (state.output, state.error) +} + +#[derive(Debug)] +pub struct LexState<'a> { + pub input: &'a [u8], + pub output: Vec, + pub error: Option, + pub span_offset: usize, +} + +/// Lex until the output is `max_tokens` longer than before the call, or until the input is exhausted. +/// The return value indicates how many tokens the call added to / removed from the output. +/// +/// The behaviour here is non-obvious when `additional_whitespace` doesn't include newline: +/// If you pass a `state` where the last token in the output is an Eol, this might *remove* tokens. +pub fn lex_n_tokens( + state: &mut LexState, + additional_whitespace: &[u8], + special_tokens: &[u8], + skip_comment: bool, + max_tokens: usize, +) -> isize { + let n_tokens = state.output.len(); + lex_internal( + state, + additional_whitespace, + special_tokens, + skip_comment, + false, + Some(max_tokens), + ); + // If this lex_internal call reached the end of the input, there may now be fewer tokens + // in the output than before. + let tokens_n_diff = (state.output.len() as isize) - (n_tokens as isize); + let next_offset = state.output.last().map(|token| token.span.end); + if let Some(next_offset) = next_offset { + state.input = &state.input[next_offset - state.span_offset..]; + state.span_offset = next_offset; + } + tokens_n_diff } pub fn lex( @@ -464,33 +511,43 @@ pub fn lex( special_tokens: &[u8], skip_comment: bool, ) -> (Vec, Option) { - lex_internal( + let mut state = LexState { input, + output: Vec::new(), + error: None, span_offset, + }; + lex_internal( + &mut state, additional_whitespace, special_tokens, skip_comment, false, - ) + None, + ); + (state.output, state.error) } fn lex_internal( - input: &[u8], - span_offset: usize, + state: &mut LexState, additional_whitespace: &[u8], special_tokens: &[u8], skip_comment: bool, // within signatures we want to treat `<` and `>` specially in_signature: bool, -) -> (Vec, Option) { - let mut error = None; + max_tokens: Option, +) { + let initial_output_len = state.output.len(); let mut curr_offset = 0; - let mut output = vec![]; let mut is_complete = true; - - while let Some(c) = input.get(curr_offset) { + while let Some(c) = state.input.get(curr_offset) { + if max_tokens + .is_some_and(|max_tokens| state.output.len() >= initial_output_len + max_tokens) + { + break; + } let c = *c; if c == b'|' { // If the next character is `|`, it's either `|` or `||`. @@ -499,13 +556,13 @@ fn lex_internal( curr_offset += 1; // If the next character is `|`, we're looking at a `||`. - if let Some(c) = input.get(curr_offset) { + if let Some(c) = state.input.get(curr_offset) { if *c == b'|' { let idx = curr_offset; curr_offset += 1; - output.push(Token::new( + state.output.push(Token::new( TokenContents::PipePipe, - Span::new(span_offset + prev_idx, span_offset + idx + 1), + Span::new(state.span_offset + prev_idx, state.span_offset + idx + 1), )); continue; } @@ -515,12 +572,12 @@ fn lex_internal( // Before we push, check to see if the previous character was a newline. // If so, then this is a continuation of the previous line - if let Some(prev) = output.last_mut() { + if let Some(prev) = state.output.last_mut() { match prev.contents { TokenContents::Eol => { *prev = Token::new( TokenContents::Pipe, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), ); // And this is a continuation of the previous line if previous line is a // comment line (combined with EOL + Comment) @@ -528,12 +585,12 @@ fn lex_internal( // Initially, the last one token is TokenContents::Pipe, we don't need to // check it, so the beginning offset is 2. let mut offset = 2; - while output.len() > offset { - let index = output.len() - offset; - if output[index].contents == TokenContents::Comment - && output[index - 1].contents == TokenContents::Eol + while state.output.len() > offset { + let index = state.output.len() - offset; + if state.output[index].contents == TokenContents::Comment + && state.output[index - 1].contents == TokenContents::Eol { - output.remove(index - 1); + state.output.remove(index - 1); offset += 1; } else { break; @@ -541,16 +598,16 @@ fn lex_internal( } } _ => { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Pipe, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), )); } } } else { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Pipe, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), )); } @@ -558,17 +615,17 @@ fn lex_internal( } else if c == b';' { // If the next character is a `;`, we're looking at a semicolon token. - if !is_complete && error.is_none() { - error = Some(ParseError::ExtraTokens(Span::new( + if !is_complete && state.error.is_none() { + state.error = Some(ParseError::ExtraTokens(Span::new( curr_offset, curr_offset + 1, ))); } let idx = curr_offset; curr_offset += 1; - output.push(Token::new( + state.output.push(Token::new( TokenContents::Semicolon, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), )); } else if c == b'\r' { // Ignore a stand-alone carriage return @@ -578,9 +635,9 @@ fn lex_internal( let idx = curr_offset; curr_offset += 1; if !additional_whitespace.contains(&c) { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Eol, - Span::new(span_offset + idx, span_offset + idx + 1), + Span::new(state.span_offset + idx, state.span_offset + idx + 1), )); } } else if c == b'#' { @@ -588,12 +645,12 @@ fn lex_internal( // comment. The comment continues until the next newline. let mut start = curr_offset; - while let Some(input) = input.get(curr_offset) { + while let Some(input) = state.input.get(curr_offset) { if *input == b'\n' { if !skip_comment { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Comment, - Span::new(span_offset + start, span_offset + curr_offset), + Span::new(state.span_offset + start, state.span_offset + curr_offset), )); } start = curr_offset; @@ -604,9 +661,9 @@ fn lex_internal( } } if start != curr_offset && !skip_comment { - output.push(Token::new( + state.output.push(Token::new( TokenContents::Comment, - Span::new(span_offset + start, span_offset + curr_offset), + Span::new(state.span_offset + start, state.span_offset + curr_offset), )); } } else if c == b' ' || c == b'\t' || additional_whitespace.contains(&c) { @@ -614,21 +671,20 @@ fn lex_internal( curr_offset += 1; } else { let (token, err) = lex_item( - input, + state.input, &mut curr_offset, - span_offset, + state.span_offset, additional_whitespace, special_tokens, in_signature, ); - if error.is_none() { - error = err; + if state.error.is_none() { + state.error = err; } is_complete = true; - output.push(token); + state.output.push(token); } } - (output, error) } /// True if this the start of a redirection. Does not match `>>` or `>|` forms. diff --git a/crates/nu-parser/src/lib.rs b/crates/nu-parser/src/lib.rs index 8f871aa815..c5d69cb270 100644 --- a/crates/nu-parser/src/lib.rs +++ b/crates/nu-parser/src/lib.rs @@ -16,7 +16,7 @@ pub use flatten::{ flatten_block, flatten_expression, flatten_pipeline, flatten_pipeline_element, FlatShape, }; pub use known_external::KnownExternal; -pub use lex::{lex, lex_signature, Token, TokenContents}; +pub use lex::{lex, lex_n_tokens, lex_signature, LexState, Token, TokenContents}; pub use lite_parser::{lite_parse, LiteBlock, LiteCommand}; pub use nu_protocol::parser_path::*; pub use parse_keywords::*; diff --git a/crates/nu-parser/src/parser.rs b/crates/nu-parser/src/parser.rs index b9147ed7f9..3a84defe95 100644 --- a/crates/nu-parser/src/parser.rs +++ b/crates/nu-parser/src/parser.rs @@ -1,5 +1,5 @@ use crate::{ - lex::{is_assignment_operator, lex, lex_signature}, + lex::{is_assignment_operator, lex, lex_n_tokens, lex_signature, LexState}, lite_parser::{lite_parse, LiteCommand, LitePipeline, LiteRedirection, LiteRedirectionTarget}, parse_keywords::*, parse_patterns::parse_pattern, @@ -5650,6 +5650,49 @@ pub fn parse_builtin_commands( } } +fn check_record_key_or_value( + working_set: &StateWorkingSet, + expr: &Expression, + position: &str, +) -> Option { + let bareword_error = |string_value: &Expression| { + working_set + .get_span_contents(string_value.span) + .iter() + .find_position(|b| **b == b':') + .map(|(i, _)| { + let colon_position = i + string_value.span.start; + ParseError::InvalidLiteral( + "colon".to_string(), + format!("bare word specifying record {}", position), + Span::new(colon_position, colon_position + 1), + ) + }) + }; + let value_span = working_set.get_span_contents(expr.span); + match expr.expr { + Expr::String(_) => { + if ![b'"', b'\'', b'`'].contains(&value_span[0]) { + bareword_error(expr) + } else { + None + } + } + Expr::StringInterpolation(ref expressions) => { + if value_span[0] != b'$' { + expressions + .iter() + .filter(|expr| matches!(expr.expr, Expr::String(_))) + .filter_map(bareword_error) + .next() + } else { + None + } + } + _ => None, + } +} + pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression { let bytes = working_set.get_span_contents(span); @@ -5670,9 +5713,32 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression } let inner_span = Span::new(start, end); - let source = working_set.get_span_contents(inner_span); - let (tokens, err) = lex(source, start, &[b'\n', b'\r', b','], &[b':'], true); + let mut lex_state = LexState { + input: working_set.get_span_contents(inner_span), + output: Vec::new(), + error: None, + span_offset: start, + }; + let mut lex_n = |additional_whitespace, special_tokens, max_tokens| { + lex_n_tokens( + &mut lex_state, + additional_whitespace, + special_tokens, + true, + max_tokens, + ) + }; + loop { + if lex_n(&[b'\n', b'\r', b','], &[b':'], 2) < 2 { + break; + }; + if lex_n(&[b'\n', b'\r', b','], &[], 1) < 1 { + break; + }; + } + let (tokens, err) = (lex_state.output, lex_state.error); + if let Some(err) = err { working_set.error(err); } @@ -5716,7 +5782,22 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression )); } else { // Normal key-value pair - let field = parse_value(working_set, curr_span, &SyntaxShape::Any); + let field_token = &tokens[idx]; + let field = if field_token.contents != TokenContents::Item { + working_set.error(ParseError::Expected( + "item in record key position", + Span::new(field_token.span.start, field_token.span.end), + )); + garbage(working_set, curr_span) + } else { + let field = parse_value(working_set, curr_span, &SyntaxShape::Any); + if let Some(error) = check_record_key_or_value(working_set, &field, "key") { + working_set.error(error); + garbage(working_set, field.span) + } else { + field + } + }; idx += 1; if idx == tokens.len() { @@ -5761,7 +5842,26 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression )); break; } - let value = parse_value(working_set, tokens[idx].span, &SyntaxShape::Any); + + let value_token = &tokens[idx]; + let value = if value_token.contents != TokenContents::Item { + working_set.error(ParseError::Expected( + "item in record value position", + Span::new(value_token.span.start, value_token.span.end), + )); + garbage( + working_set, + Span::new(value_token.span.start, value_token.span.end), + ) + } else { + let value = parse_value(working_set, tokens[idx].span, &SyntaxShape::Any); + if let Some(parse_error) = check_record_key_or_value(working_set, &value, "value") { + working_set.error(parse_error); + garbage(working_set, value.span) + } else { + value + } + }; idx += 1; if let Some(field) = field.as_string() { diff --git a/crates/nu-parser/tests/test_lex.rs b/crates/nu-parser/tests/test_lex.rs index 07470a310e..22fe4c4715 100644 --- a/crates/nu-parser/tests/test_lex.rs +++ b/crates/nu-parser/tests/test_lex.rs @@ -1,4 +1,4 @@ -use nu_parser::{lex, lex_signature, Token, TokenContents}; +use nu_parser::{lex, lex_n_tokens, lex_signature, LexState, Token, TokenContents}; use nu_protocol::{ParseError, Span}; #[test] @@ -281,3 +281,26 @@ fn lex_comments() { } ); } + +#[test] +fn lex_manually() { + let file = b"'a'\n#comment\n#comment again\n| continue"; + let mut lex_state = LexState { + input: file, + output: Vec::new(), + error: None, + span_offset: 10, + }; + assert_eq!(lex_n_tokens(&mut lex_state, &[], &[], false, 1), 1); + assert_eq!(lex_state.output.len(), 1); + assert_eq!(lex_n_tokens(&mut lex_state, &[], &[], false, 5), 5); + assert_eq!(lex_state.output.len(), 6); + // Next token is the pipe. + // This shortens the output because it exhausts the input before it can + // compensate for the EOL tokens lost to the line continuation + assert_eq!(lex_n_tokens(&mut lex_state, &[], &[], false, 1), -1); + assert_eq!(lex_state.output.len(), 5); + assert_eq!(file.len(), lex_state.span_offset - 10); + let last_span = lex_state.output.last().unwrap().span; + assert_eq!(&file[last_span.start - 10..last_span.end - 10], b"continue"); +} diff --git a/crates/nu-parser/tests/test_parser.rs b/crates/nu-parser/tests/test_parser.rs index ed0a288a47..973ba1b689 100644 --- a/crates/nu-parser/tests/test_parser.rs +++ b/crates/nu-parser/tests/test_parser.rs @@ -2478,3 +2478,56 @@ mod operator { ); } } + +mod record { + use super::*; + + use nu_protocol::ast::RecordItem; + + #[rstest] + #[case(b"{ :: x }", "Invalid literal")] // Key is bare colon + #[case(b"{ a: x:y }", "Invalid literal")] // Value is bare word with colon + #[case(b"{ a: x('y'):z }", "Invalid literal")] // Value is bare string interpolation with colon + #[case(b"{ ;: x }", "Parse mismatch during operation.")] // Key is a non-item token + #[case(b"{ a: || }", "Parse mismatch during operation.")] // Value is a non-item token + fn refuse_confusing_record(#[case] expr: &[u8], #[case] error: &str) { + dbg!(String::from_utf8_lossy(expr)); + let engine_state = EngineState::new(); + let mut working_set = StateWorkingSet::new(&engine_state); + parse(&mut working_set, None, expr, false); + assert_eq!( + working_set.parse_errors.first().map(|e| e.to_string()), + Some(error.to_string()) + ); + } + + #[rstest] + #[case(b"{ a: 2024-07-23T22:54:54.532100627+02:00 b:xy }")] + fn parse_datetime_in_record(#[case] expr: &[u8]) { + dbg!(String::from_utf8_lossy(expr)); + let engine_state = EngineState::new(); + let mut working_set = StateWorkingSet::new(&engine_state); + let block = parse(&mut working_set, None, expr, false); + assert!(working_set.parse_errors.first().is_none()); + let pipeline_el_expr = &block + .pipelines + .first() + .unwrap() + .elements + .first() + .unwrap() + .expr + .expr; + dbg!(pipeline_el_expr); + match pipeline_el_expr { + Expr::FullCellPath(v) => match &v.head.expr { + Expr::Record(fields) => assert!(matches!( + fields[0], + RecordItem::Pair(_, Expression { ty: Type::Date, .. }) + )), + _ => panic!("Expected record head"), + }, + _ => panic!("Expected full cell path"), + } + } +}