Fix parsing record values containing colons (#13413)

This PR is an attempt to fix #8257 and fix #10985 (which is
duplicate-ish)

# Description
The parser currently doesn't know how to deal with colons appearing
while lexing whitespace-terminated tokens specifying a record value.
Most notably, this means you can't use datetime literals in record value
position (and as a consequence, `| to nuon | from nuon` roundtrips can
fail), but it also means that bare words containing colons cause a
non-useful error message.

![image](https://github.com/user-attachments/assets/f04a8417-ee18-44e7-90eb-a0ecef943a0f)

`parser::parse_record` calls `lex::lex` with the `:` colon character in
the `special_tokens` argument. This allows colons to terminate record
keys, but as a side effect, it also causes colons to terminate record
*values*. I added a new function `lex::lex_n_tokens`, which allows the
caller to drive the lexing process more explicitly, and used it in
`parser::parse_record` to let colons terminate record keys while not
giving them special treatment when appearing in record values.

This PR description previously said: *Another approach suggested in one
of the issues was to support an additional datetime literal format that
doesn't require colons. I like that that wouldn't require new
`lex::lex_internal` behaviour, but an advantage of my approach is that
it also newly allows for string record values given as bare words
containing colons. I think this eliminates another possible source of
confusion.* It was determined that this is undesirable, and in the
current state of this PR, bare word record values with colons are
rejected explicitly. The better error message is still a win.

# User-Facing Changes
In addition to the above, this PR also disables the use of "special"
(non-item) tokens in record key and value position, and the use of a
single bare `:` as a record key.

Examples of behaviour *before* this PR:
```nu
{ a: b } # Valid, same as { 'a': 'b' }
{ a: b:c } # Error: expected ':'
{ a: 2024-08-13T22:11:09 } # Error: expected ':'
{ :: 1 } # Valid, same as { ':': 1 }
{ ;: 1 } # Valid, same as { ';': 1 }
{ a: || } # Valid, same as { 'a': '||' }
```

Examples of behaviour *after* this PR:
```nu
{ a: b } # (Unchanged) Valid, same as { 'a': 'b' }
{ a: b:c } # Error: colon in bare word specifying record value
{ a: 2024-08-13T22:11:09 } # Valid, same as { a: (2024-08-13T22:11:09) }
{ :: 1 } # Error: colon in bare word specifying record key
{ ;: 1 } # Error: expected item in record key position
{ a: || } # Error: expected item in record value position
```

# Tests + Formatting
I added tests, but I'm not sure if they're sufficient and in the right
place.

# After Submitting
I don't think documentation changes are needed for this, but please let
me know if you disagree.
This commit is contained in:
Kira
2024-08-28 22:53:56 +02:00
committed by GitHub
parent 2c379cba71
commit ae0e13733d
5 changed files with 281 additions and 49 deletions

View File

@ -1,5 +1,5 @@
use crate::{
lex::{is_assignment_operator, lex, lex_signature},
lex::{is_assignment_operator, lex, lex_n_tokens, lex_signature, LexState},
lite_parser::{lite_parse, LiteCommand, LitePipeline, LiteRedirection, LiteRedirectionTarget},
parse_keywords::*,
parse_patterns::parse_pattern,
@ -5650,6 +5650,49 @@ pub fn parse_builtin_commands(
}
}
fn check_record_key_or_value(
working_set: &StateWorkingSet,
expr: &Expression,
position: &str,
) -> Option<ParseError> {
let bareword_error = |string_value: &Expression| {
working_set
.get_span_contents(string_value.span)
.iter()
.find_position(|b| **b == b':')
.map(|(i, _)| {
let colon_position = i + string_value.span.start;
ParseError::InvalidLiteral(
"colon".to_string(),
format!("bare word specifying record {}", position),
Span::new(colon_position, colon_position + 1),
)
})
};
let value_span = working_set.get_span_contents(expr.span);
match expr.expr {
Expr::String(_) => {
if ![b'"', b'\'', b'`'].contains(&value_span[0]) {
bareword_error(expr)
} else {
None
}
}
Expr::StringInterpolation(ref expressions) => {
if value_span[0] != b'$' {
expressions
.iter()
.filter(|expr| matches!(expr.expr, Expr::String(_)))
.filter_map(bareword_error)
.next()
} else {
None
}
}
_ => None,
}
}
pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression {
let bytes = working_set.get_span_contents(span);
@ -5670,9 +5713,32 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression
}
let inner_span = Span::new(start, end);
let source = working_set.get_span_contents(inner_span);
let (tokens, err) = lex(source, start, &[b'\n', b'\r', b','], &[b':'], true);
let mut lex_state = LexState {
input: working_set.get_span_contents(inner_span),
output: Vec::new(),
error: None,
span_offset: start,
};
let mut lex_n = |additional_whitespace, special_tokens, max_tokens| {
lex_n_tokens(
&mut lex_state,
additional_whitespace,
special_tokens,
true,
max_tokens,
)
};
loop {
if lex_n(&[b'\n', b'\r', b','], &[b':'], 2) < 2 {
break;
};
if lex_n(&[b'\n', b'\r', b','], &[], 1) < 1 {
break;
};
}
let (tokens, err) = (lex_state.output, lex_state.error);
if let Some(err) = err {
working_set.error(err);
}
@ -5716,7 +5782,22 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression
));
} else {
// Normal key-value pair
let field = parse_value(working_set, curr_span, &SyntaxShape::Any);
let field_token = &tokens[idx];
let field = if field_token.contents != TokenContents::Item {
working_set.error(ParseError::Expected(
"item in record key position",
Span::new(field_token.span.start, field_token.span.end),
));
garbage(working_set, curr_span)
} else {
let field = parse_value(working_set, curr_span, &SyntaxShape::Any);
if let Some(error) = check_record_key_or_value(working_set, &field, "key") {
working_set.error(error);
garbage(working_set, field.span)
} else {
field
}
};
idx += 1;
if idx == tokens.len() {
@ -5761,7 +5842,26 @@ pub fn parse_record(working_set: &mut StateWorkingSet, span: Span) -> Expression
));
break;
}
let value = parse_value(working_set, tokens[idx].span, &SyntaxShape::Any);
let value_token = &tokens[idx];
let value = if value_token.contents != TokenContents::Item {
working_set.error(ParseError::Expected(
"item in record value position",
Span::new(value_token.span.start, value_token.span.end),
));
garbage(
working_set,
Span::new(value_token.span.start, value_token.span.end),
)
} else {
let value = parse_value(working_set, tokens[idx].span, &SyntaxShape::Any);
if let Some(parse_error) = check_record_key_or_value(working_set, &value, "value") {
working_set.error(parse_error);
garbage(working_set, value.span)
} else {
value
}
};
idx += 1;
if let Some(field) = field.as_string() {