Fix parsing record values containing colons (#13413)

This PR is an attempt to fix #8257 and fix #10985 (which is
duplicate-ish)

# Description
The parser currently doesn't know how to deal with colons appearing
while lexing whitespace-terminated tokens specifying a record value.
Most notably, this means you can't use datetime literals in record value
position (and as a consequence, `| to nuon | from nuon` roundtrips can
fail), but it also means that bare words containing colons cause a
non-useful error message.

![image](https://github.com/user-attachments/assets/f04a8417-ee18-44e7-90eb-a0ecef943a0f)

`parser::parse_record` calls `lex::lex` with the `:` colon character in
the `special_tokens` argument. This allows colons to terminate record
keys, but as a side effect, it also causes colons to terminate record
*values*. I added a new function `lex::lex_n_tokens`, which allows the
caller to drive the lexing process more explicitly, and used it in
`parser::parse_record` to let colons terminate record keys while not
giving them special treatment when appearing in record values.

This PR description previously said: *Another approach suggested in one
of the issues was to support an additional datetime literal format that
doesn't require colons. I like that that wouldn't require new
`lex::lex_internal` behaviour, but an advantage of my approach is that
it also newly allows for string record values given as bare words
containing colons. I think this eliminates another possible source of
confusion.* It was determined that this is undesirable, and in the
current state of this PR, bare word record values with colons are
rejected explicitly. The better error message is still a win.

# User-Facing Changes
In addition to the above, this PR also disables the use of "special"
(non-item) tokens in record key and value position, and the use of a
single bare `:` as a record key.

Examples of behaviour *before* this PR:
```nu
{ a: b } # Valid, same as { 'a': 'b' }
{ a: b:c } # Error: expected ':'
{ a: 2024-08-13T22:11:09 } # Error: expected ':'
{ :: 1 } # Valid, same as { ':': 1 }
{ ;: 1 } # Valid, same as { ';': 1 }
{ a: || } # Valid, same as { 'a': '||' }
```

Examples of behaviour *after* this PR:
```nu
{ a: b } # (Unchanged) Valid, same as { 'a': 'b' }
{ a: b:c } # Error: colon in bare word specifying record value
{ a: 2024-08-13T22:11:09 } # Valid, same as { a: (2024-08-13T22:11:09) }
{ :: 1 } # Error: colon in bare word specifying record key
{ ;: 1 } # Error: expected item in record key position
{ a: || } # Error: expected item in record value position
```

# Tests + Formatting
I added tests, but I'm not sure if they're sufficient and in the right
place.

# After Submitting
I don't think documentation changes are needed for this, but please let
me know if you disagree.
This commit is contained in:
Kira
2024-08-28 22:53:56 +02:00
committed by GitHub
parent 2c379cba71
commit ae0e13733d
5 changed files with 281 additions and 49 deletions

View File

@ -1,4 +1,4 @@
use nu_parser::{lex, lex_signature, Token, TokenContents};
use nu_parser::{lex, lex_n_tokens, lex_signature, LexState, Token, TokenContents};
use nu_protocol::{ParseError, Span};
#[test]
@ -281,3 +281,26 @@ fn lex_comments() {
}
);
}
#[test]
fn lex_manually() {
let file = b"'a'\n#comment\n#comment again\n| continue";
let mut lex_state = LexState {
input: file,
output: Vec::new(),
error: None,
span_offset: 10,
};
assert_eq!(lex_n_tokens(&mut lex_state, &[], &[], false, 1), 1);
assert_eq!(lex_state.output.len(), 1);
assert_eq!(lex_n_tokens(&mut lex_state, &[], &[], false, 5), 5);
assert_eq!(lex_state.output.len(), 6);
// Next token is the pipe.
// This shortens the output because it exhausts the input before it can
// compensate for the EOL tokens lost to the line continuation
assert_eq!(lex_n_tokens(&mut lex_state, &[], &[], false, 1), -1);
assert_eq!(lex_state.output.len(), 5);
assert_eq!(file.len(), lex_state.span_offset - 10);
let last_span = lex_state.output.last().unwrap().span;
assert_eq!(&file[last_span.start - 10..last_span.end - 10], b"continue");
}

View File

@ -2478,3 +2478,56 @@ mod operator {
);
}
}
mod record {
use super::*;
use nu_protocol::ast::RecordItem;
#[rstest]
#[case(b"{ :: x }", "Invalid literal")] // Key is bare colon
#[case(b"{ a: x:y }", "Invalid literal")] // Value is bare word with colon
#[case(b"{ a: x('y'):z }", "Invalid literal")] // Value is bare string interpolation with colon
#[case(b"{ ;: x }", "Parse mismatch during operation.")] // Key is a non-item token
#[case(b"{ a: || }", "Parse mismatch during operation.")] // Value is a non-item token
fn refuse_confusing_record(#[case] expr: &[u8], #[case] error: &str) {
dbg!(String::from_utf8_lossy(expr));
let engine_state = EngineState::new();
let mut working_set = StateWorkingSet::new(&engine_state);
parse(&mut working_set, None, expr, false);
assert_eq!(
working_set.parse_errors.first().map(|e| e.to_string()),
Some(error.to_string())
);
}
#[rstest]
#[case(b"{ a: 2024-07-23T22:54:54.532100627+02:00 b:xy }")]
fn parse_datetime_in_record(#[case] expr: &[u8]) {
dbg!(String::from_utf8_lossy(expr));
let engine_state = EngineState::new();
let mut working_set = StateWorkingSet::new(&engine_state);
let block = parse(&mut working_set, None, expr, false);
assert!(working_set.parse_errors.first().is_none());
let pipeline_el_expr = &block
.pipelines
.first()
.unwrap()
.elements
.first()
.unwrap()
.expr
.expr;
dbg!(pipeline_el_expr);
match pipeline_el_expr {
Expr::FullCellPath(v) => match &v.head.expr {
Expr::Record(fields) => assert!(matches!(
fields[0],
RecordItem::Pair(_, Expression { ty: Type::Date, .. })
)),
_ => panic!("Expected record head"),
},
_ => panic!("Expected full cell path"),
}
}
}