2024-11-29 21:46:58 +01:00
|
|
|
#![allow(clippy::byte_char_slices)]
|
|
|
|
|
Fix parsing record values containing colons (#13413)
This PR is an attempt to fix #8257 and fix #10985 (which is
duplicate-ish)
# Description
The parser currently doesn't know how to deal with colons appearing
while lexing whitespace-terminated tokens specifying a record value.
Most notably, this means you can't use datetime literals in record value
position (and as a consequence, `| to nuon | from nuon` roundtrips can
fail), but it also means that bare words containing colons cause a
non-useful error message.
![image](https://github.com/user-attachments/assets/f04a8417-ee18-44e7-90eb-a0ecef943a0f)
`parser::parse_record` calls `lex::lex` with the `:` colon character in
the `special_tokens` argument. This allows colons to terminate record
keys, but as a side effect, it also causes colons to terminate record
*values*. I added a new function `lex::lex_n_tokens`, which allows the
caller to drive the lexing process more explicitly, and used it in
`parser::parse_record` to let colons terminate record keys while not
giving them special treatment when appearing in record values.
This PR description previously said: *Another approach suggested in one
of the issues was to support an additional datetime literal format that
doesn't require colons. I like that that wouldn't require new
`lex::lex_internal` behaviour, but an advantage of my approach is that
it also newly allows for string record values given as bare words
containing colons. I think this eliminates another possible source of
confusion.* It was determined that this is undesirable, and in the
current state of this PR, bare word record values with colons are
rejected explicitly. The better error message is still a win.
# User-Facing Changes
In addition to the above, this PR also disables the use of "special"
(non-item) tokens in record key and value position, and the use of a
single bare `:` as a record key.
Examples of behaviour *before* this PR:
```nu
{ a: b } # Valid, same as { 'a': 'b' }
{ a: b:c } # Error: expected ':'
{ a: 2024-08-13T22:11:09 } # Error: expected ':'
{ :: 1 } # Valid, same as { ':': 1 }
{ ;: 1 } # Valid, same as { ';': 1 }
{ a: || } # Valid, same as { 'a': '||' }
```
Examples of behaviour *after* this PR:
```nu
{ a: b } # (Unchanged) Valid, same as { 'a': 'b' }
{ a: b:c } # Error: colon in bare word specifying record value
{ a: 2024-08-13T22:11:09 } # Valid, same as { a: (2024-08-13T22:11:09) }
{ :: 1 } # Error: colon in bare word specifying record key
{ ;: 1 } # Error: expected item in record key position
{ a: || } # Error: expected item in record value position
```
# Tests + Formatting
I added tests, but I'm not sure if they're sufficient and in the right
place.
# After Submitting
I don't think documentation changes are needed for this, but please let
me know if you disagree.
2024-08-28 22:53:56 +02:00
|
|
|
use nu_parser::{lex, lex_n_tokens, lex_signature, LexState, Token, TokenContents};
|
2023-04-07 02:35:45 +02:00
|
|
|
use nu_protocol::{ParseError, Span};
|
2021-08-30 20:36:07 +02:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_basic() {
|
|
|
|
let file = b"let x = 4";
|
|
|
|
|
2021-11-21 19:13:09 +01:00
|
|
|
let output = lex(file, 0, &[], &[], true);
|
2021-08-30 20:36:07 +02:00
|
|
|
|
|
|
|
assert!(output.1.is_none());
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_newline() {
|
|
|
|
let file = b"let x = 300\nlet y = 500;";
|
|
|
|
|
2021-11-21 19:13:09 +01:00
|
|
|
let output = lex(file, 0, &[], &[], true);
|
2021-08-30 20:36:07 +02:00
|
|
|
|
|
|
|
assert!(output.0.contains(&Token {
|
|
|
|
contents: TokenContents::Eol,
|
2022-12-03 10:44:12 +01:00
|
|
|
span: Span::new(11, 12)
|
2021-08-30 20:36:07 +02:00
|
|
|
}));
|
|
|
|
}
|
|
|
|
|
2023-03-24 12:54:06 +01:00
|
|
|
#[test]
|
|
|
|
fn lex_annotations_list() {
|
|
|
|
let file = b"items: list<string>";
|
|
|
|
|
|
|
|
let (output, err) = lex_signature(file, 0, &[b'\n', b'\r'], &[b':', b'=', b','], false);
|
|
|
|
|
|
|
|
assert!(err.is_none());
|
|
|
|
assert_eq!(output.len(), 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_annotations_record() {
|
|
|
|
let file = b"config: record<name: string>";
|
|
|
|
|
|
|
|
let (output, err) = lex_signature(file, 0, &[b'\n', b'\r'], &[b':', b'=', b','], false);
|
|
|
|
|
|
|
|
assert!(err.is_none());
|
|
|
|
assert_eq!(output.len(), 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_annotations_empty() {
|
|
|
|
let file = b"items: list<>";
|
|
|
|
|
|
|
|
let (output, err) = lex_signature(file, 0, &[b'\n', b'\r'], &[b':', b'=', b','], false);
|
|
|
|
|
|
|
|
assert!(err.is_none());
|
|
|
|
assert_eq!(output.len(), 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_annotations_space_before_annotations() {
|
|
|
|
let file = b"items: list <string>";
|
|
|
|
|
|
|
|
let (output, err) = lex_signature(file, 0, &[b'\n', b'\r'], &[b':', b'=', b','], false);
|
|
|
|
|
|
|
|
assert!(err.is_none());
|
|
|
|
assert_eq!(output.len(), 4);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_annotations_space_within_annotations() {
|
|
|
|
let file = b"items: list< string>";
|
|
|
|
|
|
|
|
let (output, err) = lex_signature(file, 0, &[b'\n', b'\r'], &[b':', b'=', b','], false);
|
|
|
|
|
|
|
|
assert!(err.is_none());
|
|
|
|
assert_eq!(output.len(), 3);
|
|
|
|
|
|
|
|
let file = b"items: list<string >";
|
|
|
|
|
|
|
|
let (output, err) = lex_signature(file, 0, &[b'\n', b'\r'], &[b':', b'=', b','], false);
|
|
|
|
|
|
|
|
assert!(err.is_none());
|
|
|
|
assert_eq!(output.len(), 3);
|
|
|
|
|
|
|
|
let file = b"items: list< string >";
|
|
|
|
|
|
|
|
let (output, err) = lex_signature(file, 0, &[b'\n', b'\r'], &[b':', b'=', b','], false);
|
|
|
|
|
|
|
|
assert!(err.is_none());
|
|
|
|
assert_eq!(output.len(), 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_annotations_nested() {
|
|
|
|
let file = b"items: list<record<name: string>>";
|
|
|
|
|
|
|
|
let (output, err) = lex_signature(file, 0, &[b'\n', b'\r'], &[b':', b'=', b','], false);
|
|
|
|
|
|
|
|
assert!(err.is_none());
|
|
|
|
assert_eq!(output.len(), 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_annotations_nested_unterminated() {
|
|
|
|
let file = b"items: list<record<name: string>";
|
|
|
|
|
|
|
|
let (output, err) = lex_signature(file, 0, &[b'\n', b'\r'], &[b':', b'=', b','], false);
|
|
|
|
|
|
|
|
assert!(matches!(err.unwrap(), ParseError::UnexpectedEof(_, _)));
|
|
|
|
assert_eq!(output.len(), 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_annotations_unterminated() {
|
|
|
|
let file = b"items: list<string";
|
|
|
|
|
|
|
|
let (output, err) = lex_signature(file, 0, &[b'\n', b'\r'], &[b':', b'=', b','], false);
|
|
|
|
|
|
|
|
assert!(matches!(err.unwrap(), ParseError::UnexpectedEof(_, _)));
|
|
|
|
assert_eq!(output.len(), 3);
|
|
|
|
}
|
|
|
|
|
2021-08-30 20:36:07 +02:00
|
|
|
#[test]
|
|
|
|
fn lex_empty() {
|
|
|
|
let file = b"";
|
|
|
|
|
2021-11-21 19:13:09 +01:00
|
|
|
let output = lex(file, 0, &[], &[], true);
|
2021-08-30 20:36:07 +02:00
|
|
|
|
|
|
|
assert!(output.0.is_empty());
|
|
|
|
assert!(output.1.is_none());
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_parenthesis() {
|
|
|
|
// The whole parenthesis is an item for the lexer
|
|
|
|
let file = b"let x = (300 + (322 * 444));";
|
|
|
|
|
2021-11-21 19:13:09 +01:00
|
|
|
let output = lex(file, 0, &[], &[], true);
|
2021-08-30 20:36:07 +02:00
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
output.0.get(3).unwrap(),
|
|
|
|
&Token {
|
|
|
|
contents: TokenContents::Item,
|
2022-12-03 10:44:12 +01:00
|
|
|
span: Span::new(8, 27)
|
2021-08-30 20:36:07 +02:00
|
|
|
}
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_comment() {
|
|
|
|
let file = b"let x = 300 # a comment \n $x + 444";
|
|
|
|
|
2021-11-21 19:13:09 +01:00
|
|
|
let output = lex(file, 0, &[], &[], false);
|
2021-08-30 20:36:07 +02:00
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
output.0.get(4).unwrap(),
|
|
|
|
&Token {
|
|
|
|
contents: TokenContents::Comment,
|
2022-12-03 10:44:12 +01:00
|
|
|
span: Span::new(12, 24)
|
2021-08-30 20:36:07 +02:00
|
|
|
}
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2024-12-13 14:02:07 +01:00
|
|
|
#[test]
|
|
|
|
fn lex_not_comment_needs_space_in_front_of_hashtag() {
|
|
|
|
let file = b"1..10 | each {echo test#testing }";
|
|
|
|
|
|
|
|
let output = lex(file, 0, &[], &[], false);
|
|
|
|
|
|
|
|
assert!(output.1.is_none());
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_comment_with_space_in_front_of_hashtag() {
|
|
|
|
let file = b"1..10 | each {echo test #testing }";
|
|
|
|
|
|
|
|
let output = lex(file, 0, &[], &[], false);
|
|
|
|
|
|
|
|
assert!(output.1.is_some());
|
|
|
|
assert!(matches!(
|
|
|
|
output.1.unwrap(),
|
|
|
|
ParseError::UnexpectedEof(missing_token, span) if missing_token == "}"
|
|
|
|
&& span == Span::new(33, 34)
|
|
|
|
));
|
|
|
|
}
|
|
|
|
|
2021-08-30 20:36:07 +02:00
|
|
|
#[test]
|
|
|
|
fn lex_is_incomplete() {
|
|
|
|
let file = b"let x = 300 | ;";
|
|
|
|
|
2021-11-21 19:13:09 +01:00
|
|
|
let output = lex(file, 0, &[], &[], true);
|
2021-08-30 20:36:07 +02:00
|
|
|
|
|
|
|
let err = output.1.unwrap();
|
|
|
|
assert!(matches!(err, ParseError::ExtraTokens(_)));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_incomplete_paren() {
|
|
|
|
let file = b"let x = (300 + ( 4 + 1)";
|
|
|
|
|
2021-11-21 19:13:09 +01:00
|
|
|
let output = lex(file, 0, &[], &[], true);
|
2021-08-30 20:36:07 +02:00
|
|
|
|
|
|
|
let err = output.1.unwrap();
|
|
|
|
assert!(matches!(err, ParseError::UnexpectedEof(v, _) if v == ")"));
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_incomplete_quote() {
|
|
|
|
let file = b"let x = '300 + 4 + 1";
|
|
|
|
|
2021-11-21 19:13:09 +01:00
|
|
|
let output = lex(file, 0, &[], &[], true);
|
2021-08-30 20:36:07 +02:00
|
|
|
|
|
|
|
let err = output.1.unwrap();
|
|
|
|
assert!(matches!(err, ParseError::UnexpectedEof(v, _) if v == "'"));
|
|
|
|
}
|
2021-08-31 21:33:41 +02:00
|
|
|
|
fix: fix lexing of comments, such that a#b becomes a coherent Item (#8151)
# Description
Previously `nix run nixpkgs#hello` was lexed as `Item, Item, Item,
Comment`, however, `#hello` is *not* supposed to be a comment here and
should be parsed as part of the third `Item`.
This change introduces this behavior by not interrupting the parse of
the current token upon seeing a `#`.
Thank you so much for considering this, I think many `nix` users will be
grateful for this change and I think this will lead to more adaptation
in the ecosystem.
- closes #8137 and #6335
# User-Facing Changes
- code like `somecode# bla` and `somecode#bla` will not be parsed as
`somecode, comment` but as `somecode#bla`, hence this is a breaking
change for all users who didn't put a space before a comment introducing
token (`#`)
# Tests + Formatting
I've added tests that cover this behavior in `test_lex.rs`
- [x] `cargo fmt --all -- --check` to check standard code formatting
(`cargo fmt --all` applies these changes)
- [x] `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used -A
clippy::needless_collect` to check that you're using the standard code
style
- [x] `cargo test --workspace` to check that all tests pass
# After Submitting
> If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
I think this is expected behavior in most other shells, so the
documentation was lacking for not documenting the unexpected behavior
before and hence now is automatically more complete >D
2023-02-22 13:59:47 +01:00
|
|
|
#[test]
|
|
|
|
fn lex_comments_no_space() {
|
|
|
|
// test for parses that contain tokens that normally introduce comments
|
|
|
|
// Code:
|
|
|
|
// let z = 42 #the comment
|
|
|
|
// let x#y = 69 #hello
|
|
|
|
// let flk = nixpkgs#hello #hello
|
|
|
|
let file = b"let z = 42 #the comment \n let x#y = 69 #hello \n let flk = nixpkgs#hello #hello";
|
|
|
|
let output = lex(file, 0, &[], &[], false);
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
output.0.get(4).unwrap(),
|
|
|
|
&Token {
|
|
|
|
contents: TokenContents::Comment,
|
|
|
|
span: Span::new(11, 24)
|
|
|
|
}
|
|
|
|
);
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
output.0.get(7).unwrap(),
|
|
|
|
&Token {
|
|
|
|
contents: TokenContents::Item,
|
|
|
|
span: Span::new(30, 33)
|
|
|
|
}
|
|
|
|
);
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
output.0.get(10).unwrap(),
|
|
|
|
&Token {
|
|
|
|
contents: TokenContents::Comment,
|
|
|
|
span: Span::new(39, 46)
|
|
|
|
}
|
|
|
|
);
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
output.0.get(15).unwrap(),
|
|
|
|
&Token {
|
|
|
|
contents: TokenContents::Item,
|
|
|
|
span: Span::new(58, 71)
|
|
|
|
}
|
|
|
|
);
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
output.0.get(16).unwrap(),
|
|
|
|
&Token {
|
|
|
|
contents: TokenContents::Comment,
|
|
|
|
span: Span::new(72, 78)
|
|
|
|
}
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2021-08-31 21:33:41 +02:00
|
|
|
#[test]
|
|
|
|
fn lex_comments() {
|
|
|
|
// Comments should keep the end of line token
|
|
|
|
// Code:
|
|
|
|
// let z = 4
|
|
|
|
// let x = 4 #comment
|
|
|
|
// let y = 1 # comment
|
|
|
|
let file = b"let z = 4 #comment \n let x = 4 # comment\n let y = 1 # comment";
|
|
|
|
|
2021-11-21 19:13:09 +01:00
|
|
|
let output = lex(file, 0, &[], &[], false);
|
2021-08-31 21:33:41 +02:00
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
output.0.get(4).unwrap(),
|
|
|
|
&Token {
|
|
|
|
contents: TokenContents::Comment,
|
2022-12-03 10:44:12 +01:00
|
|
|
span: Span::new(10, 19)
|
2021-08-31 21:33:41 +02:00
|
|
|
}
|
|
|
|
);
|
|
|
|
assert_eq!(
|
|
|
|
output.0.get(5).unwrap(),
|
|
|
|
&Token {
|
|
|
|
contents: TokenContents::Eol,
|
2022-12-03 10:44:12 +01:00
|
|
|
span: Span::new(19, 20)
|
2021-08-31 21:33:41 +02:00
|
|
|
}
|
|
|
|
);
|
|
|
|
|
|
|
|
// When there is no space between the comment and the new line the span
|
|
|
|
// for the command and the EOL overlaps
|
|
|
|
assert_eq!(
|
|
|
|
output.0.get(10).unwrap(),
|
|
|
|
&Token {
|
|
|
|
contents: TokenContents::Comment,
|
2022-12-03 10:44:12 +01:00
|
|
|
span: Span::new(31, 40)
|
2021-08-31 21:33:41 +02:00
|
|
|
}
|
|
|
|
);
|
|
|
|
assert_eq!(
|
|
|
|
output.0.get(11).unwrap(),
|
|
|
|
&Token {
|
|
|
|
contents: TokenContents::Eol,
|
2022-12-03 10:44:12 +01:00
|
|
|
span: Span::new(40, 41)
|
2021-08-31 21:33:41 +02:00
|
|
|
}
|
|
|
|
);
|
|
|
|
}
|
Fix parsing record values containing colons (#13413)
This PR is an attempt to fix #8257 and fix #10985 (which is
duplicate-ish)
# Description
The parser currently doesn't know how to deal with colons appearing
while lexing whitespace-terminated tokens specifying a record value.
Most notably, this means you can't use datetime literals in record value
position (and as a consequence, `| to nuon | from nuon` roundtrips can
fail), but it also means that bare words containing colons cause a
non-useful error message.
![image](https://github.com/user-attachments/assets/f04a8417-ee18-44e7-90eb-a0ecef943a0f)
`parser::parse_record` calls `lex::lex` with the `:` colon character in
the `special_tokens` argument. This allows colons to terminate record
keys, but as a side effect, it also causes colons to terminate record
*values*. I added a new function `lex::lex_n_tokens`, which allows the
caller to drive the lexing process more explicitly, and used it in
`parser::parse_record` to let colons terminate record keys while not
giving them special treatment when appearing in record values.
This PR description previously said: *Another approach suggested in one
of the issues was to support an additional datetime literal format that
doesn't require colons. I like that that wouldn't require new
`lex::lex_internal` behaviour, but an advantage of my approach is that
it also newly allows for string record values given as bare words
containing colons. I think this eliminates another possible source of
confusion.* It was determined that this is undesirable, and in the
current state of this PR, bare word record values with colons are
rejected explicitly. The better error message is still a win.
# User-Facing Changes
In addition to the above, this PR also disables the use of "special"
(non-item) tokens in record key and value position, and the use of a
single bare `:` as a record key.
Examples of behaviour *before* this PR:
```nu
{ a: b } # Valid, same as { 'a': 'b' }
{ a: b:c } # Error: expected ':'
{ a: 2024-08-13T22:11:09 } # Error: expected ':'
{ :: 1 } # Valid, same as { ':': 1 }
{ ;: 1 } # Valid, same as { ';': 1 }
{ a: || } # Valid, same as { 'a': '||' }
```
Examples of behaviour *after* this PR:
```nu
{ a: b } # (Unchanged) Valid, same as { 'a': 'b' }
{ a: b:c } # Error: colon in bare word specifying record value
{ a: 2024-08-13T22:11:09 } # Valid, same as { a: (2024-08-13T22:11:09) }
{ :: 1 } # Error: colon in bare word specifying record key
{ ;: 1 } # Error: expected item in record key position
{ a: || } # Error: expected item in record value position
```
# Tests + Formatting
I added tests, but I'm not sure if they're sufficient and in the right
place.
# After Submitting
I don't think documentation changes are needed for this, but please let
me know if you disagree.
2024-08-28 22:53:56 +02:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn lex_manually() {
|
|
|
|
let file = b"'a'\n#comment\n#comment again\n| continue";
|
|
|
|
let mut lex_state = LexState {
|
|
|
|
input: file,
|
|
|
|
output: Vec::new(),
|
|
|
|
error: None,
|
|
|
|
span_offset: 10,
|
|
|
|
};
|
|
|
|
assert_eq!(lex_n_tokens(&mut lex_state, &[], &[], false, 1), 1);
|
|
|
|
assert_eq!(lex_state.output.len(), 1);
|
|
|
|
assert_eq!(lex_n_tokens(&mut lex_state, &[], &[], false, 5), 5);
|
|
|
|
assert_eq!(lex_state.output.len(), 6);
|
|
|
|
// Next token is the pipe.
|
|
|
|
// This shortens the output because it exhausts the input before it can
|
|
|
|
// compensate for the EOL tokens lost to the line continuation
|
|
|
|
assert_eq!(lex_n_tokens(&mut lex_state, &[], &[], false, 1), -1);
|
|
|
|
assert_eq!(lex_state.output.len(), 5);
|
|
|
|
assert_eq!(file.len(), lex_state.span_offset - 10);
|
|
|
|
let last_span = lex_state.output.last().unwrap().span;
|
|
|
|
assert_eq!(&file[last_span.start - 10..last_span.end - 10], b"continue");
|
|
|
|
}
|