Support extended unicode escapes in strings: "\u{10fff}" (#7883)

# Description

Support extended unicode escapes in strings with same syntax as Rust:
`"\u{6e}"`.

# User-Facing Changes

New syntax in string literals, `\u{NNNNNN}`, to go along with the
existing `\uNNNN`.
New syntax accepts 1-6 hex digits and rejects values greater than
0x10FFFF (max Unicode char)..

_(List of all changes that impact the user experience here. This helps
us keep track of breaking changes.)_

Won't break existing scripts, since this is new syntax.  

We might consider deprecating `char -u`, since users can now embed
unicode chars > 0xFFFF with the new escape.

# Tests + Formatting

Several unit tests and one integration test added.

- [x] `cargo fmt --all -- --check` to check standard code formatting
(`cargo fmt --all` applies these changes)
Done
- [x] `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used -A
clippy::needless_collect` to check that you're using the standard code
style
Done
- [x] `cargo test --workspace` to check that all tests pass  
Done

# After Submitting

- [ ] If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
This commit is contained in:
Bob Hyman
2023-01-28 15:25:53 -05:00
committed by GitHub
parent 2a39332d51
commit e616b2e247
4 changed files with 146 additions and 64 deletions

View File

@ -2465,7 +2465,7 @@ pub fn unescape_string(bytes: &[u8], span: Span) -> (Vec<u8>, Option<ParseError>
let mut idx = 0;
let mut err = None;
while idx < bytes.len() {
'us_loop: while idx < bytes.len() {
if bytes[idx] == b'\\' {
// We're in an escape
idx += 1;
@ -2552,53 +2552,67 @@ pub fn unescape_string(bytes: &[u8], span: Span) -> (Vec<u8>, Option<ParseError>
idx += 1;
}
Some(b'u') => {
match (
bytes.get(idx + 1),
bytes.get(idx + 2),
bytes.get(idx + 3),
bytes.get(idx + 4),
) {
(Some(h1), Some(h2), Some(h3), Some(h4)) => {
let s = String::from_utf8(vec![*h1, *h2, *h3, *h4]);
let mut digits = String::with_capacity(10);
let mut cur_idx = idx + 1; // index of first beyond current end of token
if let Ok(s) = s {
let int = u32::from_str_radix(&s, 16);
if let Ok(int) = int {
let result = char::from_u32(int);
if let Some(result) = result {
let mut buffer = vec![0; 4];
let result = result.encode_utf8(&mut buffer);
for elem in result.bytes() {
output.push(elem);
}
idx += 5;
continue;
}
if let Some(b'{') = bytes.get(idx + 1) {
cur_idx = idx + 2;
loop {
match bytes.get(cur_idx) {
Some(b'}') => {
cur_idx += 1;
break;
}
Some(c) => {
digits.push(*c as char);
cur_idx += 1;
}
_ => {
err = Some(ParseError::Expected(
"closing '}' in unicode escape `\\u{n..}`".into(),
Span::new(span.start + idx, span.end),
));
break 'us_loop;
}
}
err = Some(ParseError::Expected(
"unicode hex value".into(),
Span::new(span.start + idx, span.end),
));
}
_ => {
err = Some(ParseError::Expected(
"unicode hex value".into(),
Span::new(span.start + idx, span.end),
));
}
}
idx += 5;
if (1..=6).contains(&digits.len()) {
let int = u32::from_str_radix(&digits, 16);
if let Ok(int) = int {
if int <= 0x10ffff {
let result = char::from_u32(int);
if let Some(result) = result {
let mut buffer = vec![0; 4];
let result = result.encode_utf8(&mut buffer);
for elem in result.bytes() {
output.push(elem);
}
idx = cur_idx;
continue 'us_loop;
}
}
}
}
// fall through -- escape not accepted above, must be error.
err = Some(ParseError::Expected(
"unicode escape \\u{n..}".into(),
Span::new(span.start + idx, span.end),
));
break 'us_loop;
}
_ => {
err = Some(ParseError::Expected(
"supported escape character".into(),
Span::new(span.start + idx, span.end),
));
break 'us_loop;
}
}
} else {