Support extended unicode escapes in strings: "\u{10fff}" (#7883)

# Description Support extended unicode escapes in strings with same syntax as Rust: `"\u{6e}"`. # User-Facing Changes New syntax in string literals, `\u{NNNNNN}`, to go along with the existing `\uNNNN`. New syntax accepts 1-6 hex digits and rejects values greater than 0x10FFFF (max Unicode char).. _(List of all changes that impact the user experience here. This helps us keep track of breaking changes.)_ Won't break existing scripts, since this is new syntax. We might consider deprecating `char -u`, since users can now embed unicode chars > 0xFFFF with the new escape. # Tests + Formatting Several unit tests and one integration test added. - [x] `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) Done - [x] `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used -A clippy::needless_collect` to check that you're using the standard code style Done - [x] `cargo test --workspace` to check that all tests pass Done # After Submitting - [ ] If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date.
2023-01-28 15:25:53 -05:00
parent 2a39332d51
commit e616b2e247
4 changed files with 146 additions and 64 deletions
--- a/crates/nu-parser/src/parser.rs
+++ b/crates/nu-parser/src/parser.rs
@ -2465,7 +2465,7 @@ pub fn unescape_string(bytes: &[u8], span: Span) -> (Vec<u8>, Option<ParseError>
    let mut idx = 0;
    let mut err = None;

-    while idx < bytes.len() {
+    'us_loop: while idx < bytes.len() {
        if bytes[idx] == b'\\' {
            // We're in an escape
            idx += 1;
@ -2552,53 +2552,67 @@ pub fn unescape_string(bytes: &[u8], span: Span) -> (Vec<u8>, Option<ParseError>
                    idx += 1;
                }
                Some(b'u') => {
-                    match (
-                        bytes.get(idx + 1),
-                        bytes.get(idx + 2),
-                        bytes.get(idx + 3),
-                        bytes.get(idx + 4),
-                    ) {
-                        (Some(h1), Some(h2), Some(h3), Some(h4)) => {
-                            let s = String::from_utf8(vec![*h1, *h2, *h3, *h4]);
+                    let mut digits = String::with_capacity(10);
+                    let mut cur_idx = idx + 1; // index of first beyond current end of token

-                            if let Ok(s) = s {
-                                let int = u32::from_str_radix(&s, 16);
-
-                                if let Ok(int) = int {
-                                    let result = char::from_u32(int);
-
-                                    if let Some(result) = result {
-                                        let mut buffer = vec![0; 4];
-                                        let result = result.encode_utf8(&mut buffer);
-
-                                        for elem in result.bytes() {
-                                            output.push(elem);
-                                        }
-
-                                        idx += 5;
-                                        continue;
-                                    }
+                    if let Some(b'{') = bytes.get(idx + 1) {
+                        cur_idx = idx + 2;
+                        loop {
+                            match bytes.get(cur_idx) {
+                                Some(b'}') => {
+                                    cur_idx += 1;
+                                    break;
+                                }
+                                Some(c) => {
+                                    digits.push(*c as char);
+                                    cur_idx += 1;
+                                }
+                                _ => {
+                                    err = Some(ParseError::Expected(
+                                        "closing '}' in unicode escape `\\u{n..}`".into(),
+                                        Span::new(span.start + idx, span.end),
+                                    ));
+                                    break 'us_loop;
                                }
                            }
-                            err = Some(ParseError::Expected(
-                                "unicode hex value".into(),
-                                Span::new(span.start + idx, span.end),
-                            ));
-                        }
-                        _ => {
-                            err = Some(ParseError::Expected(
-                                "unicode hex value".into(),
-                                Span::new(span.start + idx, span.end),
-                            ));
                        }
                    }
-                    idx += 5;
+
+                    if (1..=6).contains(&digits.len()) {
+                        let int = u32::from_str_radix(&digits, 16);
+
+                        if let Ok(int) = int {
+                            if int <= 0x10ffff {
+                                let result = char::from_u32(int);
+
+                                if let Some(result) = result {
+                                    let mut buffer = vec![0; 4];
+                                    let result = result.encode_utf8(&mut buffer);
+
+                                    for elem in result.bytes() {
+                                        output.push(elem);
+                                    }
+
+                                    idx = cur_idx;
+                                    continue 'us_loop;
+                                }
+                            }
+                        }
+                    }
+                    // fall through -- escape not accepted above, must be error.
+                    err = Some(ParseError::Expected(
+                        "unicode escape \\u{n..}".into(),
+                        Span::new(span.start + idx, span.end),
+                    ));
+                    break 'us_loop;
                }
+
                _ => {
                    err = Some(ParseError::Expected(
                        "supported escape character".into(),
                        Span::new(span.start + idx, span.end),
                    ));
+                    break 'us_loop;
                }
            }
        } else {