From e616b2e247a329489cc838edf2c8e3590d7efe2e Mon Sep 17 00:00:00 2001
From: Bob Hyman <bob.hyman@gmail.com>
Date: Sat, 28 Jan 2023 15:25:53 -0500
Subject: [PATCH] Support extended unicode escapes in strings: "\u{10fff}"
 (#7883)

# Description

Support extended unicode escapes in strings with same syntax as Rust:
`"\u{6e}"`.

# User-Facing Changes

New syntax in string literals, `\u{NNNNNN}`, to go along with the
existing `\uNNNN`.
New syntax accepts 1-6 hex digits and rejects values greater than
0x10FFFF (max Unicode char)..

_(List of all changes that impact the user experience here. This helps
us keep track of breaking changes.)_

Won't break existing scripts, since this is new syntax.

We might consider deprecating `char -u`, since users can now embed
unicode chars > 0xFFFF with the new escape.

# Tests + Formatting

Several unit tests and one integration test added.

- [x] `cargo fmt --all -- --check` to check standard code formatting
(`cargo fmt --all` applies these changes)
Done
- [x] `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used -A
clippy::needless_collect` to check that you're using the standard code
style
Done
- [x] `cargo test --workspace` to check that all tests pass
Done

# After Submitting

- [ ] If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
---
 crates/nu-parser/src/parser.rs                | 88 ++++++++++--------
 crates/nu-parser/tests/test_parser.rs         | 24 -----
 .../tests/test_parser_unicode_escapes.rs      | 91 +++++++++++++++++++
 src/tests/test_parser.rs                      |  7 +-
 4 files changed, 146 insertions(+), 64 deletions(-)
 create mode 100644 crates/nu-parser/tests/test_parser_unicode_escapes.rs
diff --git a/crates/nu-parser/src/parser.rs b/crates/nu-parser/src/parser.rs
index c809a3d6f..23049cc40 100644
--- a/crates/nu-parser/src/parser.rs
+++ b/crates/nu-parser/src/parser.rs
@@ -2465,7 +2465,7 @@ pub fn unescape_string(bytes: &[u8], span: Span) -> (Vec<u8>, Option<ParseError>
     let mut idx = 0;
     let mut err = None;
 
-    while idx < bytes.len() {
+    'us_loop: while idx < bytes.len() {
         if bytes[idx] == b'\\' {
             // We're in an escape
             idx += 1;
@@ -2552,53 +2552,67 @@ pub fn unescape_string(bytes: &[u8], span: Span) -> (Vec<u8>, Option<ParseError>
                     idx += 1;
                 }
                 Some(b'u') => {
-                    match (
-                        bytes.get(idx + 1),
-                        bytes.get(idx + 2),
-                        bytes.get(idx + 3),
-                        bytes.get(idx + 4),
-                    ) {
-                        (Some(h1), Some(h2), Some(h3), Some(h4)) => {
-                            let s = String::from_utf8(vec![*h1, *h2, *h3, *h4]);
+                    let mut digits = String::with_capacity(10);
+                    let mut cur_idx = idx + 1; // index of first beyond current end of token
 
-                            if let Ok(s) = s {
-                                let int = u32::from_str_radix(&s, 16);
-
-                                if let Ok(int) = int {
-                                    let result = char::from_u32(int);
-
-                                    if let Some(result) = result {
-                                        let mut buffer = vec![0; 4];
-                                        let result = result.encode_utf8(&mut buffer);
-
-                                        for elem in result.bytes() {
-                                            output.push(elem);
-                                        }
-
-                                        idx += 5;
-                                        continue;
-                                    }
+                    if let Some(b'{') = bytes.get(idx + 1) {
+                        cur_idx = idx + 2;
+                        loop {
+                            match bytes.get(cur_idx) {
+                                Some(b'}') => {
+                                    cur_idx += 1;
+                                    break;
+                                }
+                                Some(c) => {
+                                    digits.push(*c as char);
+                                    cur_idx += 1;
+                                }
+                                _ => {
+                                    err = Some(ParseError::Expected(
+                                        "closing '}' in unicode escape `\\u{n..}`".into(),
+                                        Span::new(span.start + idx, span.end),
+                                    ));
+                                    break 'us_loop;
                                 }
                             }
-                            err = Some(ParseError::Expected(
-                                "unicode hex value".into(),
-                                Span::new(span.start + idx, span.end),
-                            ));
-                        }
-                        _ => {
-                            err = Some(ParseError::Expected(
-                                "unicode hex value".into(),
-                                Span::new(span.start + idx, span.end),
-                            ));
                         }
                     }
-                    idx += 5;
+
+                    if (1..=6).contains(&digits.len()) {
+                        let int = u32::from_str_radix(&digits, 16);
+
+                        if let Ok(int) = int {
+                            if int <= 0x10ffff {
+                                let result = char::from_u32(int);
+
+                                if let Some(result) = result {
+                                    let mut buffer = vec![0; 4];
+                                    let result = result.encode_utf8(&mut buffer);
+
+                                    for elem in result.bytes() {
+                                        output.push(elem);
+                                    }
+
+                                    idx = cur_idx;
+                                    continue 'us_loop;
+                                }
+                            }
+                        }
+                    }
+                    // fall through -- escape not accepted above, must be error.
+                    err = Some(ParseError::Expected(
+                        "unicode escape \\u{n..}".into(),
+                        Span::new(span.start + idx, span.end),
+                    ));
+                    break 'us_loop;
                 }
+
                 _ => {
                     err = Some(ParseError::Expected(
                         "supported escape character".into(),
                         Span::new(span.start + idx, span.end),
                     ));
+                    break 'us_loop;
                 }
             }
         } else {
diff --git a/crates/nu-parser/tests/test_parser.rs b/crates/nu-parser/tests/test_parser.rs
index fd35599ea..78930a841 100644
--- a/crates/nu-parser/tests/test_parser.rs
+++ b/crates/nu-parser/tests/test_parser.rs
@@ -412,30 +412,6 @@ mod string {
         }
     }
 
-    #[test]
-    pub fn parse_escaped_string() {
-        let engine_state = EngineState::new();
-        let mut working_set = StateWorkingSet::new(&engine_state);
-
-        let (block, err) = parse(
-            &mut working_set,
-            None,
-            b"\"hello \\u006e\\u0075\\u0073hell\"",
-            true,
-            &[],
-        );
-
-        assert!(err.is_none());
-        assert_eq!(block.len(), 1);
-        let expressions = &block[0];
-        assert_eq!(expressions.len(), 1);
-        if let PipelineElement::Expression(_, expr) = &expressions[0] {
-            assert_eq!(expr.expr, Expr::String("hello nushell".to_string()))
-        } else {
-            panic!("Not an expression")
-        }
-    }
-
     mod interpolation {
         use nu_protocol::Span;
 
diff --git a/crates/nu-parser/tests/test_parser_unicode_escapes.rs b/crates/nu-parser/tests/test_parser_unicode_escapes.rs
new file mode 100644
index 000000000..0cc884b1d
--- /dev/null
+++ b/crates/nu-parser/tests/test_parser_unicode_escapes.rs
@@ -0,0 +1,91 @@
+#![cfg(test)]
+
+//use nu_parser::ParseError;
+use nu_parser::*;
+use nu_protocol::{
+    //ast::{Expr, Expression, PipelineElement},
+    ast::{Expr, PipelineElement},
+    //engine::{Command, EngineState, Stack, StateWorkingSet},
+    engine::{EngineState, StateWorkingSet},
+    //Signature, SyntaxShape,
+};
+
+pub fn do_test(test: &[u8], expected: &str, error_contains: Option<&str>) {
+    let engine_state = EngineState::new();
+    let mut working_set = StateWorkingSet::new(&engine_state);
+
+    let (block, err) = parse(&mut working_set, None, test, true, &[]);
+
+    match err {
+        None => {
+            assert_eq!(block.len(), 1);
+            let expressions = &block[0];
+            assert_eq!(expressions.len(), 1);
+            if let PipelineElement::Expression(_, expr) = &expressions[0] {
+                assert_eq!(expr.expr, Expr::String(expected.to_string()))
+            } else {
+                panic!("Not an expression")
+            }
+        }
+        Some(pev) => match error_contains {
+            None => {
+                panic!("Err:{:#?}", pev);
+            }
+            Some(contains_string) => {
+                let full_err = format!("{:#?}", pev);
+                assert!(
+                    full_err.contains(contains_string),
+                    "Expected error containing {}, instead got {}",
+                    contains_string,
+                    full_err
+                );
+            }
+        },
+    }
+}
+
+// cases that all should work
+#[test]
+pub fn unicode_escapes_in_strings() {
+    pub struct Tc(&'static [u8], &'static str);
+
+    let test_vec = vec![
+        Tc(b"\"hello \\u{6e}\\u{000075}\\u{073}hell\"", "hello nushell"),
+        // template: Tc(br#""<string literal without #'s>"", "<Rust literal comparand>")
+        //deprecated Tc(br#""\u006enu\u0075\u0073\u0073""#, "nnuuss"),
+        Tc(br#""hello \u{6e}\u{000075}\u{073}hell""#, "hello nushell"),
+        Tc(br#""\u{39}8\u{10ffff}""#, "98\u{10ffff}"),
+        Tc(br#""abc\u{41}""#, "abcA"), // at end of string
+        Tc(br#""\u{41}abc""#, "Aabc"), // at start of string
+        Tc(br#""\u{a}""#, "\n"),       // single digit
+    ];
+
+    for tci in test_vec {
+        println!("Expecting: {}", tci.1);
+        do_test(tci.0, tci.1, None);
+    }
+}
+
+// cases that all should fail (in expected way)
+#[test]
+pub fn unicode_escapes_in_strings_expected_failures() {
+    // input, substring of expected failure
+    pub struct Tc(&'static [u8], &'static str);
+
+    let test_vec = vec![
+        // template: Tc(br#""<string literal without #'s>"", "<pattern in expected error>")
+        //deprecated Tc(br#""\u06e""#, "any shape"), // 4digit too short, next char is EOF
+        //deprecatedTc(br#""\u06ex""#, "any shape"), // 4digit too short, next char is non-hex-digit
+        Tc(br#""hello \u{6e""#, "any shape"), // extended, missing close delim
+        Tc(
+            br#""\u{39}8\u{000000000000000000000000000000000000000000000037}""#,
+            "any shape",
+        ), // hex too long, but small value
+        Tc(br#""\u{110000}""#, "any shape"),  // max unicode <= 0x10ffff
+    ];
+
+    for tci in test_vec {
+        println!("Expecting failure containing: {}", tci.1);
+        do_test(tci.0, "--success not expected--", Some(tci.1));
+    }
+}
diff --git a/src/tests/test_parser.rs b/src/tests/test_parser.rs
index e054c1a7b..21e2ff018 100644
--- a/src/tests/test_parser.rs
+++ b/src/tests/test_parser.rs
@@ -380,14 +380,15 @@ fn block_arity_check1() -> TestResult {
     )
 }
 
+// deprecating former support for escapes like `/uNNNN`, dropping test.
 #[test]
-fn string_escape() -> TestResult {
-    run_test(r#""\u015B""#, "ś")
+fn string_escape_unicode_extended() -> TestResult {
+    run_test(r#""\u{015B}\u{1f10b}""#, "ś🄋")
 }
 
 #[test]
 fn string_escape_interpolation() -> TestResult {
-    run_test(r#"$"\u015B(char hamburger)abc""#, "ś≡abc")
+    run_test(r#"$"\u{015B}(char hamburger)abc""#, "ś≡abc")
 }
 
 #[test]