From e616b2e247a329489cc838edf2c8e3590d7efe2e Mon Sep 17 00:00:00 2001 From: Bob Hyman Date: Sat, 28 Jan 2023 15:25:53 -0500 Subject: [PATCH] Support extended unicode escapes in strings: "\u{10fff}" (#7883) # Description Support extended unicode escapes in strings with same syntax as Rust: `"\u{6e}"`. # User-Facing Changes New syntax in string literals, `\u{NNNNNN}`, to go along with the existing `\uNNNN`. New syntax accepts 1-6 hex digits and rejects values greater than 0x10FFFF (max Unicode char).. _(List of all changes that impact the user experience here. This helps us keep track of breaking changes.)_ Won't break existing scripts, since this is new syntax. We might consider deprecating `char -u`, since users can now embed unicode chars > 0xFFFF with the new escape. # Tests + Formatting Several unit tests and one integration test added. - [x] `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) Done - [x] `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used -A clippy::needless_collect` to check that you're using the standard code style Done - [x] `cargo test --workspace` to check that all tests pass Done # After Submitting - [ ] If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. --- crates/nu-parser/src/parser.rs | 88 ++++++++++-------- crates/nu-parser/tests/test_parser.rs | 24 ----- .../tests/test_parser_unicode_escapes.rs | 91 +++++++++++++++++++ src/tests/test_parser.rs | 7 +- 4 files changed, 146 insertions(+), 64 deletions(-) create mode 100644 crates/nu-parser/tests/test_parser_unicode_escapes.rs diff --git a/crates/nu-parser/src/parser.rs b/crates/nu-parser/src/parser.rs index c809a3d6f..23049cc40 100644 --- a/crates/nu-parser/src/parser.rs +++ b/crates/nu-parser/src/parser.rs @@ -2465,7 +2465,7 @@ pub fn unescape_string(bytes: &[u8], span: Span) -> (Vec, Option let mut idx = 0; let mut err = None; - while idx < bytes.len() { + 'us_loop: while idx < bytes.len() { if bytes[idx] == b'\\' { // We're in an escape idx += 1; @@ -2552,53 +2552,67 @@ pub fn unescape_string(bytes: &[u8], span: Span) -> (Vec, Option idx += 1; } Some(b'u') => { - match ( - bytes.get(idx + 1), - bytes.get(idx + 2), - bytes.get(idx + 3), - bytes.get(idx + 4), - ) { - (Some(h1), Some(h2), Some(h3), Some(h4)) => { - let s = String::from_utf8(vec![*h1, *h2, *h3, *h4]); + let mut digits = String::with_capacity(10); + let mut cur_idx = idx + 1; // index of first beyond current end of token - if let Ok(s) = s { - let int = u32::from_str_radix(&s, 16); - - if let Ok(int) = int { - let result = char::from_u32(int); - - if let Some(result) = result { - let mut buffer = vec![0; 4]; - let result = result.encode_utf8(&mut buffer); - - for elem in result.bytes() { - output.push(elem); - } - - idx += 5; - continue; - } + if let Some(b'{') = bytes.get(idx + 1) { + cur_idx = idx + 2; + loop { + match bytes.get(cur_idx) { + Some(b'}') => { + cur_idx += 1; + break; + } + Some(c) => { + digits.push(*c as char); + cur_idx += 1; + } + _ => { + err = Some(ParseError::Expected( + "closing '}' in unicode escape `\\u{n..}`".into(), + Span::new(span.start + idx, span.end), + )); + break 'us_loop; } } - err = Some(ParseError::Expected( - "unicode hex value".into(), - Span::new(span.start + idx, span.end), - )); - } - _ => { - err = Some(ParseError::Expected( - "unicode hex value".into(), - Span::new(span.start + idx, span.end), - )); } } - idx += 5; + + if (1..=6).contains(&digits.len()) { + let int = u32::from_str_radix(&digits, 16); + + if let Ok(int) = int { + if int <= 0x10ffff { + let result = char::from_u32(int); + + if let Some(result) = result { + let mut buffer = vec![0; 4]; + let result = result.encode_utf8(&mut buffer); + + for elem in result.bytes() { + output.push(elem); + } + + idx = cur_idx; + continue 'us_loop; + } + } + } + } + // fall through -- escape not accepted above, must be error. + err = Some(ParseError::Expected( + "unicode escape \\u{n..}".into(), + Span::new(span.start + idx, span.end), + )); + break 'us_loop; } + _ => { err = Some(ParseError::Expected( "supported escape character".into(), Span::new(span.start + idx, span.end), )); + break 'us_loop; } } } else { diff --git a/crates/nu-parser/tests/test_parser.rs b/crates/nu-parser/tests/test_parser.rs index fd35599ea..78930a841 100644 --- a/crates/nu-parser/tests/test_parser.rs +++ b/crates/nu-parser/tests/test_parser.rs @@ -412,30 +412,6 @@ mod string { } } - #[test] - pub fn parse_escaped_string() { - let engine_state = EngineState::new(); - let mut working_set = StateWorkingSet::new(&engine_state); - - let (block, err) = parse( - &mut working_set, - None, - b"\"hello \\u006e\\u0075\\u0073hell\"", - true, - &[], - ); - - assert!(err.is_none()); - assert_eq!(block.len(), 1); - let expressions = &block[0]; - assert_eq!(expressions.len(), 1); - if let PipelineElement::Expression(_, expr) = &expressions[0] { - assert_eq!(expr.expr, Expr::String("hello nushell".to_string())) - } else { - panic!("Not an expression") - } - } - mod interpolation { use nu_protocol::Span; diff --git a/crates/nu-parser/tests/test_parser_unicode_escapes.rs b/crates/nu-parser/tests/test_parser_unicode_escapes.rs new file mode 100644 index 000000000..0cc884b1d --- /dev/null +++ b/crates/nu-parser/tests/test_parser_unicode_escapes.rs @@ -0,0 +1,91 @@ +#![cfg(test)] + +//use nu_parser::ParseError; +use nu_parser::*; +use nu_protocol::{ + //ast::{Expr, Expression, PipelineElement}, + ast::{Expr, PipelineElement}, + //engine::{Command, EngineState, Stack, StateWorkingSet}, + engine::{EngineState, StateWorkingSet}, + //Signature, SyntaxShape, +}; + +pub fn do_test(test: &[u8], expected: &str, error_contains: Option<&str>) { + let engine_state = EngineState::new(); + let mut working_set = StateWorkingSet::new(&engine_state); + + let (block, err) = parse(&mut working_set, None, test, true, &[]); + + match err { + None => { + assert_eq!(block.len(), 1); + let expressions = &block[0]; + assert_eq!(expressions.len(), 1); + if let PipelineElement::Expression(_, expr) = &expressions[0] { + assert_eq!(expr.expr, Expr::String(expected.to_string())) + } else { + panic!("Not an expression") + } + } + Some(pev) => match error_contains { + None => { + panic!("Err:{:#?}", pev); + } + Some(contains_string) => { + let full_err = format!("{:#?}", pev); + assert!( + full_err.contains(contains_string), + "Expected error containing {}, instead got {}", + contains_string, + full_err + ); + } + }, + } +} + +// cases that all should work +#[test] +pub fn unicode_escapes_in_strings() { + pub struct Tc(&'static [u8], &'static str); + + let test_vec = vec![ + Tc(b"\"hello \\u{6e}\\u{000075}\\u{073}hell\"", "hello nushell"), + // template: Tc(br#"""", "") + //deprecated Tc(br#""\u006enu\u0075\u0073\u0073""#, "nnuuss"), + Tc(br#""hello \u{6e}\u{000075}\u{073}hell""#, "hello nushell"), + Tc(br#""\u{39}8\u{10ffff}""#, "98\u{10ffff}"), + Tc(br#""abc\u{41}""#, "abcA"), // at end of string + Tc(br#""\u{41}abc""#, "Aabc"), // at start of string + Tc(br#""\u{a}""#, "\n"), // single digit + ]; + + for tci in test_vec { + println!("Expecting: {}", tci.1); + do_test(tci.0, tci.1, None); + } +} + +// cases that all should fail (in expected way) +#[test] +pub fn unicode_escapes_in_strings_expected_failures() { + // input, substring of expected failure + pub struct Tc(&'static [u8], &'static str); + + let test_vec = vec![ + // template: Tc(br#"""", "") + //deprecated Tc(br#""\u06e""#, "any shape"), // 4digit too short, next char is EOF + //deprecatedTc(br#""\u06ex""#, "any shape"), // 4digit too short, next char is non-hex-digit + Tc(br#""hello \u{6e""#, "any shape"), // extended, missing close delim + Tc( + br#""\u{39}8\u{000000000000000000000000000000000000000000000037}""#, + "any shape", + ), // hex too long, but small value + Tc(br#""\u{110000}""#, "any shape"), // max unicode <= 0x10ffff + ]; + + for tci in test_vec { + println!("Expecting failure containing: {}", tci.1); + do_test(tci.0, "--success not expected--", Some(tci.1)); + } +} diff --git a/src/tests/test_parser.rs b/src/tests/test_parser.rs index e054c1a7b..21e2ff018 100644 --- a/src/tests/test_parser.rs +++ b/src/tests/test_parser.rs @@ -380,14 +380,15 @@ fn block_arity_check1() -> TestResult { ) } +// deprecating former support for escapes like `/uNNNN`, dropping test. #[test] -fn string_escape() -> TestResult { - run_test(r#""\u015B""#, "ś") +fn string_escape_unicode_extended() -> TestResult { + run_test(r#""\u{015B}\u{1f10b}""#, "ś🄋") } #[test] fn string_escape_interpolation() -> TestResult { - run_test(r#"$"\u015B(char hamburger)abc""#, "ś≡abc") + run_test(r#"$"\u{015B}(char hamburger)abc""#, "ś≡abc") } #[test]