mirror of
https://github.com/nushell/nushell.git
synced 2025-04-02 20:27:11 +02:00
Support extended unicode escapes in strings: "\u{10fff}" (#7883)
# Description Support extended unicode escapes in strings with same syntax as Rust: `"\u{6e}"`. # User-Facing Changes New syntax in string literals, `\u{NNNNNN}`, to go along with the existing `\uNNNN`. New syntax accepts 1-6 hex digits and rejects values greater than 0x10FFFF (max Unicode char).. _(List of all changes that impact the user experience here. This helps us keep track of breaking changes.)_ Won't break existing scripts, since this is new syntax. We might consider deprecating `char -u`, since users can now embed unicode chars > 0xFFFF with the new escape. # Tests + Formatting Several unit tests and one integration test added. - [x] `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) Done - [x] `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used -A clippy::needless_collect` to check that you're using the standard code style Done - [x] `cargo test --workspace` to check that all tests pass Done # After Submitting - [ ] If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date.
This commit is contained in:
parent
2a39332d51
commit
e616b2e247
crates/nu-parser
src/tests
@ -2465,7 +2465,7 @@ pub fn unescape_string(bytes: &[u8], span: Span) -> (Vec<u8>, Option<ParseError>
|
|||||||
let mut idx = 0;
|
let mut idx = 0;
|
||||||
let mut err = None;
|
let mut err = None;
|
||||||
|
|
||||||
while idx < bytes.len() {
|
'us_loop: while idx < bytes.len() {
|
||||||
if bytes[idx] == b'\\' {
|
if bytes[idx] == b'\\' {
|
||||||
// We're in an escape
|
// We're in an escape
|
||||||
idx += 1;
|
idx += 1;
|
||||||
@ -2552,53 +2552,67 @@ pub fn unescape_string(bytes: &[u8], span: Span) -> (Vec<u8>, Option<ParseError>
|
|||||||
idx += 1;
|
idx += 1;
|
||||||
}
|
}
|
||||||
Some(b'u') => {
|
Some(b'u') => {
|
||||||
match (
|
let mut digits = String::with_capacity(10);
|
||||||
bytes.get(idx + 1),
|
let mut cur_idx = idx + 1; // index of first beyond current end of token
|
||||||
bytes.get(idx + 2),
|
|
||||||
bytes.get(idx + 3),
|
|
||||||
bytes.get(idx + 4),
|
|
||||||
) {
|
|
||||||
(Some(h1), Some(h2), Some(h3), Some(h4)) => {
|
|
||||||
let s = String::from_utf8(vec![*h1, *h2, *h3, *h4]);
|
|
||||||
|
|
||||||
if let Ok(s) = s {
|
if let Some(b'{') = bytes.get(idx + 1) {
|
||||||
let int = u32::from_str_radix(&s, 16);
|
cur_idx = idx + 2;
|
||||||
|
loop {
|
||||||
if let Ok(int) = int {
|
match bytes.get(cur_idx) {
|
||||||
let result = char::from_u32(int);
|
Some(b'}') => {
|
||||||
|
cur_idx += 1;
|
||||||
if let Some(result) = result {
|
break;
|
||||||
let mut buffer = vec![0; 4];
|
}
|
||||||
let result = result.encode_utf8(&mut buffer);
|
Some(c) => {
|
||||||
|
digits.push(*c as char);
|
||||||
for elem in result.bytes() {
|
cur_idx += 1;
|
||||||
output.push(elem);
|
}
|
||||||
}
|
_ => {
|
||||||
|
err = Some(ParseError::Expected(
|
||||||
idx += 5;
|
"closing '}' in unicode escape `\\u{n..}`".into(),
|
||||||
continue;
|
Span::new(span.start + idx, span.end),
|
||||||
}
|
));
|
||||||
|
break 'us_loop;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
err = Some(ParseError::Expected(
|
|
||||||
"unicode hex value".into(),
|
|
||||||
Span::new(span.start + idx, span.end),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
err = Some(ParseError::Expected(
|
|
||||||
"unicode hex value".into(),
|
|
||||||
Span::new(span.start + idx, span.end),
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
idx += 5;
|
|
||||||
|
if (1..=6).contains(&digits.len()) {
|
||||||
|
let int = u32::from_str_radix(&digits, 16);
|
||||||
|
|
||||||
|
if let Ok(int) = int {
|
||||||
|
if int <= 0x10ffff {
|
||||||
|
let result = char::from_u32(int);
|
||||||
|
|
||||||
|
if let Some(result) = result {
|
||||||
|
let mut buffer = vec![0; 4];
|
||||||
|
let result = result.encode_utf8(&mut buffer);
|
||||||
|
|
||||||
|
for elem in result.bytes() {
|
||||||
|
output.push(elem);
|
||||||
|
}
|
||||||
|
|
||||||
|
idx = cur_idx;
|
||||||
|
continue 'us_loop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// fall through -- escape not accepted above, must be error.
|
||||||
|
err = Some(ParseError::Expected(
|
||||||
|
"unicode escape \\u{n..}".into(),
|
||||||
|
Span::new(span.start + idx, span.end),
|
||||||
|
));
|
||||||
|
break 'us_loop;
|
||||||
}
|
}
|
||||||
|
|
||||||
_ => {
|
_ => {
|
||||||
err = Some(ParseError::Expected(
|
err = Some(ParseError::Expected(
|
||||||
"supported escape character".into(),
|
"supported escape character".into(),
|
||||||
Span::new(span.start + idx, span.end),
|
Span::new(span.start + idx, span.end),
|
||||||
));
|
));
|
||||||
|
break 'us_loop;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -412,30 +412,6 @@ mod string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
pub fn parse_escaped_string() {
|
|
||||||
let engine_state = EngineState::new();
|
|
||||||
let mut working_set = StateWorkingSet::new(&engine_state);
|
|
||||||
|
|
||||||
let (block, err) = parse(
|
|
||||||
&mut working_set,
|
|
||||||
None,
|
|
||||||
b"\"hello \\u006e\\u0075\\u0073hell\"",
|
|
||||||
true,
|
|
||||||
&[],
|
|
||||||
);
|
|
||||||
|
|
||||||
assert!(err.is_none());
|
|
||||||
assert_eq!(block.len(), 1);
|
|
||||||
let expressions = &block[0];
|
|
||||||
assert_eq!(expressions.len(), 1);
|
|
||||||
if let PipelineElement::Expression(_, expr) = &expressions[0] {
|
|
||||||
assert_eq!(expr.expr, Expr::String("hello nushell".to_string()))
|
|
||||||
} else {
|
|
||||||
panic!("Not an expression")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
mod interpolation {
|
mod interpolation {
|
||||||
use nu_protocol::Span;
|
use nu_protocol::Span;
|
||||||
|
|
||||||
|
91
crates/nu-parser/tests/test_parser_unicode_escapes.rs
Normal file
91
crates/nu-parser/tests/test_parser_unicode_escapes.rs
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
#![cfg(test)]
|
||||||
|
|
||||||
|
//use nu_parser::ParseError;
|
||||||
|
use nu_parser::*;
|
||||||
|
use nu_protocol::{
|
||||||
|
//ast::{Expr, Expression, PipelineElement},
|
||||||
|
ast::{Expr, PipelineElement},
|
||||||
|
//engine::{Command, EngineState, Stack, StateWorkingSet},
|
||||||
|
engine::{EngineState, StateWorkingSet},
|
||||||
|
//Signature, SyntaxShape,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub fn do_test(test: &[u8], expected: &str, error_contains: Option<&str>) {
|
||||||
|
let engine_state = EngineState::new();
|
||||||
|
let mut working_set = StateWorkingSet::new(&engine_state);
|
||||||
|
|
||||||
|
let (block, err) = parse(&mut working_set, None, test, true, &[]);
|
||||||
|
|
||||||
|
match err {
|
||||||
|
None => {
|
||||||
|
assert_eq!(block.len(), 1);
|
||||||
|
let expressions = &block[0];
|
||||||
|
assert_eq!(expressions.len(), 1);
|
||||||
|
if let PipelineElement::Expression(_, expr) = &expressions[0] {
|
||||||
|
assert_eq!(expr.expr, Expr::String(expected.to_string()))
|
||||||
|
} else {
|
||||||
|
panic!("Not an expression")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(pev) => match error_contains {
|
||||||
|
None => {
|
||||||
|
panic!("Err:{:#?}", pev);
|
||||||
|
}
|
||||||
|
Some(contains_string) => {
|
||||||
|
let full_err = format!("{:#?}", pev);
|
||||||
|
assert!(
|
||||||
|
full_err.contains(contains_string),
|
||||||
|
"Expected error containing {}, instead got {}",
|
||||||
|
contains_string,
|
||||||
|
full_err
|
||||||
|
);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// cases that all should work
|
||||||
|
#[test]
|
||||||
|
pub fn unicode_escapes_in_strings() {
|
||||||
|
pub struct Tc(&'static [u8], &'static str);
|
||||||
|
|
||||||
|
let test_vec = vec![
|
||||||
|
Tc(b"\"hello \\u{6e}\\u{000075}\\u{073}hell\"", "hello nushell"),
|
||||||
|
// template: Tc(br#""<string literal without #'s>"", "<Rust literal comparand>")
|
||||||
|
//deprecated Tc(br#""\u006enu\u0075\u0073\u0073""#, "nnuuss"),
|
||||||
|
Tc(br#""hello \u{6e}\u{000075}\u{073}hell""#, "hello nushell"),
|
||||||
|
Tc(br#""\u{39}8\u{10ffff}""#, "98\u{10ffff}"),
|
||||||
|
Tc(br#""abc\u{41}""#, "abcA"), // at end of string
|
||||||
|
Tc(br#""\u{41}abc""#, "Aabc"), // at start of string
|
||||||
|
Tc(br#""\u{a}""#, "\n"), // single digit
|
||||||
|
];
|
||||||
|
|
||||||
|
for tci in test_vec {
|
||||||
|
println!("Expecting: {}", tci.1);
|
||||||
|
do_test(tci.0, tci.1, None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// cases that all should fail (in expected way)
|
||||||
|
#[test]
|
||||||
|
pub fn unicode_escapes_in_strings_expected_failures() {
|
||||||
|
// input, substring of expected failure
|
||||||
|
pub struct Tc(&'static [u8], &'static str);
|
||||||
|
|
||||||
|
let test_vec = vec![
|
||||||
|
// template: Tc(br#""<string literal without #'s>"", "<pattern in expected error>")
|
||||||
|
//deprecated Tc(br#""\u06e""#, "any shape"), // 4digit too short, next char is EOF
|
||||||
|
//deprecatedTc(br#""\u06ex""#, "any shape"), // 4digit too short, next char is non-hex-digit
|
||||||
|
Tc(br#""hello \u{6e""#, "any shape"), // extended, missing close delim
|
||||||
|
Tc(
|
||||||
|
br#""\u{39}8\u{000000000000000000000000000000000000000000000037}""#,
|
||||||
|
"any shape",
|
||||||
|
), // hex too long, but small value
|
||||||
|
Tc(br#""\u{110000}""#, "any shape"), // max unicode <= 0x10ffff
|
||||||
|
];
|
||||||
|
|
||||||
|
for tci in test_vec {
|
||||||
|
println!("Expecting failure containing: {}", tci.1);
|
||||||
|
do_test(tci.0, "--success not expected--", Some(tci.1));
|
||||||
|
}
|
||||||
|
}
|
@ -380,14 +380,15 @@ fn block_arity_check1() -> TestResult {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// deprecating former support for escapes like `/uNNNN`, dropping test.
|
||||||
#[test]
|
#[test]
|
||||||
fn string_escape() -> TestResult {
|
fn string_escape_unicode_extended() -> TestResult {
|
||||||
run_test(r#""\u015B""#, "ś")
|
run_test(r#""\u{015B}\u{1f10b}""#, "ś🄋")
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn string_escape_interpolation() -> TestResult {
|
fn string_escape_interpolation() -> TestResult {
|
||||||
run_test(r#"$"\u015B(char hamburger)abc""#, "ś≡abc")
|
run_test(r#"$"\u{015B}(char hamburger)abc""#, "ś≡abc")
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
Loading…
Reference in New Issue
Block a user