From e211e96d33e62033970ad9487e1ea2643ae7bbdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rodrigo=20Fri=C3=A3es?= <119532691+friaes@users.noreply.github.com> Date: Sat, 6 Apr 2024 15:07:55 +0100 Subject: [PATCH] Fix nushell#10591: encode returns error with utf-16le and utf-16be encodings (nushell#10591) (#12411) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description This closes (nushell#10591) The Command encode's help text says that utf-16le and utf-16be encodings are not supported, however you could still use these encodings and they didn't work properly, since they returned the bytes UTF-8 encoded: ```bash "䆺ש" | encode utf-16 Length: 5 (0x5) bytes | printable whitespace ascii_other non_ascii 00000000: e4 86 ba d7 a9 ××××× ``` # User-Facing Changes The Command encode's help text was updated and now when trying to encode with utf-16le and utf-16be returns an error: ![screenshot](https://github.com/nushell/nushell/assets/119532691/c346dc57-8b42-4dfc-93d5-638b0041d89f) # Tests + Formatting - :green_circle: `toolkit fmt` - :green_circle: `toolkit clippy` - :green_circle: `toolkit test` - :green_circle: `toolkit test stdlib` --- .../src/strings/encode_decode/encode.rs | 1 + .../src/strings/encode_decode/encoding.rs | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/crates/nu-command/src/strings/encode_decode/encode.rs b/crates/nu-command/src/strings/encode_decode/encode.rs index b017da4a8ac..98fcc34179a 100644 --- a/crates/nu-command/src/strings/encode_decode/encode.rs +++ b/crates/nu-command/src/strings/encode_decode/encode.rs @@ -37,6 +37,7 @@ impl Command for Encode { big5, euc-jp, euc-kr, gbk, iso-8859-1, cp1252, latin5 Note that since the Encoding Standard doesn't specify encoders for utf-16le and utf-16be, these are not yet supported. +More information can be found here: https://docs.rs/encoding_rs/latest/encoding_rs/#utf-16le-utf-16be-and-unicode-encoding-schemes For a more complete list of encodings, please refer to the encoding_rs documentation link at https://docs.rs/encoding_rs/latest/encoding_rs/#statics"# diff --git a/crates/nu-command/src/strings/encode_decode/encoding.rs b/crates/nu-command/src/strings/encode_decode/encoding.rs index a063820d4cb..91c7e72b17c 100644 --- a/crates/nu-command/src/strings/encode_decode/encoding.rs +++ b/crates/nu-command/src/strings/encode_decode/encoding.rs @@ -50,6 +50,19 @@ pub fn encode( } else { parse_encoding(encoding_name.span, &encoding_name.item) }?; + + // Since the Encoding Standard doesn't specify encoders for "UTF-16BE" and "UTF-16LE" + // Check if the encoding is one of them and return an error + if ["UTF-16BE", "UTF-16LE"].contains(&encoding.name()) { + return Err(ShellError::GenericError { + error: format!(r#"{} encoding is not supported"#, &encoding_name.item), + msg: "invalid encoding".into(), + span: Some(encoding_name.span), + help: Some("refer to https://docs.rs/encoding_rs/latest/encoding_rs/index.html#statics for a valid list of encodings".into()), + inner: vec![], + }); + } + let (result, _actual_encoding, replacements) = encoding.encode(s); // Because encoding_rs is a Web-facing crate, it defaults to replacing unknowns with HTML entities. // This behaviour can be enabled with -i. Otherwise, it becomes an error. @@ -102,9 +115,7 @@ mod test { #[case::iso_8859_1("iso-8859-1", "Some ¼½¿ Data µ¶·¸¹º")] #[case::cp1252("cp1252", "Some ¼½¿ Data")] #[case::latin5("latin5", "Some ¼½¿ Data µ¶·¸¹º")] - // Tests for specific renditions of UTF-16 and UTF-8 labels - #[case::utf16("utf16", "")] - #[case::utf_hyphen_16("utf-16", "")] + // Tests for specific renditions of UTF-8 labels #[case::utf8("utf8", "")] #[case::utf_hyphen_8("utf-8", "")] fn smoke(#[case] encoding: String, #[case] expected: &str) {