diff --git a/crates/nu-command/src/strings/encode_decode/decode.rs b/crates/nu-command/src/strings/encode_decode/decode.rs index 2ce93f128..5044b21f0 100644 --- a/crates/nu-command/src/strings/encode_decode/decode.rs +++ b/crates/nu-command/src/strings/encode_decode/decode.rs @@ -15,7 +15,7 @@ impl Command for Decode { } fn usage(&self) -> &str { - "Decode bytes as a string." + "Decode bytes into a string." } fn search_terms(&self) -> Vec<&str> { @@ -30,11 +30,11 @@ impl Command for Decode { } fn extra_usage(&self) -> &str { - r#"Multiple encodings are supported, here is an example of a few: + r#"Multiple encodings are supported; here are a few: big5, euc-jp, euc-kr, gbk, iso-8859-1, utf-16, cp1252, latin5 For a more complete list of encodings please refer to the encoding_rs -documentation link at https://docs.rs/encoding_rs/0.8.28/encoding_rs/#statics"# +documentation link at https://docs.rs/encoding_rs/latest/encoding_rs/#statics"# } fn examples(&self) -> Vec { diff --git a/crates/nu-command/src/strings/encode_decode/encode.rs b/crates/nu-command/src/strings/encode_decode/encode.rs index 65d2d69eb..b3d406d7f 100644 --- a/crates/nu-command/src/strings/encode_decode/encode.rs +++ b/crates/nu-command/src/strings/encode_decode/encode.rs @@ -15,7 +15,11 @@ impl Command for Encode { } fn usage(&self) -> &str { - "Encode an UTF-8 string into other kind of representations." + // Note: "Encode a UTF-8 string into other forms" is semantically incorrect because + // Nushell strings, as abstract values, have no user-facing encoding. + // (Remember that "encoding" exclusively means "how the characters are + // observably represented by bytes"). + "Encode a string into bytes." } fn search_terms(&self) -> Vec<&str> { @@ -26,33 +30,48 @@ impl Command for Encode { Signature::build("encode") .input_output_types(vec![(Type::String, Type::Binary)]) .required("encoding", SyntaxShape::String, "the text encoding to use") + .switch( + "ignore-errors", + "when a character isn't in the given encoding, replace with a HTML entity (like `🎈`)", + Some('i'), + ) .category(Category::Strings) } fn extra_usage(&self) -> &str { - r#"Multiple encodings are supported, here is an example of a few: + r#"Multiple encodings are supported; here are a few: big5, euc-jp, euc-kr, gbk, iso-8859-1, cp1252, latin5 Note that since the Encoding Standard doesn't specify encoders for utf-16le and utf-16be, these are not yet supported. -For a more complete list of encodings please refer to the encoding_rs -documentation link at https://docs.rs/encoding_rs/0.8.28/encoding_rs/#statics"# +For a more complete list of encodings, please refer to the encoding_rs +documentation link at https://docs.rs/encoding_rs/latest/encoding_rs/#statics"# } fn examples(&self) -> Vec { - vec![Example { - description: "Encode an UTF-8 string into Shift-JIS", - example: r#""負けると知って戦うのが、遥かに美しいのだ" | encode shift-jis"#, - result: Some(Value::Binary { - val: vec![ - 0x95, 0x89, 0x82, 0xaf, 0x82, 0xe9, 0x82, 0xc6, 0x92, 0x6d, 0x82, 0xc1, 0x82, - 0xc4, 0x90, 0xed, 0x82, 0xa4, 0x82, 0xcc, 0x82, 0xaa, 0x81, 0x41, 0x97, 0x79, - 0x82, 0xa9, 0x82, 0xc9, 0x94, 0xfc, 0x82, 0xb5, 0x82, 0xa2, 0x82, 0xcc, 0x82, - 0xbe, - ], - span: Span::test_data(), - }), - }] + vec![ + Example { + description: "Encode an UTF-8 string into Shift-JIS", + example: r#""負けると知って戦うのが、遥かに美しいのだ" | encode shift-jis"#, + result: Some(Value::Binary { + val: vec![ + 0x95, 0x89, 0x82, 0xaf, 0x82, 0xe9, 0x82, 0xc6, 0x92, 0x6d, 0x82, 0xc1, + 0x82, 0xc4, 0x90, 0xed, 0x82, 0xa4, 0x82, 0xcc, 0x82, 0xaa, 0x81, 0x41, + 0x97, 0x79, 0x82, 0xa9, 0x82, 0xc9, 0x94, 0xfc, 0x82, 0xb5, 0x82, 0xa2, + 0x82, 0xcc, 0x82, 0xbe, + ], + span: Span::test_data(), + }), + }, + Example { + description: "Replace characters with HTML entities if they can't be encoded", + example: r#""🎈" | encode -i shift-jis"#, + result: Some(Value::Binary { + val: vec![0x26, 0x23, 0x31, 0x32, 0x37, 0x38, 0x38, 0x30, 0x3b], + span: Span::test_data(), + }), + }, + ] } fn run( @@ -64,6 +83,7 @@ documentation link at https://docs.rs/encoding_rs/0.8.28/encoding_rs/#statics"# ) -> Result { let head = call.head; let encoding: Spanned = call.req(engine_state, stack, 0)?; + let ignore_errors = call.has_flag("ignore-errors"); match input { PipelineData::ExternalStream { stdout: None, .. } => Ok(PipelineData::empty()), @@ -71,12 +91,14 @@ documentation link at https://docs.rs/encoding_rs/0.8.28/encoding_rs/#statics"# stdout: Some(stream), .. } => { - let s = stream.into_string()?.item; - super::encoding::encode(head, encoding, &s).map(|val| val.into_pipeline_data()) + let s = stream.into_string()?; + super::encoding::encode(head, encoding, &s.item, s.span, ignore_errors) + .map(|val| val.into_pipeline_data()) } PipelineData::Value(v, ..) => match v { - Value::String { val: s, .. } => { - super::encoding::encode(head, encoding, &s).map(|val| val.into_pipeline_data()) + Value::String { val: s, span } => { + super::encoding::encode(head, encoding, &s, span, ignore_errors) + .map(|val| val.into_pipeline_data()) } Value::Error { error } => Err(error), _ => Err(ShellError::OnlySupportsThisInputType( @@ -87,7 +109,7 @@ documentation link at https://docs.rs/encoding_rs/0.8.28/encoding_rs/#statics"# )), }, // This should be more precise, but due to difficulties in getting spans - // from PipelineData::ListData, this is as it is. + // from PipelineData::ListStream, this is as it is. _ => Err(ShellError::UnsupportedInput( "non-string input".into(), "value originates from here".into(), diff --git a/crates/nu-command/src/strings/encode_decode/encoding.rs b/crates/nu-command/src/strings/encode_decode/encoding.rs index bc7883312..39f481fe0 100644 --- a/crates/nu-command/src/strings/encode_decode/encoding.rs +++ b/crates/nu-command/src/strings/encode_decode/encoding.rs @@ -1,8 +1,17 @@ use encoding_rs::Encoding; use nu_protocol::{ShellError, Span, Spanned, Value}; -pub fn decode(head: Span, encoding: Spanned, bytes: &[u8]) -> Result { - let encoding = parse_encoding(encoding.span, &encoding.item)?; +pub fn decode( + head: Span, + encoding_name: Spanned, + bytes: &[u8], +) -> Result { + // Workaround for a bug in the Encodings Specification. + let encoding = if encoding_name.item.to_lowercase() == "utf16" { + parse_encoding(encoding_name.span, "utf-16") + } else { + parse_encoding(encoding_name.span, &encoding_name.item) + }?; let (result, ..) = encoding.decode(bytes); Ok(Value::String { val: result.into_owned(), @@ -10,26 +19,56 @@ pub fn decode(head: Span, encoding: Spanned, bytes: &[u8]) -> Result, s: &str) -> Result { - let encoding = parse_encoding(encoding.span, &encoding.item)?; - let (result, ..) = encoding.encode(s); - Ok(Value::Binary { - val: result.into_owned(), - span: head, - }) +pub fn encode( + head: Span, + encoding_name: Spanned, + s: &str, + s_span: Span, + ignore_errors: bool, +) -> Result { + // Workaround for a bug in the Encodings Specification. + let encoding = if encoding_name.item.to_lowercase() == "utf16" { + parse_encoding(encoding_name.span, "utf-16") + } else { + parse_encoding(encoding_name.span, &encoding_name.item) + }?; + let (result, _actual_encoding, replacements) = encoding.encode(s); + // Because encoding_rs is a Web-facing crate, it defaults to replacing unknowns with HTML entities. + // This behaviour can be enabled with -i. Otherwise, it becomes an error. + if replacements && !ignore_errors { + // TODO: make GenericError accept two spans (including head) + Err(ShellError::GenericError( + "error while encoding string".into(), + format!("string contained characters not in {}", &encoding_name.item), + Some(s_span), + None, + vec![], + )) + } else { + Ok(Value::Binary { + val: result.into_owned(), + span: head, + }) + } } fn parse_encoding(span: Span, label: &str) -> Result<&'static Encoding, ShellError> { + // Workaround for a bug in the Encodings Specification. + let label = if label.to_lowercase() == "utf16" { + "utf-16" + } else { + label + }; match Encoding::for_label_no_replacement(label.as_bytes()) { None => Err(ShellError::GenericError( format!( - r#"{} is not a valid encoding, refer to https://docs.rs/encoding_rs/0.8.23/encoding_rs/#statics for a valid list of encodings"#, + r#"{} is not a valid encoding"#, label ), "invalid encoding".into(), Some(span), - None, - Vec::new(), + Some("refer to https://docs.rs/encoding_rs/latest/encoding_rs/index.html#statics for a valid list of encodings".into()), + vec![], )), Some(encoding) => Ok(encoding), } @@ -49,6 +88,11 @@ mod test { #[case::iso_8859_1("iso-8859-1", "Some ¼½¿ Data µ¶·¸¹º")] #[case::cp1252("cp1252", "Some ¼½¿ Data")] #[case::latin5("latin5", "Some ¼½¿ Data µ¶·¸¹º")] + // Tests for specific renditions of UTF-16 and UTF-8 labels + #[case::utf16("utf16", "")] + #[case::utf_hyphen_16("utf-16", "")] + #[case::utf8("utf8", "")] + #[case::utf_hyphen_8("utf-8", "")] fn smoke(#[case] encoding: String, #[case] expected: &str) { let test_span = Span::test_data(); let encoding = Spanned { @@ -56,7 +100,7 @@ mod test { span: test_span, }; - let encoded = encode(test_span, encoding.clone(), expected).unwrap(); + let encoded = encode(test_span, encoding.clone(), expected, test_span, true).unwrap(); let encoded = encoded.as_binary().unwrap(); let decoded = decode(test_span, encoding, encoded).unwrap();