mirror of
https://github.com/nushell/nushell.git
synced 2025-04-24 21:28:20 +02:00
Disallow encode
's silent conversion to HTML entities (and add -i
/--ignore-errors
flag to re-allow it) (#7738)
# Description Closes #7514. * For both `encode` and `decode`: add a special case allowing `utf16` as a valid alias for `utf-16` (just as `utf-8` has `utf8`). * For `encode` , make it an error when encodings_rs replaces characters outside the given encoding with HTML entities * For `encode` , add `-i`/`--ignore-errors` flag to bring back this behaviour. Note: `--ignore-errors` does NOT ignore the error for using a wrong encoding label like `uft8` # User-Facing Changes See above. # Tests + Formatting Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used -A clippy::needless_collect` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass # After Submitting If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. Co-authored-by: Darren Schroeder <343840+fdncred@users.noreply.github.com>
This commit is contained in:
parent
b004aacd69
commit
54dd65cfe1
@ -15,7 +15,7 @@ impl Command for Decode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn usage(&self) -> &str {
|
fn usage(&self) -> &str {
|
||||||
"Decode bytes as a string."
|
"Decode bytes into a string."
|
||||||
}
|
}
|
||||||
|
|
||||||
fn search_terms(&self) -> Vec<&str> {
|
fn search_terms(&self) -> Vec<&str> {
|
||||||
@ -30,11 +30,11 @@ impl Command for Decode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn extra_usage(&self) -> &str {
|
fn extra_usage(&self) -> &str {
|
||||||
r#"Multiple encodings are supported, here is an example of a few:
|
r#"Multiple encodings are supported; here are a few:
|
||||||
big5, euc-jp, euc-kr, gbk, iso-8859-1, utf-16, cp1252, latin5
|
big5, euc-jp, euc-kr, gbk, iso-8859-1, utf-16, cp1252, latin5
|
||||||
|
|
||||||
For a more complete list of encodings please refer to the encoding_rs
|
For a more complete list of encodings please refer to the encoding_rs
|
||||||
documentation link at https://docs.rs/encoding_rs/0.8.28/encoding_rs/#statics"#
|
documentation link at https://docs.rs/encoding_rs/latest/encoding_rs/#statics"#
|
||||||
}
|
}
|
||||||
|
|
||||||
fn examples(&self) -> Vec<Example> {
|
fn examples(&self) -> Vec<Example> {
|
||||||
|
@ -15,7 +15,11 @@ impl Command for Encode {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn usage(&self) -> &str {
|
fn usage(&self) -> &str {
|
||||||
"Encode an UTF-8 string into other kind of representations."
|
// Note: "Encode a UTF-8 string into other forms" is semantically incorrect because
|
||||||
|
// Nushell strings, as abstract values, have no user-facing encoding.
|
||||||
|
// (Remember that "encoding" exclusively means "how the characters are
|
||||||
|
// observably represented by bytes").
|
||||||
|
"Encode a string into bytes."
|
||||||
}
|
}
|
||||||
|
|
||||||
fn search_terms(&self) -> Vec<&str> {
|
fn search_terms(&self) -> Vec<&str> {
|
||||||
@ -26,33 +30,48 @@ impl Command for Encode {
|
|||||||
Signature::build("encode")
|
Signature::build("encode")
|
||||||
.input_output_types(vec![(Type::String, Type::Binary)])
|
.input_output_types(vec![(Type::String, Type::Binary)])
|
||||||
.required("encoding", SyntaxShape::String, "the text encoding to use")
|
.required("encoding", SyntaxShape::String, "the text encoding to use")
|
||||||
|
.switch(
|
||||||
|
"ignore-errors",
|
||||||
|
"when a character isn't in the given encoding, replace with a HTML entity (like `🎈`)",
|
||||||
|
Some('i'),
|
||||||
|
)
|
||||||
.category(Category::Strings)
|
.category(Category::Strings)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extra_usage(&self) -> &str {
|
fn extra_usage(&self) -> &str {
|
||||||
r#"Multiple encodings are supported, here is an example of a few:
|
r#"Multiple encodings are supported; here are a few:
|
||||||
big5, euc-jp, euc-kr, gbk, iso-8859-1, cp1252, latin5
|
big5, euc-jp, euc-kr, gbk, iso-8859-1, cp1252, latin5
|
||||||
|
|
||||||
Note that since the Encoding Standard doesn't specify encoders for utf-16le and utf-16be, these are not yet supported.
|
Note that since the Encoding Standard doesn't specify encoders for utf-16le and utf-16be, these are not yet supported.
|
||||||
|
|
||||||
For a more complete list of encodings please refer to the encoding_rs
|
For a more complete list of encodings, please refer to the encoding_rs
|
||||||
documentation link at https://docs.rs/encoding_rs/0.8.28/encoding_rs/#statics"#
|
documentation link at https://docs.rs/encoding_rs/latest/encoding_rs/#statics"#
|
||||||
}
|
}
|
||||||
|
|
||||||
fn examples(&self) -> Vec<Example> {
|
fn examples(&self) -> Vec<Example> {
|
||||||
vec![Example {
|
vec![
|
||||||
|
Example {
|
||||||
description: "Encode an UTF-8 string into Shift-JIS",
|
description: "Encode an UTF-8 string into Shift-JIS",
|
||||||
example: r#""負けると知って戦うのが、遥かに美しいのだ" | encode shift-jis"#,
|
example: r#""負けると知って戦うのが、遥かに美しいのだ" | encode shift-jis"#,
|
||||||
result: Some(Value::Binary {
|
result: Some(Value::Binary {
|
||||||
val: vec![
|
val: vec![
|
||||||
0x95, 0x89, 0x82, 0xaf, 0x82, 0xe9, 0x82, 0xc6, 0x92, 0x6d, 0x82, 0xc1, 0x82,
|
0x95, 0x89, 0x82, 0xaf, 0x82, 0xe9, 0x82, 0xc6, 0x92, 0x6d, 0x82, 0xc1,
|
||||||
0xc4, 0x90, 0xed, 0x82, 0xa4, 0x82, 0xcc, 0x82, 0xaa, 0x81, 0x41, 0x97, 0x79,
|
0x82, 0xc4, 0x90, 0xed, 0x82, 0xa4, 0x82, 0xcc, 0x82, 0xaa, 0x81, 0x41,
|
||||||
0x82, 0xa9, 0x82, 0xc9, 0x94, 0xfc, 0x82, 0xb5, 0x82, 0xa2, 0x82, 0xcc, 0x82,
|
0x97, 0x79, 0x82, 0xa9, 0x82, 0xc9, 0x94, 0xfc, 0x82, 0xb5, 0x82, 0xa2,
|
||||||
0xbe,
|
0x82, 0xcc, 0x82, 0xbe,
|
||||||
],
|
],
|
||||||
span: Span::test_data(),
|
span: Span::test_data(),
|
||||||
}),
|
}),
|
||||||
}]
|
},
|
||||||
|
Example {
|
||||||
|
description: "Replace characters with HTML entities if they can't be encoded",
|
||||||
|
example: r#""🎈" | encode -i shift-jis"#,
|
||||||
|
result: Some(Value::Binary {
|
||||||
|
val: vec![0x26, 0x23, 0x31, 0x32, 0x37, 0x38, 0x38, 0x30, 0x3b],
|
||||||
|
span: Span::test_data(),
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
fn run(
|
fn run(
|
||||||
@ -64,6 +83,7 @@ documentation link at https://docs.rs/encoding_rs/0.8.28/encoding_rs/#statics"#
|
|||||||
) -> Result<PipelineData, ShellError> {
|
) -> Result<PipelineData, ShellError> {
|
||||||
let head = call.head;
|
let head = call.head;
|
||||||
let encoding: Spanned<String> = call.req(engine_state, stack, 0)?;
|
let encoding: Spanned<String> = call.req(engine_state, stack, 0)?;
|
||||||
|
let ignore_errors = call.has_flag("ignore-errors");
|
||||||
|
|
||||||
match input {
|
match input {
|
||||||
PipelineData::ExternalStream { stdout: None, .. } => Ok(PipelineData::empty()),
|
PipelineData::ExternalStream { stdout: None, .. } => Ok(PipelineData::empty()),
|
||||||
@ -71,12 +91,14 @@ documentation link at https://docs.rs/encoding_rs/0.8.28/encoding_rs/#statics"#
|
|||||||
stdout: Some(stream),
|
stdout: Some(stream),
|
||||||
..
|
..
|
||||||
} => {
|
} => {
|
||||||
let s = stream.into_string()?.item;
|
let s = stream.into_string()?;
|
||||||
super::encoding::encode(head, encoding, &s).map(|val| val.into_pipeline_data())
|
super::encoding::encode(head, encoding, &s.item, s.span, ignore_errors)
|
||||||
|
.map(|val| val.into_pipeline_data())
|
||||||
}
|
}
|
||||||
PipelineData::Value(v, ..) => match v {
|
PipelineData::Value(v, ..) => match v {
|
||||||
Value::String { val: s, .. } => {
|
Value::String { val: s, span } => {
|
||||||
super::encoding::encode(head, encoding, &s).map(|val| val.into_pipeline_data())
|
super::encoding::encode(head, encoding, &s, span, ignore_errors)
|
||||||
|
.map(|val| val.into_pipeline_data())
|
||||||
}
|
}
|
||||||
Value::Error { error } => Err(error),
|
Value::Error { error } => Err(error),
|
||||||
_ => Err(ShellError::OnlySupportsThisInputType(
|
_ => Err(ShellError::OnlySupportsThisInputType(
|
||||||
@ -87,7 +109,7 @@ documentation link at https://docs.rs/encoding_rs/0.8.28/encoding_rs/#statics"#
|
|||||||
)),
|
)),
|
||||||
},
|
},
|
||||||
// This should be more precise, but due to difficulties in getting spans
|
// This should be more precise, but due to difficulties in getting spans
|
||||||
// from PipelineData::ListData, this is as it is.
|
// from PipelineData::ListStream, this is as it is.
|
||||||
_ => Err(ShellError::UnsupportedInput(
|
_ => Err(ShellError::UnsupportedInput(
|
||||||
"non-string input".into(),
|
"non-string input".into(),
|
||||||
"value originates from here".into(),
|
"value originates from here".into(),
|
||||||
|
@ -1,8 +1,17 @@
|
|||||||
use encoding_rs::Encoding;
|
use encoding_rs::Encoding;
|
||||||
use nu_protocol::{ShellError, Span, Spanned, Value};
|
use nu_protocol::{ShellError, Span, Spanned, Value};
|
||||||
|
|
||||||
pub fn decode(head: Span, encoding: Spanned<String>, bytes: &[u8]) -> Result<Value, ShellError> {
|
pub fn decode(
|
||||||
let encoding = parse_encoding(encoding.span, &encoding.item)?;
|
head: Span,
|
||||||
|
encoding_name: Spanned<String>,
|
||||||
|
bytes: &[u8],
|
||||||
|
) -> Result<Value, ShellError> {
|
||||||
|
// Workaround for a bug in the Encodings Specification.
|
||||||
|
let encoding = if encoding_name.item.to_lowercase() == "utf16" {
|
||||||
|
parse_encoding(encoding_name.span, "utf-16")
|
||||||
|
} else {
|
||||||
|
parse_encoding(encoding_name.span, &encoding_name.item)
|
||||||
|
}?;
|
||||||
let (result, ..) = encoding.decode(bytes);
|
let (result, ..) = encoding.decode(bytes);
|
||||||
Ok(Value::String {
|
Ok(Value::String {
|
||||||
val: result.into_owned(),
|
val: result.into_owned(),
|
||||||
@ -10,26 +19,56 @@ pub fn decode(head: Span, encoding: Spanned<String>, bytes: &[u8]) -> Result<Val
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn encode(head: Span, encoding: Spanned<String>, s: &str) -> Result<Value, ShellError> {
|
pub fn encode(
|
||||||
let encoding = parse_encoding(encoding.span, &encoding.item)?;
|
head: Span,
|
||||||
let (result, ..) = encoding.encode(s);
|
encoding_name: Spanned<String>,
|
||||||
|
s: &str,
|
||||||
|
s_span: Span,
|
||||||
|
ignore_errors: bool,
|
||||||
|
) -> Result<Value, ShellError> {
|
||||||
|
// Workaround for a bug in the Encodings Specification.
|
||||||
|
let encoding = if encoding_name.item.to_lowercase() == "utf16" {
|
||||||
|
parse_encoding(encoding_name.span, "utf-16")
|
||||||
|
} else {
|
||||||
|
parse_encoding(encoding_name.span, &encoding_name.item)
|
||||||
|
}?;
|
||||||
|
let (result, _actual_encoding, replacements) = encoding.encode(s);
|
||||||
|
// Because encoding_rs is a Web-facing crate, it defaults to replacing unknowns with HTML entities.
|
||||||
|
// This behaviour can be enabled with -i. Otherwise, it becomes an error.
|
||||||
|
if replacements && !ignore_errors {
|
||||||
|
// TODO: make GenericError accept two spans (including head)
|
||||||
|
Err(ShellError::GenericError(
|
||||||
|
"error while encoding string".into(),
|
||||||
|
format!("string contained characters not in {}", &encoding_name.item),
|
||||||
|
Some(s_span),
|
||||||
|
None,
|
||||||
|
vec![],
|
||||||
|
))
|
||||||
|
} else {
|
||||||
Ok(Value::Binary {
|
Ok(Value::Binary {
|
||||||
val: result.into_owned(),
|
val: result.into_owned(),
|
||||||
span: head,
|
span: head,
|
||||||
})
|
})
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_encoding(span: Span, label: &str) -> Result<&'static Encoding, ShellError> {
|
fn parse_encoding(span: Span, label: &str) -> Result<&'static Encoding, ShellError> {
|
||||||
|
// Workaround for a bug in the Encodings Specification.
|
||||||
|
let label = if label.to_lowercase() == "utf16" {
|
||||||
|
"utf-16"
|
||||||
|
} else {
|
||||||
|
label
|
||||||
|
};
|
||||||
match Encoding::for_label_no_replacement(label.as_bytes()) {
|
match Encoding::for_label_no_replacement(label.as_bytes()) {
|
||||||
None => Err(ShellError::GenericError(
|
None => Err(ShellError::GenericError(
|
||||||
format!(
|
format!(
|
||||||
r#"{} is not a valid encoding, refer to https://docs.rs/encoding_rs/0.8.23/encoding_rs/#statics for a valid list of encodings"#,
|
r#"{} is not a valid encoding"#,
|
||||||
label
|
label
|
||||||
),
|
),
|
||||||
"invalid encoding".into(),
|
"invalid encoding".into(),
|
||||||
Some(span),
|
Some(span),
|
||||||
None,
|
Some("refer to https://docs.rs/encoding_rs/latest/encoding_rs/index.html#statics for a valid list of encodings".into()),
|
||||||
Vec::new(),
|
vec![],
|
||||||
)),
|
)),
|
||||||
Some(encoding) => Ok(encoding),
|
Some(encoding) => Ok(encoding),
|
||||||
}
|
}
|
||||||
@ -49,6 +88,11 @@ mod test {
|
|||||||
#[case::iso_8859_1("iso-8859-1", "Some ¼½¿ Data µ¶·¸¹º")]
|
#[case::iso_8859_1("iso-8859-1", "Some ¼½¿ Data µ¶·¸¹º")]
|
||||||
#[case::cp1252("cp1252", "Some ¼½¿ Data")]
|
#[case::cp1252("cp1252", "Some ¼½¿ Data")]
|
||||||
#[case::latin5("latin5", "Some ¼½¿ Data µ¶·¸¹º")]
|
#[case::latin5("latin5", "Some ¼½¿ Data µ¶·¸¹º")]
|
||||||
|
// Tests for specific renditions of UTF-16 and UTF-8 labels
|
||||||
|
#[case::utf16("utf16", "")]
|
||||||
|
#[case::utf_hyphen_16("utf-16", "")]
|
||||||
|
#[case::utf8("utf8", "")]
|
||||||
|
#[case::utf_hyphen_8("utf-8", "")]
|
||||||
fn smoke(#[case] encoding: String, #[case] expected: &str) {
|
fn smoke(#[case] encoding: String, #[case] expected: &str) {
|
||||||
let test_span = Span::test_data();
|
let test_span = Span::test_data();
|
||||||
let encoding = Spanned {
|
let encoding = Spanned {
|
||||||
@ -56,7 +100,7 @@ mod test {
|
|||||||
span: test_span,
|
span: test_span,
|
||||||
};
|
};
|
||||||
|
|
||||||
let encoded = encode(test_span, encoding.clone(), expected).unwrap();
|
let encoded = encode(test_span, encoding.clone(), expected, test_span, true).unwrap();
|
||||||
let encoded = encoded.as_binary().unwrap();
|
let encoded = encoded.as_binary().unwrap();
|
||||||
|
|
||||||
let decoded = decode(test_span, encoding, encoded).unwrap();
|
let decoded = decode(test_span, encoding, encoded).unwrap();
|
||||||
|
Loading…
Reference in New Issue
Block a user