forked from extern/nushell
# Description Closes #7514. * For both `encode` and `decode`: add a special case allowing `utf16` as a valid alias for `utf-16` (just as `utf-8` has `utf8`). * For `encode` , make it an error when encodings_rs replaces characters outside the given encoding with HTML entities * For `encode` , add `-i`/`--ignore-errors` flag to bring back this behaviour. Note: `--ignore-errors` does NOT ignore the error for using a wrong encoding label like `uft8` # User-Facing Changes See above. # Tests + Formatting Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used -A clippy::needless_collect` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass # After Submitting If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. Co-authored-by: Darren Schroeder <343840+fdncred@users.noreply.github.com>
112 lines
3.7 KiB
Rust
112 lines
3.7 KiB
Rust
use encoding_rs::Encoding;
|
|
use nu_protocol::{ShellError, Span, Spanned, Value};
|
|
|
|
pub fn decode(
|
|
head: Span,
|
|
encoding_name: Spanned<String>,
|
|
bytes: &[u8],
|
|
) -> Result<Value, ShellError> {
|
|
// Workaround for a bug in the Encodings Specification.
|
|
let encoding = if encoding_name.item.to_lowercase() == "utf16" {
|
|
parse_encoding(encoding_name.span, "utf-16")
|
|
} else {
|
|
parse_encoding(encoding_name.span, &encoding_name.item)
|
|
}?;
|
|
let (result, ..) = encoding.decode(bytes);
|
|
Ok(Value::String {
|
|
val: result.into_owned(),
|
|
span: head,
|
|
})
|
|
}
|
|
|
|
pub fn encode(
|
|
head: Span,
|
|
encoding_name: Spanned<String>,
|
|
s: &str,
|
|
s_span: Span,
|
|
ignore_errors: bool,
|
|
) -> Result<Value, ShellError> {
|
|
// Workaround for a bug in the Encodings Specification.
|
|
let encoding = if encoding_name.item.to_lowercase() == "utf16" {
|
|
parse_encoding(encoding_name.span, "utf-16")
|
|
} else {
|
|
parse_encoding(encoding_name.span, &encoding_name.item)
|
|
}?;
|
|
let (result, _actual_encoding, replacements) = encoding.encode(s);
|
|
// Because encoding_rs is a Web-facing crate, it defaults to replacing unknowns with HTML entities.
|
|
// This behaviour can be enabled with -i. Otherwise, it becomes an error.
|
|
if replacements && !ignore_errors {
|
|
// TODO: make GenericError accept two spans (including head)
|
|
Err(ShellError::GenericError(
|
|
"error while encoding string".into(),
|
|
format!("string contained characters not in {}", &encoding_name.item),
|
|
Some(s_span),
|
|
None,
|
|
vec![],
|
|
))
|
|
} else {
|
|
Ok(Value::Binary {
|
|
val: result.into_owned(),
|
|
span: head,
|
|
})
|
|
}
|
|
}
|
|
|
|
fn parse_encoding(span: Span, label: &str) -> Result<&'static Encoding, ShellError> {
|
|
// Workaround for a bug in the Encodings Specification.
|
|
let label = if label.to_lowercase() == "utf16" {
|
|
"utf-16"
|
|
} else {
|
|
label
|
|
};
|
|
match Encoding::for_label_no_replacement(label.as_bytes()) {
|
|
None => Err(ShellError::GenericError(
|
|
format!(
|
|
r#"{} is not a valid encoding"#,
|
|
label
|
|
),
|
|
"invalid encoding".into(),
|
|
Some(span),
|
|
Some("refer to https://docs.rs/encoding_rs/latest/encoding_rs/index.html#statics for a valid list of encodings".into()),
|
|
vec![],
|
|
)),
|
|
Some(encoding) => Ok(encoding),
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use super::*;
|
|
use rstest::rstest;
|
|
|
|
#[rstest]
|
|
#[case::big5("big5", "簡体字")]
|
|
#[case::shift_jis("shift-jis", "何だと?……無駄な努力だ?……百も承知だ!")]
|
|
#[case::euc_jp("euc-jp", "だがな、勝つ望みがある時ばかり、戦うのとは訳が違うぞ!")]
|
|
#[case::euc_kr("euc-kr", "가셨어요?")]
|
|
#[case::gbk("gbk", "簡体字")]
|
|
#[case::iso_8859_1("iso-8859-1", "Some ¼½¿ Data µ¶·¸¹º")]
|
|
#[case::cp1252("cp1252", "Some ¼½¿ Data")]
|
|
#[case::latin5("latin5", "Some ¼½¿ Data µ¶·¸¹º")]
|
|
// Tests for specific renditions of UTF-16 and UTF-8 labels
|
|
#[case::utf16("utf16", "")]
|
|
#[case::utf_hyphen_16("utf-16", "")]
|
|
#[case::utf8("utf8", "")]
|
|
#[case::utf_hyphen_8("utf-8", "")]
|
|
fn smoke(#[case] encoding: String, #[case] expected: &str) {
|
|
let test_span = Span::test_data();
|
|
let encoding = Spanned {
|
|
item: encoding,
|
|
span: test_span,
|
|
};
|
|
|
|
let encoded = encode(test_span, encoding.clone(), expected, test_span, true).unwrap();
|
|
let encoded = encoded.as_binary().unwrap();
|
|
|
|
let decoded = decode(test_span, encoding, encoded).unwrap();
|
|
let decoded = decoded.as_string().unwrap();
|
|
|
|
assert_eq!(decoded, expected);
|
|
}
|
|
}
|