forked from extern/nushell
Add encoding auto-detection for decode
(#10030)
# Description Allow `decode` command to guess the encoding of input if no encoding name is given. # User-Facing Changes * `decode` now has an optional parameter instead of required one. User can just run `decode` to let the command automatically detect encoding and convert it to utf-8. <img width="575" alt="Example" src="https://github.com/nushell/nushell/assets/1991933/03a0ba11-910e-4db9-89aa-79cfec06893f"> * Based on the detect result, user may have to give a encoding name <img width="572" alt="Error Sample1" src="https://github.com/nushell/nushell/assets/1991933/f21fda85-1f04-4cb3-9feb-cb9fb7dcee07"> or get informed that the input is not supported by `decode` <img width="568" alt="Error Sample2" src="https://github.com/nushell/nushell/assets/1991933/dd3cc4c0-f119-493e-8609-d07594fc055a"> # Tests + Formatting * `cargo fmt --all -- --check` : OK * `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used -A clippy::needless_collect -A clippy::result_large_err`: OK * `cargo test --workspace` : OK * `cargo run -- -c "use std testing; testing run-tests --path crates/nu-std"`: OK # After Submitting [Command document](https://www.nushell.sh/commands/docs/decode.html) is auto-generated and requires no action. --------- Co-authored-by: Horasal <horsal@horsal.dev>
This commit is contained in:
parent
16c15e83a3
commit
e25a795cf6
12
Cargo.lock
generated
12
Cargo.lock
generated
@ -549,6 +549,17 @@ version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "chardetng"
|
||||
version = "0.1.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"encoding_rs",
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.26"
|
||||
@ -2727,6 +2738,7 @@ dependencies = [
|
||||
"byteorder",
|
||||
"bytesize",
|
||||
"calamine",
|
||||
"chardetng",
|
||||
"chrono",
|
||||
"chrono-humanize",
|
||||
"chrono-tz",
|
||||
|
@ -94,6 +94,7 @@ uuid = { version = "1.3", features = ["v4"] }
|
||||
wax = { version = "0.5" }
|
||||
which = { version = "4.4", optional = true }
|
||||
bracoxide = "0.1.2"
|
||||
chardetng = "0.1.17"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winreg = "0.50"
|
||||
|
@ -25,7 +25,7 @@ impl Command for Decode {
|
||||
fn signature(&self) -> nu_protocol::Signature {
|
||||
Signature::build("decode")
|
||||
.input_output_types(vec![(Type::Binary, Type::String)])
|
||||
.required("encoding", SyntaxShape::String, "the text encoding to use")
|
||||
.optional("encoding", SyntaxShape::String, "the text encoding to use")
|
||||
.category(Category::Strings)
|
||||
}
|
||||
|
||||
@ -63,20 +63,35 @@ documentation link at https://docs.rs/encoding_rs/latest/encoding_rs/#statics"#
|
||||
input: PipelineData,
|
||||
) -> Result<PipelineData, ShellError> {
|
||||
let head = call.head;
|
||||
let encoding: Spanned<String> = call.req(engine_state, stack, 0)?;
|
||||
let encoding: Option<Spanned<String>> = call.opt(engine_state, stack, 0)?;
|
||||
|
||||
match input {
|
||||
PipelineData::ExternalStream { stdout: None, .. } => Ok(PipelineData::empty()),
|
||||
PipelineData::ExternalStream {
|
||||
stdout: Some(stream),
|
||||
span: input_span,
|
||||
..
|
||||
} => {
|
||||
let bytes: Vec<u8> = stream.into_bytes()?.item;
|
||||
super::encoding::decode(head, encoding, &bytes).map(|val| val.into_pipeline_data())
|
||||
match encoding {
|
||||
Some(encoding_name) => super::encoding::decode(head, encoding_name, &bytes),
|
||||
None => super::encoding::detect_encoding_name(head, input_span, &bytes)
|
||||
.map(|encoding| encoding.decode(&bytes).0.into_owned())
|
||||
.map(|s| Value::String { val: s, span: head }),
|
||||
}
|
||||
.map(|val| val.into_pipeline_data())
|
||||
}
|
||||
PipelineData::Value(v, ..) => match v {
|
||||
Value::Binary { val: bytes, .. } => super::encoding::decode(head, encoding, &bytes)
|
||||
.map(|val| val.into_pipeline_data()),
|
||||
Value::Binary {
|
||||
val: bytes,
|
||||
span: input_span,
|
||||
} => match encoding {
|
||||
Some(encoding_name) => super::encoding::decode(head, encoding_name, &bytes),
|
||||
None => super::encoding::detect_encoding_name(head, input_span, &bytes)
|
||||
.map(|encoding| encoding.decode(&bytes).0.into_owned())
|
||||
.map(|s| Value::String { val: s, span: head }),
|
||||
}
|
||||
.map(|val| val.into_pipeline_data()),
|
||||
Value::Error { error, .. } => Err(*error),
|
||||
_ => Err(ShellError::OnlySupportsThisInputType {
|
||||
exp_input_type: "binary".into(),
|
||||
|
@ -1,6 +1,27 @@
|
||||
use chardetng::EncodingDetector;
|
||||
use encoding_rs::Encoding;
|
||||
use nu_protocol::{ShellError, Span, Spanned, Value};
|
||||
|
||||
pub fn detect_encoding_name(
|
||||
head: Span,
|
||||
input: Span,
|
||||
bytes: &[u8],
|
||||
) -> Result<&'static Encoding, ShellError> {
|
||||
let mut detector = EncodingDetector::new();
|
||||
let _non_ascii = detector.feed(bytes, false);
|
||||
//Guess(TLD=None(usually used in HTML), Allow_UTF8=True)
|
||||
let (encoding, is_certain) = detector.guess_assess(None, true);
|
||||
if !is_certain {
|
||||
return Err(ShellError::UnsupportedInput(
|
||||
"Input contains unknown encoding, try giving a encoding name".into(),
|
||||
"value originates from here".into(),
|
||||
head,
|
||||
input,
|
||||
));
|
||||
}
|
||||
Ok(encoding)
|
||||
}
|
||||
|
||||
pub fn decode(
|
||||
head: Span,
|
||||
encoding_name: Spanned<String>,
|
||||
@ -107,4 +128,39 @@ mod test {
|
||||
|
||||
assert_eq!(decoded, expected);
|
||||
}
|
||||
|
||||
#[rstest]
|
||||
#[case::big5(&[186, 251, 176, 242, 164, 106, 168, 229, 161, 93, 87, 105, 107, 105, 112, 101, 100, 105, 97, 161,
|
||||
94, 170, 204, 161, 65, 186, 244, 184, 244, 172, 176, 194, 166, 161, 70, 182, 176, 164, 209, 164, 85,
|
||||
170, 190, 161, 66, 165, 124, 174, 252, 168, 165, 161, 66, 178, 179, 164, 72, 167, 211, 161, 65, 174,
|
||||
209, 166, 202, 172, 236, 178, 106, 161, 67, 169, 108, 167, 64, 170, 204, 161, 65, 186, 251, 176,
|
||||
242, 180, 67, 197, 233, 176, 242, 170, 247, 183, 124, 164, 93, 161, 67], "Big5")]
|
||||
// FIXME: chardetng fails on this
|
||||
//#[case::shiftjis(&[130, 162, 130, 235, 130, 205, 130, 201, 130, 217, 130, 214, 130, 198, 129, 64, 130,
|
||||
// 191, 130, 232, 130, 202, 130, 233, 130, 240], "SHIFT_JIS")]
|
||||
#[case::eucjp(&[164, 164, 164, 237, 164, 207, 164, 203, 164, 219, 164, 216, 164, 200, 161, 161, 164, 193,
|
||||
164, 234, 164, 204, 164, 235, 164, 242],"EUC-JP")]
|
||||
#[case::euckr(&[192, 167, 197, 176, 185, 233, 176, 250, 40, 45, 219, 221, 206, 161, 41, 32, 182, 199, 180,
|
||||
194, 32, 192, 167, 197, 176, 199, 199, 181, 240, 190, 198, 180, 194, 32, 180, 169, 177, 184, 179, 170,
|
||||
32, 192, 218, 192, 175, 183, 211, 176, 212, 32, 190, 181, 32, 188, 246, 32, 192, 214, 180, 194, 32, 180,
|
||||
217, 190, 240, 190, 238, 198, 199, 32, 192, 206, 197, 205, 179, 221, 32, 185, 233, 176, 250, 187, 231, 192,
|
||||
252, 192, 204, 180, 217],"EUC-KR")]
|
||||
#[case::gb2312(&[206, 172, 187, 249, 202, 199, 210, 187, 214, 214, 212, 218, 205, 248, 194, 231, 201, 207, 191,
|
||||
170, 183, 197, 199, 210, 191, 201, 185, 169, 182, 224, 200, 203, 208, 173, 205, 172, 180, 180, 215, 247,
|
||||
181, 196, 179, 172, 206, 196, 177, 190, 207, 181, 205, 179, 163, 172, 211, 201, 195, 192, 185, 250, 200,
|
||||
203, 206, 214, 181, 194, 161, 164, 191, 178, 196, 254, 176, 178, 211, 218, 49, 57, 57, 53, 196, 234, 202,
|
||||
215, 207, 200, 191, 170, 183, 162], "GB2312")]
|
||||
#[case::tis620(&[199, 212, 185, 226, 180, 199, 202, 236, 45, 49, 50, 53, 50, 224, 187, 231, 185, 195, 203, 209,
|
||||
202, 205, 209, 161, 162, 195, 208, 225, 186, 186, 203, 185, 214, 232, 167, 228, 186, 181, 236, 183, 213,
|
||||
232, 227, 170, 233, 161, 209, 186, 205, 209, 161, 201, 195, 197, 208, 181, 212, 185, 32, 193, 209, 161,
|
||||
182, 217, 161, 227, 170, 233, 227, 185, 205, 167, 164, 236, 187, 195, 208, 161, 205, 186, 195, 216, 232,
|
||||
185, 224, 161, 232, 210, 227, 185, 228, 193, 226, 164, 195, 171, 205, 191, 183, 236], "TIS-620")]
|
||||
fn smoke_encoding_name(#[case] bytes: &[u8], #[case] expected: &str) {
|
||||
let encoding_name =
|
||||
detect_encoding_name(Span::test_data(), Span::test_data(), bytes).unwrap();
|
||||
assert_eq!(
|
||||
encoding_name,
|
||||
Encoding::for_label(expected.as_bytes()).unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user