forked from extern/nushell
allow oem code pages to be used to decode text (#14187)
# Description This PR allows oem code pages to be used in decoding by specifying the code page number. ## Before  ## After (umlauts)  closes https://github.com/nushell/nushell/issues/14168 I abstracted the decoding a bit. Here are my function comments on how/why. ```rust // Since we have two different decoding mechanisms, we allow oem_cp to be // specified by only a number like `open file | decode 850`. If this decode // parameter parses as a usize then we assume it was intentional and use oem_cp // crate. Otherwise, if it doesn't parse as a usize, we assume it was a string // and use the encoding_rs crate to try and decode it. ``` # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use toolkit.nu; toolkit test stdlib"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. -->
This commit is contained in:
@ -1,4 +1,35 @@
|
||||
use nu_engine::command_prelude::*;
|
||||
use oem_cp::decode_string_complete_table;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::collections::HashMap;
|
||||
|
||||
// create a lazycell of all the code_table "Complete" code pages
|
||||
// the commented out code pages are "Incomplete", which means they
|
||||
// are stored as Option<char> and not &[char; 128]
|
||||
static OEM_DECODE: Lazy<HashMap<usize, &[char; 128]>> = Lazy::new(|| {
|
||||
let mut m = HashMap::new();
|
||||
m.insert(437, &oem_cp::code_table::DECODING_TABLE_CP437);
|
||||
// m.insert(720, &oem_cp::code_table::DECODING_TABLE_CP720);
|
||||
m.insert(737, &oem_cp::code_table::DECODING_TABLE_CP737);
|
||||
m.insert(775, &oem_cp::code_table::DECODING_TABLE_CP775);
|
||||
|
||||
m.insert(850, &oem_cp::code_table::DECODING_TABLE_CP850);
|
||||
m.insert(852, &oem_cp::code_table::DECODING_TABLE_CP852);
|
||||
m.insert(855, &oem_cp::code_table::DECODING_TABLE_CP855);
|
||||
// m.insert(857, &oem_cp::code_table::DECODING_TABLE_CP857);
|
||||
m.insert(858, &oem_cp::code_table::DECODING_TABLE_CP858);
|
||||
m.insert(860, &oem_cp::code_table::DECODING_TABLE_CP860);
|
||||
m.insert(861, &oem_cp::code_table::DECODING_TABLE_CP861);
|
||||
m.insert(862, &oem_cp::code_table::DECODING_TABLE_CP862);
|
||||
m.insert(863, &oem_cp::code_table::DECODING_TABLE_CP863);
|
||||
// m.insert(864, &oem_cp::code_table::DECODING_TABLE_CP864);
|
||||
m.insert(865, &oem_cp::code_table::DECODING_TABLE_CP865);
|
||||
m.insert(866, &oem_cp::code_table::DECODING_TABLE_CP866);
|
||||
// m.insert(869, &oem_cp::code_table::DECODING_TABLE_CP869);
|
||||
// m.insert(874, &oem_cp::code_table::DECODING_TABLE_CP874);
|
||||
|
||||
m
|
||||
});
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Decode;
|
||||
@ -84,7 +115,7 @@ fn run(
|
||||
let span = stream.span();
|
||||
let bytes = stream.into_bytes()?;
|
||||
match encoding {
|
||||
Some(encoding_name) => super::encoding::decode(head, encoding_name, &bytes),
|
||||
Some(encoding_name) => detect_and_decode(encoding_name, head, bytes),
|
||||
None => super::encoding::detect_encoding_name(head, span, &bytes)
|
||||
.map(|encoding| encoding.decode(&bytes).0.into_owned())
|
||||
.map(|s| Value::string(s, head)),
|
||||
@ -95,7 +126,7 @@ fn run(
|
||||
let input_span = v.span();
|
||||
match v {
|
||||
Value::Binary { val: bytes, .. } => match encoding {
|
||||
Some(encoding_name) => super::encoding::decode(head, encoding_name, &bytes),
|
||||
Some(encoding_name) => detect_and_decode(encoding_name, head, bytes),
|
||||
None => super::encoding::detect_encoding_name(head, input_span, &bytes)
|
||||
.map(|encoding| encoding.decode(&bytes).0.into_owned())
|
||||
.map(|s| Value::string(s, head)),
|
||||
@ -121,6 +152,27 @@ fn run(
|
||||
}
|
||||
}
|
||||
|
||||
// Since we have two different decoding mechanisms, we allow oem_cp to be
|
||||
// specified by only a number like `open file | decode 850`. If this decode
|
||||
// parameter parses as a usize then we assume it was intentional and use oem_cp
|
||||
// crate. Otherwise, if it doesn't parse as a usize, we assume it was a string
|
||||
// and use the encoding_rs crate to try and decode it.
|
||||
fn detect_and_decode(
|
||||
encoding_name: Spanned<String>,
|
||||
head: Span,
|
||||
bytes: Vec<u8>,
|
||||
) -> Result<Value, ShellError> {
|
||||
let dec_table_id = encoding_name.item.parse::<usize>().unwrap_or(0usize);
|
||||
if dec_table_id == 0 {
|
||||
super::encoding::decode(head, encoding_name, &bytes)
|
||||
} else {
|
||||
Ok(Value::string(
|
||||
decode_string_complete_table(&bytes, OEM_DECODE[&dec_table_id]),
|
||||
head,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
Reference in New Issue
Block a user