allow oem code pages to be used to decode text (#14187)

# Description This PR allows oem code pages to be used in decoding by specifying the code page number. ## Before ![image](https://github.com/user-attachments/assets/27f5d288-49f1-4743-a2fc-154f5291d190) ## After (umlauts) ![image](https://github.com/user-attachments/assets/d37c11be-b1fe-4159-822d-7d38018e1c57) closes https://github.com/nushell/nushell/issues/14168 I abstracted the decoding a bit. Here are my function comments on how/why. ```rust // Since we have two different decoding mechanisms, we allow oem_cp to be // specified by only a number like `open file | decode 850`. If this decode // parameter parses as a usize then we assume it was intentional and use oem_cp // crate. Otherwise, if it doesn't parse as a usize, we assume it was a string // and use the encoding_rs crate to try and decode it. ``` # User-Facing Changes  # Tests + Formatting  # After Submitting
2025-06-20 01:48:09 +02:00 · 2024-10-29 06:32:35 -05:00 · 2024-10-29 06:32:35 -05:00 · 88b0982dac
commit 88b0982dac
parent 8c2e12ad79
4 changed files with 69 additions and 2 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3162,6 +3162,7 @@ dependencies = [
 "num-format",
 "num-traits",
 "nuon",
+ "oem_cp",
 "once_cell",
 "open",
 "os_pipe",
@ -3868,6 +3869,18 @@ dependencies = [
 "memchr",
 ]

+[[package]]
+name = "oem_cp"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "330138902ab4dab09a86e6b7ab7ddeffb5f8435d52fe0df1bce8b06a17b10ee4"
+dependencies = [
+ "phf 0.11.2",
+ "phf_codegen 0.11.2",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "omnipath"
 version = "0.1.6"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -117,6 +117,7 @@ notify-debouncer-full = { version = "0.3", default-features = false }
 nu-ansi-term = "0.50.1"
 num-format = "0.4"
 num-traits = "0.2"
+oem_cp = "2.0.0"
 omnipath = "0.1"
 once_cell = "1.20"
 open = "5.3"
--- a/crates/nu-command/Cargo.toml
+++ b/crates/nu-command/Cargo.toml
@ -66,6 +66,7 @@ native-tls = { workspace = true }
 notify-debouncer-full = { workspace = true, default-features = false }
 num-format = { workspace = true }
 num-traits = { workspace = true }
+oem_cp = { workspace = true }
 once_cell = { workspace = true }
 open = { workspace = true }
 os_pipe = { workspace = true }
--- a/crates/nu-command/src/strings/encode_decode/decode.rs
+++ b/crates/nu-command/src/strings/encode_decode/decode.rs
@ -1,4 +1,35 @@
 use nu_engine::command_prelude::*;
+use oem_cp::decode_string_complete_table;
+use once_cell::sync::Lazy;
+use std::collections::HashMap;
+
+// create a lazycell of all the code_table "Complete" code pages
+// the commented out code pages are "Incomplete", which means they
+// are stored as Option<char> and not &[char; 128]
+static OEM_DECODE: Lazy<HashMap<usize, &[char; 128]>> = Lazy::new(|| {
+    let mut m = HashMap::new();
+    m.insert(437, &oem_cp::code_table::DECODING_TABLE_CP437);
+    // m.insert(720, &oem_cp::code_table::DECODING_TABLE_CP720);
+    m.insert(737, &oem_cp::code_table::DECODING_TABLE_CP737);
+    m.insert(775, &oem_cp::code_table::DECODING_TABLE_CP775);
+
+    m.insert(850, &oem_cp::code_table::DECODING_TABLE_CP850);
+    m.insert(852, &oem_cp::code_table::DECODING_TABLE_CP852);
+    m.insert(855, &oem_cp::code_table::DECODING_TABLE_CP855);
+    // m.insert(857, &oem_cp::code_table::DECODING_TABLE_CP857);
+    m.insert(858, &oem_cp::code_table::DECODING_TABLE_CP858);
+    m.insert(860, &oem_cp::code_table::DECODING_TABLE_CP860);
+    m.insert(861, &oem_cp::code_table::DECODING_TABLE_CP861);
+    m.insert(862, &oem_cp::code_table::DECODING_TABLE_CP862);
+    m.insert(863, &oem_cp::code_table::DECODING_TABLE_CP863);
+    // m.insert(864, &oem_cp::code_table::DECODING_TABLE_CP864);
+    m.insert(865, &oem_cp::code_table::DECODING_TABLE_CP865);
+    m.insert(866, &oem_cp::code_table::DECODING_TABLE_CP866);
+    // m.insert(869, &oem_cp::code_table::DECODING_TABLE_CP869);
+    // m.insert(874, &oem_cp::code_table::DECODING_TABLE_CP874);
+
+    m
+});

 #[derive(Clone)]
 pub struct Decode;
@ -84,7 +115,7 @@ fn run(
            let span = stream.span();
            let bytes = stream.into_bytes()?;
            match encoding {
-                Some(encoding_name) => super::encoding::decode(head, encoding_name, &bytes),
+                Some(encoding_name) => detect_and_decode(encoding_name, head, bytes),
                None => super::encoding::detect_encoding_name(head, span, &bytes)
                    .map(|encoding| encoding.decode(&bytes).0.into_owned())
                    .map(|s| Value::string(s, head)),
@ -95,7 +126,7 @@ fn run(
            let input_span = v.span();
            match v {
                Value::Binary { val: bytes, .. } => match encoding {
-                    Some(encoding_name) => super::encoding::decode(head, encoding_name, &bytes),
+                    Some(encoding_name) => detect_and_decode(encoding_name, head, bytes),
                    None => super::encoding::detect_encoding_name(head, input_span, &bytes)
                        .map(|encoding| encoding.decode(&bytes).0.into_owned())
                        .map(|s| Value::string(s, head)),
@ -121,6 +152,27 @@ fn run(
    }
 }

+// Since we have two different decoding mechanisms, we allow oem_cp to be
+// specified by only a number like `open file | decode 850`. If this decode
+// parameter parses as a usize then we assume it was intentional and use oem_cp
+// crate. Otherwise, if it doesn't parse as a usize, we assume it was a string
+// and use the encoding_rs crate to try and decode it.
+fn detect_and_decode(
+    encoding_name: Spanned<String>,
+    head: Span,
+    bytes: Vec<u8>,
+) -> Result<Value, ShellError> {
+    let dec_table_id = encoding_name.item.parse::<usize>().unwrap_or(0usize);
+    if dec_table_id == 0 {
+        super::encoding::decode(head, encoding_name, &bytes)
+    } else {
+        Ok(Value::string(
+            decode_string_complete_table(&bytes, OEM_DECODE[&dec_table_id]),
+            head,
+        ))
+    }
+}
+
 #[cfg(test)]
 mod test {
    use super::*;