allow oem code pages to be used to decode text (#14187)

# Description This PR allows oem code pages to be used in decoding by specifying the code page number. ## Before ![image](https://github.com/user-attachments/assets/27f5d288-49f1-4743-a2fc-154f5291d190) ## After (umlauts) ![image](https://github.com/user-attachments/assets/d37c11be-b1fe-4159-822d-7d38018e1c57) closes https://github.com/nushell/nushell/issues/14168 I abstracted the decoding a bit. Here are my function comments on how/why. ```rust // Since we have two different decoding mechanisms, we allow oem_cp to be // specified by only a number like `open file | decode 850`. If this decode // parameter parses as a usize then we assume it was intentional and use oem_cp // crate. Otherwise, if it doesn't parse as a usize, we assume it was a string // and use the encoding_rs crate to try and decode it. ``` # User-Facing Changes  # Tests + Formatting  # After Submitting
2024-10-29 06:32:35 -05:00
parent 8c2e12ad79
commit 88b0982dac
4 changed files with 69 additions and 2 deletions
--- a/crates/nu-command/src/strings/encode_decode/decode.rs
+++ b/crates/nu-command/src/strings/encode_decode/decode.rs
@ -1,4 +1,35 @@
 use nu_engine::command_prelude::*;
+use oem_cp::decode_string_complete_table;
+use once_cell::sync::Lazy;
+use std::collections::HashMap;
+
+// create a lazycell of all the code_table "Complete" code pages
+// the commented out code pages are "Incomplete", which means they
+// are stored as Option<char> and not &[char; 128]
+static OEM_DECODE: Lazy<HashMap<usize, &[char; 128]>> = Lazy::new(|| {
+    let mut m = HashMap::new();
+    m.insert(437, &oem_cp::code_table::DECODING_TABLE_CP437);
+    // m.insert(720, &oem_cp::code_table::DECODING_TABLE_CP720);
+    m.insert(737, &oem_cp::code_table::DECODING_TABLE_CP737);
+    m.insert(775, &oem_cp::code_table::DECODING_TABLE_CP775);
+
+    m.insert(850, &oem_cp::code_table::DECODING_TABLE_CP850);
+    m.insert(852, &oem_cp::code_table::DECODING_TABLE_CP852);
+    m.insert(855, &oem_cp::code_table::DECODING_TABLE_CP855);
+    // m.insert(857, &oem_cp::code_table::DECODING_TABLE_CP857);
+    m.insert(858, &oem_cp::code_table::DECODING_TABLE_CP858);
+    m.insert(860, &oem_cp::code_table::DECODING_TABLE_CP860);
+    m.insert(861, &oem_cp::code_table::DECODING_TABLE_CP861);
+    m.insert(862, &oem_cp::code_table::DECODING_TABLE_CP862);
+    m.insert(863, &oem_cp::code_table::DECODING_TABLE_CP863);
+    // m.insert(864, &oem_cp::code_table::DECODING_TABLE_CP864);
+    m.insert(865, &oem_cp::code_table::DECODING_TABLE_CP865);
+    m.insert(866, &oem_cp::code_table::DECODING_TABLE_CP866);
+    // m.insert(869, &oem_cp::code_table::DECODING_TABLE_CP869);
+    // m.insert(874, &oem_cp::code_table::DECODING_TABLE_CP874);
+
+    m
+});

 #[derive(Clone)]
 pub struct Decode;
@ -84,7 +115,7 @@ fn run(
            let span = stream.span();
            let bytes = stream.into_bytes()?;
            match encoding {
-                Some(encoding_name) => super::encoding::decode(head, encoding_name, &bytes),
+                Some(encoding_name) => detect_and_decode(encoding_name, head, bytes),
                None => super::encoding::detect_encoding_name(head, span, &bytes)
                    .map(|encoding| encoding.decode(&bytes).0.into_owned())
                    .map(|s| Value::string(s, head)),
@ -95,7 +126,7 @@ fn run(
            let input_span = v.span();
            match v {
                Value::Binary { val: bytes, .. } => match encoding {
-                    Some(encoding_name) => super::encoding::decode(head, encoding_name, &bytes),
+                    Some(encoding_name) => detect_and_decode(encoding_name, head, bytes),
                    None => super::encoding::detect_encoding_name(head, input_span, &bytes)
                        .map(|encoding| encoding.decode(&bytes).0.into_owned())
                        .map(|s| Value::string(s, head)),
@ -121,6 +152,27 @@ fn run(
    }
 }

+// Since we have two different decoding mechanisms, we allow oem_cp to be
+// specified by only a number like `open file | decode 850`. If this decode
+// parameter parses as a usize then we assume it was intentional and use oem_cp
+// crate. Otherwise, if it doesn't parse as a usize, we assume it was a string
+// and use the encoding_rs crate to try and decode it.
+fn detect_and_decode(
+    encoding_name: Spanned<String>,
+    head: Span,
+    bytes: Vec<u8>,
+) -> Result<Value, ShellError> {
+    let dec_table_id = encoding_name.item.parse::<usize>().unwrap_or(0usize);
+    if dec_table_id == 0 {
+        super::encoding::decode(head, encoding_name, &bytes)
+    } else {
+        Ok(Value::string(
+            decode_string_complete_table(&bytes, OEM_DECODE[&dec_table_id]),
+            head,
+        ))
+    }
+}
+
 #[cfg(test)]
 mod test {
    use super::*;