Fix multibyte codepoint handling in detect columns --guess (#13272)

# Description  This PR fixes #13269. The splitting code in `guess_width.rs` was creating slices from char indices, instead of byte indices. This works perfectly fine for 1-byte code points, but panics or returns wrong results as soon as multibyte codepoints appear in the input. I originally discovered this by piping `winget list` into `detect columns --guess`, since winget sometimes uses the unicode ellipsis symbol (`…`) which is 3 bytes long when encoded in utf-8. # User-Facing Changes  `detect columns --guess` should not crash due to multibyte unicode input anymore before: ![image](https://github.com/nushell/nushell/assets/20356389/833cd732-be3b-4158-97f7-0ca2616ce23f) after: ![image](https://github.com/nushell/nushell/assets/20356389/15358b40-4083-4a33-9f2c-87e63f39d985) # Tests + Formatting  - Added tests to `guess_width.rs` for testing handling of multibyte as well as combining diacritical marks # After Submitting
2025-08-09 05:34:58 +02:00 · 2024-06-29 23:12:17 +02:00
parent 1b1928c103
commit 40e629beb1
1 changed files with 50 additions and 10 deletions
--- a/crates/nu-command/src/strings/guess_width.rs
+++ b/crates/nu-command/src/strings/guess_width.rs
@ -175,34 +175,34 @@ fn separator_position(lr: &[char], p: usize, pos: &[usize], n: usize) -> usize {

 fn split(line: &str, pos: &[usize], trim_space: bool) -> Vec<String> {
    let mut n = 0;
-    let mut start = 0;
+    let mut start_char = 0;
    let mut columns = Vec::with_capacity(pos.len() + 1);
-    let lr: Vec<char> = line.chars().collect();
+    let (line_char_boundaries, line_chars): (Vec<usize>, Vec<char>) = line.char_indices().unzip();
    let mut w = 0;

-    for p in 0..lr.len() {
+    for p in 0..line_char_boundaries.len() {
        if pos.is_empty() || n > pos.len() - 1 {
-            start = p;
+            start_char = p;
            break;
        }

        if pos[n] <= w {
-            let end = separator_position(&lr, p, pos, n);
-            if start > end {
+            let end_char = separator_position(&line_chars, p, pos, n);
+            if start_char > end_char {
                break;
            }
-            let col = &line[start..end];
+            let col = &line[line_char_boundaries[start_char]..line_char_boundaries[end_char]];
            let col = if trim_space { col.trim() } else { col };
            columns.push(col.to_string());
            n += 1;
-            start = end;
+            start_char = end_char;
        }

-        w += UnicodeWidthStr::width(lr[p].to_string().as_str());
+        w += UnicodeWidthStr::width(line_chars[p].to_string().as_str());
    }

    // add last part.
-    let col = &line[start..];
+    let col = &line[line_char_boundaries[start_char]..];
    let col = if trim_space { col.trim() } else { col };
    columns.push(col.to_string());
    columns
@ -423,6 +423,46 @@ D:             104792064  17042676  87749388  17% /d";
        assert_eq!(got, want);
    }

+    #[test]
+    fn test_guess_width_multibyte() {
+        let input = "A… B\nC… D";
+        let r = Box::new(std::io::BufReader::new(input.as_bytes())) as Box<dyn std::io::Read>;
+        let reader = std::io::BufReader::new(r);
+
+        let mut guess_width = GuessWidth {
+            reader,
+            pos: Vec::new(),
+            pre_lines: Vec::new(),
+            pre_count: 0,
+            limit_split: 0,
+        };
+
+        let want = vec![vec!["A…", "B"], vec!["C…", "D"]];
+        let got = guess_width.read_all();
+        assert_eq!(got, want);
+    }
+
+    #[test]
+    fn test_guess_width_combining_diacritical_marks() {
+        let input = "Name        Surname
+Ștefan         Țincu ";
+
+        let r = Box::new(std::io::BufReader::new(input.as_bytes())) as Box<dyn std::io::Read>;
+        let reader = std::io::BufReader::new(r);
+
+        let mut guess_width = GuessWidth {
+            reader,
+            pos: Vec::new(),
+            pre_lines: Vec::new(),
+            pre_count: 0,
+            limit_split: 0,
+        };
+
+        let want = vec![vec!["Name", "Surname"], vec!["Ștefan", "Țincu"]];
+        let got = guess_width.read_all();
+        assert_eq!(got, want);
+    }
+
    #[test]
    fn test_to_table() {
        let lines = vec![