Fix multibyte codepoint handling in detect columns --guess (#13272)

<!--
if this PR closes one or more issues, you can automatically link the PR
with
them by using one of the [*linking
keywords*](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword),
e.g.
- this PR should close #xxxx
- fixes #xxxx

you can also mention related issues, PRs or discussions!
-->

# Description
<!--
Thank you for improving Nushell. Please, check our [contributing
guide](../CONTRIBUTING.md) and talk to the core team before making major
changes.

Description of your pull request goes here. **Provide examples and/or
screenshots** if your changes affect the user experience.
-->
This PR fixes #13269. The splitting code in `guess_width.rs` was
creating slices from char indices, instead of byte indices. This works
perfectly fine for 1-byte code points, but panics or returns wrong
results as soon as multibyte codepoints appear in the input. I
originally discovered this by piping `winget list` into `detect columns
--guess`, since winget sometimes uses the unicode ellipsis symbol (`…`)
which is 3 bytes long when encoded in utf-8.

# User-Facing Changes
<!-- List of all changes that impact the user experience here. This
helps us keep track of breaking changes. -->
`detect columns --guess` should not crash due to multibyte unicode input
anymore

before:

![image](https://github.com/nushell/nushell/assets/20356389/833cd732-be3b-4158-97f7-0ca2616ce23f)

after:

![image](https://github.com/nushell/nushell/assets/20356389/15358b40-4083-4a33-9f2c-87e63f39d985)


# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.

Make sure you've run and fixed any issues with these commands:

- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to
check that you're using the standard code style
- `cargo test --workspace` to check that all tests pass (on Windows make
sure to [enable developer
mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging))
- `cargo run -- -c "use toolkit.nu; toolkit test stdlib"` to run the
tests for the standard library

> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->
- Added tests to `guess_width.rs` for testing handling of multibyte as
well as combining diacritical marks

# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->
This commit is contained in:
alex-tdrn 2024-06-29 23:12:17 +02:00 committed by GitHub
parent 1b1928c103
commit 40e629beb1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -175,34 +175,34 @@ fn separator_position(lr: &[char], p: usize, pos: &[usize], n: usize) -> usize {
fn split(line: &str, pos: &[usize], trim_space: bool) -> Vec<String> {
let mut n = 0;
let mut start = 0;
let mut start_char = 0;
let mut columns = Vec::with_capacity(pos.len() + 1);
let lr: Vec<char> = line.chars().collect();
let (line_char_boundaries, line_chars): (Vec<usize>, Vec<char>) = line.char_indices().unzip();
let mut w = 0;
for p in 0..lr.len() {
for p in 0..line_char_boundaries.len() {
if pos.is_empty() || n > pos.len() - 1 {
start = p;
start_char = p;
break;
}
if pos[n] <= w {
let end = separator_position(&lr, p, pos, n);
if start > end {
let end_char = separator_position(&line_chars, p, pos, n);
if start_char > end_char {
break;
}
let col = &line[start..end];
let col = &line[line_char_boundaries[start_char]..line_char_boundaries[end_char]];
let col = if trim_space { col.trim() } else { col };
columns.push(col.to_string());
n += 1;
start = end;
start_char = end_char;
}
w += UnicodeWidthStr::width(lr[p].to_string().as_str());
w += UnicodeWidthStr::width(line_chars[p].to_string().as_str());
}
// add last part.
let col = &line[start..];
let col = &line[line_char_boundaries[start_char]..];
let col = if trim_space { col.trim() } else { col };
columns.push(col.to_string());
columns
@ -423,6 +423,46 @@ D: 104792064 17042676 87749388 17% /d";
assert_eq!(got, want);
}
#[test]
fn test_guess_width_multibyte() {
let input = "A… B\nC… D";
let r = Box::new(std::io::BufReader::new(input.as_bytes())) as Box<dyn std::io::Read>;
let reader = std::io::BufReader::new(r);
let mut guess_width = GuessWidth {
reader,
pos: Vec::new(),
pre_lines: Vec::new(),
pre_count: 0,
limit_split: 0,
};
let want = vec![vec!["A…", "B"], vec!["C…", "D"]];
let got = guess_width.read_all();
assert_eq!(got, want);
}
#[test]
fn test_guess_width_combining_diacritical_marks() {
let input = "Name Surname
Ștefan Țincu ";
let r = Box::new(std::io::BufReader::new(input.as_bytes())) as Box<dyn std::io::Read>;
let reader = std::io::BufReader::new(r);
let mut guess_width = GuessWidth {
reader,
pos: Vec::new(),
pre_lines: Vec::new(),
pre_count: 0,
limit_split: 0,
};
let want = vec![vec!["Name", "Surname"], vec!["Ștefan", "Țincu"]];
let got = guess_width.read_all();
assert_eq!(got, want);
}
#[test]
fn test_to_table() {
let lines = vec![