mirror of
https://github.com/nushell/nushell.git
synced 2024-12-23 07:30:13 +01:00
Fix multibyte codepoint handling in detect columns --guess
(#13272)
<!-- if this PR closes one or more issues, you can automatically link the PR with them by using one of the [*linking keywords*](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword), e.g. - this PR should close #xxxx - fixes #xxxx you can also mention related issues, PRs or discussions! --> # Description <!-- Thank you for improving Nushell. Please, check our [contributing guide](../CONTRIBUTING.md) and talk to the core team before making major changes. Description of your pull request goes here. **Provide examples and/or screenshots** if your changes affect the user experience. --> This PR fixes #13269. The splitting code in `guess_width.rs` was creating slices from char indices, instead of byte indices. This works perfectly fine for 1-byte code points, but panics or returns wrong results as soon as multibyte codepoints appear in the input. I originally discovered this by piping `winget list` into `detect columns --guess`, since winget sometimes uses the unicode ellipsis symbol (`…`) which is 3 bytes long when encoded in utf-8. # User-Facing Changes <!-- List of all changes that impact the user experience here. This helps us keep track of breaking changes. --> `detect columns --guess` should not crash due to multibyte unicode input anymore before: ![image](https://github.com/nushell/nushell/assets/20356389/833cd732-be3b-4158-97f7-0ca2616ce23f) after: ![image](https://github.com/nushell/nushell/assets/20356389/15358b40-4083-4a33-9f2c-87e63f39d985) # Tests + Formatting <!-- Don't forget to add tests that cover your changes. Make sure you've run and fixed any issues with these commands: - `cargo fmt --all -- --check` to check standard code formatting (`cargo fmt --all` applies these changes) - `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to check that you're using the standard code style - `cargo test --workspace` to check that all tests pass (on Windows make sure to [enable developer mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging)) - `cargo run -- -c "use toolkit.nu; toolkit test stdlib"` to run the tests for the standard library > **Note** > from `nushell` you can also use the `toolkit` as follows > ```bash > use toolkit.nu # or use an `env_change` hook to activate it automatically > toolkit check pr > ``` --> - Added tests to `guess_width.rs` for testing handling of multibyte as well as combining diacritical marks # After Submitting <!-- If your PR had any user-facing changes, update [the documentation](https://github.com/nushell/nushell.github.io) after the PR is merged, if necessary. This will help us keep the docs up to date. -->
This commit is contained in:
parent
1b1928c103
commit
40e629beb1
@ -175,34 +175,34 @@ fn separator_position(lr: &[char], p: usize, pos: &[usize], n: usize) -> usize {
|
||||
|
||||
fn split(line: &str, pos: &[usize], trim_space: bool) -> Vec<String> {
|
||||
let mut n = 0;
|
||||
let mut start = 0;
|
||||
let mut start_char = 0;
|
||||
let mut columns = Vec::with_capacity(pos.len() + 1);
|
||||
let lr: Vec<char> = line.chars().collect();
|
||||
let (line_char_boundaries, line_chars): (Vec<usize>, Vec<char>) = line.char_indices().unzip();
|
||||
let mut w = 0;
|
||||
|
||||
for p in 0..lr.len() {
|
||||
for p in 0..line_char_boundaries.len() {
|
||||
if pos.is_empty() || n > pos.len() - 1 {
|
||||
start = p;
|
||||
start_char = p;
|
||||
break;
|
||||
}
|
||||
|
||||
if pos[n] <= w {
|
||||
let end = separator_position(&lr, p, pos, n);
|
||||
if start > end {
|
||||
let end_char = separator_position(&line_chars, p, pos, n);
|
||||
if start_char > end_char {
|
||||
break;
|
||||
}
|
||||
let col = &line[start..end];
|
||||
let col = &line[line_char_boundaries[start_char]..line_char_boundaries[end_char]];
|
||||
let col = if trim_space { col.trim() } else { col };
|
||||
columns.push(col.to_string());
|
||||
n += 1;
|
||||
start = end;
|
||||
start_char = end_char;
|
||||
}
|
||||
|
||||
w += UnicodeWidthStr::width(lr[p].to_string().as_str());
|
||||
w += UnicodeWidthStr::width(line_chars[p].to_string().as_str());
|
||||
}
|
||||
|
||||
// add last part.
|
||||
let col = &line[start..];
|
||||
let col = &line[line_char_boundaries[start_char]..];
|
||||
let col = if trim_space { col.trim() } else { col };
|
||||
columns.push(col.to_string());
|
||||
columns
|
||||
@ -423,6 +423,46 @@ D: 104792064 17042676 87749388 17% /d";
|
||||
assert_eq!(got, want);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_guess_width_multibyte() {
|
||||
let input = "A… B\nC… D";
|
||||
let r = Box::new(std::io::BufReader::new(input.as_bytes())) as Box<dyn std::io::Read>;
|
||||
let reader = std::io::BufReader::new(r);
|
||||
|
||||
let mut guess_width = GuessWidth {
|
||||
reader,
|
||||
pos: Vec::new(),
|
||||
pre_lines: Vec::new(),
|
||||
pre_count: 0,
|
||||
limit_split: 0,
|
||||
};
|
||||
|
||||
let want = vec![vec!["A…", "B"], vec!["C…", "D"]];
|
||||
let got = guess_width.read_all();
|
||||
assert_eq!(got, want);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_guess_width_combining_diacritical_marks() {
|
||||
let input = "Name Surname
|
||||
Ștefan Țincu ";
|
||||
|
||||
let r = Box::new(std::io::BufReader::new(input.as_bytes())) as Box<dyn std::io::Read>;
|
||||
let reader = std::io::BufReader::new(r);
|
||||
|
||||
let mut guess_width = GuessWidth {
|
||||
reader,
|
||||
pos: Vec::new(),
|
||||
pre_lines: Vec::new(),
|
||||
pre_count: 0,
|
||||
limit_split: 0,
|
||||
};
|
||||
|
||||
let want = vec![vec!["Name", "Surname"], vec!["Ștefan", "Țincu"]];
|
||||
let got = guess_width.read_all();
|
||||
assert_eq!(got, want);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_table() {
|
||||
let lines = vec![
|
||||
|
Loading…
Reference in New Issue
Block a user