From 66ad83c15ce0289a9901b5f618e9ea95ea763974 Mon Sep 17 00:00:00 2001 From: Christian Friedow Date: Wed, 22 Mar 2023 23:54:18 +0100 Subject: [PATCH] from ssv --aligned-columns should separate lines by character index instead of byte index (#8558) # Description ## Symptom Lines which are input into `from ssv --aligned-columns` are split incorrectly of they contain utf-8 characters which have the length of multiple bytes. Notice how the values of the `Bars` column bleeds into the `Security` column in the following output (the big grey areas are censored data ;) ): ![before-patch](https://user-images.githubusercontent.com/17351844/226757737-be7ca493-5c64-4a91-9153-984df515bb8c.png) ## Problem The function behind `from ssv --aligned-columns` splits lines into fields by byte index (which is default behavior of str.get(...) in Rust) instead of character index. If the header row has a different length in bytes than the remaining table rows, the split is executed incorrectly. ## Solution The function behind `from ssv --aligned-columns1 now separates lines by character index instead of byte index. This productes the following (correct) output (the big grey areas are censored data ;) ): ![after-patch](https://user-images.githubusercontent.com/17351844/226757850-7acaebf3-2d40-4f85-b76e-64e465254bda.png) --- crates/nu-command/src/formats/from/ssv.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/crates/nu-command/src/formats/from/ssv.rs b/crates/nu-command/src/formats/from/ssv.rs index 15e931074..2b9490a79 100644 --- a/crates/nu-command/src/formats/from/ssv.rs +++ b/crates/nu-command/src/formats/from/ssv.rs @@ -85,15 +85,23 @@ fn parse_aligned_columns<'a>( .iter() .enumerate() .map(|(i, (header_name, start_position))| { + let char_index_start = match l.char_indices().nth(*start_position) { + Some(idx) => idx.0, + None => *start_position, + }; let val = match headers.get(i + 1) { Some((_, end)) => { if *end < l.len() { - l.get(*start_position..*end) + let char_index_end = match l.char_indices().nth(*end) { + Some(idx) => idx.0, + None => *end, + }; + l.get(char_index_start..char_index_end) } else { - l.get(*start_position..) + l.get(char_index_start..) } } - None => l.get(*start_position..), + None => l.get(char_index_start..), } .unwrap_or("") .trim()