feat: allow from csv to accept 4 byte unicode separator chars (#10138)

- this PR should close #10132

# Description
* added a flag to `from csv --ascii` that replaces the given `separator
with the unicode separator x1f https://www.codetable.net/hex/1f (aka
Information Separator One)

# User-Facing Changes
New flags are available for `from csv` ( `--ascii` or short `-a`)

# Tests + Formatting
There are no tests at the moment. Code has been formatted.
- `cargo test --workspace` (breaks with a non related test on my
machine)
This commit is contained in:
Matthias Q 2023-08-31 18:55:39 +02:00 committed by GitHub
parent 02318cf3a7
commit 93f20b406e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 75 additions and 8 deletions

View File

@ -21,7 +21,7 @@ impl Command for FromCsv {
.named( .named(
"separator", "separator",
SyntaxShape::String, SyntaxShape::String,
"a character to separate columns, defaults to ','", "a character to separate columns (either single char or 4 byte unicode sequence), defaults to ','",
Some('s'), Some('s'),
) )
.named( .named(
@ -134,11 +134,22 @@ fn from_csv(
) -> Result<PipelineData, ShellError> { ) -> Result<PipelineData, ShellError> {
let name = call.head; let name = call.head;
let separator = call let separator = match call.get_flag::<String>(engine_state, stack, "separator")? {
.get_flag(engine_state, stack, "separator")? Some(sep) => {
.map(|v: Value| v.as_char()) if sep.len() == 1 {
.transpose()? sep.chars().next().unwrap_or(',')
.unwrap_or(','); } else if sep.len() == 4 {
let unicode_sep = u32::from_str_radix(&sep, 16);
char::from_u32(unicode_sep.unwrap_or(b'\x1f' as u32)).unwrap_or(',')
} else {
return Err(ShellError::NonUtf8Custom(
"separator should be a single char or a 4-byte unicode".to_string(),
call.span(),
));
}
}
None => ',',
};
let comment = call let comment = call
.get_flag(engine_state, stack, "comment")? .get_flag(engine_state, stack, "comment")?
.map(|v: Value| v.as_char()) .map(|v: Value| v.as_char())

View File

@ -341,7 +341,9 @@ fn from_csv_text_with_multiple_char_separator() {
"# "#
)); ));
assert!(actual.err.contains("single character separator")); assert!(actual
.err
.contains("separator should be a single char or a 4-byte unicode"));
}) })
} }
@ -366,7 +368,7 @@ fn from_csv_text_with_wrong_type_separator() {
"# "#
)); ));
assert!(actual.err.contains("can't convert int to char")); assert!(actual.err.contains("can't convert int to string"));
}) })
} }
@ -404,3 +406,57 @@ fn string_to_csv_error() {
assert!(actual.err.contains("command doesn't support")) assert!(actual.err.contains("command doesn't support"))
} }
#[test]
fn parses_csv_with_unicode_sep() {
Playground::setup("filter_from_csv_unicode_sep_test_3", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_caballeros.txt",
r#"
first_name;last_name;rusty_luck
Andrés;Robalino;1
JT;Turner;1
Yehuda;Katz;1
"#,
)]);
let actual = nu!(
cwd: dirs.test(), pipeline(
r#"
open los_tres_caballeros.txt
| from csv --separator "003B"
| get rusty_luck
| length
"#
));
assert_eq!(actual.out, "3");
})
}
#[test]
fn parses_csv_with_unicode_x1f_sep() {
Playground::setup("filter_from_csv_unicode_sep_x1f_test_3", |dirs, sandbox| {
sandbox.with_files(vec![FileWithContentToBeTrimmed(
"los_tres_caballeros.txt",
r#"
first_namelast_namerusty_luck
AndrésRobalino1
JTTurner1
YehudaKatz1
"#,
)]);
let actual = nu!(
cwd: dirs.test(), pipeline(
r#"
open los_tres_caballeros.txt
| from csv --separator "001F"
| get rusty_luck
| length
"#
));
assert_eq!(actual.out, "3");
})
}