new command: into value (#10427)

# Description

This new command `into value` is a command that tries to infer the type
of data you have in a table. It converts each cell to a string and then
runs a set of regular expressions on that string. This was mostly
cobbled together after looking at how polars does similar things. The
regular expressions were taken straight form polars and tweaked.

### Before
```nushell
❯ [[col1 col2 col3 col4 col5 col6]; ["1" "two" "3.4" "true" "2023-08-10 14:07:17.922050800 -05:00" "2023-09-19"]] |
  update col1 {|r| $r.col1 | into int } |
  update col3 {|r| $r.col3 | into float } |
  update col4 {|r| $r.col4 | into bool } |
  update col5 {|r| $r.col5 | into datetime } |
  update col6 {|r| $r.col6 | into datetime }
╭#┬col1┬col2┬col3┬col4┬───col5────┬───col6────╮
│0│   1│two │3.40│true│a month ago│8 hours ago│
╰─┴────┴────┴────┴────┴───────────┴───────────╯
```
or
```nushell
❯ [[col1 col2 col3 col4 col5 col6]; ["1" "two" "3.4" "true" "2023-08-10 14:07:17.922050800 -05:00" "2023-09-19"]] |
  into int col1 |
  into float col3 |
  into bool col4 |
  into datetime col5 col6
╭#┬col1┬col2┬col3┬col4┬───col5────┬───col6────╮
│0│   1│two │3.40│true│a month ago│8 hours ago│
╰─┴────┴────┴────┴────┴───────────┴───────────╯
```

### After
```nushell
❯ [[col1 col2 col3 col4 col5 col6]; ["1" "two" "3.4" "true" "2023-08-10 14:07:17.922050800 -05:00" "2023-09-19"]] | into value
╭#┬col1┬col2┬col3┬col4┬───col5────┬───col6────╮
│0│   1│two │3.40│true│a month ago│8 hours ago│
╰─┴────┴────┴────┴────┴───────────┴───────────╯
```

It's definitely not perfect. There are ways it will fail because on
regular expressions not working on all formats. My hope is that people
will pick this up and add more regular expressions and if there are
problems with the existing ones, change them. This is meant as a
"starter command" with easy entry for newcomers that are looking to chip
in and help out.

Also, some tests probably need to be added to ensure what we have now
doesn't break with updates.

# User-Facing Changes
<!-- List of all changes that impact the user experience here. This
helps us keep track of breaking changes. -->

# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.

Make sure you've run and fixed any issues with these commands:

- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to
check that you're using the standard code style
- `cargo test --workspace` to check that all tests pass (on Windows make
sure to [enable developer
mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging))
- `cargo run -- -c "use std testing; testing run-tests --path
crates/nu-std"` to run the tests for the standard library

> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->

# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->
This commit is contained in:
Darren Schroeder 2023-09-20 12:57:58 -05:00 committed by GitHub
parent 29e809ad77
commit 4ae53d93fb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 490 additions and 13 deletions

24
Cargo.lock generated
View File

@ -480,7 +480,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6798148dccfbff0fae41c7574d2fa8f1ef3492fba0face179de5d8d447d67b05"
dependencies = [
"memchr",
"regex-automata 0.3.0",
"regex-automata 0.3.8",
"serde",
]
@ -2397,9 +2397,9 @@ dependencies = [
[[package]]
name = "memchr"
version = "2.5.0"
version = "2.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
[[package]]
name = "memmap2"
@ -4313,14 +4313,14 @@ dependencies = [
[[package]]
name = "regex"
version = "1.9.0"
version = "1.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89089e897c013b3deb627116ae56a6955a72b8bed395c9526af31c9fe528b484"
checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata 0.3.0",
"regex-syntax 0.7.3",
"regex-automata 0.3.8",
"regex-syntax 0.7.5",
]
[[package]]
@ -4331,13 +4331,13 @@ checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
[[package]]
name = "regex-automata"
version = "0.3.0"
version = "0.3.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa250384981ea14565685dea16a9ccc4d1c541a13f82b9c168572264d1df8c56"
checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax 0.7.3",
"regex-syntax 0.7.5",
]
[[package]]
@ -4348,9 +4348,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
[[package]]
name = "regex-syntax"
version = "0.7.3"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2ab07dc67230e4a4718e70fd5c20055a4334b121f1f9db8fe63ef39ce9b8c846"
checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
[[package]]
name = "relative-path"

View File

@ -71,7 +71,7 @@ print-positions = "0.6"
quick-xml = "0.30"
rand = "0.8"
rayon = "1.7"
regex = "1.7"
regex = "1.9.5"
roxmltree = "0.18"
rusqlite = { version = "0.29", features = ["bundled"], optional = true }
same-file = "1.0"

View File

@ -9,6 +9,7 @@ mod float;
mod int;
mod record;
mod string;
mod value;
pub use self::bool::SubCommand as IntoBool;
pub use self::filesize::SubCommand as IntoFilesize;
@ -21,3 +22,4 @@ pub use float::SubCommand as IntoFloat;
pub use int::SubCommand as IntoInt;
pub use record::SubCommand as IntoRecord;
pub use string::SubCommand as IntoString;
pub use value::IntoValue;

View File

@ -0,0 +1,474 @@
use crate::parse_date_from_string;
use nu_engine::CallExt;
use nu_protocol::{
ast::Call,
engine::{Command, EngineState, Stack},
Category, Example, IntoInterruptiblePipelineData, PipelineData, PipelineIterator, ShellError,
Signature, Span, SyntaxShape, Type, Value,
};
use once_cell::sync::Lazy;
use regex::{Regex, RegexBuilder};
use std::{collections::HashSet, iter::FromIterator};
#[derive(Clone)]
pub struct IntoValue;
impl Command for IntoValue {
fn name(&self) -> &str {
"into value"
}
fn signature(&self) -> Signature {
Signature::build("into value")
.input_output_types(vec![(Type::Table(vec![]), Type::Table(vec![]))])
.named(
"columns",
SyntaxShape::Table(vec![]),
"list of columns to update",
Some('c'),
)
.allow_variants_without_examples(true)
.category(Category::Filters)
}
fn usage(&self) -> &str {
"Infer nushell datatype for each cell."
}
fn examples(&self) -> Vec<Example> {
vec![
Example {
description: "Infer Nushell values for each cell.",
example: "$table | into value",
result: None,
},
Example {
description: "Infer Nushell values for each cell in the given columns.",
example: "$table | into value -c [column1, column5]",
result: None,
},
]
}
fn run(
&self,
engine_state: &EngineState,
stack: &mut Stack,
call: &Call,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let engine_state = engine_state.clone();
let metadata = input.metadata();
let ctrlc = engine_state.ctrlc.clone();
let span = call.head;
// the columns to update
let columns: Option<Value> = call.get_flag(&engine_state, stack, "columns")?;
let columns: Option<HashSet<String>> = match columns {
Some(val) => {
let cols = val
.as_list()?
.iter()
.map(|val| val.as_string())
.collect::<Result<Vec<String>, ShellError>>()?;
Some(HashSet::from_iter(cols))
}
None => None,
};
Ok(UpdateCellIterator {
input: input.into_iter(),
columns,
span,
}
.into_pipeline_data(ctrlc)
.set_metadata(metadata))
}
}
struct UpdateCellIterator {
input: PipelineIterator,
columns: Option<HashSet<String>>,
span: Span,
}
impl Iterator for UpdateCellIterator {
type Item = Value;
fn next(&mut self) -> Option<Self::Item> {
match self.input.next() {
Some(val) => {
if let Some(ref cols) = self.columns {
if !val.columns().iter().any(|c| cols.contains(c)) {
return Some(val);
}
}
let span = val.span();
match val {
Value::Record { val, .. } => Some(Value::record(
val.into_iter()
.map(|(col, val)| match &self.columns {
Some(cols) if !cols.contains(&col) => (col, val),
_ => (
col,
match process_cell(val, span) {
Ok(val) => val,
Err(err) => Value::error(err, span),
},
),
})
.collect(),
span,
)),
val => match process_cell(val, self.span) {
Ok(val) => Some(val),
Err(err) => Some(Value::error(err, self.span)),
},
}
}
None => None,
}
}
}
// This function will check each cell to see if it matches a regular expression
// for a particular datatype. If it does, it will convert the cell to that datatype.
fn process_cell(val: Value, span: Span) -> Result<Value, ShellError> {
// step 1: convert value to string
let val_str = val.as_string().unwrap_or_default();
// step 2: bounce string up against regexes
if BOOLEAN_RE.is_match(&val_str) {
let bval = val_str
.parse::<bool>()
.map_err(|_| ShellError::CantConvert {
to_type: "string".to_string(),
from_type: "bool".to_string(),
span,
help: Some(format!(
r#""{val_str}" does not represent a valid boolean value"#
)),
})?;
Ok(Value::bool(bval, span))
} else if FLOAT_RE.is_match(&val_str) {
let fval = val_str
.parse::<f64>()
.map_err(|_| ShellError::CantConvert {
to_type: "string".to_string(),
from_type: "float".to_string(),
span,
help: Some(format!(
r#""{val_str}" does not represent a valid floating point value"#
)),
})?;
Ok(Value::float(fval, span))
} else if INTEGER_RE.is_match(&val_str) {
let ival = val_str
.parse::<i64>()
.map_err(|_| ShellError::CantConvert {
to_type: "string".to_string(),
from_type: "int".to_string(),
span,
help: Some(format!(
r#""{val_str}" does not represent a valid integer value"#
)),
})?;
Ok(Value::int(ival, span))
} else if INTEGER_WITH_DELIMS_RE.is_match(&val_str) {
let mut val_str = val_str;
val_str.retain(|x| !['_', ','].contains(&x));
let ival = val_str
.parse::<i64>()
.map_err(|_| ShellError::CantConvert {
to_type: "string".to_string(),
from_type: "int".to_string(),
span,
help: Some(format!(
r#""{val_str}" does not represent a valid integer value"#
)),
})?;
Ok(Value::int(ival, span))
} else if DATETIME_DMY_RE.is_match(&val_str) {
let dt = parse_date_from_string(&val_str, span).map_err(|_| ShellError::CantConvert {
to_type: "date".to_string(),
from_type: "string".to_string(),
span,
help: Some(format!(
r#""{val_str}" does not represent a valid DATETIME_MDY_RE value"#
)),
})?;
Ok(Value::date(dt, span))
} else if DATETIME_YMD_RE.is_match(&val_str) {
let dt = parse_date_from_string(&val_str, span).map_err(|_| ShellError::CantConvert {
to_type: "date".to_string(),
from_type: "string".to_string(),
span,
help: Some(format!(
r#""{val_str}" does not represent a valid DATETIME_YMD_RE value"#
)),
})?;
Ok(Value::date(dt, span))
} else if DATETIME_YMDZ_RE.is_match(&val_str) {
let dt = parse_date_from_string(&val_str, span).map_err(|_| ShellError::CantConvert {
to_type: "date".to_string(),
from_type: "string".to_string(),
span,
help: Some(format!(
r#""{val_str}" does not represent a valid DATETIME_YMDZ_RE value"#
)),
})?;
Ok(Value::date(dt, span))
} else {
// If we don't know what it is, just return whatever it was passed in as
Ok(val)
}
}
// region: datatype regexes
const DATETIME_DMY_PATTERN: &str = r#"(?x)
^
['"]? # optional quotes
(?:\d{1,2}) # day
[-/] # separator
(?P<month>[01]?\d{1}) # month
[-/] # separator
(?:\d{4,}) # year
(?:
[T\ ] # separator
(?:\d{2}) # hour
:? # separator
(?:\d{2}) # minute
(?:
:? # separator
(?:\d{2}) # second
(?:
\.(?:\d{1,9}) # subsecond
)?
)?
)?
['"]? # optional quotes
$
"#;
static DATETIME_DMY_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(DATETIME_DMY_PATTERN).expect("datetime_dmy_pattern should be valid"));
const DATETIME_YMD_PATTERN: &str = r#"(?x)
^
['"]? # optional quotes
(?:\d{4,}) # year
[-/] # separator
(?P<month>[01]?\d{1}) # month
[-/] # separator
(?:\d{1,2}) # day
(?:
[T\ ] # separator
(?:\d{2}) # hour
:? # separator
(?:\d{2}) # minute
(?:
:? # separator
(?:\d{2}) # seconds
(?:
\.(?:\d{1,9}) # subsecond
)?
)?
)?
['"]? # optional quotes
$
"#;
static DATETIME_YMD_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(DATETIME_YMD_PATTERN).expect("datetime_ymd_pattern should be valid"));
//2023-03-24 16:44:17.865147299 -05:00
const DATETIME_YMDZ_PATTERN: &str = r#"(?x)
^
['"]? # optional quotes
(?:\d{4,}) # year
[-/] # separator
(?P<month>[01]?\d{1}) # month
[-/] # separator
(?:\d{1,2}) # day
[T\ ] # separator
(?:\d{2}) # hour
:? # separator
(?:\d{2}) # minute
(?:
:? # separator
(?:\d{2}) # second
(?:
\.(?:\d{1,9}) # subsecond
)?
)?
\s? # optional space
(?:
# offset (e.g. +01:00)
[+-](?:\d{2})
:?
(?:\d{2})
# or Zulu suffix
|Z
)
['"]? # optional quotes
$
"#;
static DATETIME_YMDZ_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(DATETIME_YMDZ_PATTERN).expect("datetime_ymdz_pattern should be valid"));
static FLOAT_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"^\s*[-+]?((\d*\.\d+)([eE][-+]?\d+)?|inf|NaN|(\d+)[eE][-+]?\d+|\d+\.)$")
.expect("float pattern should be valid")
});
static INTEGER_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^\s*-?(\d+)$").expect("integer pattern should be valid"));
static INTEGER_WITH_DELIMS_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"^\s*-?(\d{1,3}([,_]\d{3})+)$")
.expect("integer with delimiters pattern should be valid")
});
static BOOLEAN_RE: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(r"^\s*(true)$|^(false)$")
.case_insensitive(true)
.build()
.expect("boolean pattern should be valid")
});
// endregion:
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_examples() {
use crate::test_examples;
test_examples(IntoValue {})
}
#[test]
fn test_float_parse() {
// The regex should work on all these but nushell's float parser is more strict
assert!(FLOAT_RE.is_match("0.1"));
assert!(FLOAT_RE.is_match("3.0"));
assert!(FLOAT_RE.is_match("3.00001"));
assert!(FLOAT_RE.is_match("-9.9990e-003"));
assert!(FLOAT_RE.is_match("9.9990e+003"));
assert!(FLOAT_RE.is_match("9.9990E+003"));
assert!(FLOAT_RE.is_match("9.9990E+003"));
assert!(FLOAT_RE.is_match(".5"));
assert!(FLOAT_RE.is_match("2.5E-10"));
assert!(FLOAT_RE.is_match("2.5e10"));
assert!(FLOAT_RE.is_match("NaN"));
assert!(FLOAT_RE.is_match("-NaN"));
assert!(FLOAT_RE.is_match("-inf"));
assert!(FLOAT_RE.is_match("inf"));
assert!(FLOAT_RE.is_match("-7e-05"));
assert!(FLOAT_RE.is_match("7e-05"));
assert!(FLOAT_RE.is_match("+7e+05"));
}
#[test]
fn test_int_parse() {
assert!(INTEGER_RE.is_match("0"));
assert!(INTEGER_RE.is_match("1"));
assert!(INTEGER_RE.is_match("10"));
assert!(INTEGER_RE.is_match("100"));
assert!(INTEGER_RE.is_match("1000"));
assert!(INTEGER_RE.is_match("10000"));
assert!(INTEGER_RE.is_match("100000"));
assert!(INTEGER_RE.is_match("1000000"));
assert!(INTEGER_RE.is_match("10000000"));
assert!(INTEGER_RE.is_match("100000000"));
assert!(INTEGER_RE.is_match("1000000000"));
assert!(INTEGER_RE.is_match("10000000000"));
assert!(INTEGER_RE.is_match("100000000000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("1_000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("10_000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("100_000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("1_000_000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("10_000_000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("100_000_000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("1_000_000_000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("10_000_000_000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("100_000_000_000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("1,000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("10,000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("100,000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("1,000,000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("10,000,000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("100,000,000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("1,000,000,000"));
assert!(INTEGER_WITH_DELIMS_RE.is_match("10,000,000,000"));
}
#[test]
fn test_bool_parse() {
assert!(BOOLEAN_RE.is_match("true"));
assert!(BOOLEAN_RE.is_match("false"));
assert!(!BOOLEAN_RE.is_match("1"));
assert!(!BOOLEAN_RE.is_match("0"));
}
#[test]
fn test_datetime_ymdz_pattern() {
assert!(DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00Z"));
assert!(DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789Z"));
assert!(DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00+01:00"));
assert!(DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789+01:00"));
assert!(DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00-01:00"));
assert!(DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789-01:00"));
assert!(DATETIME_YMDZ_RE.is_match("'2022-01-01T00:00:00Z'"));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00"));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00."));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789"));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00+01"));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00+01:0"));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00+1:00"));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789+01"));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789+01:0"));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789+1:00"));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00-01"));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00-01:0"));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00-1:00"));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789-01"));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789-01:0"));
assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789-1:00"));
}
#[test]
fn test_datetime_ymd_pattern() {
assert!(DATETIME_YMD_RE.is_match("2022-01-01"));
assert!(DATETIME_YMD_RE.is_match("2022/01/01"));
assert!(DATETIME_YMD_RE.is_match("2022-01-01T00:00:00"));
assert!(DATETIME_YMD_RE.is_match("2022-01-01T00:00:00.000000000"));
assert!(DATETIME_YMD_RE.is_match("'2022-01-01'"));
// The regex isn't this specific, but it would be nice if it were
// assert!(!DATETIME_YMD_RE.is_match("2022-13-01"));
// assert!(!DATETIME_YMD_RE.is_match("2022-01-32"));
// assert!(!DATETIME_YMD_RE.is_match("2022-01-01T24:00:00"));
// assert!(!DATETIME_YMD_RE.is_match("2022-01-01T00:60:00"));
// assert!(!DATETIME_YMD_RE.is_match("2022-01-01T00:00:60"));
assert!(!DATETIME_YMD_RE.is_match("2022-01-01T00:00:00.0000000000"));
}
#[test]
fn test_datetime_dmy_pattern() {
assert!(DATETIME_DMY_RE.is_match("31-12-2021"));
assert!(DATETIME_DMY_RE.is_match("01/01/2022"));
assert!(DATETIME_DMY_RE.is_match("15-06-2023 12:30"));
assert!(!DATETIME_DMY_RE.is_match("2022-13-01"));
assert!(!DATETIME_DMY_RE.is_match("2022-01-32"));
assert!(!DATETIME_DMY_RE.is_match("2022-01-01 24:00"));
}
}

View File

@ -293,6 +293,7 @@ pub fn add_shell_command_context(mut engine_state: EngineState) -> EngineState {
IntoInt,
IntoRecord,
IntoString,
IntoValue,
};
// Env