diff --git a/Cargo.lock b/Cargo.lock index 2e27590110..65b79999cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -480,7 +480,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6798148dccfbff0fae41c7574d2fa8f1ef3492fba0face179de5d8d447d67b05" dependencies = [ "memchr", - "regex-automata 0.3.0", + "regex-automata 0.3.8", "serde", ] @@ -2397,9 +2397,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.5.0" +version = "2.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" [[package]] name = "memmap2" @@ -4313,14 +4313,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.0" +version = "1.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89089e897c013b3deb627116ae56a6955a72b8bed395c9526af31c9fe528b484" +checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.3.0", - "regex-syntax 0.7.3", + "regex-automata 0.3.8", + "regex-syntax 0.7.5", ] [[package]] @@ -4331,13 +4331,13 @@ checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" [[package]] name = "regex-automata" -version = "0.3.0" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa250384981ea14565685dea16a9ccc4d1c541a13f82b9c168572264d1df8c56" +checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.7.3", + "regex-syntax 0.7.5", ] [[package]] @@ -4348,9 +4348,9 @@ checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" [[package]] name = "regex-syntax" -version = "0.7.3" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ab07dc67230e4a4718e70fd5c20055a4334b121f1f9db8fe63ef39ce9b8c846" +checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" [[package]] name = "relative-path" diff --git a/crates/nu-command/Cargo.toml b/crates/nu-command/Cargo.toml index 06f7038011..8d4b8f8a49 100644 --- a/crates/nu-command/Cargo.toml +++ b/crates/nu-command/Cargo.toml @@ -71,7 +71,7 @@ print-positions = "0.6" quick-xml = "0.30" rand = "0.8" rayon = "1.7" -regex = "1.7" +regex = "1.9.5" roxmltree = "0.18" rusqlite = { version = "0.29", features = ["bundled"], optional = true } same-file = "1.0" diff --git a/crates/nu-command/src/conversions/into/mod.rs b/crates/nu-command/src/conversions/into/mod.rs index 9f0f27962a..f56a56167d 100644 --- a/crates/nu-command/src/conversions/into/mod.rs +++ b/crates/nu-command/src/conversions/into/mod.rs @@ -9,6 +9,7 @@ mod float; mod int; mod record; mod string; +mod value; pub use self::bool::SubCommand as IntoBool; pub use self::filesize::SubCommand as IntoFilesize; @@ -21,3 +22,4 @@ pub use float::SubCommand as IntoFloat; pub use int::SubCommand as IntoInt; pub use record::SubCommand as IntoRecord; pub use string::SubCommand as IntoString; +pub use value::IntoValue; diff --git a/crates/nu-command/src/conversions/into/value.rs b/crates/nu-command/src/conversions/into/value.rs new file mode 100644 index 0000000000..e209b0ad9f --- /dev/null +++ b/crates/nu-command/src/conversions/into/value.rs @@ -0,0 +1,474 @@ +use crate::parse_date_from_string; +use nu_engine::CallExt; +use nu_protocol::{ + ast::Call, + engine::{Command, EngineState, Stack}, + Category, Example, IntoInterruptiblePipelineData, PipelineData, PipelineIterator, ShellError, + Signature, Span, SyntaxShape, Type, Value, +}; +use once_cell::sync::Lazy; +use regex::{Regex, RegexBuilder}; +use std::{collections::HashSet, iter::FromIterator}; + +#[derive(Clone)] +pub struct IntoValue; + +impl Command for IntoValue { + fn name(&self) -> &str { + "into value" + } + + fn signature(&self) -> Signature { + Signature::build("into value") + .input_output_types(vec![(Type::Table(vec![]), Type::Table(vec![]))]) + .named( + "columns", + SyntaxShape::Table(vec![]), + "list of columns to update", + Some('c'), + ) + .allow_variants_without_examples(true) + .category(Category::Filters) + } + + fn usage(&self) -> &str { + "Infer nushell datatype for each cell." + } + + fn examples(&self) -> Vec { + vec![ + Example { + description: "Infer Nushell values for each cell.", + example: "$table | into value", + result: None, + }, + Example { + description: "Infer Nushell values for each cell in the given columns.", + example: "$table | into value -c [column1, column5]", + result: None, + }, + ] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + let engine_state = engine_state.clone(); + let metadata = input.metadata(); + let ctrlc = engine_state.ctrlc.clone(); + let span = call.head; + + // the columns to update + let columns: Option = call.get_flag(&engine_state, stack, "columns")?; + let columns: Option> = match columns { + Some(val) => { + let cols = val + .as_list()? + .iter() + .map(|val| val.as_string()) + .collect::, ShellError>>()?; + Some(HashSet::from_iter(cols)) + } + None => None, + }; + + Ok(UpdateCellIterator { + input: input.into_iter(), + columns, + span, + } + .into_pipeline_data(ctrlc) + .set_metadata(metadata)) + } +} + +struct UpdateCellIterator { + input: PipelineIterator, + columns: Option>, + span: Span, +} + +impl Iterator for UpdateCellIterator { + type Item = Value; + + fn next(&mut self) -> Option { + match self.input.next() { + Some(val) => { + if let Some(ref cols) = self.columns { + if !val.columns().iter().any(|c| cols.contains(c)) { + return Some(val); + } + } + + let span = val.span(); + match val { + Value::Record { val, .. } => Some(Value::record( + val.into_iter() + .map(|(col, val)| match &self.columns { + Some(cols) if !cols.contains(&col) => (col, val), + _ => ( + col, + match process_cell(val, span) { + Ok(val) => val, + Err(err) => Value::error(err, span), + }, + ), + }) + .collect(), + span, + )), + val => match process_cell(val, self.span) { + Ok(val) => Some(val), + Err(err) => Some(Value::error(err, self.span)), + }, + } + } + None => None, + } + } +} + +// This function will check each cell to see if it matches a regular expression +// for a particular datatype. If it does, it will convert the cell to that datatype. +fn process_cell(val: Value, span: Span) -> Result { + // step 1: convert value to string + let val_str = val.as_string().unwrap_or_default(); + + // step 2: bounce string up against regexes + if BOOLEAN_RE.is_match(&val_str) { + let bval = val_str + .parse::() + .map_err(|_| ShellError::CantConvert { + to_type: "string".to_string(), + from_type: "bool".to_string(), + span, + help: Some(format!( + r#""{val_str}" does not represent a valid boolean value"# + )), + })?; + + Ok(Value::bool(bval, span)) + } else if FLOAT_RE.is_match(&val_str) { + let fval = val_str + .parse::() + .map_err(|_| ShellError::CantConvert { + to_type: "string".to_string(), + from_type: "float".to_string(), + span, + help: Some(format!( + r#""{val_str}" does not represent a valid floating point value"# + )), + })?; + + Ok(Value::float(fval, span)) + } else if INTEGER_RE.is_match(&val_str) { + let ival = val_str + .parse::() + .map_err(|_| ShellError::CantConvert { + to_type: "string".to_string(), + from_type: "int".to_string(), + span, + help: Some(format!( + r#""{val_str}" does not represent a valid integer value"# + )), + })?; + + Ok(Value::int(ival, span)) + } else if INTEGER_WITH_DELIMS_RE.is_match(&val_str) { + let mut val_str = val_str; + val_str.retain(|x| !['_', ','].contains(&x)); + + let ival = val_str + .parse::() + .map_err(|_| ShellError::CantConvert { + to_type: "string".to_string(), + from_type: "int".to_string(), + span, + help: Some(format!( + r#""{val_str}" does not represent a valid integer value"# + )), + })?; + + Ok(Value::int(ival, span)) + } else if DATETIME_DMY_RE.is_match(&val_str) { + let dt = parse_date_from_string(&val_str, span).map_err(|_| ShellError::CantConvert { + to_type: "date".to_string(), + from_type: "string".to_string(), + span, + help: Some(format!( + r#""{val_str}" does not represent a valid DATETIME_MDY_RE value"# + )), + })?; + + Ok(Value::date(dt, span)) + } else if DATETIME_YMD_RE.is_match(&val_str) { + let dt = parse_date_from_string(&val_str, span).map_err(|_| ShellError::CantConvert { + to_type: "date".to_string(), + from_type: "string".to_string(), + span, + help: Some(format!( + r#""{val_str}" does not represent a valid DATETIME_YMD_RE value"# + )), + })?; + + Ok(Value::date(dt, span)) + } else if DATETIME_YMDZ_RE.is_match(&val_str) { + let dt = parse_date_from_string(&val_str, span).map_err(|_| ShellError::CantConvert { + to_type: "date".to_string(), + from_type: "string".to_string(), + span, + help: Some(format!( + r#""{val_str}" does not represent a valid DATETIME_YMDZ_RE value"# + )), + })?; + + Ok(Value::date(dt, span)) + } else { + // If we don't know what it is, just return whatever it was passed in as + Ok(val) + } +} + +// region: datatype regexes +const DATETIME_DMY_PATTERN: &str = r#"(?x) + ^ + ['"]? # optional quotes + (?:\d{1,2}) # day + [-/] # separator + (?P[01]?\d{1}) # month + [-/] # separator + (?:\d{4,}) # year + (?: + [T\ ] # separator + (?:\d{2}) # hour + :? # separator + (?:\d{2}) # minute + (?: + :? # separator + (?:\d{2}) # second + (?: + \.(?:\d{1,9}) # subsecond + )? + )? + )? + ['"]? # optional quotes + $ + "#; + +static DATETIME_DMY_RE: Lazy = + Lazy::new(|| Regex::new(DATETIME_DMY_PATTERN).expect("datetime_dmy_pattern should be valid")); +const DATETIME_YMD_PATTERN: &str = r#"(?x) + ^ + ['"]? # optional quotes + (?:\d{4,}) # year + [-/] # separator + (?P[01]?\d{1}) # month + [-/] # separator + (?:\d{1,2}) # day + (?: + [T\ ] # separator + (?:\d{2}) # hour + :? # separator + (?:\d{2}) # minute + (?: + :? # separator + (?:\d{2}) # seconds + (?: + \.(?:\d{1,9}) # subsecond + )? + )? + )? + ['"]? # optional quotes + $ + "#; +static DATETIME_YMD_RE: Lazy = + Lazy::new(|| Regex::new(DATETIME_YMD_PATTERN).expect("datetime_ymd_pattern should be valid")); +//2023-03-24 16:44:17.865147299 -05:00 +const DATETIME_YMDZ_PATTERN: &str = r#"(?x) + ^ + ['"]? # optional quotes + (?:\d{4,}) # year + [-/] # separator + (?P[01]?\d{1}) # month + [-/] # separator + (?:\d{1,2}) # day + [T\ ] # separator + (?:\d{2}) # hour + :? # separator + (?:\d{2}) # minute + (?: + :? # separator + (?:\d{2}) # second + (?: + \.(?:\d{1,9}) # subsecond + )? + )? + \s? # optional space + (?: + # offset (e.g. +01:00) + [+-](?:\d{2}) + :? + (?:\d{2}) + # or Zulu suffix + |Z + ) + ['"]? # optional quotes + $ + "#; +static DATETIME_YMDZ_RE: Lazy = + Lazy::new(|| Regex::new(DATETIME_YMDZ_PATTERN).expect("datetime_ymdz_pattern should be valid")); + +static FLOAT_RE: Lazy = Lazy::new(|| { + Regex::new(r"^\s*[-+]?((\d*\.\d+)([eE][-+]?\d+)?|inf|NaN|(\d+)[eE][-+]?\d+|\d+\.)$") + .expect("float pattern should be valid") +}); + +static INTEGER_RE: Lazy = + Lazy::new(|| Regex::new(r"^\s*-?(\d+)$").expect("integer pattern should be valid")); + +static INTEGER_WITH_DELIMS_RE: Lazy = Lazy::new(|| { + Regex::new(r"^\s*-?(\d{1,3}([,_]\d{3})+)$") + .expect("integer with delimiters pattern should be valid") +}); + +static BOOLEAN_RE: Lazy = Lazy::new(|| { + RegexBuilder::new(r"^\s*(true)$|^(false)$") + .case_insensitive(true) + .build() + .expect("boolean pattern should be valid") +}); +// endregion: + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_examples() { + use crate::test_examples; + + test_examples(IntoValue {}) + } + + #[test] + fn test_float_parse() { + // The regex should work on all these but nushell's float parser is more strict + assert!(FLOAT_RE.is_match("0.1")); + assert!(FLOAT_RE.is_match("3.0")); + assert!(FLOAT_RE.is_match("3.00001")); + assert!(FLOAT_RE.is_match("-9.9990e-003")); + assert!(FLOAT_RE.is_match("9.9990e+003")); + assert!(FLOAT_RE.is_match("9.9990E+003")); + assert!(FLOAT_RE.is_match("9.9990E+003")); + assert!(FLOAT_RE.is_match(".5")); + assert!(FLOAT_RE.is_match("2.5E-10")); + assert!(FLOAT_RE.is_match("2.5e10")); + assert!(FLOAT_RE.is_match("NaN")); + assert!(FLOAT_RE.is_match("-NaN")); + assert!(FLOAT_RE.is_match("-inf")); + assert!(FLOAT_RE.is_match("inf")); + assert!(FLOAT_RE.is_match("-7e-05")); + assert!(FLOAT_RE.is_match("7e-05")); + assert!(FLOAT_RE.is_match("+7e+05")); + } + + #[test] + fn test_int_parse() { + assert!(INTEGER_RE.is_match("0")); + assert!(INTEGER_RE.is_match("1")); + assert!(INTEGER_RE.is_match("10")); + assert!(INTEGER_RE.is_match("100")); + assert!(INTEGER_RE.is_match("1000")); + assert!(INTEGER_RE.is_match("10000")); + assert!(INTEGER_RE.is_match("100000")); + assert!(INTEGER_RE.is_match("1000000")); + assert!(INTEGER_RE.is_match("10000000")); + assert!(INTEGER_RE.is_match("100000000")); + assert!(INTEGER_RE.is_match("1000000000")); + assert!(INTEGER_RE.is_match("10000000000")); + assert!(INTEGER_RE.is_match("100000000000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("1_000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("10_000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("100_000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("1_000_000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("10_000_000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("100_000_000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("1_000_000_000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("10_000_000_000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("100_000_000_000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("1,000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("10,000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("100,000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("1,000,000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("10,000,000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("100,000,000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("1,000,000,000")); + assert!(INTEGER_WITH_DELIMS_RE.is_match("10,000,000,000")); + } + + #[test] + fn test_bool_parse() { + assert!(BOOLEAN_RE.is_match("true")); + assert!(BOOLEAN_RE.is_match("false")); + assert!(!BOOLEAN_RE.is_match("1")); + assert!(!BOOLEAN_RE.is_match("0")); + } + + #[test] + fn test_datetime_ymdz_pattern() { + assert!(DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00Z")); + assert!(DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789Z")); + assert!(DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00+01:00")); + assert!(DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789+01:00")); + assert!(DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00-01:00")); + assert!(DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789-01:00")); + assert!(DATETIME_YMDZ_RE.is_match("'2022-01-01T00:00:00Z'")); + + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00")); + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.")); + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789")); + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00+01")); + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00+01:0")); + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00+1:00")); + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789+01")); + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789+01:0")); + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789+1:00")); + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00-01")); + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00-01:0")); + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00-1:00")); + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789-01")); + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789-01:0")); + assert!(!DATETIME_YMDZ_RE.is_match("2022-01-01T00:00:00.123456789-1:00")); + } + + #[test] + fn test_datetime_ymd_pattern() { + assert!(DATETIME_YMD_RE.is_match("2022-01-01")); + assert!(DATETIME_YMD_RE.is_match("2022/01/01")); + assert!(DATETIME_YMD_RE.is_match("2022-01-01T00:00:00")); + assert!(DATETIME_YMD_RE.is_match("2022-01-01T00:00:00.000000000")); + assert!(DATETIME_YMD_RE.is_match("'2022-01-01'")); + + // The regex isn't this specific, but it would be nice if it were + // assert!(!DATETIME_YMD_RE.is_match("2022-13-01")); + // assert!(!DATETIME_YMD_RE.is_match("2022-01-32")); + // assert!(!DATETIME_YMD_RE.is_match("2022-01-01T24:00:00")); + // assert!(!DATETIME_YMD_RE.is_match("2022-01-01T00:60:00")); + // assert!(!DATETIME_YMD_RE.is_match("2022-01-01T00:00:60")); + assert!(!DATETIME_YMD_RE.is_match("2022-01-01T00:00:00.0000000000")); + } + + #[test] + fn test_datetime_dmy_pattern() { + assert!(DATETIME_DMY_RE.is_match("31-12-2021")); + assert!(DATETIME_DMY_RE.is_match("01/01/2022")); + assert!(DATETIME_DMY_RE.is_match("15-06-2023 12:30")); + assert!(!DATETIME_DMY_RE.is_match("2022-13-01")); + assert!(!DATETIME_DMY_RE.is_match("2022-01-32")); + assert!(!DATETIME_DMY_RE.is_match("2022-01-01 24:00")); + } +} diff --git a/crates/nu-command/src/default_context.rs b/crates/nu-command/src/default_context.rs index b1a64f39f8..aeedd4bed0 100644 --- a/crates/nu-command/src/default_context.rs +++ b/crates/nu-command/src/default_context.rs @@ -293,6 +293,7 @@ pub fn add_shell_command_context(mut engine_state: EngineState) -> EngineState { IntoInt, IntoRecord, IntoString, + IntoValue, }; // Env