diff --git a/Cargo.lock b/Cargo.lock index f34a76ad24..ed8819785e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1456,6 +1456,7 @@ dependencies = [ "rand", "rayon", "regex", + "roxmltree", "serde", "serde_ini", "serde_urlencoded", @@ -2169,6 +2170,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "194d8e591e405d1eecf28819740abed6d719d1a2db87fc0bcdedee9a26d55560" +[[package]] +name = "roxmltree" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "921904a62e410e37e215c40381b7117f830d9d89ba60ab5236170541dd25646b" +dependencies = [ + "xmlparser", +] + [[package]] name = "rust-argon2" version = "0.8.3" @@ -2758,6 +2768,12 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "xmlparser" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "114ba2b24d2167ef6d67d7d04c8cc86522b87f490025f39f0303b7db5bf5e3d8" + [[package]] name = "yaml-rust" version = "0.4.5" diff --git a/crates/nu-command/Cargo.toml b/crates/nu-command/Cargo.toml index 165751f582..c61526801d 100644 --- a/crates/nu-command/Cargo.toml +++ b/crates/nu-command/Cargo.toml @@ -44,6 +44,7 @@ toml = "0.5.8" itertools = "0.10.0" ical = "0.7.0" calamine = "0.18.0" +roxmltree = "0.14.0" rand = "0.8" num = {version="0.4.0", optional=true} diff --git a/crates/nu-command/src/default_context.rs b/crates/nu-command/src/default_context.rs index 2b2c1b57ab..977aa39195 100644 --- a/crates/nu-command/src/default_context.rs +++ b/crates/nu-command/src/default_context.rs @@ -71,6 +71,8 @@ pub fn create_default_context() -> EngineState { FromIcs, FromIni, FromVcf, + FromSsv, + FromXml, FromXlsx, Get, Griddle, diff --git a/crates/nu-command/src/formats/from/mod.rs b/crates/nu-command/src/formats/from/mod.rs index 4698e7556d..54f007d498 100644 --- a/crates/nu-command/src/formats/from/mod.rs +++ b/crates/nu-command/src/formats/from/mod.rs @@ -6,11 +6,13 @@ mod ics; mod ini; mod json; mod ods; +mod ssv; mod toml; mod tsv; mod url; mod vcf; mod xlsx; +mod xml; mod yaml; pub use self::csv::FromCsv; @@ -21,9 +23,11 @@ pub use ics::FromIcs; pub use ini::FromIni; pub use json::FromJson; pub use ods::FromOds; +pub use ssv::FromSsv; pub use tsv::FromTsv; pub use url::FromUrl; pub use vcf::FromVcf; pub use xlsx::FromXlsx; +pub use xml::FromXml; pub use yaml::FromYaml; pub use yaml::FromYml; diff --git a/crates/nu-command/src/formats/from/ssv.rs b/crates/nu-command/src/formats/from/ssv.rs new file mode 100644 index 0000000000..f213fdb003 --- /dev/null +++ b/crates/nu-command/src/formats/from/ssv.rs @@ -0,0 +1,500 @@ +use indexmap::map::IndexMap; +use nu_engine::CallExt; +use nu_protocol::ast::Call; +use nu_protocol::engine::{Command, EngineState, Stack}; +use nu_protocol::{ + Category, Example, IntoPipelineData, PipelineData, ShellError, Signature, Span, Spanned, + SyntaxShape, Value, +}; + +#[derive(Clone)] +pub struct FromSsv; + +const DEFAULT_MINIMUM_SPACES: usize = 2; + +impl Command for FromSsv { + fn name(&self) -> &str { + "from ssv" + } + + fn signature(&self) -> Signature { + Signature::build("from ssv") + .switch( + "noheaders", + "don't treat the first row as column names", + Some('n'), + ) + .switch("aligned-columns", "assume columns are aligned", Some('a')) + .named( + "minimum-spaces", + SyntaxShape::Int, + "the minimum spaces to separate columns", + Some('m'), + ) + .category(Category::Formats) + } + + fn usage(&self) -> &str { + "Parse text as space-separated values and create a table. The default minimum number of spaces counted as a separator is 2." + } + + fn examples(&self) -> Vec { + vec![Example { + example: r#"'FOO BAR +1 2' | from ssv"#, + description: "Converts ssv formatted string to table", + result: Some(Value::List { vals: vec![Value::Record { cols: vec!["FOO".to_string(), "BAR".to_string()], vals: vec![Value::String { val: "1".to_string(), span: Span::unknown() }, Value::String { val: "2".to_string(), span: Span::unknown() }], span: Span::unknown() }], span: Span::unknown() }), + }, Example { + example: r#"'FOO BAR +1 2' | from ssv -n"#, + description: "Converts ssv formatted string to table but not treating the first row as column names", + result: Some( + Value::List { vals: vec![Value::Record { cols: vec!["Column1".to_string(), "Column2".to_string()], vals: vec![Value::String { val: "FOO".to_string(), span: Span::unknown() }, Value::String { val: "BAR".to_string(), span: Span::unknown() }], span: Span::unknown() }, Value::Record { cols: vec!["Column1".to_string(), "Column2".to_string()], vals: vec![Value::String { val: "1".to_string(), span: Span::unknown() }, Value::String { val: "2".to_string(), span: Span::unknown() }], span: Span::unknown() }], span: Span::unknown() }), + }] + } + + fn run( + &self, + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + from_ssv(engine_state, stack, call, input) + } +} + +enum HeaderOptions<'a> { + WithHeaders(&'a str), + WithoutHeaders, +} + +fn parse_aligned_columns<'a>( + lines: impl Iterator, + headers: HeaderOptions, + separator: &str, +) -> Vec> { + fn construct<'a>( + lines: impl Iterator, + headers: Vec<(String, usize)>, + ) -> Vec> { + lines + .map(|l| { + headers + .iter() + .enumerate() + .map(|(i, (header_name, start_position))| { + let val = match headers.get(i + 1) { + Some((_, end)) => { + if *end < l.len() { + l.get(*start_position..*end) + } else { + l.get(*start_position..) + } + } + None => l.get(*start_position..), + } + .unwrap_or("") + .trim() + .into(); + (header_name.clone(), val) + }) + .collect() + }) + .collect() + } + + let find_indices = |line: &str| { + let values = line + .split(&separator) + .map(str::trim) + .filter(|s| !s.is_empty()); + values + .fold( + (0, vec![]), + |(current_pos, mut indices), value| match line[current_pos..].find(value) { + None => (current_pos, indices), + Some(index) => { + let absolute_index = current_pos + index; + indices.push(absolute_index); + (absolute_index + value.len(), indices) + } + }, + ) + .1 + }; + + let parse_with_headers = |lines, headers_raw: &str| { + let indices = find_indices(headers_raw); + let headers = headers_raw + .split(&separator) + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(String::from) + .zip(indices); + + let columns = headers.collect::>(); + + construct(lines, columns) + }; + + let parse_without_headers = |ls: Vec<&str>| { + let mut indices = ls + .iter() + .flat_map(|s| find_indices(*s)) + .collect::>(); + + indices.sort_unstable(); + indices.dedup(); + + let headers: Vec<(String, usize)> = indices + .iter() + .enumerate() + .map(|(i, position)| (format!("Column{}", i + 1), *position)) + .collect(); + + construct(ls.iter().map(|s| s.to_owned()), headers) + }; + + match headers { + HeaderOptions::WithHeaders(headers_raw) => parse_with_headers(lines, headers_raw), + HeaderOptions::WithoutHeaders => parse_without_headers(lines.collect()), + } +} + +fn parse_separated_columns<'a>( + lines: impl Iterator, + headers: HeaderOptions, + separator: &str, +) -> Vec> { + fn collect<'a>( + headers: Vec, + rows: impl Iterator, + separator: &str, + ) -> Vec> { + rows.map(|r| { + headers + .iter() + .zip(r.split(separator).map(str::trim).filter(|s| !s.is_empty())) + .map(|(a, b)| (a.to_owned(), b.to_owned())) + .collect() + }) + .collect() + } + + let parse_with_headers = |lines, headers_raw: &str| { + let headers = headers_raw + .split(&separator) + .map(str::trim) + .map(str::to_owned) + .filter(|s| !s.is_empty()) + .collect(); + collect(headers, lines, separator) + }; + + let parse_without_headers = |ls: Vec<&str>| { + let num_columns = ls.iter().map(|r| r.len()).max().unwrap_or(0); + + let headers = (1..=num_columns) + .map(|i| format!("Column{}", i)) + .collect::>(); + collect(headers, ls.into_iter(), separator) + }; + + match headers { + HeaderOptions::WithHeaders(headers_raw) => parse_with_headers(lines, headers_raw), + HeaderOptions::WithoutHeaders => parse_without_headers(lines.collect()), + } +} + +fn string_to_table( + s: &str, + noheaders: bool, + aligned_columns: bool, + split_at: usize, +) -> Vec> { + let mut lines = s.lines().filter(|l| !l.trim().is_empty()); + let separator = " ".repeat(std::cmp::max(split_at, 1)); + + let (ls, header_options) = if noheaders { + (lines, HeaderOptions::WithoutHeaders) + } else { + match lines.next() { + Some(header) => (lines, HeaderOptions::WithHeaders(header)), + None => return vec![], + } + }; + + let f = if aligned_columns { + parse_aligned_columns + } else { + parse_separated_columns + }; + + f(ls, header_options, &separator) +} + +fn from_ssv_string_to_value( + s: &str, + noheaders: bool, + aligned_columns: bool, + split_at: usize, + span: Span, +) -> Value { + let rows = string_to_table(s, noheaders, aligned_columns, split_at) + .iter() + .map(|row| { + let mut dict = IndexMap::new(); + for (col, entry) in row { + dict.insert( + col.to_string(), + Value::String { + val: entry.to_string(), + span, + }, + ); + } + Value::from(Spanned { item: dict, span }) + }) + .collect(); + + Value::List { vals: rows, span } +} + +fn from_ssv( + engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, +) -> Result { + let config = stack.get_config()?; + let name = call.head; + + let noheaders = call.has_flag("noheaders"); + let aligned_columns = call.has_flag("aligned-columns"); + let minimum_spaces: Option> = + call.get_flag(engine_state, stack, "minimum-spaces")?; + + let concat_string = input.collect_string("", &config); + let split_at = match minimum_spaces { + Some(number) => number.item, + None => DEFAULT_MINIMUM_SPACES, + }; + + Ok( + from_ssv_string_to_value(&concat_string, noheaders, aligned_columns, split_at, name) + .into_pipeline_data(), + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn owned(x: &str, y: &str) -> (String, String) { + (String::from(x), String::from(y)) + } + + #[test] + fn it_trims_empty_and_whitespace_only_lines() { + let input = r#" + + a b + + 1 2 + + 3 4 + "#; + let result = string_to_table(input, false, true, 1); + assert_eq!( + result, + vec![ + vec![owned("a", "1"), owned("b", "2")], + vec![owned("a", "3"), owned("b", "4")] + ] + ); + } + + #[test] + fn it_deals_with_single_column_input() { + let input = r#" + a + 1 + 2 + "#; + let result = string_to_table(input, false, true, 1); + assert_eq!(result, vec![vec![owned("a", "1")], vec![owned("a", "2")]]); + } + + #[test] + fn it_uses_first_row_as_data_when_noheaders() { + let input = r#" + a b + 1 2 + 3 4 + "#; + let result = string_to_table(input, true, true, 1); + assert_eq!( + result, + vec![ + vec![owned("Column1", "a"), owned("Column2", "b")], + vec![owned("Column1", "1"), owned("Column2", "2")], + vec![owned("Column1", "3"), owned("Column2", "4")] + ] + ); + } + + #[test] + fn it_allows_a_predefined_number_of_spaces() { + let input = r#" + column a column b + entry 1 entry number 2 + 3 four + "#; + + let result = string_to_table(input, false, true, 3); + assert_eq!( + result, + vec![ + vec![ + owned("column a", "entry 1"), + owned("column b", "entry number 2") + ], + vec![owned("column a", "3"), owned("column b", "four")] + ] + ); + } + + #[test] + fn it_trims_remaining_separator_space() { + let input = r#" + colA colB colC + val1 val2 val3 + "#; + + let trimmed = |s: &str| s.trim() == s; + + let result = string_to_table(input, false, true, 2); + assert!(result + .iter() + .all(|row| row.iter().all(|(a, b)| trimmed(a) && trimmed(b)))); + } + + #[test] + fn it_keeps_empty_columns() { + let input = r#" + colA col B col C + val2 val3 + val4 val 5 val 6 + val7 val8 + "#; + + let result = string_to_table(input, false, true, 2); + assert_eq!( + result, + vec![ + vec![ + owned("colA", ""), + owned("col B", "val2"), + owned("col C", "val3") + ], + vec![ + owned("colA", "val4"), + owned("col B", "val 5"), + owned("col C", "val 6") + ], + vec![ + owned("colA", "val7"), + owned("col B", ""), + owned("col C", "val8") + ], + ] + ); + } + + #[test] + fn it_can_produce_an_empty_stream_for_header_only_input() { + let input = "colA col B"; + + let result = string_to_table(input, false, true, 2); + let expected: Vec> = vec![]; + assert_eq!(expected, result); + } + + #[test] + fn it_uses_the_full_final_column() { + let input = r#" + colA col B + val1 val2 trailing value that should be included + "#; + + let result = string_to_table(input, false, true, 2); + assert_eq!( + result, + vec![vec![ + owned("colA", "val1"), + owned("col B", "val2 trailing value that should be included"), + ]] + ); + } + + #[test] + fn it_handles_empty_values_when_noheaders_and_aligned_columns() { + let input = r#" + a multi-word value b d + 1 3-3 4 + last + "#; + + let result = string_to_table(input, true, true, 2); + assert_eq!( + result, + vec![ + vec![ + owned("Column1", "a multi-word value"), + owned("Column2", "b"), + owned("Column3", ""), + owned("Column4", "d"), + owned("Column5", "") + ], + vec![ + owned("Column1", "1"), + owned("Column2", ""), + owned("Column3", "3-3"), + owned("Column4", "4"), + owned("Column5", "") + ], + vec![ + owned("Column1", ""), + owned("Column2", ""), + owned("Column3", ""), + owned("Column4", ""), + owned("Column5", "last") + ], + ] + ); + } + + #[test] + fn input_is_parsed_correctly_if_either_option_works() { + let input = r#" + docker-registry docker-registry=default docker-registry=default 172.30.78.158 5000/TCP + kubernetes component=apiserver,provider=kubernetes 172.30.0.2 443/TCP + kubernetes-ro component=apiserver,provider=kubernetes 172.30.0.1 80/TCP + "#; + + let aligned_columns_noheaders = string_to_table(input, true, true, 2); + let separator_noheaders = string_to_table(input, true, false, 2); + let aligned_columns_with_headers = string_to_table(input, false, true, 2); + let separator_with_headers = string_to_table(input, false, false, 2); + assert_eq!(aligned_columns_noheaders, separator_noheaders); + assert_eq!(aligned_columns_with_headers, separator_with_headers); + } + + #[test] + fn test_examples() { + use crate::test_examples; + + test_examples(FromSsv {}) + } +} diff --git a/crates/nu-command/src/formats/from/xml.rs b/crates/nu-command/src/formats/from/xml.rs new file mode 100644 index 0000000000..8c579f677c --- /dev/null +++ b/crates/nu-command/src/formats/from/xml.rs @@ -0,0 +1,379 @@ +use indexmap::map::IndexMap; +use nu_protocol::ast::Call; +use nu_protocol::engine::{Command, EngineState, Stack}; +use nu_protocol::{ + Category, Config, Example, IntoPipelineData, PipelineData, ShellError, Signature, Span, + Spanned, Value, +}; + +#[derive(Clone)] +pub struct FromXml; + +impl Command for FromXml { + fn name(&self) -> &str { + "from xml" + } + + fn signature(&self) -> Signature { + Signature::build("from xml").category(Category::Formats) + } + + fn usage(&self) -> &str { + "Parse text as .xml and create table." + } + + fn run( + &self, + _engine_state: &EngineState, + stack: &mut Stack, + call: &Call, + input: PipelineData, + ) -> Result { + let head = call.head; + let config = stack.get_config()?; + from_xml(input, head, &config) + } + + fn examples(&self) -> Vec { + vec![Example { + example: r#"' + + Event +' | from xml"#, + description: "Converts xml formatted string to table", + result: Some(Value::Record { + cols: vec!["note".to_string()], + vals: vec![Value::Record { + cols: vec!["children".to_string(), "attributes".to_string()], + vals: vec![ + Value::List { + vals: vec![Value::Record { + cols: vec!["remember".to_string()], + vals: vec![Value::Record { + cols: vec!["children".to_string(), "attributes".to_string()], + vals: vec![ + Value::List { + vals: vec![Value::String { + val: "Event".to_string(), + span: Span::unknown(), + }], + span: Span::unknown(), + }, + Value::Record { + cols: vec![], + vals: vec![], + span: Span::unknown(), + }, + ], + span: Span::unknown(), + }], + span: Span::unknown(), + }], + span: Span::unknown(), + }, + Value::Record { + cols: vec![], + vals: vec![], + span: Span::unknown(), + }, + ], + span: Span::unknown(), + }], + span: Span::unknown(), + }), + }] + } +} + +fn from_attributes_to_value(attributes: &[roxmltree::Attribute], span: Span) -> Value { + let mut collected = IndexMap::new(); + for a in attributes { + collected.insert(String::from(a.name()), Value::string(a.value(), span)); + } + + let (cols, vals) = collected + .into_iter() + .fold((vec![], vec![]), |mut acc, (k, v)| { + acc.0.push(k); + acc.1.push(v); + acc + }); + + Value::Record { cols, vals, span } +} + +fn from_node_to_value(n: &roxmltree::Node, span: Span) -> Value { + if n.is_element() { + let name = n.tag_name().name().trim().to_string(); + + let mut children_values = vec![]; + for c in n.children() { + children_values.push(from_node_to_value(&c, span)); + } + + let children_values: Vec = children_values + .into_iter() + .filter(|x| match x { + Value::String { val: f, .. } => { + !f.trim().is_empty() // non-whitespace characters? + } + _ => true, + }) + .collect(); + + let mut collected = IndexMap::new(); + + let attribute_value: Value = from_attributes_to_value(n.attributes(), span); + + let mut row = IndexMap::new(); + row.insert( + String::from("children"), + Value::List { + vals: children_values, + span, + }, + ); + row.insert(String::from("attributes"), attribute_value); + collected.insert(name, Value::from(Spanned { item: row, span })); + + Value::from(Spanned { + item: collected, + span, + }) + } else if n.is_comment() { + Value::String { + val: "".to_string(), + span, + } + } else if n.is_pi() { + Value::String { + val: "".to_string(), + span, + } + } else if n.is_text() { + match n.text() { + Some(text) => Value::String { + val: text.to_string(), + span, + }, + None => Value::String { + val: "".to_string(), + span, + }, + } + } else { + Value::String { + val: "".to_string(), + span, + } + } +} + +fn from_document_to_value(d: &roxmltree::Document, span: Span) -> Value { + from_node_to_value(&d.root_element(), span) +} + +pub fn from_xml_string_to_value(s: String, span: Span) -> Result { + let parsed = roxmltree::Document::parse(&s)?; + Ok(from_document_to_value(&parsed, span)) +} + +fn from_xml(input: PipelineData, head: Span, config: &Config) -> Result { + let concat_string = input.collect_string("", config); + + match from_xml_string_to_value(concat_string, head) { + Ok(x) => Ok(x.into_pipeline_data()), + _ => Err(ShellError::UnsupportedInput( + "Could not parse string as xml".to_string(), + head, + )), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use indexmap::indexmap; + use indexmap::IndexMap; + use nu_protocol::{Spanned, Value}; + + fn string(input: impl Into) -> Value { + Value::String { + val: input.into(), + span: Span::unknown(), + } + } + + fn row(entries: IndexMap) -> Value { + Value::from(Spanned { + item: entries, + span: Span::unknown(), + }) + } + + fn table(list: &[Value]) -> Value { + Value::List { + vals: list.to_vec(), + span: Span::unknown(), + } + } + + fn parse(xml: &str) -> Result { + from_xml_string_to_value(xml.to_string(), Span::unknown()) + } + + #[test] + fn parses_empty_element() -> Result<(), roxmltree::Error> { + let source = ""; + + assert_eq!( + parse(source)?, + row(indexmap! { + "nu".into() => row(indexmap! { + "children".into() => table(&[]), + "attributes".into() => row(indexmap! {}) + }) + }) + ); + + Ok(()) + } + + #[test] + fn parses_element_with_text() -> Result<(), roxmltree::Error> { + let source = "La era de los tres caballeros"; + + assert_eq!( + parse(source)?, + row(indexmap! { + "nu".into() => row(indexmap! { + "children".into() => table(&[string("La era de los tres caballeros")]), + "attributes".into() => row(indexmap! {}) + }) + }) + ); + + Ok(()) + } + + #[test] + fn parses_element_with_elements() -> Result<(), roxmltree::Error> { + let source = "\ + + Andrés + Jonathan + Yehuda +"; + + assert_eq!( + parse(source)?, + row(indexmap! { + "nu".into() => row(indexmap! { + "children".into() => table(&[ + row(indexmap! { + "dev".into() => row(indexmap! { + "children".into() => table(&[string("Andrés")]), + "attributes".into() => row(indexmap! {}) + }) + }), + row(indexmap! { + "dev".into() => row(indexmap! { + "children".into() => table(&[string("Jonathan")]), + "attributes".into() => row(indexmap! {}) + }) + }), + row(indexmap! { + "dev".into() => row(indexmap! { + "children".into() => table(&[string("Yehuda")]), + "attributes".into() => row(indexmap! {}) + }) + }) + ]), + "attributes".into() => row(indexmap! {}) + }) + }) + ); + + Ok(()) + } + + #[test] + fn parses_element_with_attribute() -> Result<(), roxmltree::Error> { + let source = "\ + +"; + + assert_eq!( + parse(source)?, + row(indexmap! { + "nu".into() => row(indexmap! { + "children".into() => table(&[]), + "attributes".into() => row(indexmap! { + "version".into() => string("2.0") + }) + }) + }) + ); + + Ok(()) + } + + #[test] + fn parses_element_with_attribute_and_element() -> Result<(), roxmltree::Error> { + let source = "\ + + 2.0 +"; + + assert_eq!( + parse(source)?, + row(indexmap! { + "nu".into() => row(indexmap! { + "children".into() => table(&[ + row(indexmap! { + "version".into() => row(indexmap! { + "children".into() => table(&[string("2.0")]), + "attributes".into() => row(indexmap! {}) + }) + }) + ]), + "attributes".into() => row(indexmap! { + "version".into() => string("2.0") + }) + }) + }) + ); + + Ok(()) + } + + #[test] + fn parses_element_with_multiple_attributes() -> Result<(), roxmltree::Error> { + let source = "\ + +"; + + assert_eq!( + parse(source)?, + row(indexmap! { + "nu".into() => row(indexmap! { + "children".into() => table(&[]), + "attributes".into() => row(indexmap! { + "version".into() => string("2.0"), + "age".into() => string("25") + }) + }) + }) + ); + + Ok(()) + } + + #[test] + fn test_examples() { + use crate::test_examples; + + test_examples(FromXml {}) + } +}