Refactoring and more group-by flexibility.

This commit is contained in:
Andrés N. Robalino
2020-06-17 12:33:50 -05:00
parent 6914099e28
commit 778e497903
10 changed files with 266 additions and 189 deletions

View File

@@ -4,6 +4,7 @@ use indexmap::indexmap;
use nu_errors::ShellError;
use nu_protocol::{ReturnSuccess, Signature, SyntaxShape, UntaggedValue, Value};
use nu_source::Tagged;
use nu_value_ext::as_string;
pub struct GroupBy;
@@ -71,6 +72,10 @@ impl WholeStreamCommand for GroupBy {
}
}
enum Grouper {
ByColumn(Option<Tagged<String>>),
}
pub async fn group_by(
args: CommandArgs,
registry: &CommandRegistry,
@@ -81,30 +86,84 @@ pub async fn group_by(
let values: Vec<Value> = input.collect().await;
if values.is_empty() {
Err(ShellError::labeled_error(
return Err(ShellError::labeled_error(
"Expected table from pipeline",
"requires a table input",
name,
))
));
}
let values = UntaggedValue::table(&values).into_value(&name);
match group(&column_name, &values, name) {
Ok(grouped) => Ok(OutputStream::one(ReturnSuccess::value(grouped))),
Err(reason) => Err(reason),
}
}
pub fn suggestions(tried: Tagged<&str>, for_value: &Value) -> ShellError {
let possibilities = for_value.data_descriptors();
let mut possible_matches: Vec<_> = possibilities
.iter()
.map(|x| (natural::distance::levenshtein_distance(x, &tried), x))
.collect();
possible_matches.sort();
if !possible_matches.is_empty() {
ShellError::labeled_error(
"Unknown column",
format!("did you mean '{}'?", possible_matches[0].1),
tried.tag(),
)
} else {
match crate::utils::data::group(column_name, &values, None, &name) {
Ok(grouped) => Ok(OutputStream::one(ReturnSuccess::value(grouped))),
Err(err) => Err(err),
}
ShellError::labeled_error(
"Unknown column",
"row does not contain this column",
tried.tag(),
)
}
}
pub fn group(
column_name: &Tagged<String>,
values: Vec<Value>,
column_name: &Option<Tagged<String>>,
values: &Value,
tag: impl Into<Tag>,
) -> Result<Value, ShellError> {
crate::utils::data::group(Some(column_name.clone()), &values, None, tag)
let name = tag.into();
let grouper = if let Some(column_name) = column_name {
Grouper::ByColumn(Some(column_name.clone()))
} else {
Grouper::ByColumn(None)
};
match grouper {
Grouper::ByColumn(Some(column_name)) => {
let block = Box::new(move |row: &Value| {
match row.get_data_by_key(column_name.borrow_spanned()) {
Some(group_key) => Ok(as_string(&group_key)?),
None => Err(suggestions(column_name.borrow_tagged(), &row)),
}
});
crate::utils::data::group(&values, &Some(block), &name)
}
Grouper::ByColumn(None) => {
let block = Box::new(move |row: &Value| match as_string(row) {
Ok(group_key) => Ok(group_key),
Err(reason) => Err(reason),
});
crate::utils::data::group(&values, &Some(block), &name)
}
}
}
#[cfg(test)]
mod tests {
use crate::commands::group_by::group;
use super::group;
use indexmap::IndexMap;
use nu_errors::ShellError;
use nu_protocol::{UntaggedValue, Value};
@@ -122,7 +181,7 @@ mod tests {
UntaggedValue::table(list).into_untagged_value()
}
fn nu_releases_commiters() -> Vec<Value> {
fn nu_releases_committers() -> Vec<Value> {
vec![
row(
indexmap! {"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")},
@@ -156,10 +215,11 @@ mod tests {
#[test]
fn groups_table_by_date_column() -> Result<(), ShellError> {
let for_key = String::from("date").tagged_unknown();
let for_key = Some(String::from("date").tagged_unknown());
let sample = table(&nu_releases_committers());
assert_eq!(
group(&for_key, nu_releases_commiters(), Tag::unknown())?,
group(&for_key, &sample, Tag::unknown())?,
row(indexmap! {
"August 23-2019".into() => table(&[
row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}),
@@ -184,10 +244,11 @@ mod tests {
#[test]
fn groups_table_by_country_column() -> Result<(), ShellError> {
let for_key = String::from("country").tagged_unknown();
let for_key = Some(String::from("country").tagged_unknown());
let sample = table(&nu_releases_committers());
assert_eq!(
group(&for_key, nu_releases_commiters(), Tag::unknown())?,
group(&for_key, &sample, Tag::unknown())?,
row(indexmap! {
"EC".into() => table(&[
row(indexmap!{"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")}),

View File

@@ -1,7 +1,7 @@
use crate::commands::WholeStreamCommand;
use crate::prelude::*;
use nu_errors::ShellError;
use nu_protocol::{ReturnSuccess, Signature, SyntaxShape, Value};
use nu_protocol::{ReturnSuccess, Signature, SyntaxShape, UntaggedValue, Value};
use nu_source::Tagged;
pub struct GroupByDate;
@@ -55,7 +55,11 @@ impl WholeStreamCommand for GroupByDate {
}
enum Grouper {
ByDate(Option<String>),
ByDate(Option<Tagged<String>>),
}
enum GroupByColumn {
Name(Option<Tagged<String>>),
}
pub async fn group_by_date(
@@ -80,31 +84,63 @@ pub async fn group_by_date(
name,
))
} else {
let grouper = if let Some(Tagged { item: fmt, tag: _ }) = format {
Grouper::ByDate(Some(fmt))
let values = UntaggedValue::table(&values).into_value(&name);
let grouper_column = if let Some(column_name) = column_name {
GroupByColumn::Name(Some(column_name))
} else {
GroupByColumn::Name(None)
};
let grouper_date = if let Some(date_format) = format {
Grouper::ByDate(Some(date_format))
} else {
Grouper::ByDate(None)
};
match grouper {
Grouper::ByDate(None) => {
match crate::utils::data::group(
column_name,
&values,
Some(Box::new(|row: &Value| row.format("%Y-%b-%d"))),
&name,
) {
match (grouper_date, grouper_column) {
(Grouper::ByDate(None), GroupByColumn::Name(None)) => {
let block = Box::new(move |row: &Value| row.format("%Y-%b-%d"));
match crate::utils::data::group(&values, &Some(block), &name) {
Ok(grouped) => Ok(OutputStream::one(ReturnSuccess::value(grouped))),
Err(err) => Err(err),
}
}
Grouper::ByDate(Some(fmt)) => {
match crate::utils::data::group(
column_name,
&values,
Some(Box::new(move |row: &Value| row.format(&fmt))),
&name,
) {
(Grouper::ByDate(None), GroupByColumn::Name(Some(column_name))) => {
let block = Box::new(move |row: &Value| {
let group_key = match row.get_data_by_key(column_name.borrow_spanned()) {
Some(group_key) => Ok(group_key),
None => Err(suggestions(column_name.borrow_tagged(), &row)),
};
group_key?.format("%Y-%b-%d")
});
match crate::utils::data::group(&values, &Some(block), &name) {
Ok(grouped) => Ok(OutputStream::one(ReturnSuccess::value(grouped))),
Err(err) => Err(err),
}
}
(Grouper::ByDate(Some(fmt)), GroupByColumn::Name(None)) => {
let block = Box::new(move |row: &Value| row.format(&fmt));
match crate::utils::data::group(&values, &Some(block), &name) {
Ok(grouped) => Ok(OutputStream::one(ReturnSuccess::value(grouped))),
Err(err) => Err(err),
}
}
(Grouper::ByDate(Some(fmt)), GroupByColumn::Name(Some(column_name))) => {
let block = Box::new(move |row: &Value| {
let group_key = match row.get_data_by_key(column_name.borrow_spanned()) {
Some(group_key) => Ok(group_key),
None => Err(suggestions(column_name.borrow_tagged(), &row)),
};
group_key?.format(&fmt)
});
match crate::utils::data::group(&values, &Some(block), &name) {
Ok(grouped) => Ok(OutputStream::one(ReturnSuccess::value(grouped))),
Err(err) => Err(err),
}
@@ -113,6 +149,31 @@ pub async fn group_by_date(
}
}
pub fn suggestions(tried: Tagged<&str>, for_value: &Value) -> ShellError {
let possibilities = for_value.data_descriptors();
let mut possible_matches: Vec<_> = possibilities
.iter()
.map(|x| (natural::distance::levenshtein_distance(x, &tried), x))
.collect();
possible_matches.sort();
if !possible_matches.is_empty() {
ShellError::labeled_error(
"Unknown column",
format!("did you mean '{}'?", possible_matches[0].1),
tried.tag(),
)
} else {
ShellError::labeled_error(
"Unknown column",
"row does not contain this column",
tried.tag(),
)
}
}
#[cfg(test)]
mod tests {
use super::GroupByDate;

View File

@@ -76,14 +76,14 @@ pub async fn histogram(
) -> Result<OutputStream, ShellError> {
let registry = registry.clone();
let name = args.call_info.name_tag.clone();
let (HistogramArgs { column_name, rest }, input) = args.process(&registry).await?;
let values: Vec<Value> = input.collect().await;
let values = UntaggedValue::table(&values).into_value(&name);
let Tagged { item: group_by, .. } = column_name.clone();
let groups = group(&column_name, values, &name)?;
let group_labels = columns_sorted(Some(group_by.clone()), &groups, &name);
let sorted = t_sort(Some(group_by), None, &groups, &name)?;
let groups = group(&Some(column_name.clone()), &values, &name)?;
let group_labels = columns_sorted(Some(column_name.clone()), &groups, &name);
let sorted = t_sort(Some(column_name.clone()), None, &groups, &name)?;
let evaled = evaluate(&sorted, None, &name)?;
let reduced = reduce(&evaled, None, &name)?;
let maxima = map_max(&reduced, None, &name)?;

View File

@@ -1,16 +1,15 @@
use crate::commands::WholeStreamCommand;
use crate::prelude::*;
use nu_errors::ShellError;
use nu_protocol::{
Signature, SpannedTypeName, SyntaxShape, TaggedDictBuilder, UntaggedValue, Value,
};
use nu_protocol::{ReturnSuccess, Signature, SyntaxShape, Value};
use nu_source::Tagged;
use nu_value_ext::as_string;
pub struct SplitBy;
#[derive(Deserialize)]
pub struct SplitByArgs {
column_name: Tagged<String>,
column_name: Option<Tagged<String>>,
}
#[async_trait]
@@ -20,7 +19,7 @@ impl WholeStreamCommand for SplitBy {
}
fn signature(&self) -> Signature {
Signature::build("split-by").required(
Signature::build("split-by").optional(
"column_name",
SyntaxShape::String,
"the name of the column within the nested table to split by",
@@ -53,108 +52,84 @@ pub async fn split_by(
return Err(ShellError::labeled_error(
"Expected table from pipeline",
"requires a table input",
column_name.span(),
name,
));
}
match split(&column_name, &values[0], name) {
Ok(split) => Ok(OutputStream::one(split)),
match split(&column_name, &values[0], &name) {
Ok(splits) => Ok(OutputStream::one(ReturnSuccess::value(splits))),
Err(err) => Err(err),
}
}
enum Grouper {
ByColumn(Option<Tagged<String>>),
}
pub fn split(
column_name: &Tagged<String>,
value: &Value,
column_name: &Option<Tagged<String>>,
values: &Value,
tag: impl Into<Tag>,
) -> Result<Value, ShellError> {
let origin_tag = tag.into();
let name = tag.into();
let mut splits = indexmap::IndexMap::new();
let grouper = if let Some(column_name) = column_name {
Grouper::ByColumn(Some(column_name.clone()))
} else {
Grouper::ByColumn(None)
};
match value {
Value {
value: UntaggedValue::Row(group_sets),
..
} => {
for (group_key, group_value) in group_sets.entries.iter() {
match *group_value {
Value {
value: UntaggedValue::Table(ref dataset),
..
} => {
let group = crate::commands::group_by::group(
&column_name,
dataset.to_vec(),
&origin_tag,
)?;
match group {
Value {
value: UntaggedValue::Row(o),
..
} => {
for (split_label, subset) in o.entries.into_iter() {
match subset {
Value {
value: UntaggedValue::Table(subset),
tag,
} => {
let s = splits
.entry(split_label.clone())
.or_insert(indexmap::IndexMap::new());
s.insert(
group_key.clone(),
UntaggedValue::table(&subset).into_value(tag),
);
}
other => {
return Err(ShellError::type_error(
"a table value",
other.spanned_type_name(),
))
}
}
}
}
_ => {
return Err(ShellError::type_error(
"a table value",
group.spanned_type_name(),
))
}
}
}
ref other => {
return Err(ShellError::type_error(
"a table value",
other.spanned_type_name(),
))
}
match grouper {
Grouper::ByColumn(Some(column_name)) => {
let block = Box::new(move |row: &Value| {
match row.get_data_by_key(column_name.borrow_spanned()) {
Some(group_key) => Ok(as_string(&group_key)?),
None => Err(suggestions(column_name.borrow_tagged(), &row)),
}
}
});
crate::utils::data::split(&values, &Some(block), &name)
}
_ => {
return Err(ShellError::type_error(
"a table value",
value.spanned_type_name(),
))
Grouper::ByColumn(None) => {
let block = Box::new(move |row: &Value| match as_string(row) {
Ok(group_key) => Ok(group_key),
Err(reason) => Err(reason),
});
crate::utils::data::split(&values, &Some(block), &name)
}
}
let mut out = TaggedDictBuilder::new(&origin_tag);
for (k, v) in splits.into_iter() {
out.insert_untagged(k, UntaggedValue::row(v));
}
Ok(out.into_value())
}
pub fn suggestions(tried: Tagged<&str>, for_value: &Value) -> ShellError {
let possibilities = for_value.data_descriptors();
let mut possible_matches: Vec<_> = possibilities
.iter()
.map(|x| (natural::distance::levenshtein_distance(x, &tried), x))
.collect();
possible_matches.sort();
if !possible_matches.is_empty() {
return ShellError::labeled_error(
"Unknown column",
format!("did you mean '{}'?", possible_matches[0].1),
tried.tag(),
);
} else {
return ShellError::labeled_error(
"Unknown column",
"row does not contain this column",
tried.tag(),
);
}
}
#[cfg(test)]
mod tests {
use super::split;
use crate::commands::group_by::group;
use crate::commands::split_by::split;
use indexmap::IndexMap;
use nu_errors::ShellError;
use nu_protocol::{UntaggedValue, Value};
@@ -173,11 +148,12 @@ mod tests {
}
fn nu_releases_grouped_by_date() -> Result<Value, ShellError> {
let key = String::from("date").tagged_unknown();
group(&key, nu_releases_commiters(), Tag::unknown())
let key = Some(String::from("date").tagged_unknown());
let sample = table(&nu_releases_committers());
group(&key, &sample, Tag::unknown())
}
fn nu_releases_commiters() -> Vec<Value> {
fn nu_releases_committers() -> Vec<Value> {
vec![
row(
indexmap! {"name".into() => string("AR"), "country".into() => string("EC"), "date".into() => string("August 23-2019")},
@@ -211,7 +187,7 @@ mod tests {
#[test]
fn splits_inner_tables_by_key() -> Result<(), ShellError> {
let for_key = String::from("country").tagged_unknown();
let for_key = Some(String::from("country").tagged_unknown());
assert_eq!(
split(&for_key, &nu_releases_grouped_by_date()?, Tag::unknown())?,
@@ -257,7 +233,7 @@ mod tests {
#[test]
fn errors_if_key_within_some_inner_table_is_missing() {
let for_key = String::from("country").tagged_unknown();
let for_key = Some(String::from("country").tagged_unknown());
let nu_releases = row(indexmap! {
"August 23-2019".into() => table(&[

View File

@@ -78,7 +78,7 @@ async fn t_sort_by(
let values: Vec<Value> = input.collect().await;
let column_grouped_by_name = if let Some(grouped_by) = group_by {
Some(grouped_by.item().clone())
Some(grouped_by)
} else {
None
};