mirror of
https://github.com/nushell/nushell.git
synced 2025-05-17 16:30:47 +02:00
Polars cut (#15431)
- fixes #15366 # Description Introducing binning commands, `polars cut` and `polars qcut` # User-Facing Changes - New command `polars cut` - New command `polars qcut`
This commit is contained in:
parent
e76586ede4
commit
eaf522b41f
@ -32,7 +32,7 @@ serde = { version = "1.0", features = ["derive"] }
|
|||||||
sqlparser = { version = "0.53"}
|
sqlparser = { version = "0.53"}
|
||||||
polars-io = { version = "0.46", features = ["avro", "cloud", "aws"]}
|
polars-io = { version = "0.46", features = ["avro", "cloud", "aws"]}
|
||||||
polars-arrow = { version = "0.46"}
|
polars-arrow = { version = "0.46"}
|
||||||
polars-ops = { version = "0.46", features = ["pivot"]}
|
polars-ops = { version = "0.46", features = ["pivot", "cutqcut"]}
|
||||||
polars-plan = { version = "0.46", features = ["regex"]}
|
polars-plan = { version = "0.46", features = ["regex"]}
|
||||||
polars-utils = { version = "0.46"}
|
polars-utils = { version = "0.46"}
|
||||||
typetag = "0.2"
|
typetag = "0.2"
|
||||||
|
89
crates/nu_plugin_polars/src/dataframe/command/data/cut.rs
Normal file
89
crates/nu_plugin_polars/src/dataframe/command/data/cut.rs
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
use nu_plugin::{EngineInterface, EvaluatedCall, PluginCommand};
|
||||||
|
use nu_protocol::{Category, Example, PipelineData, ShellError, Signature, SyntaxShape, Type};
|
||||||
|
use polars::prelude::PlSmallStr;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
values::{CustomValueSupport, NuDataFrame},
|
||||||
|
PolarsPlugin,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub struct CutSeries;
|
||||||
|
|
||||||
|
impl PluginCommand for CutSeries {
|
||||||
|
type Plugin = PolarsPlugin;
|
||||||
|
fn name(&self) -> &str {
|
||||||
|
"polars cut"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Bin continuous values into discrete categories for a series."
|
||||||
|
}
|
||||||
|
|
||||||
|
fn signature(&self) -> nu_protocol::Signature {
|
||||||
|
Signature::build(self.name())
|
||||||
|
.required("breaks", SyntaxShape::Any, "Dataframe that contains a series of unique cut points.")
|
||||||
|
.named(
|
||||||
|
"labels",
|
||||||
|
SyntaxShape::List(Box::new(SyntaxShape::String)),
|
||||||
|
"Names of the categories. The number of labels must be equal to the number of cut points plus one.",
|
||||||
|
Some('l'),
|
||||||
|
)
|
||||||
|
.switch("left_closed", "Set the intervals to be left-closed instead of right-closed.", Some('c'))
|
||||||
|
.switch("include_breaks", "Include a column with the right endpoint of the bin each observation falls in. This will change the data type of the output from a Categorical to a Struct.", Some('b'))
|
||||||
|
.input_output_type(
|
||||||
|
Type::Custom("dataframe".into()),
|
||||||
|
Type::Custom("dataframe".into()),
|
||||||
|
)
|
||||||
|
.category(Category::Custom("dataframe".into()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn examples(&self) -> Vec<Example> {
|
||||||
|
vec![Example {
|
||||||
|
description: "Divide the column into three categories.",
|
||||||
|
example: r#"[-2, -1, 0, 1, 2] | polars into-df | polars cut [-1, 1] --labels ["a", "b", "c"]"#,
|
||||||
|
result: None,
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run(
|
||||||
|
&self,
|
||||||
|
plugin: &Self::Plugin,
|
||||||
|
engine: &EngineInterface,
|
||||||
|
call: &EvaluatedCall,
|
||||||
|
input: PipelineData,
|
||||||
|
) -> Result<PipelineData, nu_protocol::LabeledError> {
|
||||||
|
command(plugin, engine, call, input).map_err(|e| e.into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn command(
|
||||||
|
plugin: &PolarsPlugin,
|
||||||
|
engine: &EngineInterface,
|
||||||
|
call: &EvaluatedCall,
|
||||||
|
input: PipelineData,
|
||||||
|
) -> Result<PipelineData, ShellError> {
|
||||||
|
let df = NuDataFrame::try_from_pipeline_coerce(plugin, input, call.head)?;
|
||||||
|
let series = df.as_series(call.head)?;
|
||||||
|
|
||||||
|
let breaks = call.req::<Vec<f64>>(0)?;
|
||||||
|
|
||||||
|
let labels: Option<Vec<PlSmallStr>> = call.get_flag::<Vec<String>>("labels")?.map(|l| {
|
||||||
|
l.into_iter()
|
||||||
|
.map(PlSmallStr::from)
|
||||||
|
.collect::<Vec<PlSmallStr>>()
|
||||||
|
});
|
||||||
|
|
||||||
|
let left_closed = call.has_flag("left_closed")?;
|
||||||
|
let include_breaks = call.has_flag("include_breaks")?;
|
||||||
|
|
||||||
|
let new_series = polars_ops::series::cut(&series, breaks, labels, left_closed, include_breaks)
|
||||||
|
.map_err(|e| ShellError::GenericError {
|
||||||
|
error: "Error cutting series".into(),
|
||||||
|
msg: e.to_string(),
|
||||||
|
span: Some(call.head),
|
||||||
|
help: None,
|
||||||
|
inner: vec![],
|
||||||
|
})?;
|
||||||
|
|
||||||
|
NuDataFrame::try_from_series(new_series, call.head)?.to_pipeline_data(plugin, engine, call.head)
|
||||||
|
}
|
@ -5,6 +5,7 @@ mod cast;
|
|||||||
mod col;
|
mod col;
|
||||||
mod collect;
|
mod collect;
|
||||||
mod concat;
|
mod concat;
|
||||||
|
mod cut;
|
||||||
mod drop;
|
mod drop;
|
||||||
mod drop_duplicates;
|
mod drop_duplicates;
|
||||||
mod drop_nulls;
|
mod drop_nulls;
|
||||||
@ -22,6 +23,7 @@ mod last;
|
|||||||
mod len;
|
mod len;
|
||||||
mod lit;
|
mod lit;
|
||||||
mod pivot;
|
mod pivot;
|
||||||
|
mod qcut;
|
||||||
mod query_df;
|
mod query_df;
|
||||||
mod rename;
|
mod rename;
|
||||||
mod reverse;
|
mod reverse;
|
||||||
@ -75,6 +77,7 @@ pub(crate) fn data_commands() -> Vec<Box<dyn PluginCommand<Plugin = PolarsPlugin
|
|||||||
vec![
|
vec![
|
||||||
Box::new(AppendDF),
|
Box::new(AppendDF),
|
||||||
Box::new(CastDF),
|
Box::new(CastDF),
|
||||||
|
Box::new(cut::CutSeries),
|
||||||
Box::new(DropDF),
|
Box::new(DropDF),
|
||||||
Box::new(concat::ConcatDF),
|
Box::new(concat::ConcatDF),
|
||||||
Box::new(DropDuplicates),
|
Box::new(DropDuplicates),
|
||||||
@ -108,6 +111,7 @@ pub(crate) fn data_commands() -> Vec<Box<dyn PluginCommand<Plugin = PolarsPlugin
|
|||||||
Box::new(LazySortBy),
|
Box::new(LazySortBy),
|
||||||
Box::new(LazyFilter),
|
Box::new(LazyFilter),
|
||||||
Box::new(Shift),
|
Box::new(Shift),
|
||||||
|
Box::new(qcut::QCutSeries),
|
||||||
Box::new(Unique),
|
Box::new(Unique),
|
||||||
Box::new(unnest::UnnestDF),
|
Box::new(unnest::UnnestDF),
|
||||||
]
|
]
|
||||||
|
98
crates/nu_plugin_polars/src/dataframe/command/data/qcut.rs
Normal file
98
crates/nu_plugin_polars/src/dataframe/command/data/qcut.rs
Normal file
@ -0,0 +1,98 @@
|
|||||||
|
use nu_plugin::{EngineInterface, EvaluatedCall, PluginCommand};
|
||||||
|
use nu_protocol::{Category, Example, PipelineData, ShellError, Signature, SyntaxShape, Type};
|
||||||
|
use polars::prelude::PlSmallStr;
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
values::{CustomValueSupport, NuDataFrame},
|
||||||
|
PolarsPlugin,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub struct QCutSeries;
|
||||||
|
|
||||||
|
impl PluginCommand for QCutSeries {
|
||||||
|
type Plugin = PolarsPlugin;
|
||||||
|
fn name(&self) -> &str {
|
||||||
|
"polars qcut"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Bin continuous values into discrete categories based on their quantiles for a series."
|
||||||
|
}
|
||||||
|
|
||||||
|
fn signature(&self) -> nu_protocol::Signature {
|
||||||
|
Signature::build(self.name())
|
||||||
|
.required("quantiles", SyntaxShape::Any, "Either a list of quantile probabilities between 0 and 1 or a positive integer determining the number of bins with uniform probability.")
|
||||||
|
.named(
|
||||||
|
"labels",
|
||||||
|
SyntaxShape::List(Box::new(SyntaxShape::String)),
|
||||||
|
"Names of the categories. The number of labels must be equal to the number of cut points plus one.",
|
||||||
|
Some('l'),
|
||||||
|
)
|
||||||
|
.switch("left_closed", "Set the intervals to be left-closed instead of right-closed.", Some('c'))
|
||||||
|
.switch("include_breaks", "Include a column with the right endpoint of the bin each observation falls in. This will change the data type of the output from a Categorical to a Struct.", Some('b'))
|
||||||
|
.switch("allow_duplicates", "If set, duplicates in the resulting quantiles are dropped, rather than raising an error. This can happen even with unique probabilities, depending on the data.", Some('d'))
|
||||||
|
.input_output_type(
|
||||||
|
Type::Custom("dataframe".into()),
|
||||||
|
Type::Custom("dataframe".into()),
|
||||||
|
)
|
||||||
|
.category(Category::Custom("dataframe".into()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn examples(&self) -> Vec<Example> {
|
||||||
|
vec![Example {
|
||||||
|
description: "Divide a column into three categories according to pre-defined quantile probabilities.",
|
||||||
|
example: r#"[-2, -1, 0, 1, 2] | polars into-df | polars qcut [0.25, 0.75] --labels ["a", "b", "c"]"#,
|
||||||
|
result: None,
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run(
|
||||||
|
&self,
|
||||||
|
plugin: &Self::Plugin,
|
||||||
|
engine: &EngineInterface,
|
||||||
|
call: &EvaluatedCall,
|
||||||
|
input: PipelineData,
|
||||||
|
) -> Result<PipelineData, nu_protocol::LabeledError> {
|
||||||
|
command(plugin, engine, call, input).map_err(|e| e.into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn command(
|
||||||
|
plugin: &PolarsPlugin,
|
||||||
|
engine: &EngineInterface,
|
||||||
|
call: &EvaluatedCall,
|
||||||
|
input: PipelineData,
|
||||||
|
) -> Result<PipelineData, ShellError> {
|
||||||
|
let df = NuDataFrame::try_from_pipeline_coerce(plugin, input, call.head)?;
|
||||||
|
let series = df.as_series(call.head)?;
|
||||||
|
|
||||||
|
let quantiles = call.req::<Vec<f64>>(0)?;
|
||||||
|
|
||||||
|
let labels: Option<Vec<PlSmallStr>> = call.get_flag::<Vec<String>>("labels")?.map(|l| {
|
||||||
|
l.into_iter()
|
||||||
|
.map(PlSmallStr::from)
|
||||||
|
.collect::<Vec<PlSmallStr>>()
|
||||||
|
});
|
||||||
|
|
||||||
|
let left_closed = call.has_flag("left_closed")?;
|
||||||
|
let include_breaks = call.has_flag("include_breaks")?;
|
||||||
|
let allow_duplicates = call.has_flag("allow_duplicates")?;
|
||||||
|
|
||||||
|
let new_series = polars_ops::series::qcut(
|
||||||
|
&series,
|
||||||
|
quantiles,
|
||||||
|
labels,
|
||||||
|
left_closed,
|
||||||
|
allow_duplicates,
|
||||||
|
include_breaks,
|
||||||
|
)
|
||||||
|
.map_err(|e| ShellError::GenericError {
|
||||||
|
error: "Error cutting series".into(),
|
||||||
|
msg: e.to_string(),
|
||||||
|
span: Some(call.head),
|
||||||
|
help: None,
|
||||||
|
inner: vec![],
|
||||||
|
})?;
|
||||||
|
|
||||||
|
NuDataFrame::try_from_series(new_series, call.head)?.to_pipeline_data(plugin, engine, call.head)
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user