Polars cut (#15431)

- fixes #15366 

# Description
Introducing binning commands, `polars cut` and `polars qcut`

# User-Facing Changes
- New command `polars cut`
- New command `polars qcut`
This commit is contained in:
Jack Wright 2025-03-27 04:58:34 -07:00 committed by GitHub
parent e76586ede4
commit eaf522b41f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 192 additions and 1 deletions

View File

@ -32,7 +32,7 @@ serde = { version = "1.0", features = ["derive"] }
sqlparser = { version = "0.53"} sqlparser = { version = "0.53"}
polars-io = { version = "0.46", features = ["avro", "cloud", "aws"]} polars-io = { version = "0.46", features = ["avro", "cloud", "aws"]}
polars-arrow = { version = "0.46"} polars-arrow = { version = "0.46"}
polars-ops = { version = "0.46", features = ["pivot"]} polars-ops = { version = "0.46", features = ["pivot", "cutqcut"]}
polars-plan = { version = "0.46", features = ["regex"]} polars-plan = { version = "0.46", features = ["regex"]}
polars-utils = { version = "0.46"} polars-utils = { version = "0.46"}
typetag = "0.2" typetag = "0.2"

View File

@ -0,0 +1,89 @@
use nu_plugin::{EngineInterface, EvaluatedCall, PluginCommand};
use nu_protocol::{Category, Example, PipelineData, ShellError, Signature, SyntaxShape, Type};
use polars::prelude::PlSmallStr;
use crate::{
values::{CustomValueSupport, NuDataFrame},
PolarsPlugin,
};
pub struct CutSeries;
impl PluginCommand for CutSeries {
type Plugin = PolarsPlugin;
fn name(&self) -> &str {
"polars cut"
}
fn description(&self) -> &str {
"Bin continuous values into discrete categories for a series."
}
fn signature(&self) -> nu_protocol::Signature {
Signature::build(self.name())
.required("breaks", SyntaxShape::Any, "Dataframe that contains a series of unique cut points.")
.named(
"labels",
SyntaxShape::List(Box::new(SyntaxShape::String)),
"Names of the categories. The number of labels must be equal to the number of cut points plus one.",
Some('l'),
)
.switch("left_closed", "Set the intervals to be left-closed instead of right-closed.", Some('c'))
.switch("include_breaks", "Include a column with the right endpoint of the bin each observation falls in. This will change the data type of the output from a Categorical to a Struct.", Some('b'))
.input_output_type(
Type::Custom("dataframe".into()),
Type::Custom("dataframe".into()),
)
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Divide the column into three categories.",
example: r#"[-2, -1, 0, 1, 2] | polars into-df | polars cut [-1, 1] --labels ["a", "b", "c"]"#,
result: None,
}]
}
fn run(
&self,
plugin: &Self::Plugin,
engine: &EngineInterface,
call: &EvaluatedCall,
input: PipelineData,
) -> Result<PipelineData, nu_protocol::LabeledError> {
command(plugin, engine, call, input).map_err(|e| e.into())
}
}
fn command(
plugin: &PolarsPlugin,
engine: &EngineInterface,
call: &EvaluatedCall,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let df = NuDataFrame::try_from_pipeline_coerce(plugin, input, call.head)?;
let series = df.as_series(call.head)?;
let breaks = call.req::<Vec<f64>>(0)?;
let labels: Option<Vec<PlSmallStr>> = call.get_flag::<Vec<String>>("labels")?.map(|l| {
l.into_iter()
.map(PlSmallStr::from)
.collect::<Vec<PlSmallStr>>()
});
let left_closed = call.has_flag("left_closed")?;
let include_breaks = call.has_flag("include_breaks")?;
let new_series = polars_ops::series::cut(&series, breaks, labels, left_closed, include_breaks)
.map_err(|e| ShellError::GenericError {
error: "Error cutting series".into(),
msg: e.to_string(),
span: Some(call.head),
help: None,
inner: vec![],
})?;
NuDataFrame::try_from_series(new_series, call.head)?.to_pipeline_data(plugin, engine, call.head)
}

View File

@ -5,6 +5,7 @@ mod cast;
mod col; mod col;
mod collect; mod collect;
mod concat; mod concat;
mod cut;
mod drop; mod drop;
mod drop_duplicates; mod drop_duplicates;
mod drop_nulls; mod drop_nulls;
@ -22,6 +23,7 @@ mod last;
mod len; mod len;
mod lit; mod lit;
mod pivot; mod pivot;
mod qcut;
mod query_df; mod query_df;
mod rename; mod rename;
mod reverse; mod reverse;
@ -75,6 +77,7 @@ pub(crate) fn data_commands() -> Vec<Box<dyn PluginCommand<Plugin = PolarsPlugin
vec![ vec![
Box::new(AppendDF), Box::new(AppendDF),
Box::new(CastDF), Box::new(CastDF),
Box::new(cut::CutSeries),
Box::new(DropDF), Box::new(DropDF),
Box::new(concat::ConcatDF), Box::new(concat::ConcatDF),
Box::new(DropDuplicates), Box::new(DropDuplicates),
@ -108,6 +111,7 @@ pub(crate) fn data_commands() -> Vec<Box<dyn PluginCommand<Plugin = PolarsPlugin
Box::new(LazySortBy), Box::new(LazySortBy),
Box::new(LazyFilter), Box::new(LazyFilter),
Box::new(Shift), Box::new(Shift),
Box::new(qcut::QCutSeries),
Box::new(Unique), Box::new(Unique),
Box::new(unnest::UnnestDF), Box::new(unnest::UnnestDF),
] ]

View File

@ -0,0 +1,98 @@
use nu_plugin::{EngineInterface, EvaluatedCall, PluginCommand};
use nu_protocol::{Category, Example, PipelineData, ShellError, Signature, SyntaxShape, Type};
use polars::prelude::PlSmallStr;
use crate::{
values::{CustomValueSupport, NuDataFrame},
PolarsPlugin,
};
pub struct QCutSeries;
impl PluginCommand for QCutSeries {
type Plugin = PolarsPlugin;
fn name(&self) -> &str {
"polars qcut"
}
fn description(&self) -> &str {
"Bin continuous values into discrete categories based on their quantiles for a series."
}
fn signature(&self) -> nu_protocol::Signature {
Signature::build(self.name())
.required("quantiles", SyntaxShape::Any, "Either a list of quantile probabilities between 0 and 1 or a positive integer determining the number of bins with uniform probability.")
.named(
"labels",
SyntaxShape::List(Box::new(SyntaxShape::String)),
"Names of the categories. The number of labels must be equal to the number of cut points plus one.",
Some('l'),
)
.switch("left_closed", "Set the intervals to be left-closed instead of right-closed.", Some('c'))
.switch("include_breaks", "Include a column with the right endpoint of the bin each observation falls in. This will change the data type of the output from a Categorical to a Struct.", Some('b'))
.switch("allow_duplicates", "If set, duplicates in the resulting quantiles are dropped, rather than raising an error. This can happen even with unique probabilities, depending on the data.", Some('d'))
.input_output_type(
Type::Custom("dataframe".into()),
Type::Custom("dataframe".into()),
)
.category(Category::Custom("dataframe".into()))
}
fn examples(&self) -> Vec<Example> {
vec![Example {
description: "Divide a column into three categories according to pre-defined quantile probabilities.",
example: r#"[-2, -1, 0, 1, 2] | polars into-df | polars qcut [0.25, 0.75] --labels ["a", "b", "c"]"#,
result: None,
}]
}
fn run(
&self,
plugin: &Self::Plugin,
engine: &EngineInterface,
call: &EvaluatedCall,
input: PipelineData,
) -> Result<PipelineData, nu_protocol::LabeledError> {
command(plugin, engine, call, input).map_err(|e| e.into())
}
}
fn command(
plugin: &PolarsPlugin,
engine: &EngineInterface,
call: &EvaluatedCall,
input: PipelineData,
) -> Result<PipelineData, ShellError> {
let df = NuDataFrame::try_from_pipeline_coerce(plugin, input, call.head)?;
let series = df.as_series(call.head)?;
let quantiles = call.req::<Vec<f64>>(0)?;
let labels: Option<Vec<PlSmallStr>> = call.get_flag::<Vec<String>>("labels")?.map(|l| {
l.into_iter()
.map(PlSmallStr::from)
.collect::<Vec<PlSmallStr>>()
});
let left_closed = call.has_flag("left_closed")?;
let include_breaks = call.has_flag("include_breaks")?;
let allow_duplicates = call.has_flag("allow_duplicates")?;
let new_series = polars_ops::series::qcut(
&series,
quantiles,
labels,
left_closed,
allow_duplicates,
include_breaks,
)
.map_err(|e| ShellError::GenericError {
error: "Error cutting series".into(),
msg: e.to_string(),
span: Some(call.head),
help: None,
inner: vec![],
})?;
NuDataFrame::try_from_series(new_series, call.head)?.to_pipeline_data(plugin, engine, call.head)
}