2024-04-10 02:31:43 +02:00
|
|
|
[package]
|
|
|
|
authors = ["The Nushell Project Developers"]
|
|
|
|
description = "Nushell dataframe plugin commands based on polars."
|
|
|
|
edition = "2021"
|
|
|
|
license = "MIT"
|
|
|
|
name = "nu_plugin_polars"
|
2024-05-20 19:22:08 +02:00
|
|
|
repository = "https://github.com/nushell/nushell/tree/main/crates/nu_plugin_polars"
|
2024-12-24 23:47:00 +01:00
|
|
|
version = "0.101.1"
|
2024-04-10 02:31:43 +02:00
|
|
|
|
|
|
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
|
|
|
|
|
|
|
[[bin]]
|
|
|
|
name = "nu_plugin_polars"
|
|
|
|
bench = false
|
|
|
|
|
|
|
|
[lib]
|
|
|
|
bench = false
|
|
|
|
|
|
|
|
[dependencies]
|
2024-12-24 23:47:00 +01:00
|
|
|
nu-protocol = { path = "../nu-protocol", version = "0.101.1" }
|
|
|
|
nu-plugin = { path = "../nu-plugin", version = "0.101.1" }
|
|
|
|
nu-path = { path = "../nu-path", version = "0.101.1" }
|
|
|
|
nu-utils = { path = "../nu-utils", version = "0.101.1" }
|
2024-04-10 02:31:43 +02:00
|
|
|
|
|
|
|
# Potential dependencies for extras
|
|
|
|
chrono = { workspace = true, features = ["std", "unstable-locales"], default-features = false }
|
2024-10-23 19:14:24 +02:00
|
|
|
chrono-tz = "0.10"
|
2024-04-10 02:31:43 +02:00
|
|
|
fancy-regex = { workspace = true }
|
2024-12-04 02:57:05 +01:00
|
|
|
indexmap = { version = "2.7" }
|
make polars plugin use mimalloc (#12967)
# Description
@maxim-uvarov did a ton of research and work with the dply-rs author and
ritchie from polars and found out that the allocator matters on macos
and it seems to be what was messing up the performance of polars plugin.
ritchie suggested to use jemalloc but i switched it to mimalloc to match
nushell and it seems to run better.
## Before (default allocator)
note - using 1..10 vs 1..100 since it takes so long. also notice how
high the `max` timings are compared to mimalloc below.
```nushell
❯ 1..10 | each {timeit {polars open Data7602DescendingYearOrder.csv | polars group-by year | polars agg (polars col geo_count | polars sum) | polars collect | null}} | | {mean: ($in | math avg), min: ($in | math min), max: ($in | math max), stddev: ($in | into int | into float | math stddev | into int | $'($in)ns' | into duration)}
╭────────┬─────────────────────────╮
│ mean │ 4sec 999ms 605µs 995ns │
│ min │ 983ms 627µs 42ns │
│ max │ 13sec 398ms 135µs 791ns │
│ stddev │ 3sec 476ms 479µs 939ns │
╰────────┴─────────────────────────╯
❯ use std bench
❯ bench { polars open Data7602DescendingYearOrder.csv | polars group-by year | polars agg (polars col geo_count | polars sum) | polars collect | null } -n 10
╭───────┬────────────────────────╮
│ mean │ 6sec 220ms 783µs 983ns │
│ min │ 1sec 184ms 997µs 708ns │
│ max │ 18sec 882ms 81µs 708ns │
│ std │ 5sec 350ms 375µs 697ns │
│ times │ [list 10 items] │
╰───────┴────────────────────────╯
```
## After (using mimalloc)
```nushell
❯ 1..100 | each {timeit {polars open Data7602DescendingYearOrder.csv | polars group-by year | polars agg (polars col geo_count | polars sum) | polars collect | null}} | | {mean: ($in | math avg), min: ($in | math min), max: ($in | math max), stddev: ($in | into int | into float | math stddev | into int | $'($in)ns' | into duration)}
╭────────┬───────────────────╮
│ mean │ 103ms 728µs 902ns │
│ min │ 97ms 107µs 42ns │
│ max │ 149ms 430µs 84ns │
│ stddev │ 5ms 690µs 664ns │
╰────────┴───────────────────╯
❯ use std bench
❯ bench { polars open Data7602DescendingYearOrder.csv | polars group-by year | polars agg (polars col geo_count | polars sum) | polars collect | null } -n 100
╭───────┬───────────────────╮
│ mean │ 103ms 620µs 195ns │
│ min │ 97ms 541µs 166ns │
│ max │ 130ms 262µs 166ns │
│ std │ 4ms 948µs 654ns │
│ times │ [list 100 items] │
╰───────┴───────────────────╯
```
## After (using jemalloc - just for comparison)
```nushell
❯ 1..100 | each {timeit {polars open Data7602DescendingYearOrder.csv | polars group-by year | polars agg (polars col geo_count | polars sum) | polars collect | null}} | | {mean: ($in | math avg), min: ($in | math min), max: ($in | math max), stddev: ($in | into int | into float | math stddev | into int | $'($in)ns' | into duration)}
╭────────┬───────────────────╮
│ mean │ 113ms 939µs 777ns │
│ min │ 108ms 337µs 333ns │
│ max │ 166ms 467µs 458ns │
│ stddev │ 6ms 175µs 618ns │
╰────────┴───────────────────╯
❯ use std bench
❯ bench { polars open Data7602DescendingYearOrder.csv | polars group-by year | polars agg (polars col geo_count | polars sum) | polars collect | null } -n 100
╭───────┬───────────────────╮
│ mean │ 114ms 363µs 530ns │
│ min │ 108ms 804µs 833ns │
│ max │ 143ms 521µs 459ns │
│ std │ 5ms 88µs 56ns │
│ times │ [list 100 items] │
╰───────┴───────────────────╯
```
## After (using parquet + mimalloc)
```nushell
❯ 1..100 | each {timeit {polars open data.parquet | polars group-by year | polars agg (polars col geo_count | polars sum) | polars collect | null}} | | {mean: ($in | math avg), min: ($in | math min), max: ($in | math max), stddev: ($in | into int | into float | math stddev | into int | $'($in)ns' | into duration)}
╭────────┬──────────────────╮
│ mean │ 34ms 255µs 492ns │
│ min │ 31ms 787µs 250ns │
│ max │ 76ms 408µs 416ns │
│ stddev │ 4ms 472µs 916ns │
╰────────┴──────────────────╯
❯ use std bench
❯ bench { polars open data.parquet | polars group-by year | polars agg (polars col geo_count | polars sum) | polars collect | null } -n 100
╭───────┬──────────────────╮
│ mean │ 34ms 897µs 562ns │
│ min │ 31ms 518µs 542ns │
│ max │ 65ms 943µs 625ns │
│ std │ 3ms 450µs 741ns │
│ times │ [list 100 items] │
╰───────┴──────────────────╯
```
# User-Facing Changes
<!-- List of all changes that impact the user experience here. This
helps us keep track of breaking changes. -->
# Tests + Formatting
<!--
Don't forget to add tests that cover your changes.
Make sure you've run and fixed any issues with these commands:
- `cargo fmt --all -- --check` to check standard code formatting (`cargo
fmt --all` applies these changes)
- `cargo clippy --workspace -- -D warnings -D clippy::unwrap_used` to
check that you're using the standard code style
- `cargo test --workspace` to check that all tests pass (on Windows make
sure to [enable developer
mode](https://learn.microsoft.com/en-us/windows/apps/get-started/developer-mode-features-and-debugging))
- `cargo run -- -c "use toolkit.nu; toolkit test stdlib"` to run the
tests for the standard library
> **Note**
> from `nushell` you can also use the `toolkit` as follows
> ```bash
> use toolkit.nu # or use an `env_change` hook to activate it
automatically
> toolkit check pr
> ```
-->
# After Submitting
<!-- If your PR had any user-facing changes, update [the
documentation](https://github.com/nushell/nushell.github.io) after the
PR is merged, if necessary. This will help us keep the docs up to date.
-->
2024-05-25 16:10:01 +02:00
|
|
|
mimalloc = { version = "0.1.42" }
|
2024-04-10 02:31:43 +02:00
|
|
|
num = {version = "0.4"}
|
|
|
|
serde = { version = "1.0", features = ["derive"] }
|
2024-10-23 19:14:24 +02:00
|
|
|
sqlparser = { version = "0.49"}
|
2024-12-25 13:15:50 +01:00
|
|
|
polars-io = { version = "0.44", features = ["avro", "cloud", "aws"]}
|
2024-11-30 02:39:07 +01:00
|
|
|
polars-arrow = { version = "0.44"}
|
|
|
|
polars-ops = { version = "0.44", features = ["pivot"]}
|
|
|
|
polars-plan = { version = "0.44", features = ["regex"]}
|
|
|
|
polars-utils = { version = "0.44"}
|
2024-04-10 02:31:43 +02:00
|
|
|
typetag = "0.2"
|
2024-06-28 01:56:56 +02:00
|
|
|
env_logger = "0.11.3"
|
|
|
|
log.workspace = true
|
2024-10-23 07:54:16 +02:00
|
|
|
uuid = { version = "1.11", features = ["v4", "serde"] }
|
2024-04-10 02:31:43 +02:00
|
|
|
|
2024-10-09 02:07:21 +02:00
|
|
|
# Do to a compile error with polars, this included to force the raw dependency
|
|
|
|
hashbrown = { version = "0.14", features = ["rayon", "ahash", "serde", "raw"] }
|
|
|
|
|
2024-12-25 13:15:50 +01:00
|
|
|
# Cloud support
|
|
|
|
aws-config = { version = "1.5", features = ["sso"] }
|
|
|
|
aws-credential-types = "1.2"
|
|
|
|
tokio = { version = "1.41", features = ["full"] }
|
|
|
|
object_store = { version = "0.10", default-features = false }
|
|
|
|
url.workspace = true
|
|
|
|
|
2024-04-10 02:31:43 +02:00
|
|
|
[dependencies.polars]
|
|
|
|
features = [
|
2024-10-23 19:14:24 +02:00
|
|
|
"arg_where",
|
|
|
|
"checked_arithmetic",
|
2024-12-25 13:15:50 +01:00
|
|
|
"cloud",
|
2024-10-23 19:14:24 +02:00
|
|
|
"concat_str",
|
|
|
|
"cross_join",
|
|
|
|
"csv",
|
|
|
|
"cum_agg",
|
|
|
|
"default",
|
|
|
|
"dtype-categorical",
|
|
|
|
"dtype-datetime",
|
|
|
|
"dtype-struct",
|
|
|
|
"dtype-decimal",
|
|
|
|
"dtype-i8",
|
|
|
|
"dtype-i16",
|
|
|
|
"dtype-u8",
|
|
|
|
"dtype-u16",
|
|
|
|
"dynamic_group_by",
|
|
|
|
"ipc",
|
|
|
|
"is_in",
|
|
|
|
"json",
|
|
|
|
"lazy",
|
|
|
|
"object",
|
|
|
|
"parquet",
|
|
|
|
"pivot",
|
|
|
|
"random",
|
|
|
|
"rolling_window",
|
|
|
|
"rows",
|
|
|
|
"serde",
|
|
|
|
"serde-lazy",
|
|
|
|
"strings",
|
|
|
|
"string_to_integer",
|
|
|
|
"streaming",
|
2024-12-25 13:15:50 +01:00
|
|
|
"timezones",
|
2024-10-23 19:14:24 +02:00
|
|
|
"temporal",
|
|
|
|
"to_dummies",
|
2024-04-10 02:31:43 +02:00
|
|
|
]
|
|
|
|
optional = false
|
2024-11-30 02:39:07 +01:00
|
|
|
version = "0.44"
|
2024-04-10 02:31:43 +02:00
|
|
|
|
|
|
|
[dev-dependencies]
|
2024-12-24 23:47:00 +01:00
|
|
|
nu-cmd-lang = { path = "../nu-cmd-lang", version = "0.101.1" }
|
|
|
|
nu-engine = { path = "../nu-engine", version = "0.101.1" }
|
|
|
|
nu-parser = { path = "../nu-parser", version = "0.101.1" }
|
|
|
|
nu-command = { path = "../nu-command", version = "0.101.1" }
|
|
|
|
nu-plugin-test-support = { path = "../nu-plugin-test-support", version = "0.101.1" }
|
2024-12-22 15:10:19 +01:00
|
|
|
tempfile.workspace = true
|