Add well-optimized string types (#16446)

This commit is contained in:
Piepmatz
2025-08-16 18:01:38 +02:00
committed by GitHub
parent 36427a7434
commit 0b8531ed9d
7 changed files with 565 additions and 0 deletions

32
Cargo.lock generated
View File

@ -815,6 +815,15 @@ dependencies = [
"serde",
]
[[package]]
name = "buf-trait"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21eaafc770e8c073d6c3facafe7617e774305d4954aa6351b9c452eb37ee17b4"
dependencies = [
"zerocopy 0.7.35",
]
[[package]]
name = "bumpalo"
version = "3.16.0"
@ -878,6 +887,15 @@ version = "1.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2e93abca9e28e0a1b9877922aacb20576e05d4679ffa78c3d6dc22a26a216659"
[[package]]
name = "byteyarn"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b93e51d26468a15ea59f8525e0c13dc405db43e644a0b1e6d44346c72cf4cf7b"
dependencies = [
"buf-trait",
]
[[package]]
name = "calamine"
version = "0.28.0"
@ -3048,6 +3066,18 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "lean_string"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75e49998bd00bfb567a44a6d3c3d1a970ce35665b5ebf68d64e5e96410be58ae"
dependencies = [
"castaway",
"itoa",
"ryu",
"serde",
]
[[package]]
name = "libc"
version = "0.2.174"
@ -4234,9 +4264,11 @@ dependencies = [
name = "nu-utils"
version = "0.106.2"
dependencies = [
"byteyarn",
"crossterm",
"crossterm_winapi",
"fancy-regex",
"lean_string",
"log",
"lscolors",
"nix 0.29.0",

View File

@ -72,6 +72,7 @@ brotli = "7.0"
byteorder = "1.5"
bytes = "1"
bytesize = "1.3.3"
byteyarn = "0.5"
calamine = "0.28"
chardetng = "0.1.17"
chrono = { default-features = false, version = "0.4.34" }
@ -100,6 +101,7 @@ indicatif = "0.17"
interprocess = "2.2.0"
is_executable = "1.0"
itertools = "0.14"
lean_string = { version = "0.5", features = ["serde"] }
libc = "0.2"
libproc = "0.14"
log = "0.4"

View File

@ -17,8 +17,10 @@ bench = false
bench = false
[dependencies]
byteyarn.workspace = true
crossterm = { workspace = true, optional = true }
fancy-regex = { workspace = true }
lean_string.workspace = true
lscolors = { workspace = true, default-features = false, features = ["nu-ansi-term"] }
log = { workspace = true }
num-format = { workspace = true }

View File

@ -8,6 +8,7 @@ pub mod float;
pub mod locale;
mod quoting;
mod shared_cow;
pub mod strings;
pub mod utils;
pub use locale::get_system_locale;

View File

@ -0,0 +1,40 @@
//! String utility types with specific semantics.
//!
//! This module provides additional string types optimized for certain use-cases, offering
//! alternatives to standard [`String`] or [`&str`](str) when specific performance characteristics
//! are desired.
//!
//! The pipeline-based, functional programming model, we use, leads to frequent string operations
//! that involve immutability and often copying.
//! These specialized string types can provide performance and memory benefits for these cases.
//!
//! ## Choosing a String Type: `SharedString` vs. `UniqueString`
//!
//! ### Use [`SharedString`] when:
//!
//! `SharedString` is an owned, immutable string type optimized for **frequent cloning and sharing**.
//! Cloning it is very inexpensive (a pointer copy and atomic reference count increment), avoiding
//! deep copies of the string data.
//! It also benefits from Small String Optimization (SSO) and static string re-use.
//!
//! **Ideal for:** Strings that need to be duplicated or passed by value frequently across
//! pipeline stages or within complex data structures, where many references to the same string
//! data are expected.
//!
//! ### Use [`UniqueString`] when:
//!
//! `UniqueString` is an owned, immutable string type optimized for
//! **strings that are primarily unique** or rarely cloned.
//! Cloning a `UniqueString` always involves copying the underlying string data.
//! Its advantage lies in avoiding the overhead of atomic reference counting.
//! It also benefits from Small String Optimization (SSO) and static string re-use.
//!
//! **Ideal for:** Strings that are created and consumed locally, or represent unique identifiers
//! that are not expected to be duplicated across many ownership boundaries.
//! When the cost of copying upon infrequent clones is acceptable.
mod shared;
mod unique;
pub use shared::SharedString;
pub use unique::UniqueString;

View File

@ -0,0 +1,245 @@
use std::{
borrow::Borrow,
fmt::{Arguments, Debug, Display},
hash::Hash,
ops::Deref,
};
use serde::{Deserialize, Serialize};
/// An owned, immutable string with compact storage and efficient cloning.
///
/// `SharedString` is designed for immutable strings that are frequently cloned and hold ownership.
/// It offers similar characteristics to [`Arc<str>`](std::sync::Arc) but with several key
/// optimizations for improved efficiency and memory usage:
///
/// - **Efficient Cloning:**
/// Cloning a `SharedString` is very inexpensive.
/// It typically involves just a pointer copy and an atomic reference count increment, without
/// copying the actual string data.
///
/// - **Small String Optimization (SSO):**
/// For shorter strings, the data is stored directly within the `SharedString` struct, keeping
/// the data on the stack and avoiding indirection.
///
/// - **Static String Re-use:**
/// Strings with a `'static` lifetime are directly referenced, avoiding unnecessary copies or
/// allocations.
///
/// - **Niche Optimization:**
/// `SharedString` allows niche-optimization, meaning that [`Option<SharedString>`] has the same
/// memory footprint as `SharedString`.
///
/// - **Compact Size:**
/// On 64-bit systems, `SharedString` is 16 bytes.
/// This is achieved by disregarding the capacity of a `String` since we only hold the string as
/// immutable.
///
/// Internally, `SharedString` is powered by [`lean_string::LeanString`], which provides the
/// underlying implementation for these optimizations.
pub struct SharedString(lean_string::LeanString);
const _: () = const {
assert!(size_of::<SharedString>() == size_of::<[usize; 2]>());
assert!(size_of::<SharedString>() == size_of::<Option<SharedString>>());
};
impl SharedString {
/// Returns a string slice containing the entire `SharedString`.
#[inline]
pub fn as_str(&self) -> &str {
self.0.as_str()
}
/// Returns a byte slice of this `SharedString`'s contents.
#[inline]
pub fn as_bytes(&self) -> &[u8] {
self.0.as_bytes()
}
/// Returns the length of this `SharedString`, in bytes.
#[inline]
pub fn len(&self) -> usize {
self.0.len()
}
/// Returns `true` if the `SharedString` has a length of 0, `false` otherwise.
#[inline]
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
/// Returns a `SharedString` by taking ownership of an allocation.
#[inline]
pub fn from_string(string: String) -> Self {
Self(lean_string::LeanString::from(string))
}
/// Returns a `SharedString` pointing to the given slice, without copying.
///
/// By using this function instead of [`from_string`](Self::from_string), we can avoid any
/// copying and always refer to the provided static string slice.
#[inline]
pub fn from_static_str(string: &'static str) -> Self {
Self(lean_string::LeanString::from_static_str(string))
}
/// Builds a new `SharedString` from the given formatting arguments.
///
/// You can get an [`Arguments`] instance by calling [`format_args!`].
/// This function is used when using [`sformat!`](crate::sformat) instead of [`format!`] to
/// create a `SharedString`.
#[inline]
pub fn from_fmt(arguments: Arguments) -> Self {
match arguments.as_str() {
Some(static_str) => Self::from_static_str(static_str),
None => Self::from_string(std::fmt::format(arguments)),
}
}
}
impl AsRef<str> for SharedString {
#[inline]
fn as_ref(&self) -> &str {
self.as_str()
}
}
impl Borrow<str> for SharedString {
#[inline]
fn borrow(&self) -> &str {
self.as_str()
}
}
impl Clone for SharedString {
#[inline]
fn clone(&self) -> Self {
Self(self.0.clone())
}
}
impl Debug for SharedString {
#[inline]
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
Debug::fmt(&self.0, f)
}
}
impl Default for SharedString {
#[inline]
fn default() -> Self {
Self(Default::default())
}
}
impl Deref for SharedString {
type Target = str;
#[inline]
fn deref(&self) -> &Self::Target {
self.as_str()
}
}
impl Display for SharedString {
#[inline]
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
Display::fmt(&self.0, f)
}
}
impl Eq for SharedString {}
impl From<String> for SharedString {
#[inline]
fn from(string: String) -> Self {
Self::from_string(string)
}
}
impl From<&'static str> for SharedString {
#[inline]
fn from(string: &'static str) -> Self {
Self::from_static_str(string)
}
}
impl Hash for SharedString {
#[inline]
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.0.hash(state);
}
}
impl Ord for SharedString {
#[inline]
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.0.cmp(&other.0)
}
}
impl PartialEq for SharedString {
#[inline]
fn eq(&self, other: &Self) -> bool {
self.0 == other.0
}
}
impl PartialOrd for SharedString {
#[inline]
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Serialize for SharedString {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
self.0.serialize(serializer)
}
}
impl<'de> Deserialize<'de> for SharedString {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
Ok(Self(lean_string::LeanString::deserialize(deserializer)?))
}
}
/// A macro for creating [`SharedString`] instances from format arguments.
///
/// This macro works similarly to [`format!`] but returns a [`SharedString`] instead of a [`String`].
/// It attempts to optimize for `'static` string literals.
#[macro_export]
macro_rules! sformat {
($fmt:expr) => {
$crate::strings::SharedString::from_fmt(::std::format_args!($fmt))
};
($fmt:expr, $($args:tt)*) => {
$crate::strings::SharedString::from_fmt(::std::format_args!($fmt, $($args)*))
};
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn macro_works() {
assert!(sformat!("").is_empty());
assert_eq!(
sformat!("Hello World"),
SharedString::from_static_str("Hello World")
);
assert_eq!(
sformat!("Hello {}", "World"),
SharedString::from_static_str("Hello World")
);
}
}

View File

@ -0,0 +1,243 @@
use std::{
borrow::Borrow,
fmt::{Arguments, Debug, Display},
hash::Hash,
ops::Deref,
};
use serde::{Deserialize, Serialize};
/// An owned, immutable string with compact storage.
///
/// `UniqueString` is designed for immutable strings that are not frequently cloned and hold ownership.
/// It offers similar characteristics to `Box<str>` but with several key
/// optimizations for improved efficiency and memory usage:
///
/// - **Efficient for Unique Strings:**
/// When strings are not frequently cloned, `UniqueString` can be more performant than
/// reference-counted alternatives like [`SharedString`](super::SharedString) as it avoids the
/// overhead of atomic reference counting.
///
/// - **Small String Optimization (SSO):**
/// For shorter strings, the data is stored directly within the `UniqueString` struct, keeping
/// the data on the stack and avoiding indirection.
///
/// - **Static String Re-use:**
/// Strings with a `'static` lifetime are directly referenced, avoiding unnecessary copies or
/// allocations.
///
/// - **Niche Optimization:**
/// `UniqueString` allows niche-optimization, meaning that [`Option<UniqueString>`] has the same
/// memory footprint as `UniqueString`.
///
/// - **Compact Size:**
/// On 64-bit systems, `UniqueString` is 16 bytes.
/// This is achieved by disregarding the capacity of a `String` since we only hold the string as
/// immutable.
///
/// Internally, `UniqueString` is powered by [`byteyarn::Yarn`], which provides the
/// underlying implementation for these optimizations.
pub struct UniqueString(byteyarn::Yarn);
const _: () = const {
assert!(size_of::<UniqueString>() == size_of::<[usize; 2]>());
assert!(size_of::<UniqueString>() == size_of::<Option<UniqueString>>());
};
impl UniqueString {
/// Returns a string slice containing the entire `UniqueString`.
#[inline]
pub fn as_str(&self) -> &str {
self.0.as_str()
}
/// Returns a byte slice of this `UniqueString`'s contents.
#[inline]
pub fn as_bytes(&self) -> &[u8] {
self.0.as_bytes()
}
/// Returns the length of this `UniqueString`, in bytes.
#[inline]
pub fn len(&self) -> usize {
self.0.len()
}
/// Returns `true` if the `UniqueString` has a length of 0, `false` otherwise.
#[inline]
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
/// Returns a `UniqueString` by taking ownership of an allocation.
#[inline]
pub fn from_string(string: String) -> Self {
Self(byteyarn::Yarn::from_string(string))
}
/// Returns a `UniqueString` pointing to the given slice, without copying.
///
/// By using this function instead of [`from_string`](Self::from_string), we can avoid any
/// copying and always refer to the provided static string slice.
#[inline]
pub fn from_static_str(string: &'static str) -> Self {
Self(byteyarn::Yarn::from_static(string))
}
/// Builds a new `UniqueString` from the given formatting arguments.
///
/// You can get an [`Arguments`] instance by calling [`format_args!`].
/// This function is used when using [`uformat!`](crate::uformat) instead of [`format!`] to
/// create a `UniqueString`.
#[inline]
pub fn from_fmt(arguments: Arguments) -> Self {
Self(byteyarn::Yarn::from_fmt(arguments))
}
}
impl AsRef<str> for UniqueString {
#[inline]
fn as_ref(&self) -> &str {
self.as_str()
}
}
impl Borrow<str> for UniqueString {
#[inline]
fn borrow(&self) -> &str {
self.as_str()
}
}
impl Clone for UniqueString {
#[inline]
fn clone(&self) -> Self {
Self(self.0.clone())
}
}
impl Debug for UniqueString {
#[inline]
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
Debug::fmt(&self.0, f)
}
}
impl Default for UniqueString {
#[inline]
fn default() -> Self {
Self(Default::default())
}
}
impl Deref for UniqueString {
type Target = str;
#[inline]
fn deref(&self) -> &Self::Target {
self.as_str()
}
}
impl Display for UniqueString {
#[inline]
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
Display::fmt(&self.0, f)
}
}
impl Eq for UniqueString {}
impl From<String> for UniqueString {
#[inline]
fn from(string: String) -> Self {
Self::from_string(string)
}
}
impl From<&'static str> for UniqueString {
#[inline]
fn from(string: &'static str) -> Self {
Self::from_static_str(string)
}
}
impl Hash for UniqueString {
#[inline]
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.0.hash(state);
}
}
impl Ord for UniqueString {
#[inline]
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.0.cmp(&other.0)
}
}
impl PartialEq for UniqueString {
#[inline]
fn eq(&self, other: &Self) -> bool {
self.0 == other.0
}
}
impl PartialOrd for UniqueString {
#[inline]
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Serialize for UniqueString {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
self.0.serialize(serializer)
}
}
impl<'de> Deserialize<'de> for UniqueString {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
Ok(Self::from_string(s))
}
}
/// A macro for creating [`UniqueString`] instances from format arguments.
///
/// This macro works similarly to [`format!`] but returns a [`UniqueString`] instead of a [`String`].
/// It attempts to optimize for `'static` string literals.
#[macro_export]
macro_rules! uformat {
($fmt:expr) => {
$crate::strings::UniqueString::from_fmt(::std::format_args!($fmt))
};
($fmt:expr, $($args:tt)*) => {
$crate::strings::UniqueString::from_fmt(::std::format_args!($fmt, $($args)*))
};
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn macro_works() {
assert!(uformat!("").is_empty());
assert_eq!(
uformat!("Hello World"),
UniqueString::from_static_str("Hello World")
);
assert_eq!(
uformat!("Hello {}", "World"),
UniqueString::from_static_str("Hello World")
);
}
}