diff --git a/Cargo.lock b/Cargo.lock index 1bbd8fa3f6..c589f13d91 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -815,6 +815,15 @@ dependencies = [ "serde", ] +[[package]] +name = "buf-trait" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21eaafc770e8c073d6c3facafe7617e774305d4954aa6351b9c452eb37ee17b4" +dependencies = [ + "zerocopy 0.7.35", +] + [[package]] name = "bumpalo" version = "3.16.0" @@ -878,6 +887,15 @@ version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e93abca9e28e0a1b9877922aacb20576e05d4679ffa78c3d6dc22a26a216659" +[[package]] +name = "byteyarn" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b93e51d26468a15ea59f8525e0c13dc405db43e644a0b1e6d44346c72cf4cf7b" +dependencies = [ + "buf-trait", +] + [[package]] name = "calamine" version = "0.28.0" @@ -3048,6 +3066,18 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "lean_string" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75e49998bd00bfb567a44a6d3c3d1a970ce35665b5ebf68d64e5e96410be58ae" +dependencies = [ + "castaway", + "itoa", + "ryu", + "serde", +] + [[package]] name = "libc" version = "0.2.174" @@ -4234,9 +4264,11 @@ dependencies = [ name = "nu-utils" version = "0.106.2" dependencies = [ + "byteyarn", "crossterm", "crossterm_winapi", "fancy-regex", + "lean_string", "log", "lscolors", "nix 0.29.0", diff --git a/Cargo.toml b/Cargo.toml index 8db830be13..592279dae1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -72,6 +72,7 @@ brotli = "7.0" byteorder = "1.5" bytes = "1" bytesize = "1.3.3" +byteyarn = "0.5" calamine = "0.28" chardetng = "0.1.17" chrono = { default-features = false, version = "0.4.34" } @@ -100,6 +101,7 @@ indicatif = "0.17" interprocess = "2.2.0" is_executable = "1.0" itertools = "0.14" +lean_string = { version = "0.5", features = ["serde"] } libc = "0.2" libproc = "0.14" log = "0.4" diff --git a/crates/nu-utils/Cargo.toml b/crates/nu-utils/Cargo.toml index ac602878ed..359984fa7a 100644 --- a/crates/nu-utils/Cargo.toml +++ b/crates/nu-utils/Cargo.toml @@ -17,8 +17,10 @@ bench = false bench = false [dependencies] +byteyarn.workspace = true crossterm = { workspace = true, optional = true } fancy-regex = { workspace = true } +lean_string.workspace = true lscolors = { workspace = true, default-features = false, features = ["nu-ansi-term"] } log = { workspace = true } num-format = { workspace = true } diff --git a/crates/nu-utils/src/lib.rs b/crates/nu-utils/src/lib.rs index ff52f557a1..d8a7db6975 100644 --- a/crates/nu-utils/src/lib.rs +++ b/crates/nu-utils/src/lib.rs @@ -8,6 +8,7 @@ pub mod float; pub mod locale; mod quoting; mod shared_cow; +pub mod strings; pub mod utils; pub use locale::get_system_locale; diff --git a/crates/nu-utils/src/strings/mod.rs b/crates/nu-utils/src/strings/mod.rs new file mode 100644 index 0000000000..450e5eb8be --- /dev/null +++ b/crates/nu-utils/src/strings/mod.rs @@ -0,0 +1,40 @@ +//! String utility types with specific semantics. +//! +//! This module provides additional string types optimized for certain use-cases, offering +//! alternatives to standard [`String`] or [`&str`](str) when specific performance characteristics +//! are desired. +//! +//! The pipeline-based, functional programming model, we use, leads to frequent string operations +//! that involve immutability and often copying. +//! These specialized string types can provide performance and memory benefits for these cases. +//! +//! ## Choosing a String Type: `SharedString` vs. `UniqueString` +//! +//! ### Use [`SharedString`] when: +//! +//! `SharedString` is an owned, immutable string type optimized for **frequent cloning and sharing**. +//! Cloning it is very inexpensive (a pointer copy and atomic reference count increment), avoiding +//! deep copies of the string data. +//! It also benefits from Small String Optimization (SSO) and static string re-use. +//! +//! **Ideal for:** Strings that need to be duplicated or passed by value frequently across +//! pipeline stages or within complex data structures, where many references to the same string +//! data are expected. +//! +//! ### Use [`UniqueString`] when: +//! +//! `UniqueString` is an owned, immutable string type optimized for +//! **strings that are primarily unique** or rarely cloned. +//! Cloning a `UniqueString` always involves copying the underlying string data. +//! Its advantage lies in avoiding the overhead of atomic reference counting. +//! It also benefits from Small String Optimization (SSO) and static string re-use. +//! +//! **Ideal for:** Strings that are created and consumed locally, or represent unique identifiers +//! that are not expected to be duplicated across many ownership boundaries. +//! When the cost of copying upon infrequent clones is acceptable. + +mod shared; +mod unique; + +pub use shared::SharedString; +pub use unique::UniqueString; diff --git a/crates/nu-utils/src/strings/shared.rs b/crates/nu-utils/src/strings/shared.rs new file mode 100644 index 0000000000..3496dc7e03 --- /dev/null +++ b/crates/nu-utils/src/strings/shared.rs @@ -0,0 +1,245 @@ +use std::{ + borrow::Borrow, + fmt::{Arguments, Debug, Display}, + hash::Hash, + ops::Deref, +}; + +use serde::{Deserialize, Serialize}; + +/// An owned, immutable string with compact storage and efficient cloning. +/// +/// `SharedString` is designed for immutable strings that are frequently cloned and hold ownership. +/// It offers similar characteristics to [`Arc`](std::sync::Arc) but with several key +/// optimizations for improved efficiency and memory usage: +/// +/// - **Efficient Cloning:** +/// Cloning a `SharedString` is very inexpensive. +/// It typically involves just a pointer copy and an atomic reference count increment, without +/// copying the actual string data. +/// +/// - **Small String Optimization (SSO):** +/// For shorter strings, the data is stored directly within the `SharedString` struct, keeping +/// the data on the stack and avoiding indirection. +/// +/// - **Static String Re-use:** +/// Strings with a `'static` lifetime are directly referenced, avoiding unnecessary copies or +/// allocations. +/// +/// - **Niche Optimization:** +/// `SharedString` allows niche-optimization, meaning that [`Option`] has the same +/// memory footprint as `SharedString`. +/// +/// - **Compact Size:** +/// On 64-bit systems, `SharedString` is 16 bytes. +/// This is achieved by disregarding the capacity of a `String` since we only hold the string as +/// immutable. +/// +/// Internally, `SharedString` is powered by [`lean_string::LeanString`], which provides the +/// underlying implementation for these optimizations. +pub struct SharedString(lean_string::LeanString); + +const _: () = const { + assert!(size_of::() == size_of::<[usize; 2]>()); + assert!(size_of::() == size_of::>()); +}; + +impl SharedString { + /// Returns a string slice containing the entire `SharedString`. + #[inline] + pub fn as_str(&self) -> &str { + self.0.as_str() + } + + /// Returns a byte slice of this `SharedString`'s contents. + #[inline] + pub fn as_bytes(&self) -> &[u8] { + self.0.as_bytes() + } + + /// Returns the length of this `SharedString`, in bytes. + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + /// Returns `true` if the `SharedString` has a length of 0, `false` otherwise. + #[inline] + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Returns a `SharedString` by taking ownership of an allocation. + #[inline] + pub fn from_string(string: String) -> Self { + Self(lean_string::LeanString::from(string)) + } + + /// Returns a `SharedString` pointing to the given slice, without copying. + /// + /// By using this function instead of [`from_string`](Self::from_string), we can avoid any + /// copying and always refer to the provided static string slice. + #[inline] + pub fn from_static_str(string: &'static str) -> Self { + Self(lean_string::LeanString::from_static_str(string)) + } + + /// Builds a new `SharedString` from the given formatting arguments. + /// + /// You can get an [`Arguments`] instance by calling [`format_args!`]. + /// This function is used when using [`sformat!`](crate::sformat) instead of [`format!`] to + /// create a `SharedString`. + #[inline] + pub fn from_fmt(arguments: Arguments) -> Self { + match arguments.as_str() { + Some(static_str) => Self::from_static_str(static_str), + None => Self::from_string(std::fmt::format(arguments)), + } + } +} + +impl AsRef for SharedString { + #[inline] + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl Borrow for SharedString { + #[inline] + fn borrow(&self) -> &str { + self.as_str() + } +} + +impl Clone for SharedString { + #[inline] + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} + +impl Debug for SharedString { + #[inline] + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + Debug::fmt(&self.0, f) + } +} + +impl Default for SharedString { + #[inline] + fn default() -> Self { + Self(Default::default()) + } +} + +impl Deref for SharedString { + type Target = str; + + #[inline] + fn deref(&self) -> &Self::Target { + self.as_str() + } +} + +impl Display for SharedString { + #[inline] + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + Display::fmt(&self.0, f) + } +} + +impl Eq for SharedString {} + +impl From for SharedString { + #[inline] + fn from(string: String) -> Self { + Self::from_string(string) + } +} + +impl From<&'static str> for SharedString { + #[inline] + fn from(string: &'static str) -> Self { + Self::from_static_str(string) + } +} + +impl Hash for SharedString { + #[inline] + fn hash(&self, state: &mut H) { + self.0.hash(state); + } +} + +impl Ord for SharedString { + #[inline] + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.0.cmp(&other.0) + } +} + +impl PartialEq for SharedString { + #[inline] + fn eq(&self, other: &Self) -> bool { + self.0 == other.0 + } +} + +impl PartialOrd for SharedString { + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Serialize for SharedString { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.0.serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for SharedString { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + Ok(Self(lean_string::LeanString::deserialize(deserializer)?)) + } +} + +/// A macro for creating [`SharedString`] instances from format arguments. +/// +/// This macro works similarly to [`format!`] but returns a [`SharedString`] instead of a [`String`]. +/// It attempts to optimize for `'static` string literals. +#[macro_export] +macro_rules! sformat { + ($fmt:expr) => { + $crate::strings::SharedString::from_fmt(::std::format_args!($fmt)) + }; + + ($fmt:expr, $($args:tt)*) => { + $crate::strings::SharedString::from_fmt(::std::format_args!($fmt, $($args)*)) + }; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn macro_works() { + assert!(sformat!("").is_empty()); + assert_eq!( + sformat!("Hello World"), + SharedString::from_static_str("Hello World") + ); + assert_eq!( + sformat!("Hello {}", "World"), + SharedString::from_static_str("Hello World") + ); + } +} diff --git a/crates/nu-utils/src/strings/unique.rs b/crates/nu-utils/src/strings/unique.rs new file mode 100644 index 0000000000..66a2bd7186 --- /dev/null +++ b/crates/nu-utils/src/strings/unique.rs @@ -0,0 +1,243 @@ +use std::{ + borrow::Borrow, + fmt::{Arguments, Debug, Display}, + hash::Hash, + ops::Deref, +}; + +use serde::{Deserialize, Serialize}; + +/// An owned, immutable string with compact storage. +/// +/// `UniqueString` is designed for immutable strings that are not frequently cloned and hold ownership. +/// It offers similar characteristics to `Box` but with several key +/// optimizations for improved efficiency and memory usage: +/// +/// - **Efficient for Unique Strings:** +/// When strings are not frequently cloned, `UniqueString` can be more performant than +/// reference-counted alternatives like [`SharedString`](super::SharedString) as it avoids the +/// overhead of atomic reference counting. +/// +/// - **Small String Optimization (SSO):** +/// For shorter strings, the data is stored directly within the `UniqueString` struct, keeping +/// the data on the stack and avoiding indirection. +/// +/// - **Static String Re-use:** +/// Strings with a `'static` lifetime are directly referenced, avoiding unnecessary copies or +/// allocations. +/// +/// - **Niche Optimization:** +/// `UniqueString` allows niche-optimization, meaning that [`Option`] has the same +/// memory footprint as `UniqueString`. +/// +/// - **Compact Size:** +/// On 64-bit systems, `UniqueString` is 16 bytes. +/// This is achieved by disregarding the capacity of a `String` since we only hold the string as +/// immutable. +/// +/// Internally, `UniqueString` is powered by [`byteyarn::Yarn`], which provides the +/// underlying implementation for these optimizations. +pub struct UniqueString(byteyarn::Yarn); + +const _: () = const { + assert!(size_of::() == size_of::<[usize; 2]>()); + assert!(size_of::() == size_of::>()); +}; + +impl UniqueString { + /// Returns a string slice containing the entire `UniqueString`. + #[inline] + pub fn as_str(&self) -> &str { + self.0.as_str() + } + + /// Returns a byte slice of this `UniqueString`'s contents. + #[inline] + pub fn as_bytes(&self) -> &[u8] { + self.0.as_bytes() + } + + /// Returns the length of this `UniqueString`, in bytes. + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + /// Returns `true` if the `UniqueString` has a length of 0, `false` otherwise. + #[inline] + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + /// Returns a `UniqueString` by taking ownership of an allocation. + #[inline] + pub fn from_string(string: String) -> Self { + Self(byteyarn::Yarn::from_string(string)) + } + + /// Returns a `UniqueString` pointing to the given slice, without copying. + /// + /// By using this function instead of [`from_string`](Self::from_string), we can avoid any + /// copying and always refer to the provided static string slice. + #[inline] + pub fn from_static_str(string: &'static str) -> Self { + Self(byteyarn::Yarn::from_static(string)) + } + + /// Builds a new `UniqueString` from the given formatting arguments. + /// + /// You can get an [`Arguments`] instance by calling [`format_args!`]. + /// This function is used when using [`uformat!`](crate::uformat) instead of [`format!`] to + /// create a `UniqueString`. + #[inline] + pub fn from_fmt(arguments: Arguments) -> Self { + Self(byteyarn::Yarn::from_fmt(arguments)) + } +} + +impl AsRef for UniqueString { + #[inline] + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl Borrow for UniqueString { + #[inline] + fn borrow(&self) -> &str { + self.as_str() + } +} + +impl Clone for UniqueString { + #[inline] + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} + +impl Debug for UniqueString { + #[inline] + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + Debug::fmt(&self.0, f) + } +} + +impl Default for UniqueString { + #[inline] + fn default() -> Self { + Self(Default::default()) + } +} + +impl Deref for UniqueString { + type Target = str; + + #[inline] + fn deref(&self) -> &Self::Target { + self.as_str() + } +} + +impl Display for UniqueString { + #[inline] + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + Display::fmt(&self.0, f) + } +} + +impl Eq for UniqueString {} + +impl From for UniqueString { + #[inline] + fn from(string: String) -> Self { + Self::from_string(string) + } +} + +impl From<&'static str> for UniqueString { + #[inline] + fn from(string: &'static str) -> Self { + Self::from_static_str(string) + } +} + +impl Hash for UniqueString { + #[inline] + fn hash(&self, state: &mut H) { + self.0.hash(state); + } +} + +impl Ord for UniqueString { + #[inline] + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.0.cmp(&other.0) + } +} + +impl PartialEq for UniqueString { + #[inline] + fn eq(&self, other: &Self) -> bool { + self.0 == other.0 + } +} + +impl PartialOrd for UniqueString { + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Serialize for UniqueString { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.0.serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for UniqueString { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + Ok(Self::from_string(s)) + } +} + +/// A macro for creating [`UniqueString`] instances from format arguments. +/// +/// This macro works similarly to [`format!`] but returns a [`UniqueString`] instead of a [`String`]. +/// It attempts to optimize for `'static` string literals. +#[macro_export] +macro_rules! uformat { + ($fmt:expr) => { + $crate::strings::UniqueString::from_fmt(::std::format_args!($fmt)) + }; + + ($fmt:expr, $($args:tt)*) => { + $crate::strings::UniqueString::from_fmt(::std::format_args!($fmt, $($args)*)) + }; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn macro_works() { + assert!(uformat!("").is_empty()); + assert_eq!( + uformat!("Hello World"), + UniqueString::from_static_str("Hello World") + ); + assert_eq!( + uformat!("Hello {}", "World"), + UniqueString::from_static_str("Hello World") + ); + } +}