Refactor Record to use a single backing Vec (#12326)

# Description
This shrinks `Record`'s size in half and and allows you to include it in
`Value` without growing the size.

Changing the `Record` internals may have slightly different performance
characteristics as the cache locality changes on lookups (if you
directly need the value, it should be closer, but in other cases may
blow up the cache line budget)

Also different perf characteristics on creation expected. 
`Record::from_raw_cols_vals` now probably worse.

## Benchmarking

Comparison with the main branch (boxed Record) revealed no significant
change to the creation but an improvement when accessing larger N.
The fact that this was more pronounced for nested access (still cloning
before nushell/nushell#12325) leads to the conclusion that this may
still be dominated by the smaller clone necessary for a 24-byte `Record`
over the previous 48 bytes.

# User-Facing Changes

Reduced memory usage
This commit is contained in:
Stefan Holderbach 2024-03-31 00:47:17 +01:00 committed by GitHub
parent 3b8258ae57
commit 0cf7de598b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -6,8 +6,7 @@ use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct Record {
cols: Vec<String>,
vals: Vec<Value>,
inner: Vec<(String, Value)>,
}
impl Record {
@ -17,8 +16,7 @@ impl Record {
pub fn with_capacity(capacity: usize) -> Self {
Self {
cols: Vec::with_capacity(capacity),
vals: Vec::with_capacity(capacity),
inner: Vec::with_capacity(capacity),
}
}
@ -35,7 +33,8 @@ impl Record {
creation_site_span: Span,
) -> Result<Self, ShellError> {
if cols.len() == vals.len() {
Ok(Self { cols, vals })
let inner = cols.into_iter().zip(vals).collect();
Ok(Self { inner })
} else {
Err(ShellError::RecordColsValsMismatch {
bad_value: input_span,
@ -53,13 +52,11 @@ impl Record {
}
pub fn is_empty(&self) -> bool {
debug_assert_eq!(self.cols.len(), self.vals.len());
self.cols.is_empty()
self.inner.is_empty()
}
pub fn len(&self) -> usize {
debug_assert_eq!(self.cols.len(), self.vals.len());
self.cols.len()
self.inner.len()
}
/// Naive push to the end of the datastructure.
@ -68,8 +65,7 @@ impl Record {
///
/// Consider to use [`Record::insert`] instead
pub fn push(&mut self, col: impl Into<String>, val: Value) {
self.cols.push(col.into());
self.vals.push(val);
self.inner.push((col.into(), val));
}
/// Insert into the record, replacing preexisting value if found.
@ -79,9 +75,7 @@ impl Record {
where
K: AsRef<str> + Into<String>,
{
if let Some(idx) = self.index_of(&col) {
// Can panic if vals.len() < cols.len()
let curr_val = &mut self.vals[idx];
if let Some(curr_val) = self.get_mut(&col) {
Some(std::mem::replace(curr_val, val))
} else {
self.push(col, val);
@ -98,15 +92,19 @@ impl Record {
}
pub fn get(&self, col: impl AsRef<str>) -> Option<&Value> {
self.index_of(col).and_then(|idx| self.vals.get(idx))
self.inner
.iter()
.find_map(|(k, v)| if k == col.as_ref() { Some(v) } else { None })
}
pub fn get_mut(&mut self, col: impl AsRef<str>) -> Option<&mut Value> {
self.index_of(col).and_then(|idx| self.vals.get_mut(idx))
self.inner
.iter_mut()
.find_map(|(k, v)| if k == col.as_ref() { Some(v) } else { None })
}
pub fn get_index(&self, idx: usize) -> Option<(&String, &Value)> {
Some((self.cols.get(idx)?, self.vals.get(idx)?))
self.inner.get(idx).map(|(col, val): &(_, _)| (col, val))
}
/// Remove single value by key
@ -116,8 +114,8 @@ impl Record {
/// Note: makes strong assumption that keys are unique
pub fn remove(&mut self, col: impl AsRef<str>) -> Option<Value> {
let idx = self.index_of(col)?;
self.cols.remove(idx);
Some(self.vals.remove(idx))
let (_, val) = self.inner.remove(idx);
Some(val)
}
/// Remove elements in-place that do not satisfy `keep`
@ -185,33 +183,7 @@ impl Record {
where
F: FnMut(&str, &mut Value) -> bool,
{
// `Vec::retain` is able to optimize memcopies internally.
// For maximum benefit, `retain` is used on `vals`,
// as `Value` is a larger struct than `String`.
//
// To do a simultaneous retain on the `cols`, three portions of it are tracked:
// [..retained, ..dropped, ..unvisited]
// number of elements keep so far, start of ..dropped and length of ..retained
let mut retained = 0;
// current index of element being checked, start of ..unvisited
let mut idx = 0;
self.vals.retain_mut(|val| {
if keep(&self.cols[idx], val) {
// skip swaps for first consecutive run of kept elements
if idx != retained {
self.cols.swap(idx, retained);
}
retained += 1;
idx += 1;
true
} else {
idx += 1;
false
}
});
self.cols.truncate(retained);
self.inner.retain_mut(|(col, val)| keep(col, val));
}
/// Truncate record to the first `len` elements.
@ -235,25 +207,24 @@ impl Record {
/// assert_eq!(rec.len(), 0);
/// ```
pub fn truncate(&mut self, len: usize) {
self.cols.truncate(len);
self.vals.truncate(len);
self.inner.truncate(len);
}
pub fn columns(&self) -> Columns {
Columns {
iter: self.cols.iter(),
iter: self.inner.iter(),
}
}
pub fn values(&self) -> Values {
Values {
iter: self.vals.iter(),
iter: self.inner.iter(),
}
}
pub fn into_values(self) -> IntoValues {
IntoValues {
iter: self.vals.into_iter(),
iter: self.inner.into_iter(),
}
}
@ -282,19 +253,18 @@ impl Record {
where
R: RangeBounds<usize> + Clone,
{
debug_assert_eq!(self.cols.len(), self.vals.len());
Drain {
keys: self.cols.drain(range.clone()),
values: self.vals.drain(range),
iter: self.inner.drain(range),
}
}
}
impl FromIterator<(String, Value)> for Record {
fn from_iter<T: IntoIterator<Item = (String, Value)>>(iter: T) -> Self {
let (cols, vals) = iter.into_iter().unzip();
// TODO: should this check for duplicate keys/columns?
Self { cols, vals }
Self {
inner: iter.into_iter().collect(),
}
}
}
@ -308,7 +278,7 @@ impl Extend<(String, Value)> for Record {
}
pub struct IntoIter {
iter: std::iter::Zip<std::vec::IntoIter<String>, std::vec::IntoIter<Value>>,
iter: std::vec::IntoIter<(String, Value)>,
}
impl Iterator for IntoIter {
@ -338,26 +308,26 @@ impl IntoIterator for Record {
fn into_iter(self) -> Self::IntoIter {
IntoIter {
iter: self.cols.into_iter().zip(self.vals),
iter: self.inner.into_iter(),
}
}
}
pub struct Iter<'a> {
iter: std::iter::Zip<std::slice::Iter<'a, String>, std::slice::Iter<'a, Value>>,
iter: std::slice::Iter<'a, (String, Value)>,
}
impl<'a> Iterator for Iter<'a> {
type Item = (&'a String, &'a Value);
fn next(&mut self) -> Option<Self::Item> {
self.iter.next()
self.iter.next().map(|(col, val): &(_, _)| (col, val))
}
}
impl<'a> DoubleEndedIterator for Iter<'a> {
fn next_back(&mut self) -> Option<Self::Item> {
self.iter.next_back()
self.iter.next_back().map(|(col, val): &(_, _)| (col, val))
}
}
@ -374,26 +344,26 @@ impl<'a> IntoIterator for &'a Record {
fn into_iter(self) -> Self::IntoIter {
Iter {
iter: self.cols.iter().zip(&self.vals),
iter: self.inner.iter(),
}
}
}
pub struct IterMut<'a> {
iter: std::iter::Zip<std::slice::Iter<'a, String>, std::slice::IterMut<'a, Value>>,
iter: std::slice::IterMut<'a, (String, Value)>,
}
impl<'a> Iterator for IterMut<'a> {
type Item = (&'a String, &'a mut Value);
fn next(&mut self) -> Option<Self::Item> {
self.iter.next()
self.iter.next().map(|(col, val)| (&*col, val))
}
}
impl<'a> DoubleEndedIterator for IterMut<'a> {
fn next_back(&mut self) -> Option<Self::Item> {
self.iter.next_back()
self.iter.next_back().map(|(col, val)| (&*col, val))
}
}
@ -410,20 +380,20 @@ impl<'a> IntoIterator for &'a mut Record {
fn into_iter(self) -> Self::IntoIter {
IterMut {
iter: self.cols.iter().zip(&mut self.vals),
iter: self.inner.iter_mut(),
}
}
}
pub struct Columns<'a> {
iter: std::slice::Iter<'a, String>,
iter: std::slice::Iter<'a, (String, Value)>,
}
impl<'a> Iterator for Columns<'a> {
type Item = &'a String;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next()
self.iter.next().map(|(col, _)| col)
}
fn size_hint(&self) -> (usize, Option<usize>) {
@ -433,7 +403,7 @@ impl<'a> Iterator for Columns<'a> {
impl<'a> DoubleEndedIterator for Columns<'a> {
fn next_back(&mut self) -> Option<Self::Item> {
self.iter.next_back()
self.iter.next_back().map(|(col, _)| col)
}
}
@ -444,14 +414,14 @@ impl<'a> ExactSizeIterator for Columns<'a> {
}
pub struct Values<'a> {
iter: std::slice::Iter<'a, Value>,
iter: std::slice::Iter<'a, (String, Value)>,
}
impl<'a> Iterator for Values<'a> {
type Item = &'a Value;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next()
self.iter.next().map(|(_, val)| val)
}
fn size_hint(&self) -> (usize, Option<usize>) {
@ -461,7 +431,7 @@ impl<'a> Iterator for Values<'a> {
impl<'a> DoubleEndedIterator for Values<'a> {
fn next_back(&mut self) -> Option<Self::Item> {
self.iter.next_back()
self.iter.next_back().map(|(_, val)| val)
}
}
@ -472,14 +442,14 @@ impl<'a> ExactSizeIterator for Values<'a> {
}
pub struct IntoValues {
iter: std::vec::IntoIter<Value>,
iter: std::vec::IntoIter<(String, Value)>,
}
impl Iterator for IntoValues {
type Item = Value;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next()
self.iter.next().map(|(_, val)| val)
}
fn size_hint(&self) -> (usize, Option<usize>) {
@ -489,7 +459,7 @@ impl Iterator for IntoValues {
impl DoubleEndedIterator for IntoValues {
fn next_back(&mut self) -> Option<Self::Item> {
self.iter.next_back()
self.iter.next_back().map(|(_, val)| val)
}
}
@ -500,31 +470,30 @@ impl ExactSizeIterator for IntoValues {
}
pub struct Drain<'a> {
keys: std::vec::Drain<'a, String>,
values: std::vec::Drain<'a, Value>,
iter: std::vec::Drain<'a, (String, Value)>,
}
impl Iterator for Drain<'_> {
type Item = (String, Value);
fn next(&mut self) -> Option<Self::Item> {
Some((self.keys.next()?, self.values.next()?))
self.iter.next()
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.keys.size_hint()
self.iter.size_hint()
}
}
impl DoubleEndedIterator for Drain<'_> {
fn next_back(&mut self) -> Option<Self::Item> {
Some((self.keys.next_back()?, self.values.next_back()?))
self.iter.next_back()
}
}
impl ExactSizeIterator for Drain<'_> {
fn len(&self) -> usize {
self.keys.len()
self.iter.len()
}
}