Add LRU regex cache (#7587)

Closes #7572 by adding a cache for compiled regexes of type
`Arc<Mutex<LruCache<String, Regex>>>` to `EngineState` .

The cache is limited to 100 entries (limit chosen arbitrarily) and
evicts least-recently-used items first.

This PR makes a noticeable difference when using regexes for
`color_config`, e.g.:
```bash
#first set string formatting in config.nu like:
string: { if $in =~ '^#\w{6}$' { $in } else { 'white' } }`

# then try displaying and exploring a table with many strings
# this is instant after the PR, but takes hundreds of milliseconds before
['#ff0033', '#0025ee', '#0087aa', 'string', '#4101ff', '#ff0033', '#0025ee', '#0087aa', 'string', '#6103ff', '#ff0033', '#0025ee', '#0087aa', 'string', '#6103ff', '#ff0033', '#0025ee', '#0087aa', 'string', '#6103ff', '#ff0033', '#0025ee', '#0087aa', 'string', '#6103ff','#ff0033', '#0025ee', '#0087aa', 'string', '#6103ff','#ff0033', '#0025ee', '#0087aa', 'string', '#6103ff','#ff0033', '#0025ee', '#0087aa', 'string', '#6103ff','#ff0033', '#0025ee', '#0087aa', 'string', '#6103ff','#ff0033', '#0025ee', '#0087aa', 'string', '#6103ff']
```

## New dependency (`lru`)
This uses [the popular `lru` crate](https://lib.rs/crates/lru). The new
dependency adds 19.8KB to a Linux release build of Nushell. I think this
is OK, especially since the crate can be useful elsewhere in Nu.
This commit is contained in:
Reilly Wood 2022-12-23 14:30:04 -08:00 committed by GitHub
parent 3be7996e79
commit a43e66ef92
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 58 additions and 14 deletions

10
Cargo.lock generated
View File

@ -2110,6 +2110,15 @@ dependencies = [
"cfg-if 1.0.0", "cfg-if 1.0.0",
] ]
[[package]]
name = "lru"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6e8aaa3f231bb4bd57b84b2d5dc3ae7f350265df8aa96492e0bc394a1571909"
dependencies = [
"hashbrown",
]
[[package]] [[package]]
name = "lscolors" name = "lscolors"
version = "0.12.0" version = "0.12.0"
@ -2778,6 +2787,7 @@ dependencies = [
"chrono-humanize", "chrono-humanize",
"fancy-regex", "fancy-regex",
"indexmap", "indexmap",
"lru",
"miette", "miette",
"nu-json", "nu-json",
"nu-test-support", "nu-test-support",

View File

@ -402,9 +402,11 @@ pub fn eval_expression(
Comparison::NotEqual => lhs.ne(op_span, &rhs, expr.span), Comparison::NotEqual => lhs.ne(op_span, &rhs, expr.span),
Comparison::In => lhs.r#in(op_span, &rhs, expr.span), Comparison::In => lhs.r#in(op_span, &rhs, expr.span),
Comparison::NotIn => lhs.not_in(op_span, &rhs, expr.span), Comparison::NotIn => lhs.not_in(op_span, &rhs, expr.span),
Comparison::RegexMatch => lhs.regex_match(op_span, &rhs, false, expr.span), Comparison::RegexMatch => {
lhs.regex_match(engine_state, op_span, &rhs, false, expr.span)
}
Comparison::NotRegexMatch => { Comparison::NotRegexMatch => {
lhs.regex_match(op_span, &rhs, true, expr.span) lhs.regex_match(engine_state, op_span, &rhs, true, expr.span)
} }
Comparison::StartsWith => lhs.starts_with(op_span, &rhs, expr.span), Comparison::StartsWith => lhs.starts_with(op_span, &rhs, expr.span),
Comparison::EndsWith => lhs.ends_with(op_span, &rhs, expr.span), Comparison::EndsWith => lhs.ends_with(op_span, &rhs, expr.span),

View File

@ -18,6 +18,7 @@ chrono = { version="0.4.23", features= ["serde", "std"], default-features = fals
chrono-humanize = "0.2.1" chrono-humanize = "0.2.1"
fancy-regex = "0.10.0" fancy-regex = "0.10.0"
indexmap = { version="1.7" } indexmap = { version="1.7" }
lru = "0.8.1"
miette = { version = "5.1.0", features = ["fancy-no-backtrace"] } miette = { version = "5.1.0", features = ["fancy-no-backtrace"] }
num-format = "0.4.3" num-format = "0.4.3"
serde = {version = "1.0.143", default-features = false } serde = {version = "1.0.143", default-features = false }

View File

@ -1,3 +1,6 @@
use fancy_regex::Regex;
use lru::LruCache;
use super::{Command, EnvVars, OverlayFrame, ScopeFrame, Stack, Visibility, DEFAULT_OVERLAY_NAME}; use super::{Command, EnvVars, OverlayFrame, ScopeFrame, Stack, Visibility, DEFAULT_OVERLAY_NAME};
use crate::Value; use crate::Value;
use crate::{ use crate::{
@ -6,6 +9,7 @@ use crate::{
}; };
use core::panic; use core::panic;
use std::borrow::Borrow; use std::borrow::Borrow;
use std::num::NonZeroUsize;
use std::path::Path; use std::path::Path;
use std::path::PathBuf; use std::path::PathBuf;
use std::{ use std::{
@ -94,8 +98,12 @@ pub struct EngineState {
pub history_session_id: i64, pub history_session_id: i64,
// If Nushell was started, e.g., with `nu spam.nu`, the file's parent is stored here // If Nushell was started, e.g., with `nu spam.nu`, the file's parent is stored here
pub currently_parsed_cwd: Option<PathBuf>, pub currently_parsed_cwd: Option<PathBuf>,
pub regex_cache: Arc<Mutex<LruCache<String, Regex>>>,
} }
// The max number of compiled regexes to keep around in a LRU cache, arbitrarily chosen
const REGEX_CACHE_SIZE: usize = 100; // must be nonzero, otherwise will panic
pub const NU_VARIABLE_ID: usize = 0; pub const NU_VARIABLE_ID: usize = 0;
pub const IN_VARIABLE_ID: usize = 1; pub const IN_VARIABLE_ID: usize = 1;
pub const ENV_VARIABLE_ID: usize = 2; pub const ENV_VARIABLE_ID: usize = 2;
@ -137,6 +145,9 @@ impl EngineState {
config_path: HashMap::new(), config_path: HashMap::new(),
history_session_id: 0, history_session_id: 0,
currently_parsed_cwd: None, currently_parsed_cwd: None,
regex_cache: Arc::new(Mutex::new(LruCache::new(
NonZeroUsize::new(REGEX_CACHE_SIZE).expect("tried to create cache of size zero"),
))),
} }
} }

View File

@ -7,6 +7,7 @@ mod unit;
use crate::ast::{Bits, Boolean, CellPath, Comparison, PathMember}; use crate::ast::{Bits, Boolean, CellPath, Comparison, PathMember};
use crate::ast::{Math, Operator}; use crate::ast::{Math, Operator};
use crate::engine::EngineState;
use crate::ShellError; use crate::ShellError;
use crate::{did_you_mean, BlockId, Config, Span, Spanned, Type, VarId}; use crate::{did_you_mean, BlockId, Config, Span, Spanned, Type, VarId};
use byte_unit::ByteUnit; use byte_unit::ByteUnit;
@ -2627,6 +2628,7 @@ impl Value {
pub fn regex_match( pub fn regex_match(
&self, &self,
engine_state: &EngineState,
op: Span, op: Span,
rhs: &Value, rhs: &Value,
invert: bool, invert: bool,
@ -2640,9 +2642,10 @@ impl Value {
span: rhs_span, span: rhs_span,
}, },
) => { ) => {
// We are leaving some performance on the table by compiling the regex every time. let is_match = match engine_state.regex_cache.try_lock() {
// Small regexes compile in microseconds, and the simplicity of this approach currently Ok(mut cache) => match cache.get(rhs) {
// outweighs the performance costs. Revisit this if it ever becomes a bottleneck. Some(regex) => regex.is_match(lhs),
None => {
let regex = Regex::new(rhs).map_err(|e| { let regex = Regex::new(rhs).map_err(|e| {
ShellError::UnsupportedInput( ShellError::UnsupportedInput(
format!("{e}"), format!("{e}"),
@ -2651,7 +2654,24 @@ impl Value {
*rhs_span, *rhs_span,
) )
})?; })?;
let is_match = regex.is_match(lhs); let ret = regex.is_match(lhs);
cache.put(rhs.clone(), regex);
ret
}
},
Err(_) => {
let regex = Regex::new(rhs).map_err(|e| {
ShellError::UnsupportedInput(
format!("{e}"),
"value originated from here".into(),
span,
*rhs_span,
)
})?;
regex.is_match(lhs)
}
};
Ok(Value::Bool { Ok(Value::Bool {
val: if invert { val: if invert {
!is_match.unwrap_or(false) !is_match.unwrap_or(false)