From fae118a46ba23da5aed9f4436e16ba7677ecbb84 Mon Sep 17 00:00:00 2001 From: Patrick Date: Fri, 18 Mar 2022 12:37:27 +0100 Subject: [PATCH] Improve fuzzy search (#279) * Add SearchMode fzf. Add a new search mode "fzf" that tries to mimic the search syntax of https://github.com/junegunn/fzf#search-syntax This search mode splits the query into terms where each term is matched individually. Terms can have operators like prefix, suffix, exact match only and can be inverted. Additionally, smart-case matching is performed: if a term contains a non-lowercase letter the match will be case-sensitive. * PR feedback. - Use SearchMode::Fuzzy instead of SearchMode::Fzf - update docs - re-order tests so previous fuzzy tests come first, add more tests for each operator * PR comments: remove named arguments, match expression * PR comments: macro -> async func --- Cargo.lock | 1 + atuin-client/Cargo.toml | 1 + atuin-client/src/database.rs | 262 ++++++++++++++++++++++++++--------- atuin-client/src/ordering.rs | 1 + docs/config.md | 26 +++- 5 files changed, 226 insertions(+), 65 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 01e53383..5ef50935 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -116,6 +116,7 @@ dependencies = [ "minspan", "parse_duration", "rand 0.8.5", + "regex", "reqwest", "rmp-serde", "rust-crypto", diff --git a/atuin-client/Cargo.toml b/atuin-client/Cargo.toml index d6ad4ced..54c5127d 100644 --- a/atuin-client/Cargo.toml +++ b/atuin-client/Cargo.toml @@ -47,3 +47,4 @@ sqlx = { version = "0.5", features = [ "sqlite", ] } minspan = "0.1.1" +regex = "1.5.4" diff --git a/atuin-client/src/database.rs b/atuin-client/src/database.rs index 3cb89472..3ca61717 100644 --- a/atuin-client/src/database.rs +++ b/atuin-client/src/database.rs @@ -7,6 +7,7 @@ use chrono::Utc; use eyre::Result; use itertools::Itertools; +use regex::Regex; use sqlx::sqlite::{ SqliteConnectOptions, SqliteJournalMode, SqlitePool, SqlitePoolOptions, SqliteRow, @@ -286,27 +287,89 @@ impl Database for Sqlite { let query = query.to_string().replace('*', "%"); // allow wildcard char let limit = limit.map_or("".to_owned(), |l| format!("limit {}", l)); - let query = match search_mode { - SearchMode::Prefix => query, - SearchMode::FullText => format!("%{}", query), - SearchMode::Fuzzy => query.split("").join("%"), + let (query_sql, query_params) = match search_mode { + SearchMode::Prefix => ("command like ?1".to_string(), vec![format!("{}%", query)]), + SearchMode::FullText => ("command like ?1".to_string(), vec![format!("%{}%", query)]), + SearchMode::Fuzzy => { + let split_regex = Regex::new(r" +").unwrap(); + let terms: Vec<&str> = split_regex.split(query.as_str()).collect(); + let mut query_sql = std::string::String::new(); + let mut query_params = Vec::with_capacity(terms.len()); + let mut was_or = false; + for (i, query_part) in terms.into_iter().enumerate() { + // TODO smart case mode could be made configurable like in fzf + let (operator, glob) = if query_part.contains(char::is_uppercase) { + ("glob", '*') + } else { + ("like", '%') + }; + let (is_inverse, query_part) = match query_part.strip_prefix('!') { + Some(stripped) => (true, stripped), + None => (false, query_part), + }; + match query_part { + "|" => { + if !was_or { + query_sql.push_str(" OR "); + was_or = true; + continue; + } else { + query_params.push(format!("{glob}|{glob}")); + } + } + exact_prefix if query_part.starts_with('^') => query_params.push(format!( + "{term}{glob}", + term = exact_prefix.strip_prefix('^').unwrap() + )), + exact_suffix if query_part.ends_with('$') => query_params.push(format!( + "{glob}{term}", + term = exact_suffix.strip_suffix('$').unwrap() + )), + exact if query_part.starts_with('\'') => query_params.push(format!( + "{glob}{term}{glob}", + term = exact.strip_prefix('\'').unwrap() + )), + exact if is_inverse => { + query_params.push(format!("{glob}{term}{glob}", term = exact)) + } + _ => { + query_params.push(query_part.split("").join(glob.to_string().as_str())) + } + } + if i > 0 && !was_or { + query_sql.push_str(" AND "); + } + if is_inverse { + query_sql.push_str("NOT "); + } + query_sql + .push_str(format!("command {} ?{}", operator, query_params.len()).as_str()); + was_or = false; + } + (query_sql, query_params) + } }; - let res = sqlx::query( - format!( - "select * from history h - where command like ?1 || '%' - group by command - having max(timestamp) - order by timestamp desc {}", - limit.clone() + let res = query_params + .iter() + .fold( + sqlx::query( + format!( + "select * from history h + where {} + group by command + having max(timestamp) + order by timestamp desc {}", + query_sql.as_str(), + limit.clone() + ) + .as_str(), + ), + |query, query_param| query.bind(query_param), ) - .as_str(), - ) - .bind(query) - .map(Self::query_history) - .fetch_all(&self.pool) - .await?; + .map(Self::query_history) + .fetch_all(&self.pool) + .await?; Ok(ordering::reorder_fuzzy(search_mode, orig_query, res)) } @@ -326,6 +389,36 @@ mod test { use super::*; use std::time::{Duration, Instant}; + async fn assert_search_eq<'a>( + db: &impl Database, + mode: SearchMode, + query: &str, + expected: usize, + ) -> Result> { + let results = db.search(None, mode, query).await?; + assert_eq!( + results.len(), + expected, + "query \"{}\", commands: {:?}", + query, + results.iter().map(|a| &a.command).collect::>() + ); + Ok(results) + } + + async fn assert_search_commands( + db: &impl Database, + mode: SearchMode, + query: &str, + expected_commands: Vec<&str>, + ) { + let results = assert_search_eq(db, mode, query, expected_commands.len()) + .await + .unwrap(); + let commands: Vec<&str> = results.iter().map(|a| a.command.as_str()).collect(); + assert_eq!(commands, expected_commands); + } + async fn new_history_item(db: &mut impl Database, cmd: &str) -> Result<()> { let history = History::new( chrono::Utc::now(), @@ -344,14 +437,15 @@ mod test { let mut db = Sqlite::new("sqlite::memory:").await.unwrap(); new_history_item(&mut db, "ls /home/ellie").await.unwrap(); - let mut results = db.search(None, SearchMode::Prefix, "ls").await.unwrap(); - assert_eq!(results.len(), 1); - - results = db.search(None, SearchMode::Prefix, "/home").await.unwrap(); - assert_eq!(results.len(), 0); - - results = db.search(None, SearchMode::Prefix, "ls ").await.unwrap(); - assert_eq!(results.len(), 0); + assert_search_eq(&db, SearchMode::Prefix, "ls", 1) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Prefix, "/home", 0) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Prefix, "ls ", 0) + .await + .unwrap(); } #[tokio::test(flavor = "multi_thread")] @@ -359,17 +453,15 @@ mod test { let mut db = Sqlite::new("sqlite::memory:").await.unwrap(); new_history_item(&mut db, "ls /home/ellie").await.unwrap(); - let mut results = db.search(None, SearchMode::FullText, "ls").await.unwrap(); - assert_eq!(results.len(), 1); - - results = db - .search(None, SearchMode::FullText, "/home") + assert_search_eq(&db, SearchMode::FullText, "ls", 1) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::FullText, "/home", 1) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::FullText, "ls ", 0) .await .unwrap(); - assert_eq!(results.len(), 1); - - results = db.search(None, SearchMode::FullText, "ls ").await.unwrap(); - assert_eq!(results.len(), 0); } #[tokio::test(flavor = "multi_thread")] @@ -377,34 +469,77 @@ mod test { let mut db = Sqlite::new("sqlite::memory:").await.unwrap(); new_history_item(&mut db, "ls /home/ellie").await.unwrap(); new_history_item(&mut db, "ls /home/frank").await.unwrap(); - new_history_item(&mut db, "cd /home/ellie").await.unwrap(); + new_history_item(&mut db, "cd /home/Ellie").await.unwrap(); new_history_item(&mut db, "/home/ellie/.bin/rustup") .await .unwrap(); - let mut results = db.search(None, SearchMode::Fuzzy, "ls /").await.unwrap(); - assert_eq!(results.len(), 2); - - results = db.search(None, SearchMode::Fuzzy, "l/h/").await.unwrap(); - assert_eq!(results.len(), 2); - - results = db.search(None, SearchMode::Fuzzy, "/h/e").await.unwrap(); - assert_eq!(results.len(), 3); - - results = db.search(None, SearchMode::Fuzzy, "/hmoe/").await.unwrap(); - assert_eq!(results.len(), 0); - - results = db - .search(None, SearchMode::Fuzzy, "ellie/home") + assert_search_eq(&db, SearchMode::Fuzzy, "ls /", 3) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "ls/", 2) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "l/h/", 2) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "/h/e", 3) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "/hmoe/", 0) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "ellie/home", 0) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "lsellie", 1) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, " ", 4) .await .unwrap(); - assert_eq!(results.len(), 0); - results = db.search(None, SearchMode::Fuzzy, "lsellie").await.unwrap(); - assert_eq!(results.len(), 1); + // single term operators + assert_search_eq(&db, SearchMode::Fuzzy, "^ls", 2) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "'ls", 2) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "ellie$", 2) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "!^ls", 2) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "!ellie", 1) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "!ellie$", 2) + .await + .unwrap(); - results = db.search(None, SearchMode::Fuzzy, " ").await.unwrap(); - assert_eq!(results.len(), 3); + // multiple terms + assert_search_eq(&db, SearchMode::Fuzzy, "ls !ellie", 1) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "^ls !e$", 1) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "home !^ls", 2) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "'frank | 'rustup", 2) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "'frank | 'rustup 'ls", 1) + .await + .unwrap(); + + // case matching + assert_search_eq(&db, SearchMode::Fuzzy, "Ellie", 1) + .await + .unwrap(); } #[tokio::test(flavor = "multi_thread")] @@ -414,17 +549,16 @@ mod test { new_history_item(&mut db, "curl").await.unwrap(); new_history_item(&mut db, "corburl").await.unwrap(); + // if fuzzy reordering is on, it should come back in a more sensible order - let mut results = db.search(None, SearchMode::Fuzzy, "curl").await.unwrap(); - assert_eq!(results.len(), 2); - let commands: Vec<&String> = results.iter().map(|a| &a.command).collect(); - assert_eq!(commands, vec!["curl", "corburl"]); + assert_search_commands(&db, SearchMode::Fuzzy, "curl", vec!["curl", "corburl"]).await; - results = db.search(None, SearchMode::Fuzzy, "xxxx").await.unwrap(); - assert_eq!(results.len(), 0); - - results = db.search(None, SearchMode::Fuzzy, "").await.unwrap(); - assert_eq!(results.len(), 2); + assert_search_eq(&db, SearchMode::Fuzzy, "xxxx", 0) + .await + .unwrap(); + assert_search_eq(&db, SearchMode::Fuzzy, "", 2) + .await + .unwrap(); } #[tokio::test(flavor = "multi_thread")] diff --git a/atuin-client/src/ordering.rs b/atuin-client/src/ordering.rs index b6051d15..0bb12c6a 100644 --- a/atuin-client/src/ordering.rs +++ b/atuin-client/src/ordering.rs @@ -17,6 +17,7 @@ where let mut r = res.clone(); let qvec = &query.chars().collect(); r.sort_by_cached_key(|h| { + // TODO for fzf search we should sum up scores for each matched term let (from, to) = match minspan::span(qvec, &(f(h).chars().collect())) { Some(x) => x, // this is a little unfortunate: when we are asked to match a query that is found nowhere, diff --git a/docs/config.md b/docs/config.md index 414146f4..405e3b57 100644 --- a/docs/config.md +++ b/docs/config.md @@ -97,7 +97,8 @@ key = "~/.atuin-session" ### `search_mode` Which search mode to use. Atuin supports "prefix", full text and "fuzzy" search -modes. The prefix search for "query\*", fulltext "\*query\*", and fuzzy "\*q\*u\*e\*r\*y\*" +modes. The prefix searches for "query\*", fulltext "\*query\*", and fuzzy applies +the search syntax [described below](#fuzzy-search-syntax). Defaults to "prefix" @@ -105,6 +106,29 @@ Defaults to "prefix" search_mode = "fulltext" ``` +#### `fuzzy` search syntax + +The "fuzzy" search syntax is based on the +[fzf search syntax](https://github.com/junegunn/fzf#search-syntax). + +| Token | Match type | Description | +| --------- | -------------------------- | ------------------------------------ | +| `sbtrkt` | fuzzy-match | Items that match `sbtrkt` | +| `'wild` | exact-match (quoted) | Items that include `wild` | +| `^music` | prefix-exact-match | Items that start with `music` | +| `.mp3$` | suffix-exact-match | Items that end with `.mp3` | +| `!fire` | inverse-exact-match | Items that do not include `fire` | +| `!^music` | inverse-prefix-exact-match | Items that do not start with `music` | +| `!.mp3$` | inverse-suffix-exact-match | Items that do not end with `.mp3` | + +A single bar character term acts as an OR operator. For example, the following +query matches entries that start with `core` and end with either `go`, `rb`, +or `py`. + +``` +^core go$ | rb$ | py$ +``` + ## Server config `// TODO`