diff --git a/Cargo.lock b/Cargo.lock index 7a1e17ed..93aabba5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -145,6 +145,7 @@ dependencies = [ "grep-cli", "home", "indexmap 2.1.0", + "itertools", "nix", "nu-ansi-term", "once_cell", diff --git a/Cargo.toml b/Cargo.toml index da6c7709..d62067d4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -101,6 +101,7 @@ nix = { version = "0.26.4", default-features = false, features = ["term"] } [build-dependencies] anyhow = "1.0.75" indexmap = { version = "2.1.0", features = ["serde"] } +itertools = "0.11.0" serde = { version = "1.0", features = ["derive"] } serde_with = "3.4.0" toml = { version = "0.8.6", features = ["preserve_order"] } diff --git a/build/syntax_mapping.rs b/build/syntax_mapping.rs index 778fc8ad..217a1fea 100644 --- a/build/syntax_mapping.rs +++ b/build/syntax_mapping.rs @@ -1,7 +1,8 @@ use std::{convert::Infallible, env, fs, path::Path, str::FromStr}; -use anyhow::anyhow; +use anyhow::{anyhow, bail}; use indexmap::IndexMap; +use itertools::Itertools; use serde::Deserialize; use serde_with::DeserializeFromStr; use walkdir::WalkDir; @@ -17,7 +18,6 @@ pub enum MappingTarget { } impl FromStr for MappingTarget { type Err = Infallible; - fn from_str(s: &str) -> Result { match s { "MappingTarget::MapToUnknown" => Ok(Self::MapToUnknown), @@ -36,10 +36,136 @@ impl MappingTarget { } } +#[derive(Clone, Debug, DeserializeFromStr)] +/// A single matcher. +/// +/// Corresponds to `syntax_mapping::BuiltinMatcher`. +struct Matcher(Vec); +/// Parse a matcher. +/// +/// Note that this implementation is rather strict: when it sees a '$', '{', or +/// '}' where it does not make sense, it will immediately hard-error. +/// +/// The reason for this strictness is I currently cannot think of a valid reason +/// why you would ever need '$', '{', or '}' as plaintext in a glob pattern. +/// Therefore any such occurrences are likely human errors. +/// +/// If we later discover some edge cases, it's okay to make it more permissive. +impl FromStr for Matcher { + type Err = anyhow::Error; + fn from_str(s: &str) -> Result { + use MatcherSegment as Seg; + + if s.is_empty() { + bail!("Empty string is not a valid glob matcher"); + } + + let mut segments = Vec::new(); + let mut buf = String::new(); + let mut is_in_var = false; + + let mut char_it = s.chars(); + loop { + match char_it.next() { + Some('$') => { + if is_in_var { + bail!(r#"Saw a '$' when already in a variable: "{s}""#); + } + match char_it.next() { + Some('{') => { + // push text unless empty + if !buf.is_empty() { + segments.push(Seg::Text(buf.clone())); + buf.clear(); + } + // open var + is_in_var = true; + } + Some(_) | None => { + bail!(r#"Expected a '{{' after '$': "{s}""#); + } + } + } + Some('{') => { + bail!(r#"Saw a hanging '{{': "{s}""#); + } + Some('}') => { + if !is_in_var { + bail!(r#"Saw a '}}' when not in a variable: "{s}""#); + } + if buf.is_empty() { + // `${}` + bail!(r#"Variable name cannot be empty: "{s}""#); + } + // push variable + segments.push(Seg::Env(buf.clone())); + buf.clear(); + // close var + is_in_var = false; + } + Some(' ') if is_in_var => { + bail!(r#"' ' Cannot be part of a variable's name: "{s}""#); + } + Some(c) => { + // either plaintext or variable name + buf.push(c); + } + None => { + if is_in_var { + bail!(r#"Variable unclosed: "{s}""#); + } + segments.push(Seg::Text(buf.clone())); + break; + } + } + } + + Ok(Self(segments)) + } +} +impl Matcher { + fn codegen(&self) -> String { + match self.0.len() { + 0 => unreachable!("0-length matcher should never be created"), + // if-let guard would be ideal here + // see: https://github.com/rust-lang/rust/issues/51114 + 1 if matches!(self.0[0], MatcherSegment::Text(_)) => { + let MatcherSegment::Text(ref s) = self.0[0] else { + unreachable!() + }; + format!(r###"BuiltinMatcher::Fixed(r#"{s}"#)"###) + } + // parser logic ensures that this case can only happen when there are dynamic segments + _ => { + let segments_codegen = self.0.iter().map(MatcherSegment::codegen).join(", "); + let closure = format!("|| join_segments(&[{segments_codegen}])"); + format!("BuiltinMatcher::Dynamic(Lazy::new({closure}))") + } + } + } +} + +/// A segment in a matcher. +/// +/// Corresponds to `syntax_mapping::MatcherSegment`. +#[derive(Debug, Clone)] +enum MatcherSegment { + Text(String), + Env(String), +} +impl MatcherSegment { + fn codegen(&self) -> String { + match self { + Self::Text(s) => format!(r###"MatcherSegment::Text(r#"{s}"#)"###), + Self::Env(s) => format!(r###"MatcherSegment::Env(r#"{s}"#)"###), + } + } +} + /// A struct that models a single .toml file in /src/syntax_mapping/builtins/. #[derive(Clone, Debug, Deserialize)] struct MappingDefModel { - mappings: IndexMap>, + mappings: IndexMap>, } impl MappingDefModel { fn into_mapping_list(self) -> MappingList { @@ -58,18 +184,20 @@ impl MappingDefModel { } #[derive(Clone, Debug)] -struct MappingList(Vec<(String, MappingTarget)>); +struct MappingList(Vec<(Matcher, MappingTarget)>); impl MappingList { fn codegen(&self) -> String { let array_items: Vec<_> = self .0 .iter() - .map(|(matcher, target)| format!(r###"(r#"{matcher}"#, {t})"###, t = target.codegen())) + .map(|(matcher, target)| { + format!("({m}, {t})", m = matcher.codegen(), t = target.codegen()) + }) .collect(); let len = array_items.len(); format!( - "static STATIC_RULES: [(&str, MappingTarget); {len}] = [\n{items}\n];", + "static STATIC_RULES: [(BuiltinMatcher, MappingTarget); {len}] = [\n{items}\n];", items = array_items.join(",\n") ) } diff --git a/src/syntax_mapping.rs b/src/syntax_mapping.rs index 8b0c1c17..f3c6c0ab 100644 --- a/src/syntax_mapping.rs +++ b/src/syntax_mapping.rs @@ -1,9 +1,10 @@ -use std::path::Path; +use std::{env, path::Path}; use crate::error::Result; use ignored_suffixes::IgnoredSuffixes; use globset::{Candidate, GlobBuilder, GlobMatcher}; +use once_cell::sync::Lazy; pub mod ignored_suffixes; @@ -14,6 +15,60 @@ include!(concat!( "/codegen_static_syntax_mappings.rs" )); +/// A glob matcher generated from analysing the matcher string at compile time. +/// +/// This is so that the string searches are moved from run time to compile time, +/// thus improving startup performance. +#[derive(Debug)] +enum BuiltinMatcher { + /// A plaintext matcher. + Fixed(&'static str), + /// A matcher that needs dynamic environment variable replacement. + /// + /// Evaluates to `None` when any environment variable replacement fails. + Dynamic(Lazy>), +} +impl BuiltinMatcher { + /// Finalise into a glob matcher. + /// + /// Returns `None` if any environment variable replacement fails (only + /// possible for dynamic matchers). + fn to_glob_matcher(&self) -> Option { + let glob_str = match self { + Self::Fixed(s) => *s, + Self::Dynamic(s) => s.as_ref()?.as_str(), + }; + Some(make_glob_matcher(glob_str).expect("A builtin glob matcher failed to compile")) + } +} + +/// Join a list of matcher segments, replacing all environment variables. +/// Returns `None` if any replacement fails. +/// +/// Used internally by `BuiltinMatcher::Dynamic`'s lazy evaluation closure. +fn join_segments(segs: &[MatcherSegment]) -> Option { + let mut buf = String::new(); + for seg in segs { + match seg { + MatcherSegment::Text(s) => buf.push_str(s), + MatcherSegment::Env(var) => { + let replaced = env::var(var).ok()?; + buf.push_str(&replaced); + } + } + } + Some(buf) +} + +/// A segment of a dynamic builtin matcher. +/// +/// Used internally by `BuiltinMatcher::Dynamic`'s lazy evaluation closure. +#[derive(Clone, Debug)] +enum MatcherSegment { + Text(&'static str), + Env(&'static str), +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[non_exhaustive] pub enum MappingTarget<'a> { @@ -34,6 +89,15 @@ pub enum MappingTarget<'a> { MapExtensionToUnknown, } +fn make_glob_matcher(from: &str) -> Result { + let matcher = GlobBuilder::new(from) + .case_insensitive(true) + .literal_separator(true) + .build()? + .compile_matcher(); + Ok(matcher) +} + #[derive(Debug, Clone, Default)] pub struct SyntaxMapping<'a> { mappings: Vec<(GlobMatcher, MappingTarget<'a>)>, @@ -217,11 +281,8 @@ impl<'a> SyntaxMapping<'a> { } pub fn insert(&mut self, from: &str, to: MappingTarget<'a>) -> Result<()> { - let glob = GlobBuilder::new(from) - .case_insensitive(true) - .literal_separator(true) - .build()?; - self.mappings.push((glob.compile_matcher(), to)); + let matcher = make_glob_matcher(from)?; + self.mappings.push((matcher, to)); Ok(()) }