From 388e84e7ef76df7544565c9dea372c86d7f6bdce Mon Sep 17 00:00:00 2001 From: Darren Schroeder <343840+fdncred@users.noreply.github.com> Date: Mon, 8 May 2023 09:07:01 -0500 Subject: [PATCH] update nu-glob based on latest glob 0.3.1 changes (#9099) # Description This PR updates `nu-glob` to add the latest changes and updates from `rust-lang/glob` [v0.3.1](https://github.com/rust-lang/glob). With these changes you can do this type of globbing ```rust /// - `?` matches any single character. /// /// - `*` matches any (possibly empty) sequence of characters. /// /// - `**` matches the current directory and arbitrary subdirectories. This /// sequence **must** form a single path component, so both `**a` and `b**` /// are invalid and will result in an error. A sequence of more than two /// consecutive `*` characters is also invalid. /// /// - `[...]` matches any character inside the brackets. Character sequences /// can also specify ranges of characters, as ordered by Unicode, so e.g. /// `[0-9]` specifies any character between 0 and 9 inclusive. An unclosed /// bracket is invalid. /// /// - `[!...]` is the negation of `[...]`, i.e. it matches any characters /// **not** in the brackets. /// /// - The metacharacters `?`, `*`, `[`, `]` can be matched by using brackets /// (e.g. `[?]`). When a `]` occurs immediately following `[` or `[!` then it /// is interpreted as being part of, rather then ending, the character set, so /// `]` and NOT `]` can be matched by `[]]` and `[!]]` respectively. The `-` /// character can be specified inside a character sequence pattern by placing /// it at the start or the end, e.g. `[abc-]`. ``` Example - with character sequences ![image](https://user-images.githubusercontent.com/343840/236266670-03bf9384-4917-4074-9687-2c1c0d8ef34a.png) Example - with character sequence negation ![image](https://user-images.githubusercontent.com/343840/236266421-73c3ee2c-1d10-4da0-86be-0afb51b50604.png) Example - normal globbing ![image](https://user-images.githubusercontent.com/343840/236267138-60f22228-b8d3-4bf2-911b-a80560fdfa4f.png) Example - with character sequences ![image](https://user-images.githubusercontent.com/343840/236267475-8c38fce9-87fe-4544-9757-34d319ce55b8.png) Not that, if you're using a character sequence by itself, you need to enclose it in quotes, otherwise nushell will think it's a range. But if you already have a type of a bare word already, no quotes are necessary, as in the last example. # User-Facing Changes # Tests + Formatting # After Submitting --- crates/nu-command/src/filesystem/ls.rs | 2 +- crates/nu-glob/src/lib.rs | 300 +++++++++++++++++++++++-- 2 files changed, 288 insertions(+), 14 deletions(-) diff --git a/crates/nu-command/src/filesystem/ls.rs b/crates/nu-command/src/filesystem/ls.rs index 4ae9028599..057cf711a3 100644 --- a/crates/nu-command/src/filesystem/ls.rs +++ b/crates/nu-command/src/filesystem/ls.rs @@ -143,7 +143,7 @@ impl Command for Ls { } else if is_empty_dir(current_dir(engine_state, stack)?) { return Ok(Value::list(vec![], call_span).into_pipeline_data()); } else { - (PathBuf::from("./*"), call_span, false) + (PathBuf::from("*"), call_span, false) } } }; diff --git a/crates/nu-glob/src/lib.rs b/crates/nu-glob/src/lib.rs index f833c5cc2d..9edba6b116 100644 --- a/crates/nu-glob/src/lib.rs +++ b/crates/nu-glob/src/lib.rs @@ -61,7 +61,7 @@ #![doc( html_logo_url = "https://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png", html_favicon_url = "https://www.rust-lang.org/favicon.ico", - html_root_url = "https://docs.rs/glob/0.3.0" + html_root_url = "https://docs.rs/glob/0.3.1" )] #![deny(missing_docs)] @@ -80,8 +80,10 @@ use std::io; use std::path::{self, Component, Path, PathBuf}; use std::str::FromStr; +use CharSpecifier::{CharRange, SingleChar}; use MatchResult::{EntirePatternDoesntMatch, Match, SubPatternDoesntMatch}; -use PatternToken::{AnyChar, AnyRecursiveSequence, AnySequence, Char}; +use PatternToken::AnyExcept; +use PatternToken::{AnyChar, AnyRecursiveSequence, AnySequence, AnyWithin, Char}; /// An iterator that yields `Path`s from the filesystem that match a particular /// pattern. @@ -179,7 +181,10 @@ pub fn glob_with(pattern: &str, options: MatchOptions) -> Result bool { match p.components().next() { - Some(Component::Prefix(ref p)) => p.kind().is_verbatim(), + Some(Component::Prefix(ref p)) => { + // Allow VerbatimDisk paths. std canonicalize() generates them, and they work fine + p.kind().is_verbatim() && !matches!(p.kind(), std::path::Prefix::VerbatimDisk(_)) + } _ => false, } } @@ -297,6 +302,11 @@ impl GlobError { } impl Error for GlobError { + #[allow(deprecated)] + fn description(&self) -> &str { + self.error.description() + } + #[allow(unknown_lints, bare_trait_objects)] fn cause(&self) -> Option<&dyn Error> { Some(&self.error) @@ -488,6 +498,21 @@ impl fmt::Display for PatternError { /// sequence **must** form a single path component, so both `**a` and `b**` /// are invalid and will result in an error. A sequence of more than two /// consecutive `*` characters is also invalid. +/// +/// - `[...]` matches any character inside the brackets. Character sequences +/// can also specify ranges of characters, as ordered by Unicode, so e.g. +/// `[0-9]` specifies any character between 0 and 9 inclusive. An unclosed +/// bracket is invalid. +/// +/// - `[!...]` is the negation of `[...]`, i.e. it matches any characters +/// **not** in the brackets. +/// +/// - The metacharacters `?`, `*`, `[`, `]` can be matched by using brackets +/// (e.g. `[?]`). When a `]` occurs immediately following `[` or `[!` then it +/// is interpreted as being part of, rather then ending, the character set, so +/// `]` and NOT `]` can be matched by `[]]` and `[!]]` respectively. The `-` +/// character can be specified inside a character sequence pattern by placing +/// it at the start or the end, e.g. `[abc-]`. #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default, Debug)] pub struct Pattern { original: String, @@ -516,10 +541,17 @@ enum PatternToken { AnyChar, AnySequence, AnyRecursiveSequence, + AnyWithin(Vec), + AnyExcept(Vec), } -#[allow(clippy::enum_variant_names)] -#[derive(Copy, Clone, PartialEq, Eq)] +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] +enum CharSpecifier { + SingleChar(char), + CharRange(char, char), +} + +#[derive(Copy, Clone, PartialEq)] enum MatchResult { Match, SubPatternDoesntMatch, @@ -529,6 +561,7 @@ enum MatchResult { const ERROR_WILDCARDS: &str = "wildcards are either regular `*` or recursive `**`"; const ERROR_RECURSIVE_WILDCARDS: &str = "recursive wildcards must form a single path \ component"; +const ERROR_INVALID_RANGE: &str = "invalid range pattern"; impl Pattern { /// This function compiles Unix shell style patterns. @@ -604,6 +637,36 @@ impl Pattern { tokens.push(AnySequence); } } + '[' => { + if i + 4 <= chars.len() && chars[i + 1] == '!' { + match chars[i + 3..].iter().position(|x| *x == ']') { + None => (), + Some(j) => { + let chars = &chars[i + 2..i + 3 + j]; + let cs = parse_char_specifiers(chars); + tokens.push(AnyExcept(cs)); + i += j + 4; + continue; + } + } + } else if i + 3 <= chars.len() && chars[i + 1] != '!' { + match chars[i + 2..].iter().position(|x| *x == ']') { + None => (), + Some(j) => { + let cs = parse_char_specifiers(&chars[i + 1..i + 2 + j]); + tokens.push(AnyWithin(cs)); + i += j + 3; + continue; + } + } + } + + // if we get here then this is not a valid range pattern + return Err(PatternError { + pos: i, + msg: ERROR_INVALID_RANGE, + }); + } c => { tokens.push(Char(c)); i += 1; @@ -618,6 +681,28 @@ impl Pattern { }) } + /// Escape metacharacters within the given string by surrounding them in + /// brackets. The resulting string will, when compiled into a `Pattern`, + /// match the input string and nothing else. + pub fn escape(s: &str) -> String { + let mut escaped = String::new(); + for c in s.chars() { + match c { + // note that ! does not need escaping because it is only special + // inside brackets + '?' | '*' | '[' | ']' => { + escaped.push('['); + escaped.push(c); + escaped.push(']'); + } + c => { + escaped.push(c); + } + } + } + escaped + } + /// Return if the given `str` matches this `Pattern` using the default /// match options (i.e. `MatchOptions::new()`). /// @@ -627,6 +712,7 @@ impl Pattern { /// use nu_glob::Pattern; /// /// assert!(Pattern::new("c?t").unwrap().matches("cat")); + /// assert!(Pattern::new("k[!e]tteh").unwrap().matches("kitteh")); /// assert!(Pattern::new("d*g").unwrap().matches("doog")); /// ``` pub fn matches(&self, str: &str) -> bool { @@ -715,7 +801,7 @@ impl Pattern { let is_sep = path::is_separator(c); if !match *token { - AnyChar + AnyChar | AnyWithin(..) | AnyExcept(..) if (options.require_literal_separator && is_sep) || (follows_separator && options.require_literal_leading_dot @@ -724,6 +810,8 @@ impl Pattern { false } AnyChar => true, + AnyWithin(ref specifiers) => in_char_specifiers(specifiers, c, options), + AnyExcept(ref specifiers) => !in_char_specifiers(specifiers, c, options), Char(c2) => chars_eq(c, c2, options.case_sensitive), AnySequence | AnyRecursiveSequence => unreachable!(), } { @@ -820,6 +908,16 @@ fn fill_todo( }); match dirs { Ok(mut children) => { + // FIXME: This check messes up a lot of tests for some reason + // if options.require_literal_leading_dot { + // children.retain(|x| { + // !x.file_name() + // .expect("internal error: getting filename") + // .to_str() + // .expect("internal error: filename to_str") + // .starts_with('.') + // }); + // } children.sort_by(|p1, p2| p2.file_name().cmp(&p1.file_name())); todo.extend(children.into_iter().map(|x| Ok((x, idx)))); @@ -850,6 +948,64 @@ fn fill_todo( } } +fn parse_char_specifiers(s: &[char]) -> Vec { + let mut cs = Vec::new(); + let mut i = 0; + while i < s.len() { + if i + 3 <= s.len() && s[i + 1] == '-' { + cs.push(CharRange(s[i], s[i + 2])); + i += 3; + } else { + cs.push(SingleChar(s[i])); + i += 1; + } + } + cs +} + +fn in_char_specifiers(specifiers: &[CharSpecifier], c: char, options: MatchOptions) -> bool { + for &specifier in specifiers.iter() { + match specifier { + SingleChar(sc) => { + if chars_eq(c, sc, options.case_sensitive) { + return true; + } + } + CharRange(start, end) => { + // FIXME: work with non-ascii chars properly (issue #1347) + if !options.case_sensitive && c.is_ascii() && start.is_ascii() && end.is_ascii() { + let start = start.to_ascii_lowercase(); + let end = end.to_ascii_lowercase(); + + let start_up = start + .to_uppercase() + .next() + .expect("internal error: getting start uppercase"); + let end_up = end + .to_uppercase() + .next() + .expect("internal error: getting end uppercase"); + + // only allow case insensitive matching when + // both start and end are within a-z or A-Z + if start != start_up && end != end_up { + let c = c.to_ascii_lowercase(); + if c >= start && c <= end { + return true; + } + } + } + + if c >= start && c <= end { + return true; + } + } + } + } + + false +} + /// A helper function to determine if two chars are (possibly case-insensitively) equal. fn chars_eq(a: char, b: char, case_sensitive: bool) -> bool { if cfg!(windows) && path::is_separator(a) && path::is_separator(b) { @@ -863,6 +1019,7 @@ fn chars_eq(a: char, b: char, case_sensitive: bool) -> bool { } /// Configuration options to modify the behaviour of `Pattern::matches_with(..)`. +#[allow(missing_copy_implementations)] #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] pub struct MatchOptions { /// Whether or not patterns should be matched in a case-sensitive manner. @@ -903,6 +1060,11 @@ impl MatchOptions { /// recursive_match_hidden_dir: true, /// } /// ``` + /// + /// # Note + /// The behavior of this method doesn't match `default()`'s. This returns + /// `case_sensitive` as `true` while `default()` does it as `false`. + // FIXME: Consider unity the behavior with `default()` in a next major release. pub fn new() -> Self { Self { case_sensitive: true, @@ -926,16 +1088,29 @@ mod test { #[test] fn test_wildcard_errors() { - assert_eq!(Pattern::new("a/**b").unwrap_err().pos, 4); - assert_eq!(Pattern::new("a/bc**").unwrap_err().pos, 3); - assert_eq!(Pattern::new("a/*****").unwrap_err().pos, 4); - assert_eq!(Pattern::new("a/b**c**d").unwrap_err().pos, 2); - assert_eq!(Pattern::new("a**b").unwrap_err().pos, 0); + assert!(Pattern::new("a/**b").unwrap_err().pos == 4); + assert!(Pattern::new("a/bc**").unwrap_err().pos == 3); + assert!(Pattern::new("a/*****").unwrap_err().pos == 4); + assert!(Pattern::new("a/b**c**d").unwrap_err().pos == 2); + assert!(Pattern::new("a**b").unwrap_err().pos == 0); + } + + #[test] + fn test_unclosed_bracket_errors() { + assert!(Pattern::new("abc[def").unwrap_err().pos == 3); + assert!(Pattern::new("abc[!def").unwrap_err().pos == 3); + assert!(Pattern::new("abc[").unwrap_err().pos == 3); + assert!(Pattern::new("abc[!").unwrap_err().pos == 3); + assert!(Pattern::new("abc[d").unwrap_err().pos == 3); + assert!(Pattern::new("abc[!d").unwrap_err().pos == 3); + assert!(Pattern::new("abc[]").unwrap_err().pos == 3); + assert!(Pattern::new("abc[!]").unwrap_err().pos == 3); } #[test] fn test_glob_errors() { - assert_eq!(glob("a/**b").err().unwrap().pos, 4); + assert!(glob("a/**b").err().unwrap().pos == 4); + assert!(glob("abc[def").err().unwrap().pos == 3); } // this test assumes that there is a /root directory and that @@ -1019,6 +1194,7 @@ mod test { assert!(Pattern::new("a*a*a*a*a*a*a*a*a") .unwrap() .matches("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")); + assert!(Pattern::new("a*b[xyz]c*d").unwrap().matches("abxcdbxcddd")); } #[test] @@ -1072,7 +1248,57 @@ mod test { #[test] fn test_lots_of_files() { // this is a good test because it touches lots of differently named files - glob("/*/*/*/*").unwrap().nth(10000); + glob("/*/*/*/*").unwrap().skip(10000).next(); + } + + #[test] + fn test_range_pattern() { + let pat = Pattern::new("a[0-9]b").unwrap(); + for i in 0..10 { + assert!(pat.matches(&format!("a{}b", i))); + } + assert!(!pat.matches("a_b")); + + let pat = Pattern::new("a[!0-9]b").unwrap(); + for i in 0..10 { + assert!(!pat.matches(&format!("a{}b", i))); + } + assert!(pat.matches("a_b")); + + let pats = ["[a-z123]", "[1a-z23]", "[123a-z]"]; + for &p in pats.iter() { + let pat = Pattern::new(p).unwrap(); + for c in "abcdefghijklmnopqrstuvwxyz".chars() { + assert!(pat.matches(&c.to_string())); + } + for c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ".chars() { + let options = MatchOptions { + case_sensitive: false, + ..MatchOptions::new() + }; + assert!(pat.matches_with(&c.to_string(), options)); + } + assert!(pat.matches("1")); + assert!(pat.matches("2")); + assert!(pat.matches("3")); + } + + let pats = ["[abc-]", "[-abc]", "[a-c-]"]; + for &p in pats.iter() { + let pat = Pattern::new(p).unwrap(); + assert!(pat.matches("a")); + assert!(pat.matches("b")); + assert!(pat.matches("c")); + assert!(pat.matches("-")); + assert!(!pat.matches("d")); + } + + let pat = Pattern::new("[2-1]").unwrap(); + assert!(!pat.matches("1")); + assert!(!pat.matches("2")); + + assert!(Pattern::new("[-]").unwrap().matches("-")); + assert!(!Pattern::new("[!-]").unwrap().matches("-")); } #[test] @@ -1093,6 +1319,13 @@ mod test { assert!(!dir_pat.matches("some/other/path/to/hello.txt")); } + #[test] + fn test_pattern_escape() { + let s = "_[_]_?_*_!_"; + assert_eq!(Pattern::escape(s), "_[[]_[]]_[?]_[*]_!_".to_string()); + assert!(Pattern::new(&Pattern::escape(s)).unwrap().matches(s)); + } + #[test] fn test_pattern_matches_case_insensitive() { let pat = Pattern::new("aBcDeFg").unwrap(); @@ -1109,6 +1342,33 @@ mod test { assert!(pat.matches_with("AbCdEfG", options)); } + #[test] + fn test_pattern_matches_case_insensitive_range() { + let pat_within = Pattern::new("[a]").unwrap(); + let pat_except = Pattern::new("[!a]").unwrap(); + + let options_case_insensitive = MatchOptions { + case_sensitive: false, + require_literal_separator: false, + require_literal_leading_dot: false, + recursive_match_hidden_dir: false, + }; + let options_case_sensitive = MatchOptions { + case_sensitive: true, + require_literal_separator: false, + require_literal_leading_dot: false, + recursive_match_hidden_dir: false, + }; + + assert!(pat_within.matches_with("a", options_case_insensitive)); + assert!(pat_within.matches_with("A", options_case_insensitive)); + assert!(!pat_within.matches_with("A", options_case_sensitive)); + + assert!(!pat_except.matches_with("a", options_case_insensitive)); + assert!(!pat_except.matches_with("A", options_case_insensitive)); + assert!(pat_except.matches_with("A", options_case_sensitive)); + } + #[test] fn test_pattern_matches_require_literal_separator() { let options_require_literal = MatchOptions { @@ -1133,6 +1393,9 @@ mod test { assert!(!Pattern::new("abc*def") .unwrap() .matches_with("abc/def", options_require_literal)); + assert!(!Pattern::new("abc[/]def") + .unwrap() + .matches_with("abc/def", options_require_literal)); assert!(Pattern::new("abc/def") .unwrap() @@ -1143,6 +1406,9 @@ mod test { assert!(Pattern::new("abc*def") .unwrap() .matches_with("abc/def", options_not_require_literal)); + assert!(Pattern::new("abc[/]def") + .unwrap() + .matches_with("abc/def", options_not_require_literal)); } #[test] @@ -1208,6 +1474,14 @@ mod test { assert!(f(options_not_require_literal_leading_dot)); assert!(!f(options_require_literal_leading_dot)); + let f = |options| { + Pattern::new("aaa/[.]bbb") + .unwrap() + .matches_with("aaa/.bbb", options) + }; + assert!(f(options_not_require_literal_leading_dot)); + assert!(!f(options_require_literal_leading_dot)); + let f = |options| Pattern::new("**/*").unwrap().matches_with(".bbb", options); assert!(f(options_not_require_literal_leading_dot)); assert!(!f(options_require_literal_leading_dot));