From 18067138aa79e9357eead3e4eaae15fa8e878961 Mon Sep 17 00:00:00 2001 From: Darren Schroeder <343840+fdncred@users.noreply.github.com> Date: Mon, 21 Mar 2022 11:56:14 -0500 Subject: [PATCH] created an alternate way to determine line count (#4887) --- crates/nu-command/src/strings/size.rs | 92 +++++++++++++++++---------- crates/nu-glob/src/lib.rs | 3 +- 2 files changed, 58 insertions(+), 37 deletions(-) diff --git a/crates/nu-command/src/strings/size.rs b/crates/nu-command/src/strings/size.rs index 5738708792..b05645b813 100644 --- a/crates/nu-command/src/strings/size.rs +++ b/crates/nu-command/src/strings/size.rs @@ -1,19 +1,11 @@ use nu_protocol::ast::Call; use nu_protocol::engine::{Command, EngineState, Stack}; use nu_protocol::{Category, Example, PipelineData, ShellError, Signature, Span, Value}; -use std::collections::{BTreeMap, HashSet}; +use std::collections::BTreeMap; use std::{fmt, str}; use unicode_segmentation::UnicodeSegmentation; // borrowed liberally from here https://github.com/dead10ck/uwc -const LF: &str = "\n"; // 0xe0000a -const CR: &str = "\r"; // 0xe0000d -const CRLF: &str = "\r\n"; // 0xe00d0a -const NEL: &str = "\u{0085}"; // 0x00c285 -const FF: &str = "\u{000C}"; // 0x00000c -const LS: &str = "\u{2028}"; // 0xe280a8 -const PS: &str = "\u{2029}"; // 0xe280a9 - pub type Counted = BTreeMap; #[derive(Clone)] @@ -254,15 +246,37 @@ pub trait Count { impl Count for Counter { fn count(&self, s: &str) -> usize { - let newlines: HashSet<&'static str> = HashSet::from([CR, LF, CRLF, NEL, FF, LS, PS]); - match *self { Counter::GraphemeClusters => s.graphemes(true).count(), Counter::Bytes => s.len(), - Counter::Lines => s - .graphemes(true) - .filter(|grapheme| newlines.contains(grapheme)) - .count(), + Counter::Lines => { + const LF: &str = "\n"; // 0xe0000a + const CR: &str = "\r"; // 0xe0000d + const CRLF: &str = "\r\n"; // 0xe00d0a + const NEL: &str = "\u{0085}"; // 0x00c285 + const FF: &str = "\u{000C}"; // 0x00000c + const LS: &str = "\u{2028}"; // 0xe280a8 + const PS: &str = "\u{2029}"; // 0xe280a9 + + // use regex here because it can search for CRLF first and not duplicate the count + let line_ending_types = [CRLF, LF, CR, NEL, FF, LS, PS]; + let pattern = &line_ending_types.join("|"); + let newline_pattern = regex::Regex::new(pattern).expect("Unable to create regex"); + let line_endings = newline_pattern + .find_iter(s) + .map(|f| f.as_str().to_string()) + .collect::>(); + + let has_line_ending_suffix = + line_ending_types.iter().any(|&suffix| s.ends_with(suffix)); + // eprintln!("suffix = {}", has_line_ending_suffix); + + if has_line_ending_suffix { + line_endings.len() + } else { + line_endings.len() + 1 + } + } Counter::Words => s.unicode_words().count(), Counter::CodePoints => s.chars().count(), } @@ -318,31 +332,17 @@ where { let mut counts: Counted = counters.into_iter().map(|c| (*c, c.count(s))).collect(); if let Some(lines) = counts.get_mut(&Counter::Lines) { - // this part is all about having things like this return 1 line - // "There are seven words in this sentence" | size if s.is_empty() { + // If s is empty, indeed, the count is 0 *lines = 0; } else if *lines == 0 && !s.is_empty() { + // If s is not empty and the count is 0, it means there + // is a line without a line ending, so let's make it 1 *lines = 1; } else { - // no change + // no change, whatever the count is, is right } } - - // let lines_count = match counts.get(&Counter::Lines) { - // Some(c) => { - // if s.is_empty() { - // 0 - // } else if *c == 0 && !s.is_empty() { - // 1 - // } else { - // *c - // } - // } - // None => 0, - // }; - // counts.remove(&Counter::Lines); - // counts.insert(Counter::Lines, lines_count); counts } @@ -358,8 +358,30 @@ mod test { } } +#[test] +fn test_one_newline() { + let s = "\n".to_string(); + let counts = uwc_count(&ALL_COUNTERS[..], &s); + let mut correct_counts = BTreeMap::new(); + correct_counts.insert(Counter::Lines, 1); + correct_counts.insert(Counter::Words, 0); + correct_counts.insert(Counter::GraphemeClusters, 1); + correct_counts.insert(Counter::Bytes, 1); + correct_counts.insert(Counter::CodePoints, 1); + + assert_eq!(correct_counts, counts); +} + #[test] fn test_count_counts_lines() { + // const LF: &str = "\n"; // 0xe0000a + // const CR: &str = "\r"; // 0xe0000d + // const CRLF: &str = "\r\n"; // 0xe00d0a + const NEL: &str = "\u{0085}"; // 0x00c285 + const FF: &str = "\u{000C}"; // 0x00000c + const LS: &str = "\u{2028}"; // 0xe280a8 + const PS: &str = "\u{2029}"; // 0xe280a9 + // * \r\n is a single graheme cluster // * trailing newlines are counted // * NEL is 2 bytes @@ -378,10 +400,10 @@ fn test_count_counts_lines() { let counts = uwc_count(&ALL_COUNTERS[..], &s); let mut correct_counts = BTreeMap::new(); - correct_counts.insert(Counter::GraphemeClusters, 23); correct_counts.insert(Counter::Lines, 8); - correct_counts.insert(Counter::Bytes, 29); correct_counts.insert(Counter::Words, 5); + correct_counts.insert(Counter::GraphemeClusters, 23); + correct_counts.insert(Counter::Bytes, 29); // one more than grapheme clusters because of \r\n correct_counts.insert(Counter::CodePoints, 24); diff --git a/crates/nu-glob/src/lib.rs b/crates/nu-glob/src/lib.rs index b51767d714..24608f1c44 100644 --- a/crates/nu-glob/src/lib.rs +++ b/crates/nu-glob/src/lib.rs @@ -960,8 +960,7 @@ mod test { .ok() .and_then(|p| match p.components().next().unwrap() { Component::Prefix(prefix_component) => { - let path = Path::new(prefix_component.as_os_str()); - path.join("*"); + let path = Path::new(prefix_component.as_os_str()).join("*"); Some(path.to_path_buf()) } _ => panic!("no prefix in this path"),