mirror of
https://github.com/nushell/nushell.git
synced 2025-06-01 15:46:04 +02:00
created an alternate way to determine line count (#4887)
This commit is contained in:
parent
bd7a506897
commit
18067138aa
@ -1,19 +1,11 @@
|
|||||||
use nu_protocol::ast::Call;
|
use nu_protocol::ast::Call;
|
||||||
use nu_protocol::engine::{Command, EngineState, Stack};
|
use nu_protocol::engine::{Command, EngineState, Stack};
|
||||||
use nu_protocol::{Category, Example, PipelineData, ShellError, Signature, Span, Value};
|
use nu_protocol::{Category, Example, PipelineData, ShellError, Signature, Span, Value};
|
||||||
use std::collections::{BTreeMap, HashSet};
|
use std::collections::BTreeMap;
|
||||||
use std::{fmt, str};
|
use std::{fmt, str};
|
||||||
use unicode_segmentation::UnicodeSegmentation;
|
use unicode_segmentation::UnicodeSegmentation;
|
||||||
|
|
||||||
// borrowed liberally from here https://github.com/dead10ck/uwc
|
// borrowed liberally from here https://github.com/dead10ck/uwc
|
||||||
const LF: &str = "\n"; // 0xe0000a
|
|
||||||
const CR: &str = "\r"; // 0xe0000d
|
|
||||||
const CRLF: &str = "\r\n"; // 0xe00d0a
|
|
||||||
const NEL: &str = "\u{0085}"; // 0x00c285
|
|
||||||
const FF: &str = "\u{000C}"; // 0x00000c
|
|
||||||
const LS: &str = "\u{2028}"; // 0xe280a8
|
|
||||||
const PS: &str = "\u{2029}"; // 0xe280a9
|
|
||||||
|
|
||||||
pub type Counted = BTreeMap<Counter, usize>;
|
pub type Counted = BTreeMap<Counter, usize>;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@ -254,15 +246,37 @@ pub trait Count {
|
|||||||
|
|
||||||
impl Count for Counter {
|
impl Count for Counter {
|
||||||
fn count(&self, s: &str) -> usize {
|
fn count(&self, s: &str) -> usize {
|
||||||
let newlines: HashSet<&'static str> = HashSet::from([CR, LF, CRLF, NEL, FF, LS, PS]);
|
|
||||||
|
|
||||||
match *self {
|
match *self {
|
||||||
Counter::GraphemeClusters => s.graphemes(true).count(),
|
Counter::GraphemeClusters => s.graphemes(true).count(),
|
||||||
Counter::Bytes => s.len(),
|
Counter::Bytes => s.len(),
|
||||||
Counter::Lines => s
|
Counter::Lines => {
|
||||||
.graphemes(true)
|
const LF: &str = "\n"; // 0xe0000a
|
||||||
.filter(|grapheme| newlines.contains(grapheme))
|
const CR: &str = "\r"; // 0xe0000d
|
||||||
.count(),
|
const CRLF: &str = "\r\n"; // 0xe00d0a
|
||||||
|
const NEL: &str = "\u{0085}"; // 0x00c285
|
||||||
|
const FF: &str = "\u{000C}"; // 0x00000c
|
||||||
|
const LS: &str = "\u{2028}"; // 0xe280a8
|
||||||
|
const PS: &str = "\u{2029}"; // 0xe280a9
|
||||||
|
|
||||||
|
// use regex here because it can search for CRLF first and not duplicate the count
|
||||||
|
let line_ending_types = [CRLF, LF, CR, NEL, FF, LS, PS];
|
||||||
|
let pattern = &line_ending_types.join("|");
|
||||||
|
let newline_pattern = regex::Regex::new(pattern).expect("Unable to create regex");
|
||||||
|
let line_endings = newline_pattern
|
||||||
|
.find_iter(s)
|
||||||
|
.map(|f| f.as_str().to_string())
|
||||||
|
.collect::<Vec<String>>();
|
||||||
|
|
||||||
|
let has_line_ending_suffix =
|
||||||
|
line_ending_types.iter().any(|&suffix| s.ends_with(suffix));
|
||||||
|
// eprintln!("suffix = {}", has_line_ending_suffix);
|
||||||
|
|
||||||
|
if has_line_ending_suffix {
|
||||||
|
line_endings.len()
|
||||||
|
} else {
|
||||||
|
line_endings.len() + 1
|
||||||
|
}
|
||||||
|
}
|
||||||
Counter::Words => s.unicode_words().count(),
|
Counter::Words => s.unicode_words().count(),
|
||||||
Counter::CodePoints => s.chars().count(),
|
Counter::CodePoints => s.chars().count(),
|
||||||
}
|
}
|
||||||
@ -318,31 +332,17 @@ where
|
|||||||
{
|
{
|
||||||
let mut counts: Counted = counters.into_iter().map(|c| (*c, c.count(s))).collect();
|
let mut counts: Counted = counters.into_iter().map(|c| (*c, c.count(s))).collect();
|
||||||
if let Some(lines) = counts.get_mut(&Counter::Lines) {
|
if let Some(lines) = counts.get_mut(&Counter::Lines) {
|
||||||
// this part is all about having things like this return 1 line
|
|
||||||
// "There are seven words in this sentence" | size
|
|
||||||
if s.is_empty() {
|
if s.is_empty() {
|
||||||
|
// If s is empty, indeed, the count is 0
|
||||||
*lines = 0;
|
*lines = 0;
|
||||||
} else if *lines == 0 && !s.is_empty() {
|
} else if *lines == 0 && !s.is_empty() {
|
||||||
|
// If s is not empty and the count is 0, it means there
|
||||||
|
// is a line without a line ending, so let's make it 1
|
||||||
*lines = 1;
|
*lines = 1;
|
||||||
} else {
|
} else {
|
||||||
// no change
|
// no change, whatever the count is, is right
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// let lines_count = match counts.get(&Counter::Lines) {
|
|
||||||
// Some(c) => {
|
|
||||||
// if s.is_empty() {
|
|
||||||
// 0
|
|
||||||
// } else if *c == 0 && !s.is_empty() {
|
|
||||||
// 1
|
|
||||||
// } else {
|
|
||||||
// *c
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// None => 0,
|
|
||||||
// };
|
|
||||||
// counts.remove(&Counter::Lines);
|
|
||||||
// counts.insert(Counter::Lines, lines_count);
|
|
||||||
counts
|
counts
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -358,8 +358,30 @@ mod test {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_one_newline() {
|
||||||
|
let s = "\n".to_string();
|
||||||
|
let counts = uwc_count(&ALL_COUNTERS[..], &s);
|
||||||
|
let mut correct_counts = BTreeMap::new();
|
||||||
|
correct_counts.insert(Counter::Lines, 1);
|
||||||
|
correct_counts.insert(Counter::Words, 0);
|
||||||
|
correct_counts.insert(Counter::GraphemeClusters, 1);
|
||||||
|
correct_counts.insert(Counter::Bytes, 1);
|
||||||
|
correct_counts.insert(Counter::CodePoints, 1);
|
||||||
|
|
||||||
|
assert_eq!(correct_counts, counts);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_count_counts_lines() {
|
fn test_count_counts_lines() {
|
||||||
|
// const LF: &str = "\n"; // 0xe0000a
|
||||||
|
// const CR: &str = "\r"; // 0xe0000d
|
||||||
|
// const CRLF: &str = "\r\n"; // 0xe00d0a
|
||||||
|
const NEL: &str = "\u{0085}"; // 0x00c285
|
||||||
|
const FF: &str = "\u{000C}"; // 0x00000c
|
||||||
|
const LS: &str = "\u{2028}"; // 0xe280a8
|
||||||
|
const PS: &str = "\u{2029}"; // 0xe280a9
|
||||||
|
|
||||||
// * \r\n is a single graheme cluster
|
// * \r\n is a single graheme cluster
|
||||||
// * trailing newlines are counted
|
// * trailing newlines are counted
|
||||||
// * NEL is 2 bytes
|
// * NEL is 2 bytes
|
||||||
@ -378,10 +400,10 @@ fn test_count_counts_lines() {
|
|||||||
let counts = uwc_count(&ALL_COUNTERS[..], &s);
|
let counts = uwc_count(&ALL_COUNTERS[..], &s);
|
||||||
|
|
||||||
let mut correct_counts = BTreeMap::new();
|
let mut correct_counts = BTreeMap::new();
|
||||||
correct_counts.insert(Counter::GraphemeClusters, 23);
|
|
||||||
correct_counts.insert(Counter::Lines, 8);
|
correct_counts.insert(Counter::Lines, 8);
|
||||||
correct_counts.insert(Counter::Bytes, 29);
|
|
||||||
correct_counts.insert(Counter::Words, 5);
|
correct_counts.insert(Counter::Words, 5);
|
||||||
|
correct_counts.insert(Counter::GraphemeClusters, 23);
|
||||||
|
correct_counts.insert(Counter::Bytes, 29);
|
||||||
|
|
||||||
// one more than grapheme clusters because of \r\n
|
// one more than grapheme clusters because of \r\n
|
||||||
correct_counts.insert(Counter::CodePoints, 24);
|
correct_counts.insert(Counter::CodePoints, 24);
|
||||||
|
@ -960,8 +960,7 @@ mod test {
|
|||||||
.ok()
|
.ok()
|
||||||
.and_then(|p| match p.components().next().unwrap() {
|
.and_then(|p| match p.components().next().unwrap() {
|
||||||
Component::Prefix(prefix_component) => {
|
Component::Prefix(prefix_component) => {
|
||||||
let path = Path::new(prefix_component.as_os_str());
|
let path = Path::new(prefix_component.as_os_str()).join("*");
|
||||||
path.join("*");
|
|
||||||
Some(path.to_path_buf())
|
Some(path.to_path_buf())
|
||||||
}
|
}
|
||||||
_ => panic!("no prefix in this path"),
|
_ => panic!("no prefix in this path"),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user