diff --git a/crates/nu-command/src/strings/size.rs b/crates/nu-command/src/strings/size.rs index db9faabdd..573870879 100644 --- a/crates/nu-command/src/strings/size.rs +++ b/crates/nu-command/src/strings/size.rs @@ -1,10 +1,20 @@ -extern crate unicode_segmentation; - -use unicode_segmentation::UnicodeSegmentation; - use nu_protocol::ast::Call; use nu_protocol::engine::{Command, EngineState, Stack}; use nu_protocol::{Category, Example, PipelineData, ShellError, Signature, Span, Value}; +use std::collections::{BTreeMap, HashSet}; +use std::{fmt, str}; +use unicode_segmentation::UnicodeSegmentation; + +// borrowed liberally from here https://github.com/dead10ck/uwc +const LF: &str = "\n"; // 0xe0000a +const CR: &str = "\r"; // 0xe0000d +const CRLF: &str = "\r\n"; // 0xe00d0a +const NEL: &str = "\u{0085}"; // 0x00c285 +const FF: &str = "\u{000C}"; // 0x00000c +const LS: &str = "\u{2028}"; // 0xe280a8 +const PS: &str = "\u{2029}"; // 0xe280a9 + +pub type Counted = BTreeMap; #[derive(Clone)] pub struct Size; @@ -41,12 +51,13 @@ impl Command for Size { cols: vec![ "lines".into(), "words".into(), - "chars".into(), "bytes".into(), + "chars".into(), + "graphemes".into(), ], vals: vec![ Value::Int { - val: 0, + val: 1, span: Span::test_data(), }, Value::Int { @@ -61,6 +72,46 @@ impl Command for Size { val: 38, span: Span::test_data(), }, + Value::Int { + val: 38, + span: Span::test_data(), + }, + ], + span: Span::test_data(), + }), + }, + Example { + description: "Counts unicode characters", + example: r#"'今天天气真好' | size "#, + result: Some(Value::Record { + cols: vec![ + "lines".into(), + "words".into(), + "bytes".into(), + "chars".into(), + "graphemes".into(), + ], + vals: vec![ + Value::Int { + val: 1, + span: Span::test_data(), + }, + Value::Int { + val: 6, + span: Span::test_data(), + }, + Value::Int { + val: 18, + span: Span::test_data(), + }, + Value::Int { + val: 6, + span: Span::test_data(), + }, + Value::Int { + val: 6, + span: Span::test_data(), + }, ], span: Span::test_data(), }), @@ -72,12 +123,13 @@ impl Command for Size { cols: vec![ "lines".into(), "words".into(), - "chars".into(), "bytes".into(), + "chars".into(), + "graphemes".into(), ], vals: vec![ Value::Int { - val: 0, + val: 1, span: Span::test_data(), }, Value::Int { @@ -85,11 +137,15 @@ impl Command for Size { span: Span::test_data(), }, Value::Int { - val: 13, + val: 15, span: Span::test_data(), }, Value::Int { - val: 15, + val: 14, + span: Span::test_data(), + }, + Value::Int { + val: 13, span: Span::test_data(), }, ], @@ -108,7 +164,7 @@ fn size( let span = call.head; input.map( move |v| match v.as_string() { - Ok(s) => count(&s, span), + Ok(s) => counter(&s, span), Err(_) => Value::Error { error: ShellError::PipelineMismatch("string".into(), span, span), }, @@ -117,49 +173,179 @@ fn size( ) } -fn count(contents: &str, span: Span) -> Value { - let mut lines: i64 = 0; - let mut words: i64 = 0; - let mut chars: i64 = 0; - let bytes = contents.len() as i64; - let mut end_of_word = true; - - for c in UnicodeSegmentation::graphemes(contents, true) { - chars += 1; - - match c { - "\n" => { - lines += 1; - end_of_word = true; - } - " " => end_of_word = true, - _ => { - if end_of_word { - words += 1; - } - end_of_word = false; - } - } - } - +fn counter(contents: &str, span: Span) -> Value { + let counts = uwc_count(&ALL_COUNTERS[..], contents); let mut cols = vec![]; let mut vals = vec![]; cols.push("lines".into()); - vals.push(Value::Int { val: lines, span }); + vals.push(Value::Int { + val: match counts.get(&Counter::Lines) { + Some(c) => *c as i64, + None => 0, + }, + span, + }); cols.push("words".into()); - vals.push(Value::Int { val: words, span }); - - cols.push("chars".into()); - vals.push(Value::Int { val: chars, span }); + vals.push(Value::Int { + val: match counts.get(&Counter::Words) { + Some(c) => *c as i64, + None => 0, + }, + span, + }); cols.push("bytes".into()); - vals.push(Value::Int { val: bytes, span }); + vals.push(Value::Int { + val: match counts.get(&Counter::Bytes) { + Some(c) => *c as i64, + None => 0, + }, + span, + }); + + cols.push("chars".into()); + vals.push(Value::Int { + val: match counts.get(&Counter::CodePoints) { + Some(c) => *c as i64, + None => 0, + }, + span, + }); + + cols.push("graphemes".into()); + vals.push(Value::Int { + val: match counts.get(&Counter::GraphemeClusters) { + Some(c) => *c as i64, + None => 0, + }, + span, + }); Value::Record { cols, vals, span } } +/// Take all the counts in `other_counts` and sum them into `accum`. +// pub fn sum_counts(accum: &mut Counted, other_counts: &Counted) { +// for (counter, count) in other_counts { +// let entry = accum.entry(*counter).or_insert(0); +// *entry += count; +// } +// } + +/// Sums all the `Counted` instances into a new one. +// pub fn sum_all_counts<'a, I>(counts: I) -> Counted +// where +// I: IntoIterator, +// { +// let mut totals = BTreeMap::new(); +// for counts in counts { +// sum_counts(&mut totals, counts); +// } +// totals +// } + +/// Something that counts things in `&str`s. +pub trait Count { + /// Counts something in the given `&str`. + fn count(&self, s: &str) -> usize; +} + +impl Count for Counter { + fn count(&self, s: &str) -> usize { + let newlines: HashSet<&'static str> = HashSet::from([CR, LF, CRLF, NEL, FF, LS, PS]); + + match *self { + Counter::GraphemeClusters => s.graphemes(true).count(), + Counter::Bytes => s.len(), + Counter::Lines => s + .graphemes(true) + .filter(|grapheme| newlines.contains(grapheme)) + .count(), + Counter::Words => s.unicode_words().count(), + Counter::CodePoints => s.chars().count(), + } + } +} + +/// Different types of counters. +#[derive(Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Copy, Clone)] +pub enum Counter { + /// Counts lines. + Lines, + + /// Counts words. + Words, + + /// Counts the total number of bytes. + Bytes, + + /// Counts grapheme clusters. The input is required to be valid UTF-8. + GraphemeClusters, + + /// Counts unicode code points + CodePoints, +} + +/// A convenience array of all counter types. +pub const ALL_COUNTERS: [Counter; 5] = [ + Counter::GraphemeClusters, + Counter::Bytes, + Counter::Lines, + Counter::Words, + Counter::CodePoints, +]; + +impl fmt::Display for Counter { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let s = match *self { + Counter::GraphemeClusters => "graphemes", + Counter::Bytes => "bytes", + Counter::Lines => "lines", + Counter::Words => "words", + Counter::CodePoints => "codepoints", + }; + + write!(f, "{}", s) + } +} + +/// Counts the given `Counter`s in the given `&str`. +pub fn uwc_count<'a, I>(counters: I, s: &str) -> Counted +where + I: IntoIterator, +{ + let mut counts: Counted = counters.into_iter().map(|c| (*c, c.count(s))).collect(); + if let Some(lines) = counts.get_mut(&Counter::Lines) { + // this part is all about having things like this return 1 line + // "There are seven words in this sentence" | size + if s.is_empty() { + *lines = 0; + } else if *lines == 0 && !s.is_empty() { + *lines = 1; + } else { + // no change + } + } + + // let lines_count = match counts.get(&Counter::Lines) { + // Some(c) => { + // if s.is_empty() { + // 0 + // } else if *c == 0 && !s.is_empty() { + // 1 + // } else { + // *c + // } + // } + // None => 0, + // }; + // counts.remove(&Counter::Lines); + // counts.insert(Counter::Lines, lines_count); + counts +} + #[cfg(test)] mod test { use super::*; @@ -171,3 +357,74 @@ mod test { test_examples(Size {}) } } + +#[test] +fn test_count_counts_lines() { + // * \r\n is a single graheme cluster + // * trailing newlines are counted + // * NEL is 2 bytes + // * FF is 1 byte + // * LS is 3 bytes + // * PS is 3 bytes + let mut s = String::from("foo\r\nbar\n\nbaz"); + s += NEL; + s += "quux"; + s += FF; + s += LS; + s += "xi"; + s += PS; + s += "\n"; + + let counts = uwc_count(&ALL_COUNTERS[..], &s); + + let mut correct_counts = BTreeMap::new(); + correct_counts.insert(Counter::GraphemeClusters, 23); + correct_counts.insert(Counter::Lines, 8); + correct_counts.insert(Counter::Bytes, 29); + correct_counts.insert(Counter::Words, 5); + + // one more than grapheme clusters because of \r\n + correct_counts.insert(Counter::CodePoints, 24); + + assert_eq!(correct_counts, counts); +} + +#[test] +fn test_count_counts_words() { + let i_can_eat_glass = "Μπορῶ νὰ φάω σπασμένα γυαλιὰ χωρὶς νὰ πάθω τίποτα."; + let s = String::from(i_can_eat_glass); + + let counts = uwc_count(&ALL_COUNTERS[..], &s); + + let mut correct_counts = BTreeMap::new(); + correct_counts.insert(Counter::GraphemeClusters, 50); + correct_counts.insert(Counter::Lines, 1); + correct_counts.insert(Counter::Bytes, i_can_eat_glass.len()); + correct_counts.insert(Counter::Words, 9); + correct_counts.insert(Counter::CodePoints, 50); + + assert_eq!(correct_counts, counts); +} + +#[test] +fn test_count_counts_codepoints() { + // these are NOT the same! One is e + ́́ , and one is é, a single codepoint + let one = "é"; + let two = "é"; + + let counters = [Counter::CodePoints]; + + let counts = uwc_count(&counters[..], &one); + + let mut correct_counts = BTreeMap::new(); + correct_counts.insert(Counter::CodePoints, 1); + + assert_eq!(correct_counts, counts); + + let counts = uwc_count(&counters[..], &two); + + let mut correct_counts = BTreeMap::new(); + correct_counts.insert(Counter::CodePoints, 2); + + assert_eq!(correct_counts, counts); +}