Clean up lexer (#2956)

* Document the lexer and lightly improve its names

The bulk of this pull request adds a substantial amount of new inline
documentation for the lexer. Along the way, I made a few minor changes
to the names in the lexer, most of which were internal.

The main change that affects other files is renaming `group` to `block`,
since the function is actually parsing a block (a list of groups).

* Further clean up the lexer

- Consolidate the logic of the various token builders into a single type
- Improve and clean up the event-driven BlockParser
- Clean up comment parsing. Comments now contain their original leading
  whitespace as well as trailing whitespace, and know how to move some
  leading whitespace back into the body based on how the lexer decides
  to dedent the comments. This preserves the original whitespace
  information while still making it straight-forward to eliminate leading
  whitespace in help comments.

* Update meta.rs

* WIP

* fix clippy

* remove unwraps

* remove unwraps

Co-authored-by: Jonathan Turner <jonathandturner@users.noreply.github.com>
Co-authored-by: Jonathan Turner <jonathan.d.turner@gmail.com>
This commit is contained in:
Yehuda Katz 2021-02-03 23:20:21 -08:00 committed by GitHub
parent fb1846120d
commit d07789677f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 1288 additions and 1062 deletions

13
Cargo.lock generated
View File

@ -3125,6 +3125,7 @@ dependencies = [
"derive_is_enum_variant", "derive_is_enum_variant",
"dunce", "dunce",
"indexmap", "indexmap",
"itertools",
"log 0.4.13", "log 0.4.13",
"nu-errors", "nu-errors",
"nu-protocol", "nu-protocol",
@ -3134,6 +3135,7 @@ dependencies = [
"num-traits 0.2.14", "num-traits 0.2.14",
"serde 1.0.120", "serde 1.0.120",
"shellexpand", "shellexpand",
"smart-default",
] ]
[[package]] [[package]]
@ -5086,6 +5088,17 @@ version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
[[package]]
name = "smart-default"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "133659a15339456eeeb07572eb02a91c91e9815e9cbc89566944d2c8d3efdbf6"
dependencies = [
"proc-macro2",
"quote 1.0.8",
"syn 1.0.58",
]
[[package]] [[package]]
name = "socket2" name = "socket2"
version = "0.3.19" version = "0.3.19"

View File

@ -18,81 +18,115 @@ members = ["crates/*/"]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
nu-cli = {version = "0.26.0", path = "./crates/nu-cli", default-features=false} nu-cli = { version = "0.26.0", path = "./crates/nu-cli", default-features = false }
nu-command = {version = "0.26.0", path = "./crates/nu-command"} nu-command = { version = "0.26.0", path = "./crates/nu-command" }
nu-data = {version = "0.26.0", path = "./crates/nu-data"} nu-data = { version = "0.26.0", path = "./crates/nu-data" }
nu-engine = {version = "0.26.0", path = "./crates/nu-engine"} nu-engine = { version = "0.26.0", path = "./crates/nu-engine" }
nu-errors = {version = "0.26.0", path = "./crates/nu-errors"} nu-errors = { version = "0.26.0", path = "./crates/nu-errors" }
nu-parser = {version = "0.26.0", path = "./crates/nu-parser"} nu-parser = { version = "0.26.0", path = "./crates/nu-parser" }
nu-plugin = {version = "0.26.0", path = "./crates/nu-plugin"} nu-plugin = { version = "0.26.0", path = "./crates/nu-plugin" }
nu-protocol = {version = "0.26.0", path = "./crates/nu-protocol"} nu-protocol = { version = "0.26.0", path = "./crates/nu-protocol" }
nu-source = {version = "0.26.0", path = "./crates/nu-source"} nu-source = { version = "0.26.0", path = "./crates/nu-source" }
nu-value-ext = {version = "0.26.0", path = "./crates/nu-value-ext"} nu-value-ext = { version = "0.26.0", path = "./crates/nu-value-ext" }
nu_plugin_binaryview = {version = "0.26.0", path = "./crates/nu_plugin_binaryview", optional = true} nu_plugin_binaryview = { version = "0.26.0", path = "./crates/nu_plugin_binaryview", optional = true }
nu_plugin_chart = {version = "0.26.0", path = "./crates/nu_plugin_chart", optional = true} nu_plugin_chart = { version = "0.26.0", path = "./crates/nu_plugin_chart", optional = true }
nu_plugin_fetch = {version = "0.26.0", path = "./crates/nu_plugin_fetch", optional = true} nu_plugin_fetch = { version = "0.26.0", path = "./crates/nu_plugin_fetch", optional = true }
nu_plugin_from_bson = {version = "0.26.0", path = "./crates/nu_plugin_from_bson", optional = true} nu_plugin_from_bson = { version = "0.26.0", path = "./crates/nu_plugin_from_bson", optional = true }
nu_plugin_from_sqlite = {version = "0.26.0", path = "./crates/nu_plugin_from_sqlite", optional = true} nu_plugin_from_sqlite = { version = "0.26.0", path = "./crates/nu_plugin_from_sqlite", optional = true }
nu_plugin_inc = {version = "0.26.0", path = "./crates/nu_plugin_inc", optional = true} nu_plugin_inc = { version = "0.26.0", path = "./crates/nu_plugin_inc", optional = true }
nu_plugin_match = {version = "0.26.0", path = "./crates/nu_plugin_match", optional = true} nu_plugin_match = { version = "0.26.0", path = "./crates/nu_plugin_match", optional = true }
nu_plugin_post = {version = "0.26.0", path = "./crates/nu_plugin_post", optional = true} nu_plugin_post = { version = "0.26.0", path = "./crates/nu_plugin_post", optional = true }
nu_plugin_ps = {version = "0.26.0", path = "./crates/nu_plugin_ps", optional = true} nu_plugin_ps = { version = "0.26.0", path = "./crates/nu_plugin_ps", optional = true }
nu_plugin_s3 = {version = "0.26.0", path = "./crates/nu_plugin_s3", optional = true} nu_plugin_s3 = { version = "0.26.0", path = "./crates/nu_plugin_s3", optional = true }
nu_plugin_selector = {version = "0.26.0", path = "./crates/nu_plugin_selector", optional = true} nu_plugin_selector = { version = "0.26.0", path = "./crates/nu_plugin_selector", optional = true }
nu_plugin_start = {version = "0.26.0", path = "./crates/nu_plugin_start", optional = true} nu_plugin_start = { version = "0.26.0", path = "./crates/nu_plugin_start", optional = true }
nu_plugin_sys = {version = "0.26.0", path = "./crates/nu_plugin_sys", optional = true} nu_plugin_sys = { version = "0.26.0", path = "./crates/nu_plugin_sys", optional = true }
nu_plugin_textview = {version = "0.26.0", path = "./crates/nu_plugin_textview", optional = true} nu_plugin_textview = { version = "0.26.0", path = "./crates/nu_plugin_textview", optional = true }
nu_plugin_to_bson = {version = "0.26.0", path = "./crates/nu_plugin_to_bson", optional = true} nu_plugin_to_bson = { version = "0.26.0", path = "./crates/nu_plugin_to_bson", optional = true }
nu_plugin_to_sqlite = {version = "0.26.0", path = "./crates/nu_plugin_to_sqlite", optional = true} nu_plugin_to_sqlite = { version = "0.26.0", path = "./crates/nu_plugin_to_sqlite", optional = true }
nu_plugin_tree = {version = "0.26.0", path = "./crates/nu_plugin_tree", optional = true} nu_plugin_tree = { version = "0.26.0", path = "./crates/nu_plugin_tree", optional = true }
nu_plugin_xpath = {version = "0.26.0", path = "./crates/nu_plugin_xpath", optional = true} nu_plugin_xpath = { version = "0.26.0", path = "./crates/nu_plugin_xpath", optional = true }
# Required to bootstrap the main binary # Required to bootstrap the main binary
clap = "2.33.3" clap = "2.33.3"
ctrlc = {version = "3.1.6", optional = true} ctrlc = { version = "3.1.6", optional = true }
futures = {version = "0.3.5", features = ["compat", "io-compat"]} futures = { version = "0.3.5", features = ["compat", "io-compat"] }
itertools = "0.10.0" itertools = "0.10.0"
log = "0.4.11" log = "0.4.11"
pretty_env_logger = "0.4.0" pretty_env_logger = "0.4.0"
[dev-dependencies] [dev-dependencies]
dunce = "1.0.1" dunce = "1.0.1"
nu-test-support = {version = "0.26.0", path = "./crates/nu-test-support"} nu-test-support = { version = "0.26.0", path = "./crates/nu-test-support" }
[build-dependencies] [build-dependencies]
[features] [features]
ctrlc-support = ["nu-cli/ctrlc", "nu-command/ctrlc"] ctrlc-support = ["nu-cli/ctrlc", "nu-command/ctrlc"]
directories-support = ["nu-cli/directories", "nu-cli/dirs", "nu-command/directories", "nu-command/dirs", "nu-data/directories", "nu-data/dirs", "nu-engine/dirs"] directories-support = [
"nu-cli/directories",
"nu-cli/dirs",
"nu-command/directories",
"nu-command/dirs",
"nu-data/directories",
"nu-data/dirs",
"nu-engine/dirs",
]
ptree-support = ["nu-cli/ptree", "nu-command/ptree"] ptree-support = ["nu-cli/ptree", "nu-command/ptree"]
rustyline-support = ["nu-cli/rustyline-support", "nu-command/rustyline-support"] rustyline-support = ["nu-cli/rustyline-support", "nu-command/rustyline-support"]
term-support = ["nu-cli/term", "nu-command/term"] term-support = ["nu-cli/term", "nu-command/term"]
uuid-support = ["nu-cli/uuid_crate", "nu-command/uuid_crate"] uuid-support = ["nu-cli/uuid_crate", "nu-command/uuid_crate"]
which-support = ["nu-cli/ichwh", "nu-cli/which", "nu-command/ichwh", "nu-command/which"] which-support = [
"nu-cli/ichwh",
"nu-cli/which",
"nu-command/ichwh",
"nu-command/which",
]
default = [ default = [
"nu-cli/shadow-rs", "nu-cli/shadow-rs",
"sys", "sys",
"ps", "ps",
"textview", "textview",
"inc", "inc",
"directories-support", "directories-support",
"ctrlc-support", "ctrlc-support",
"which-support", "which-support",
"ptree-support", "ptree-support",
"term-support", "term-support",
"uuid-support", "uuid-support",
"rustyline-support", "rustyline-support",
"match", "match",
"post", "post",
"fetch", "fetch",
"zip-support", "zip-support",
] ]
extra = ["default", "binaryview", "tree", "clipboard-cli", "trash-support", "start", "bson", "sqlite", "s3", "chart", "xpath", "selector"]
stable = ["default"]
wasi = ["inc", "match", "ptree-support", "match", "tree", "rustyline-support"] stable = ["default"]
extra = [
"default",
"binaryview",
"tree",
"clipboard-cli",
"trash-support",
"start",
"bson",
"sqlite",
"s3",
"chart",
"xpath",
"selector",
]
wasi = [
"inc",
"match",
"ptree-support",
"match",
"tree",
"rustyline-support",
]
trace = ["nu-parser/trace"] trace = ["nu-parser/trace"]

View File

@ -178,7 +178,7 @@ pub fn test_anchors(cmd: Command) -> Result<(), ShellError> {
let block = parse_line(&pipeline_with_anchor, &ctx)?; let block = parse_line(&pipeline_with_anchor, &ctx)?;
if let Some(_) = &sample_pipeline.result { if sample_pipeline.result.is_some() {
let result = block_on(evaluate_block(block, &mut ctx))?; let result = block_on(evaluate_block(block, &mut ctx))?;
ctx.with_errors(|reasons| reasons.iter().cloned().take(1).next()) ctx.with_errors(|reasons| reasons.iter().cloned().take(1).next())

View File

@ -9,22 +9,24 @@ version = "0.26.0"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
bigdecimal = {version = "0.2.0", features = ["serde"]} bigdecimal = { version = "0.2.0", features = ["serde"] }
codespan-reporting = "0.11.0" codespan-reporting = "0.11.0"
derive-new = "0.5.8" derive-new = "0.5.8"
derive_is_enum_variant = "0.1.1" derive_is_enum_variant = "0.1.1"
indexmap = {version = "1.6.1", features = ["serde-1"]} indexmap = { version = "1.6.1", features = ["serde-1"] }
log = "0.4.11" log = "0.4.11"
num-bigint = {version = "0.3.1", features = ["serde"]} num-bigint = { version = "0.3.1", features = ["serde"] }
num-traits = "0.2.14" num-traits = "0.2.14"
serde = "1.0.118" serde = "1.0.118"
shellexpand = "2.1.0" shellexpand = "2.1.0"
itertools = "0.10.0"
smart-default = "0.6.0"
dunce = "1.0.1" dunce = "1.0.1"
nu-errors = {version = "0.26.0", path = "../nu-errors"} nu-errors = { version = "0.26.0", path = "../nu-errors" }
nu-protocol = {version = "0.26.0", path = "../nu-protocol"} nu-protocol = { version = "0.26.0", path = "../nu-protocol" }
nu-source = {version = "0.26.0", path = "../nu-source"} nu-source = { version = "0.26.0", path = "../nu-source" }
nu-test-support = {version = "0.26.0", path = "../nu-test-support"} nu-test-support = { version = "0.26.0", path = "../nu-test-support" }
[features] [features]
stable = [] stable = []

View File

@ -1,993 +0,0 @@
use std::str::CharIndices;
use std::{fmt, iter::Peekable};
use nu_source::{Span, Spanned, SpannedItem};
use nu_errors::ParseError;
type Input<'t> = Peekable<CharIndices<'t>>;
#[derive(Debug)]
pub struct Token {
pub contents: TokenContents,
pub span: Span,
}
impl Token {
pub fn new(contents: TokenContents, span: Span) -> Token {
Token { contents, span }
}
}
#[derive(Debug, PartialEq, is_enum_variant)]
pub enum TokenContents {
/// A baseline token is an atomic chunk of source code. This means that the
/// token contains the entirety of string literals, as well as the entirety
/// of sections delimited by paired delimiters.
///
/// For example, if the token begins with `{`, the baseline token continues
/// until the closing `}` (after taking comments and string literals into
/// consideration).
Baseline(String),
Comment(String),
Pipe,
Semicolon,
EOL,
}
impl fmt::Display for TokenContents {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
TokenContents::Baseline(base) => write!(f, "{}", base),
TokenContents::Comment(comm) => write!(f, "#{}", comm),
TokenContents::Pipe => write!(f, "|"),
TokenContents::Semicolon => write!(f, ";"),
TokenContents::EOL => write!(f, "\\n"),
}
}
}
/// A `LiteCommand` is a list of words that will get meaning when processed by
/// the parser.
#[derive(Debug, Clone)]
pub struct LiteCommand {
pub parts: Vec<Spanned<String>>,
///Preceding comments. Each String in the vec is one line. The comment literal is not included.
pub comments: Option<Vec<Spanned<String>>>,
}
impl LiteCommand {
fn new() -> LiteCommand {
LiteCommand {
parts: vec![],
comments: None,
}
}
pub fn comments_joined(&self) -> String {
match &self.comments {
None => "".to_string(),
Some(text) => text
.iter()
.map(|s| s.item.clone())
.collect::<Vec<_>>()
.join("\n"),
}
}
pub fn is_empty(&self) -> bool {
self.parts.is_empty()
}
pub fn has_content(&self) -> bool {
!self.is_empty()
}
pub fn push(&mut self, item: Spanned<String>) {
self.parts.push(item)
}
pub(crate) fn span(&self) -> Span {
let start = if let Some(x) = self.parts.first() {
x.span.start()
} else {
0
};
let end = if let Some(x) = self.parts.last() {
x.span.end()
} else {
0
};
Span::new(start, end)
}
}
/// A `LitePipeline` is a series of `LiteCommand`s, separated by `|`.
#[derive(Debug, Clone)]
pub struct LitePipeline {
pub commands: Vec<LiteCommand>,
}
impl Default for LitePipeline {
fn default() -> Self {
Self::new()
}
}
impl LitePipeline {
pub fn new() -> Self {
Self { commands: vec![] }
}
pub fn is_empty(&self) -> bool {
self.commands.is_empty()
}
pub fn has_content(&self) -> bool {
!self.commands.is_empty()
}
pub fn push(&mut self, item: LiteCommand) {
self.commands.push(item)
}
pub(crate) fn span(&self) -> Span {
let start = if !self.commands.is_empty() {
self.commands[0].span().start()
} else {
0
};
if let Some((last, _)) = self.commands[..].split_last() {
Span::new(start, last.span().end())
} else {
Span::new(start, 0)
}
}
}
/// A `LiteGroup` is a series of `LitePipeline`s, separated by `;`.
#[derive(Debug, Clone)]
pub struct LiteGroup {
pub pipelines: Vec<LitePipeline>,
}
impl Default for LiteGroup {
fn default() -> Self {
Self::new()
}
}
impl LiteGroup {
pub fn new() -> Self {
Self { pipelines: vec![] }
}
pub fn is_empty(&self) -> bool {
self.pipelines.is_empty()
}
pub fn has_content(&self) -> bool {
!self.pipelines.is_empty()
}
pub fn push(&mut self, item: LitePipeline) {
self.pipelines.push(item)
}
#[cfg(test)]
pub(crate) fn span(&self) -> Span {
let start = if !self.pipelines.is_empty() {
self.pipelines[0].span().start()
} else {
0
};
if let Some((last, _)) = self.pipelines[..].split_last() {
Span::new(start, last.span().end())
} else {
Span::new(start, 0)
}
}
}
/// A `LiteBlock` is a series of `LiteGroup`s, separated by newlines.
#[derive(Debug, Clone)]
pub struct LiteBlock {
pub block: Vec<LiteGroup>,
}
impl LiteBlock {
pub fn new(block: Vec<LiteGroup>) -> Self {
Self { block }
}
pub fn is_empty(&self) -> bool {
self.block.is_empty()
}
pub fn push(&mut self, item: LiteGroup) {
self.block.push(item)
}
#[cfg(test)]
pub(crate) fn span(&self) -> Span {
let start = if !self.block.is_empty() {
self.block[0].span().start()
} else {
0
};
if let Some((last, _)) = self.block[..].split_last() {
Span::new(start, last.span().end())
} else {
Span::new(start, 0)
}
}
}
#[derive(Clone, Copy)]
enum BlockKind {
Paren,
CurlyBracket,
SquareBracket,
}
impl BlockKind {
fn closing(self) -> char {
match self {
BlockKind::Paren => ')',
BlockKind::SquareBracket => ']',
BlockKind::CurlyBracket => '}',
}
}
}
/// Finds the extents of a basline token, returning the string with its
/// associated span, along with any parse error that was discovered along the
/// way.
///
/// Baseline tokens are unparsed content separated by spaces or a command
/// separator (like pipe or semicolon) Baseline tokens may be surrounded by
/// quotes (single, double, or backtick) or braces (square, paren, curly)
///
/// Baseline tokens may be further processed based on the needs of the syntax
/// shape that encounters them. They are still lightly lexed. For example, if a
/// baseline token begins with `{`, the entire token will continue until the
/// closing `}`, taking comments into consideration.
pub fn baseline(src: &mut Input, span_offset: usize) -> (Spanned<String>, Option<ParseError>) {
let mut token_contents = String::new();
let start_offset = if let Some((pos, _)) = src.peek() {
*pos
} else {
0
};
// This variable tracks the starting character of a string literal, so that
// we remain inside the string literal lexer mode until we encounter the
// closing quote.
let mut quote_start: Option<char> = None;
// This Vec tracks paired delimiters
let mut block_level: Vec<BlockKind> = vec![];
// A baseline token is terminated if it's not nested inside of a paired
// delimiter and the next character is one of: `|`, `;`, `#` or any
// whitespace.
fn is_termination(block_level: &[BlockKind], c: char) -> bool {
block_level.is_empty() && (c.is_whitespace() || c == '|' || c == ';' || c == '#')
}
// The process of slurping up a baseline token repeats:
//
// - String literal, which begins with `'`, `"` or `\``, and continues until
// the same character is encountered again.
// - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
// the matching closing delimiter is found, skipping comments and string
// literals.
// - When not nested inside of a delimiter pair, when a terminating
// character (whitespace, `|`, `;` or `#`) is encountered, the baseline
// token is done.
// - Otherwise, accumulate the character into the current baseline token.
while let Some((_, c)) = src.peek() {
let c = *c;
if quote_start.is_some() {
// If we encountered the closing quote character for the current
// string, we're done with the current string.
if Some(c) == quote_start {
quote_start = None;
}
} else if c == '\'' || c == '"' || c == '`' {
// We encountered the opening quote of a string literal.
quote_start = Some(c);
} else if c == '[' {
// We encountered an opening `[` delimiter.
block_level.push(BlockKind::SquareBracket);
} else if c == ']' {
// We encountered a closing `]` delimiter. Pop off the opening `[`
// delimiter.
if let Some(BlockKind::SquareBracket) = block_level.last() {
let _ = block_level.pop();
}
} else if c == '{' {
// We encountered an opening `{` delimiter.
block_level.push(BlockKind::CurlyBracket);
} else if c == '}' {
// We encountered a closing `}` delimiter. Pop off the opening `{`.
if let Some(BlockKind::CurlyBracket) = block_level.last() {
let _ = block_level.pop();
}
} else if c == '(' {
// We enceountered an opening `(` delimiter.
block_level.push(BlockKind::Paren);
} else if c == ')' {
// We encountered a closing `)` delimiter. Pop off the opening `(`.
if let Some(BlockKind::Paren) = block_level.last() {
let _ = block_level.pop();
}
} else if is_termination(&block_level, c) {
break;
}
// Otherwise, accumulate the character into the current token.
token_contents.push(c);
// Consume the character.
let _ = src.next();
}
let span = Span::new(
start_offset + span_offset,
start_offset + span_offset + token_contents.len(),
);
// If there is still unclosed opening delimiters, close them and add
// synthetic closing characters to the accumulated token.
if let Some(block) = block_level.last() {
let delim: char = (*block).closing();
let cause = ParseError::unexpected_eof(delim.to_string(), span);
while let Some(bk) = block_level.pop() {
token_contents.push(bk.closing());
}
return (token_contents.spanned(span), Some(cause));
}
if let Some(delimiter) = quote_start {
// The non-lite parse trims quotes on both sides, so we add the expected quote so that
// anyone wanting to consume this partial parse (e.g., completions) will be able to get
// correct information from the non-lite parse.
token_contents.push(delimiter);
return (
token_contents.spanned(span),
Some(ParseError::unexpected_eof(delimiter.to_string(), span)),
);
}
// If we didn't accumulate any characters, it's an unexpected error.
if token_contents.is_empty() {
return (
token_contents.spanned(span),
Some(ParseError::unexpected_eof("command".to_string(), span)),
);
}
(token_contents.spanned(span), None)
}
/// Try to parse a list of tokens into a block.
pub fn block(tokens: Vec<Token>) -> (LiteBlock, Option<ParseError>) {
// Accumulate chunks of tokens into groups.
let mut groups = vec![];
// The current group
let mut group = LiteGroup::new();
// The current pipeline
let mut pipeline = LitePipeline::new();
// The current command
let mut command = LiteCommand::new();
let mut prev_comments = None;
let mut prev_comment_indent = 0;
let mut prev_token: Option<Token> = None;
// The parsing process repeats:
//
// - newline (`\n` or `\r`)
// - pipes (`|`)
// - semicolon
fn finish_command(
prev_comments: &mut Option<Vec<Spanned<String>>>,
command: &mut LiteCommand,
pipeline: &mut LitePipeline,
) {
if let Some(prev_comments_) = prev_comments {
//Add previous comments to this command
command.comments = Some(prev_comments_.clone());
//Reset
*prev_comments = None;
}
pipeline.push(command.clone());
*command = LiteCommand::new();
}
for token in tokens {
match &token.contents {
TokenContents::EOL => {
// We encountered a newline character. If the last token on the
// current line is a `|`, continue the current group on the next
// line. Otherwise, close up the current group by rolling up the
// current command into the current pipeline, and then roll up
// the current pipeline into the group.
// If the last token on the current line is a `|`, the group
// continues on the next line.
if let Some(prev) = &prev_token {
if let TokenContents::Pipe = prev.contents {
continue;
}
if let TokenContents::EOL = prev.contents {
//If we have an empty line we discard previous comments as they are not
//part of a command
//Example nu Code:
//#I am a comment getting discarded
//
//def e [] {echo hi}
prev_comments = None
}
}
// If we have an open command, push it into the current
// pipeline.
if command.has_content() {
finish_command(&mut prev_comments, &mut command, &mut pipeline);
}
// If we have an open pipeline, push it into the current group.
if pipeline.has_content() {
group.push(pipeline);
pipeline = LitePipeline::new();
}
// If we have an open group, accumulate it into `groups`.
if group.has_content() {
groups.push(group);
group = LiteGroup::new();
}
}
TokenContents::Pipe => {
// We encountered a pipe (`|`) character, which terminates a
// command.
// If the current command has content, accumulate it into
// the current pipeline and start a new command.
if command.has_content() {
finish_command(&mut prev_comments, &mut command, &mut pipeline);
} else {
// If the current command doesn't have content, return an
// error that indicates that the `|` was unexpected.
return (
LiteBlock::new(groups),
Some(ParseError::extra_tokens(
"|".to_string().spanned(token.span),
)),
);
}
}
TokenContents::Semicolon => {
// We encountered a semicolon (`;`) character, which terminates
// a pipeline.
// If the current command has content, accumulate it into the
// current pipeline and start a new command.
if command.has_content() {
finish_command(&mut prev_comments, &mut command, &mut pipeline);
}
// If the current pipeline has content, accumulate it into the
// current group and start a new pipeline.
if pipeline.has_content() {
group.push(pipeline);
pipeline = LitePipeline::new();
}
}
TokenContents::Baseline(bare) => {
// We encountered an unclassified character. Accumulate it into
// the current command as a string.
command.push(bare.to_string().spanned(token.span));
}
TokenContents::Comment(comment) => {
if prev_comments.is_none() {
//Calculate amount of space indent
if let Some((i, _)) = comment.chars().enumerate().find(|(_, ch)| *ch != ' ') {
prev_comment_indent = i;
}
}
let comment: String = comment
.chars()
.enumerate()
.skip_while(|(i, ch)| *i < prev_comment_indent && *ch == ' ')
.map(|(_, ch)| ch)
.collect();
//Because we skipped some spaces at start, the span needs to be adjusted
let comment_span = Span::new(token.span.end() - comment.len(), token.span.end());
prev_comments
.get_or_insert(vec![])
.push(comment.spanned(comment_span));
}
}
prev_token = Some(token);
}
// If the current command has content, accumulate it into the current pipeline.
if command.has_content() {
finish_command(&mut prev_comments, &mut command, &mut pipeline)
}
// If the current pipeline has content, accumulate it into the current group.
if pipeline.has_content() {
group.push(pipeline);
}
// If the current group has content, accumulate it into the list of groups.
if group.has_content() {
groups.push(group);
}
// Return a new LiteBlock with the accumulated list of groups.
(LiteBlock::new(groups), None)
}
/// Breaks the input string into a vector of tokens. This tokenization only tries to classify separators like
/// semicolons, pipes, etc from external bare values (values that haven't been classified further)
/// Takes in a string and and offset, which is used to offset the spans created (for when this function is used to parse inner strings)
pub fn lex(input: &str, span_offset: usize) -> (Vec<Token>, Option<ParseError>) {
// Break the input slice into an iterator of Unicode characters.
let mut char_indices = input.char_indices().peekable();
let mut error = None;
let mut output = vec![];
let mut is_complete = true;
// The lexing process repeats. One character of lookahead is sufficient to decide what to do next.
//
// - `|`: the token is either `|` token or a `||` token
// - `;`: the token is a semicolon
// - `\n` or `\r`: the token is an EOL (end of line) token
// - other whitespace: ignored
// - `#` the token starts a line comment, which contains all of the subsequent characters until the next EOL
// -
while let Some((idx, c)) = char_indices.peek() {
if *c == '|' {
// If the next character is `|`, it's either `|` or `||`.
let idx = *idx;
let prev_idx = idx;
let _ = char_indices.next();
// If the next character is `|`, we're looking at a `||`.
if let Some((idx, c)) = char_indices.peek() {
if *c == '|' {
let idx = *idx;
let _ = char_indices.next();
output.push(Token::new(
TokenContents::Baseline("||".into()),
Span::new(span_offset + prev_idx, span_offset + idx + 1),
));
continue;
}
}
// Otherwise, it's just a regular `|` token.
output.push(Token::new(
TokenContents::Pipe,
Span::new(span_offset + idx, span_offset + idx + 1),
));
is_complete = false;
} else if *c == ';' {
// If the next character is a `;`, we're looking at a semicolon token.
if !is_complete && error.is_none() {
error = Some(ParseError::extra_tokens(
";".to_string().spanned(Span::new(*idx, idx + 1)),
));
}
let idx = *idx;
let _ = char_indices.next();
output.push(Token::new(
TokenContents::Semicolon,
Span::new(span_offset + idx, span_offset + idx + 1),
));
} else if *c == '\n' || *c == '\r' {
// If the next character is a newline, we're looking at an EOL (end of line) token.
let idx = *idx;
let _ = char_indices.next();
output.push(Token::new(
TokenContents::EOL,
Span::new(span_offset + idx, span_offset + idx + 1),
));
} else if *c == '#' {
let comment_start = *idx + 1;
let mut comment = String::new();
//Don't copy '#' into comment string
char_indices.next();
while let Some((_, c)) = char_indices.peek() {
if *c == '\n' {
break;
}
comment.push(*c);
//Advance char_indices
let _ = char_indices.next();
}
let token = Token::new(
TokenContents::Comment(comment.clone()),
Span::new(
span_offset + comment_start,
span_offset + comment_start + comment.len(),
),
);
output.push(token);
} else if c.is_whitespace() {
// If the next character is non-newline whitespace, skip it.
let _ = char_indices.next();
} else {
// Otherwise, try to consume an unclassified token.
let (result, err) = baseline(&mut char_indices, span_offset);
if error.is_none() {
error = err;
}
is_complete = true;
let Spanned { item, span } = result;
output.push(Token::new(TokenContents::Baseline(item), span));
}
}
(output, error)
}
#[cfg(test)]
mod tests {
use super::*;
fn span(left: usize, right: usize) -> Span {
Span::new(left, right)
}
mod bare {
use super::*;
#[test]
fn simple_1() {
let input = "foo bar baz";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 3));
}
#[test]
fn simple_2() {
let input = "'foo bar' baz";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 9));
}
#[test]
fn simple_3() {
let input = "'foo\" bar' baz";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 10));
}
#[test]
fn simple_4() {
let input = "[foo bar] baz";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 9));
}
#[test]
fn simple_5() {
let input = "'foo 'bar baz";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 9));
}
#[test]
fn simple_6() {
let input = "''foo baz";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 5));
}
#[test]
fn simple_7() {
let input = "'' foo";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 2));
}
#[test]
fn simple_8() {
let input = " '' foo";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(1, 3));
}
#[test]
fn simple_9() {
let input = " 'foo' foo";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(1, 6));
}
#[test]
fn simple_10() {
let input = "[foo, bar]";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 10));
}
#[test]
fn lex_comment() {
let input = r#"
#A comment
def e [] {echo hi}
"#;
let (result, err) = lex(input, 0);
assert!(err.is_none());
//result[0] == EOL
assert_eq!(result[1].span, span(2, 11));
assert_eq!(
result[1].contents,
TokenContents::Comment("A comment".to_string())
);
}
#[test]
fn ignore_future() {
let input = "foo 'bar";
let (result, _) = lex(input, 0);
assert_eq!(result[0].span, span(0, 3));
}
#[test]
fn invalid_1() {
let input = "'foo bar";
let (_, err) = lex(input, 0);
assert!(err.is_some());
}
#[test]
fn invalid_2() {
let input = "'bar";
let (_, err) = lex(input, 0);
assert!(err.is_some());
}
#[test]
fn invalid_4() {
let input = " 'bar";
let (_, err) = lex(input, 0);
assert!(err.is_some());
}
}
mod lite_parse {
use super::*;
#[test]
fn pipeline() {
let (result, err) = lex("cmd1 | cmd2 ; deploy", 0);
assert!(err.is_none());
let (result, err) = block(result);
assert!(err.is_none());
assert_eq!(result.span(), span(0, 20));
assert_eq!(result.block[0].pipelines[0].span(), span(0, 11));
assert_eq!(result.block[0].pipelines[1].span(), span(14, 20));
}
#[test]
fn simple_1() {
let (result, err) = lex("foo", 0);
assert!(err.is_none());
let (result, err) = block(result);
assert!(err.is_none());
assert_eq!(result.block.len(), 1);
assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands[0].parts.len(), 1);
assert_eq!(
result.block[0].pipelines[0].commands[0].parts[0].span,
span(0, 3)
);
}
#[test]
fn simple_offset() {
let (result, err) = lex("foo", 10);
assert!(err.is_none());
let (result, err) = block(result);
assert!(err.is_none());
assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands[0].parts.len(), 1);
assert_eq!(
result.block[0].pipelines[0].commands[0].parts[0].span,
span(10, 13)
);
}
#[test]
fn incomplete_result() {
let (result, err) = lex("my_command \"foo' --test", 10);
assert!(matches!(err.unwrap().reason(), nu_errors::ParseErrorReason::Eof { .. }));
let (result, _) = block(result);
assert_eq!(result.block.len(), 1);
assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands[0].parts.len(), 2);
assert_eq!(
result.block[0].pipelines[0].commands[0].parts[0].item,
"my_command"
);
assert_eq!(
result.block[0].pipelines[0].commands[0].parts[1].item,
"\"foo' --test\""
);
}
#[test]
fn command_with_comment() {
let code = r#"
# My echo
# * It's much better :)
def my_echo [arg] { echo $arg }
"#;
let (result, err) = lex(code, 0);
assert!(err.is_none());
let (result, err) = block(result);
assert!(err.is_none());
assert_eq!(result.block.len(), 1);
assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands[0].parts.len(), 4);
assert_eq!(
result.block[0].pipelines[0].commands[0].comments,
Some(vec![
//Leading space is trimmed
"My echo".to_string().spanned(Span::new(3, 10)),
"* It's much better :)"
.to_string()
.spanned(Span::new(13, 34))
])
);
}
#[test]
fn discarded_comment() {
let code = r#"
# This comment gets discarded, because of the following empty line
echo 42
"#;
let (result, err) = lex(code, 0);
assert!(err.is_none());
// assert_eq!(format!("{:?}", result), "");
let (result, err) = block(result);
assert!(err.is_none());
assert_eq!(result.block.len(), 1);
assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands[0].parts.len(), 2);
assert_eq!(result.block[0].pipelines[0].commands[0].comments, None);
}
}
#[test]
fn no_discarded_white_space_start_of_comment() {
let code = r#"
#No white_space at firt line ==> No white_space discarded
# Starting space is not discarded
echo 42
"#;
let (result, err) = lex(code, 0);
assert!(err.is_none());
// assert_eq!(format!("{:?}", result), "");
let (result, err) = block(result);
assert!(err.is_none());
assert_eq!(result.block.len(), 1);
assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands[0].parts.len(), 2);
assert_eq!(
result.block[0].pipelines[0].commands[0].comments,
Some(vec![
"No white_space at firt line ==> No white_space discarded"
.to_string()
.spanned(Span::new(2, 58)),
" Starting space is not discarded"
.to_string()
.spanned(Span::new(60, 94)),
])
);
}
#[test]
fn multiple_discarded_white_space_start_of_comment() {
let code = r#"
# Discard 2 spaces
# Discard 1 space
# Discard 2 spaces
echo 42
"#;
let (result, err) = lex(code, 0);
assert!(err.is_none());
// assert_eq!(format!("{:?}", result), "");
let (result, err) = block(result);
assert!(err.is_none());
assert_eq!(result.block.len(), 1);
assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands[0].parts.len(), 2);
assert_eq!(
result.block[0].pipelines[0].commands[0].comments,
Some(vec![
"Discard 2 spaces".to_string().spanned(Span::new(4, 20)),
"Discard 1 space".to_string().spanned(Span::new(23, 38)),
"Discard 2 spaces".to_string().spanned(Span::new(42, 58)),
])
);
}
}

View File

@ -0,0 +1,497 @@
use smart_default::SmartDefault;
use std::iter::Peekable;
use std::str::CharIndices;
use nu_errors::ParseError;
use nu_source::{HasSpan, Span, Spanned, SpannedItem};
use super::token_group::TokenBuilder;
use super::tokens::{
CommandBuilder, CommentsBuilder, GroupBuilder, LiteBlock, LiteCommand, LiteComment,
PipelineBuilder, TokenContents,
};
type Input<'t> = Peekable<CharIndices<'t>>;
#[derive(Debug, Clone)]
pub struct Token {
pub contents: TokenContents,
pub span: Span,
}
impl Token {
pub fn new(contents: TokenContents, span: Span) -> Token {
Token { contents, span }
}
}
#[derive(Clone, Copy)]
enum BlockKind {
Paren,
CurlyBracket,
SquareBracket,
}
impl BlockKind {
fn closing(self) -> char {
match self {
BlockKind::Paren => ')',
BlockKind::SquareBracket => ']',
BlockKind::CurlyBracket => '}',
}
}
}
/// Finds the extents of a basline token, returning the string with its
/// associated span, along with any parse error that was discovered along the
/// way.
///
/// Baseline tokens are unparsed content separated by spaces or a command
/// separator (like pipe or semicolon) Baseline tokens may be surrounded by
/// quotes (single, double, or backtick) or braces (square, paren, curly)
///
/// Baseline tokens may be further processed based on the needs of the syntax
/// shape that encounters them. They are still lightly lexed. For example, if a
/// baseline token begins with `{`, the entire token will continue until the
/// closing `}`, taking comments into consideration.
pub fn baseline(src: &mut Input, span_offset: usize) -> (Spanned<String>, Option<ParseError>) {
let mut token_contents = String::new();
let start_offset = if let Some((pos, _)) = src.peek() {
*pos
} else {
0
};
// This variable tracks the starting character of a string literal, so that
// we remain inside the string literal lexer mode until we encounter the
// closing quote.
let mut quote_start: Option<char> = None;
// This Vec tracks paired delimiters
let mut block_level: Vec<BlockKind> = vec![];
// A baseline token is terminated if it's not nested inside of a paired
// delimiter and the next character is one of: `|`, `;`, `#` or any
// whitespace.
fn is_termination(block_level: &[BlockKind], c: char) -> bool {
block_level.is_empty() && (c.is_whitespace() || c == '|' || c == ';' || c == '#')
}
// The process of slurping up a baseline token repeats:
//
// - String literal, which begins with `'`, `"` or `\``, and continues until
// the same character is encountered again.
// - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
// the matching closing delimiter is found, skipping comments and string
// literals.
// - When not nested inside of a delimiter pair, when a terminating
// character (whitespace, `|`, `;` or `#`) is encountered, the baseline
// token is done.
// - Otherwise, accumulate the character into the current baseline token.
while let Some((_, c)) = src.peek() {
let c = *c;
if quote_start.is_some() {
// If we encountered the closing quote character for the current
// string, we're done with the current string.
if Some(c) == quote_start {
quote_start = None;
}
} else if c == '\'' || c == '"' || c == '`' {
// We encountered the opening quote of a string literal.
quote_start = Some(c);
} else if c == '[' {
// We encountered an opening `[` delimiter.
block_level.push(BlockKind::SquareBracket);
} else if c == ']' {
// We encountered a closing `]` delimiter. Pop off the opening `[`
// delimiter.
if let Some(BlockKind::SquareBracket) = block_level.last() {
let _ = block_level.pop();
}
} else if c == '{' {
// We encountered an opening `{` delimiter.
block_level.push(BlockKind::CurlyBracket);
} else if c == '}' {
// We encountered a closing `}` delimiter. Pop off the opening `{`.
if let Some(BlockKind::CurlyBracket) = block_level.last() {
let _ = block_level.pop();
}
} else if c == '(' {
// We enceountered an opening `(` delimiter.
block_level.push(BlockKind::Paren);
} else if c == ')' {
// We encountered a closing `)` delimiter. Pop off the opening `(`.
if let Some(BlockKind::Paren) = block_level.last() {
let _ = block_level.pop();
}
} else if is_termination(&block_level, c) {
break;
}
// Otherwise, accumulate the character into the current token.
token_contents.push(c);
// Consume the character.
let _ = src.next();
}
let span = Span::new(
start_offset + span_offset,
start_offset + span_offset + token_contents.len(),
);
// If there is still unclosed opening delimiters, close them and add
// synthetic closing characters to the accumulated token.
if let Some(block) = block_level.last() {
let delim: char = (*block).closing();
let cause = ParseError::unexpected_eof(delim.to_string(), span);
while let Some(bk) = block_level.pop() {
token_contents.push(bk.closing());
}
return (token_contents.spanned(span), Some(cause));
}
if let Some(delimiter) = quote_start {
// The non-lite parse trims quotes on both sides, so we add the expected quote so that
// anyone wanting to consume this partial parse (e.g., completions) will be able to get
// correct information from the non-lite parse.
token_contents.push(delimiter);
return (
token_contents.spanned(span),
Some(ParseError::unexpected_eof(delimiter.to_string(), span)),
);
}
// If we didn't accumulate any characters, it's an unexpected error.
if token_contents.is_empty() {
return (
token_contents.spanned(span),
Some(ParseError::unexpected_eof("command".to_string(), span)),
);
}
(token_contents.spanned(span), None)
}
/// We encountered a `#` character. Keep consuming characters until we encounter
/// a newline character (but don't consume it).
fn parse_comment(input: &mut Input, hash_offset: usize) -> LiteComment {
let mut comment = String::new();
let mut in_ws = true;
let mut body_start = 0;
input.next();
while let Some((_, c)) = input.peek() {
if *c == '\n' {
break;
}
if in_ws && c.is_whitespace() {
body_start += c.len_utf8();
} else if in_ws && !c.is_whitespace() {
in_ws = false;
}
comment.push(*c);
input.next();
}
if body_start == 0 {
let len = comment.len();
LiteComment::new(comment.spanned(Span::new(hash_offset + 1, hash_offset + 1 + len)))
} else {
let ws = comment[..body_start].to_string();
let body = comment[body_start..].to_string();
let body_len = body.len();
LiteComment::new_with_ws(
ws.spanned(Span::new(hash_offset + 1, hash_offset + 1 + body_start)),
body.spanned(Span::new(
hash_offset + 1 + body_start,
hash_offset + 1 + body_start + body_len,
)),
)
}
}
#[derive(SmartDefault)]
struct BlockParser {
groups: TokenBuilder<GroupBuilder>,
group: GroupBuilder,
pipeline: PipelineBuilder,
command: CommandBuilder,
prev_token: Option<Token>,
prev_comments: CommentsBuilder,
prev_comment_indent: usize,
}
impl BlockParser {
fn consumed(&mut self, token: Token) {
self.prev_token = Some(token);
}
fn success(mut self) -> (LiteBlock, Option<ParseError>) {
self.close_group();
(LiteBlock::new(self.groups.map(|g| g.into())), None)
}
fn fail(self, error: ParseError) -> (LiteBlock, Option<ParseError>) {
(LiteBlock::new(self.groups.map(|g| g.into())), Some(error))
}
fn comment(&mut self, token: &LiteComment) {
if self.prev_comments.is_empty() {
self.prev_comment_indent = token.ws_len();
}
self.prev_comments
.push(token.unindent(self.prev_comment_indent));
}
fn eoleol(&mut self) {
self.prev_comment_indent = 0;
self.prev_comments.take();
self.eol();
}
fn eol(&mut self) {
// If the last token on the current line is a `|`, the group
// continues on the next line.
if let Some(prev) = &self.prev_token {
if let TokenContents::Pipe = prev.contents {
return;
}
}
self.close_group();
}
fn pipe(&mut self) -> Result<(), ()> {
// If the current command has content, accumulate it into
// the current pipeline and start a new command.
match self.close_command() {
None => Err(()),
Some(command) => {
self.pipeline.push(command);
Ok(())
}
}
}
fn semicolon(&mut self) {
self.close_pipeline();
}
fn baseline(&mut self, part: Spanned<String>) {
// We encountered an unclassified character. Accumulate it into
// the current command as a string.
self.command.push(part);
}
fn close_command(&mut self) -> Option<LiteCommand> {
let command = self.command.take()?;
let command = LiteCommand {
parts: command.into(),
comments: self.prev_comments.take().map(|c| c.into()),
};
self.prev_comment_indent = 0;
Some(command)
}
fn close_pipeline(&mut self) {
if let Some(command) = self.close_command() {
self.pipeline.push(command);
}
if let Some(pipeline) = self.pipeline.take() {
self.group.push(pipeline);
}
}
fn close_group(&mut self) {
self.close_pipeline();
if let Some(group) = self.group.take() {
self.groups.push(group);
}
}
}
/// Try to parse a list of tokens into a block.
pub fn block(tokens: Vec<Token>) -> (LiteBlock, Option<ParseError>) {
let mut parser = BlockParser::default();
let mut tokens = tokens.iter().peekable();
// The parsing process repeats:
//
// - newline (`\n` or `\r`)
// - pipes (`|`)
// - semicolon
while let Some(token) = tokens.next() {
match &token.contents {
TokenContents::EOL => {
// If we encounter two newline characters in a row, use a special eoleol event,
// which allows the parser to discard comments that shouldn't be treated as
// documentation for the following item.
if let Some(Token {
contents: TokenContents::EOL,
..
}) = tokens.peek()
{
tokens.next();
parser.eoleol();
} else {
// We encountered a newline character. If the last token on the
// current line is a `|`, continue the current group on the next
// line. Otherwise, close up the current group by rolling up the
// current command into the current pipeline, and then roll up
// the current pipeline into the group.
parser.eol();
}
}
TokenContents::Pipe => {
// We encountered a pipe (`|`) character, which terminates a
// command.
if parser.pipe().is_err() {
// If the current command doesn't have content, return an
// error that indicates that the `|` was unexpected.
return parser.fail(ParseError::extra_tokens(
"|".to_string().spanned(token.span),
));
}
// match parser.pipe() {}
}
TokenContents::Semicolon => {
// We encountered a semicolon (`;`) character, which terminates
// a pipeline.
parser.semicolon();
}
TokenContents::Baseline(part) => {
// We encountered an unclassified character. Accumulate it into
// the current command as a string.
parser.baseline(part.to_string().spanned(token.span));
}
TokenContents::Comment(comment) => parser.comment(comment),
}
parser.consumed(token.clone());
}
parser.success()
}
/// Breaks the input string into a vector of tokens. This tokenization only tries to classify separators like
/// semicolons, pipes, etc from external bare values (values that haven't been classified further)
/// Takes in a string and and offset, which is used to offset the spans created (for when this function is used to parse inner strings)
pub fn lex(input: &str, span_offset: usize) -> (Vec<Token>, Option<ParseError>) {
// Break the input slice into an iterator of Unicode characters.
let mut char_indices = input.char_indices().peekable();
let mut error = None;
let mut output = vec![];
let mut is_complete = true;
// The lexing process repeats. One character of lookahead is sufficient to decide what to do next.
//
// - `|`: the token is either `|` token or a `||` token
// - `;`: the token is a semicolon
// - `\n` or `\r`: the token is an EOL (end of line) token
// - other whitespace: ignored
// - `#` the token starts a line comment, which contains all of the subsequent characters until the next EOL
// -
while let Some((idx, c)) = char_indices.peek() {
if *c == '|' {
// If the next character is `|`, it's either `|` or `||`.
let idx = *idx;
let prev_idx = idx;
let _ = char_indices.next();
// If the next character is `|`, we're looking at a `||`.
if let Some((idx, c)) = char_indices.peek() {
if *c == '|' {
let idx = *idx;
let _ = char_indices.next();
output.push(Token::new(
TokenContents::Baseline("||".into()),
Span::new(span_offset + prev_idx, span_offset + idx + 1),
));
continue;
}
}
// Otherwise, it's just a regular `|` token.
output.push(Token::new(
TokenContents::Pipe,
Span::new(span_offset + idx, span_offset + idx + 1),
));
is_complete = false;
} else if *c == ';' {
// If the next character is a `;`, we're looking at a semicolon token.
if !is_complete && error.is_none() {
error = Some(ParseError::extra_tokens(
";".to_string().spanned(Span::new(*idx, idx + 1)),
));
}
let idx = *idx;
let _ = char_indices.next();
output.push(Token::new(
TokenContents::Semicolon,
Span::new(span_offset + idx, span_offset + idx + 1),
));
} else if *c == '\n' || *c == '\r' {
// If the next character is a newline, we're looking at an EOL (end of line) token.
let idx = *idx;
let _ = char_indices.next();
output.push(Token::new(
TokenContents::EOL,
Span::new(span_offset + idx, span_offset + idx + 1),
));
} else if *c == '#' {
// If the next character is `#`, we're at the beginning of a line
// comment. The comment continues until the next newline.
let idx = *idx;
let comment = parse_comment(&mut char_indices, idx);
let span = comment.span();
output.push(Token::new(TokenContents::Comment(comment), span));
} else if c.is_whitespace() {
// If the next character is non-newline whitespace, skip it.
let _ = char_indices.next();
} else {
// Otherwise, try to consume an unclassified token.
let (result, err) = baseline(&mut char_indices, span_offset);
if error.is_none() {
error = err;
}
is_complete = true;
let Spanned { item, span } = result;
output.push(Token::new(TokenContents::Baseline(item), span));
}
}
(output, error)
}

View File

@ -0,0 +1,6 @@
pub mod lexer;
mod token_group;
pub mod tokens;
#[cfg(test)]
mod tests;

View File

@ -0,0 +1,358 @@
use nu_source::{Span, SpannedItem};
use super::lexer::*;
use super::tokens::*;
fn span(left: usize, right: usize) -> Span {
Span::new(left, right)
}
mod bare {
use super::*;
#[test]
fn simple_1() {
let input = "foo bar baz";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 3));
}
#[test]
fn simple_2() {
let input = "'foo bar' baz";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 9));
}
#[test]
fn simple_3() {
let input = "'foo\" bar' baz";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 10));
}
#[test]
fn simple_4() {
let input = "[foo bar] baz";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 9));
}
#[test]
fn simple_5() {
let input = "'foo 'bar baz";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 9));
}
#[test]
fn simple_6() {
let input = "''foo baz";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 5));
}
#[test]
fn simple_7() {
let input = "'' foo";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 2));
}
#[test]
fn simple_8() {
let input = " '' foo";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(1, 3));
}
#[test]
fn simple_9() {
let input = " 'foo' foo";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(1, 6));
}
#[test]
fn simple_10() {
let input = "[foo, bar]";
let (result, err) = lex(input, 0);
assert!(err.is_none());
assert_eq!(result[0].span, span(0, 10));
}
#[test]
fn lex_comment() {
let input = r#"
#A comment
def e [] {echo hi}
"#;
let (result, err) = lex(input, 0);
assert!(err.is_none());
//result[0] == EOL
assert_eq!(result[1].span, span(2, 11));
assert_eq!(
result[1].contents,
TokenContents::Comment(LiteComment::new(
"A comment".to_string().spanned(Span::new(2, 11))
))
);
}
#[test]
fn ignore_future() {
let input = "foo 'bar";
let (result, _) = lex(input, 0);
assert_eq!(result[0].span, span(0, 3));
}
#[test]
fn invalid_1() {
let input = "'foo bar";
let (_, err) = lex(input, 0);
assert!(err.is_some());
}
#[test]
fn invalid_2() {
let input = "'bar";
let (_, err) = lex(input, 0);
assert!(err.is_some());
}
#[test]
fn invalid_4() {
let input = " 'bar";
let (_, err) = lex(input, 0);
assert!(err.is_some());
}
}
mod lite_parse {
use nu_source::HasSpan;
use super::*;
#[test]
fn pipeline() {
let (result, err) = lex("cmd1 | cmd2 ; deploy", 0);
assert!(err.is_none());
let (result, err) = block(result);
assert!(err.is_none());
assert_eq!(result.span(), span(0, 20));
assert_eq!(result.block[0].pipelines[0].span(), span(0, 11));
assert_eq!(result.block[0].pipelines[1].span(), span(14, 20));
}
#[test]
fn simple_1() {
let (result, err) = lex("foo", 0);
assert!(err.is_none());
let (result, err) = block(result);
assert!(err.is_none());
assert_eq!(result.block.len(), 1);
assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands[0].parts.len(), 1);
assert_eq!(
result.block[0].pipelines[0].commands[0].parts[0].span,
span(0, 3)
);
}
#[test]
fn simple_offset() {
let (result, err) = lex("foo", 10);
assert!(err.is_none());
let (result, err) = block(result);
assert!(err.is_none());
assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands[0].parts.len(), 1);
assert_eq!(
result.block[0].pipelines[0].commands[0].parts[0].span,
span(10, 13)
);
}
#[test]
fn incomplete_result() {
let (result, err) = lex("my_command \"foo' --test", 10);
assert!(matches!(err.unwrap().reason(), nu_errors::ParseErrorReason::Eof { .. }));
let (result, _) = block(result);
assert_eq!(result.block.len(), 1);
assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands[0].parts.len(), 2);
assert_eq!(
result.block[0].pipelines[0].commands[0].parts[0].item,
"my_command"
);
assert_eq!(
result.block[0].pipelines[0].commands[0].parts[1].item,
"\"foo' --test\""
);
}
#[test]
fn command_with_comment() {
let code = r#"
# My echo
# * It's much better :)
def my_echo [arg] { echo $arg }
"#;
let (result, err) = lex(code, 0);
assert!(err.is_none());
let (result, err) = block(result);
assert!(err.is_none());
assert_eq!(result.block.len(), 1);
assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands[0].parts.len(), 4);
assert_eq!(
result.block[0].pipelines[0].commands[0].comments,
Some(vec![
//Leading space is trimmed
LiteComment::new_with_ws(
" ".to_string().spanned(Span::new(2, 3)),
"My echo".to_string().spanned(Span::new(3, 10))
),
LiteComment::new_with_ws(
" ".to_string().spanned(Span::new(12, 13)),
"* It's much better :)"
.to_string()
.spanned(Span::new(13, 34))
)
])
);
}
#[test]
fn discarded_comment() {
let code = r#"
# This comment gets discarded, because of the following empty line
echo 42
"#;
let (result, err) = lex(code, 0);
assert!(err.is_none());
// assert_eq!(format!("{:?}", result), "");
let (result, err) = block(result);
assert!(err.is_none());
assert_eq!(result.block.len(), 1);
assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands[0].parts.len(), 2);
assert_eq!(result.block[0].pipelines[0].commands[0].comments, None);
}
}
#[test]
fn no_discarded_white_space_start_of_comment() {
let code = r#"
#No white_space at firt line ==> No white_space discarded
# Starting space is not discarded
echo 42
"#;
let (result, err) = lex(code, 0);
assert!(err.is_none());
// assert_eq!(format!("{:?}", result), "");
let (result, err) = block(result);
assert!(err.is_none());
assert_eq!(result.block.len(), 1);
assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands[0].parts.len(), 2);
assert_eq!(
result.block[0].pipelines[0].commands[0].comments,
Some(vec![
LiteComment::new(
"No white_space at firt line ==> No white_space discarded"
.to_string()
.spanned(Span::new(2, 58))
),
LiteComment::new(
" Starting space is not discarded"
.to_string()
.spanned(Span::new(60, 94))
),
])
);
}
#[test]
fn multiple_discarded_white_space_start_of_comment() {
let code = r#"
# Discard 2 spaces
# Discard 1 space
# Discard 2 spaces
echo 42
"#;
let (result, err) = lex(code, 0);
assert!(err.is_none());
// assert_eq!(format!("{:?}", result), "");
let (result, err) = block(result);
assert!(err.is_none());
assert_eq!(result.block.len(), 1);
assert_eq!(result.block[0].pipelines.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands.len(), 1);
assert_eq!(result.block[0].pipelines[0].commands[0].parts.len(), 2);
assert_eq!(
result.block[0].pipelines[0].commands[0].comments,
Some(vec![
LiteComment::new_with_ws(
" ".to_string().spanned(Span::new(2, 4)),
"Discard 2 spaces".to_string().spanned(Span::new(4, 20))
),
LiteComment::new_with_ws(
" ".to_string().spanned(Span::new(22, 23)),
"Discard 1 space".to_string().spanned(Span::new(23, 38))
),
LiteComment::new_with_ws(
" ".to_string().spanned(Span::new(40, 42)),
"Discard 2 spaces".to_string().spanned(Span::new(42, 58))
),
])
);
}

View File

@ -0,0 +1,76 @@
use smart_default::SmartDefault;
use std::iter::FromIterator;
use derive_new::new;
use nu_source::{HasSpan, Span};
#[derive(Debug, Clone, SmartDefault, new)]
pub struct TokenBuilder<T: HasSpan> {
#[default(None)]
contents: Option<Vec<T>>,
}
impl<T> Into<Vec<T>> for TokenBuilder<T>
where
T: HasSpan,
{
fn into(self) -> Vec<T> {
self.contents.unwrap_or_else(Vec::new)
}
}
impl<T> HasSpan for TokenBuilder<T>
where
T: HasSpan,
{
fn span(&self) -> Span {
match &self.contents {
Some(vec) => {
let mut iter = vec.iter();
let head = iter.next();
let last = iter.last().or(head);
match (head, last) {
(Some(head), Some(last)) => Span::new(head.span().start(), last.span().end()),
_ => Span::default(),
}
}
None => Span::new(0, 0),
}
}
}
impl<T> TokenBuilder<T>
where
T: HasSpan,
{
pub fn is_empty(&self) -> bool {
self.contents.is_none()
}
pub fn take(&mut self) -> Option<TokenBuilder<T>> {
self.contents.take().map(|c| TokenBuilder::new(Some(c)))
}
pub fn map<I, U>(self, mapper: impl Fn(T) -> U) -> I
where
I: FromIterator<U>,
{
match self.contents {
Some(contents) => contents.into_iter().map(mapper).collect(),
None => I::from_iter(None),
}
}
pub fn push(&mut self, item: T) {
let contents = match self.contents.take() {
Some(mut contents) => {
contents.push(item);
contents
}
None => vec![item],
};
self.contents.replace(contents);
}
}

View File

@ -0,0 +1,212 @@
use derive_new::new;
use itertools::Itertools;
use std::fmt;
use nu_source::{HasSpan, Span, Spanned, SpannedItem};
use super::token_group::TokenBuilder;
#[derive(Debug, Clone, PartialEq, is_enum_variant)]
pub enum TokenContents {
/// A baseline token is an atomic chunk of source code. This means that the
/// token contains the entirety of string literals, as well as the entirety
/// of sections delimited by paired delimiters.
///
/// For example, if the token begins with `{`, the baseline token continues
/// until the closing `}` (after taking comments and string literals into
/// consideration).
Baseline(String),
Comment(LiteComment),
Pipe,
Semicolon,
EOL,
}
impl fmt::Display for TokenContents {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
TokenContents::Baseline(base) => write!(f, "{}", base),
TokenContents::Comment(comm) => write!(f, "{}", comm),
TokenContents::Pipe => write!(f, "|"),
TokenContents::Semicolon => write!(f, ";"),
TokenContents::EOL => write!(f, "\\n"),
}
}
}
pub type CommandBuilder = TokenBuilder<Spanned<String>>;
pub type CommentsBuilder = TokenBuilder<LiteComment>;
pub type PipelineBuilder = TokenBuilder<LiteCommand>;
pub type GroupBuilder = TokenBuilder<PipelineBuilder>;
/// A LiteComment is a line comment. It begins with `#` and continues until (but not including) the
/// next newline.
///
/// It remembers any leading whitespace, which is used in later processing steps to strip off
/// leading whitespace for an entire comment block when it is associated with a definition.
#[derive(Debug, PartialEq, Clone)]
pub struct LiteComment {
leading_ws: Option<Spanned<String>>,
rest: Spanned<String>,
}
impl LiteComment {
pub fn new(string: impl Into<Spanned<String>>) -> LiteComment {
LiteComment {
leading_ws: None,
rest: string.into(),
}
}
pub fn new_with_ws(
ws: impl Into<Spanned<String>>,
comment: impl Into<Spanned<String>>,
) -> LiteComment {
LiteComment {
leading_ws: Some(ws.into()),
rest: comment.into(),
}
}
pub fn unindent(&self, excluded_spaces: usize) -> LiteComment {
match &self.leading_ws {
// If there's no leading whitespace, there's no whitespace to exclude
None => self.clone(),
Some(Spanned { item, span }) => {
// If the number of spaces to exclude is larger than the amount of whitespace we
// have, there's no whitespace to move into the comment body.
if excluded_spaces > item.len() {
self.clone()
} else {
// If there are no spaces to exclude, prepend all of the leading_whitespace to
// the comment body.
if excluded_spaces == 0 {
let rest_span = self.span();
let rest = format!("{}{}", item, self.rest.item).spanned(rest_span);
return LiteComment {
leading_ws: None,
rest,
};
}
// Pull off excluded_spaces number of spaces, and create a new Spanned<String>
// for that whitespace. Any remaining spaces will be added to the comment.
let excluded_ws = item[..excluded_spaces]
.to_string()
.spanned(Span::new(span.start(), span.start() + excluded_spaces));
let included_ws = &item[excluded_spaces..];
let rest_start = span.start() + excluded_spaces;
let rest_span = Span::new(rest_start, rest_start + self.rest.len());
let rest = format!("{}{}", included_ws, self.rest.item).spanned(rest_span);
LiteComment {
leading_ws: Some(excluded_ws),
rest,
}
}
}
}
}
pub fn ws_len(&self) -> usize {
match &self.leading_ws {
None => 0,
Some(ws) => ws.item.len(),
}
}
pub(crate) fn trim(&self) -> Spanned<String> {
let trimmed = self.rest.trim();
trimmed.to_string().spanned(Span::new(
self.rest.span().start(),
self.rest.span().start() + trimmed.len(),
))
}
}
impl fmt::Display for LiteComment {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match &self.leading_ws {
None => write!(f, "#{}", self.rest.item),
Some(leading) => write!(f, "#{}{}", leading.item, self.rest.item),
}
}
}
impl HasSpan for LiteComment {
fn span(&self) -> Span {
match &self.leading_ws {
None => self.rest.span(),
Some(leading) => leading.span().until(self.rest.span()),
}
}
}
/// A `LiteCommand` is a list of words that will get meaning when processed by
/// the parser.
#[derive(Debug, Default, Clone)]
pub struct LiteCommand {
pub parts: Vec<Spanned<String>>,
/// Preceding comments.
pub comments: Option<Vec<LiteComment>>,
}
impl HasSpan for LiteCommand {
fn span(&self) -> Span {
Span::from_list(&self.parts)
}
}
impl LiteCommand {
pub fn comments_joined(&self) -> String {
match &self.comments {
None => "".to_string(),
Some(text) => text.iter().map(|s| s.trim().item).join("\n"),
}
}
}
/// A `LitePipeline` is a series of `LiteCommand`s, separated by `|`.
#[derive(Debug, Clone, new)]
pub struct LitePipeline {
pub commands: Vec<LiteCommand>,
}
impl HasSpan for LitePipeline {
fn span(&self) -> Span {
Span::from_list(&self.commands)
}
}
/// A `LiteGroup` is a series of `LitePipeline`s, separated by `;`.
#[derive(Debug, Clone, new)]
pub struct LiteGroup {
pub pipelines: Vec<LitePipeline>,
}
impl From<GroupBuilder> for LiteGroup {
fn from(group: GroupBuilder) -> Self {
LiteGroup::new(group.map(|p| LitePipeline::new(p.into())))
}
}
impl HasSpan for LiteGroup {
fn span(&self) -> Span {
Span::from_list(&self.pipelines)
}
}
/// A `LiteBlock` is a series of `LiteGroup`s, separated by newlines.
#[derive(Debug, Clone, new)]
pub struct LiteBlock {
pub block: Vec<LiteGroup>,
}
impl HasSpan for LiteBlock {
fn span(&self) -> Span {
Span::from_list(&self.block)
}
}

View File

@ -9,7 +9,8 @@ mod scope;
mod shapes; mod shapes;
mod signature; mod signature;
pub use lex::{block, lex, LiteBlock, LiteCommand, LiteGroup, LitePipeline}; pub use lex::lexer::{block, lex};
pub use lex::tokens::{LiteBlock, LiteCommand, LiteGroup, LitePipeline};
pub use parse::{classify_block, garbage, parse, parse_full_column_path, parse_math_expression}; pub use parse::{classify_block, garbage, parse, parse_full_column_path, parse_math_expression};
pub use path::expand_ndots; pub use path::expand_ndots;
pub use scope::ParserScope; pub use scope::ParserScope;

View File

@ -9,10 +9,11 @@ use nu_protocol::hir::{
Unit, Unit,
}; };
use nu_protocol::{NamedType, PositionalType, Signature, SyntaxShape, UnspannedPathMember}; use nu_protocol::{NamedType, PositionalType, Signature, SyntaxShape, UnspannedPathMember};
use nu_source::{Span, Spanned, SpannedItem}; use nu_source::{HasSpan, Span, Spanned, SpannedItem};
use num_bigint::BigInt; use num_bigint::BigInt;
use crate::lex::{block, lex, LiteBlock, LiteCommand, LitePipeline}; use crate::lex::lexer::{block, lex};
use crate::lex::tokens::{LiteBlock, LiteCommand, LitePipeline};
use crate::path::expand_path; use crate::path::expand_path;
use crate::scope::ParserScope; use crate::scope::ParserScope;
use bigdecimal::BigDecimal; use bigdecimal::BigDecimal;
@ -2142,7 +2143,7 @@ fn unit_parse_byte_units() -> Result<(), ParseError> {
string: String, string: String,
value: i64, value: i64,
unit: Unit, unit: Unit,
}; }
let cases = [ let cases = [
TestCase { TestCase {

View File

@ -1,12 +1,15 @@
use crate::parse::{classify_block, util::trim_quotes}; use crate::{
lex::tokens::LiteCommand,
parse::{classify_block, util::trim_quotes},
};
use indexmap::IndexMap; use indexmap::IndexMap;
use nu_errors::ParseError; use nu_errors::ParseError;
use nu_protocol::hir::Block; use nu_protocol::hir::Block;
use nu_source::SpannedItem; use nu_source::{HasSpan, SpannedItem};
//use crate::errors::{ParseError, ParseResult}; //use crate::errors::{ParseError, ParseResult};
use crate::lex::{block, lex, LiteCommand}; use crate::lex::lexer::{block, lex};
use crate::ParserScope; use crate::ParserScope;

View File

@ -14,7 +14,10 @@
use log::debug; use log::debug;
use crate::{ use crate::{
lex::{lex, Token, TokenContents}, lex::{
lexer::{lex, Token},
tokens::TokenContents,
},
parse::util::token_to_spanned_string, parse::util::token_to_spanned_string,
}; };
use nu_errors::ParseError; use nu_errors::ParseError;

View File

@ -2,7 +2,7 @@ use nu_errors::ParseError;
use nu_protocol::hir::{Expression, SpannedExpression}; use nu_protocol::hir::{Expression, SpannedExpression};
use nu_source::{Span, Spanned, SpannedItem}; use nu_source::{Span, Spanned, SpannedItem};
use crate::lex::Token; use crate::lex::lexer::Token;
pub(crate) fn token_to_spanned_string(token: &Token) -> Spanned<String> { pub(crate) fn token_to_spanned_string(token: &Token) -> Spanned<String> {
token.contents.to_string().spanned(token.span) token.contents.to_string().spanned(token.span)

View File

@ -500,6 +500,19 @@ impl Span {
Span::new(0, 0) Span::new(0, 0)
} }
pub fn from_list(list: &[impl HasSpan]) -> Span {
let mut iterator = list.iter();
match iterator.next() {
None => Span::new(0, 0),
Some(first) => {
let last = iterator.last().unwrap_or(first);
Span::new(first.span().start, last.span().end)
}
}
}
/// Creates a new `Span` from start and end inputs. The end parameter must be greater than or equal to the start parameter. /// Creates a new `Span` from start and end inputs. The end parameter must be greater than or equal to the start parameter.
pub fn new(start: usize, end: usize) -> Span { pub fn new(start: usize, end: usize) -> Span {
assert!( assert!(