mirror of
https://github.com/nushell/nushell.git
synced 2025-05-05 02:24:25 +02:00
514 lines
17 KiB
Rust
514 lines
17 KiB
Rust
use smart_default::SmartDefault;
|
|
use std::iter::Peekable;
|
|
use std::str::CharIndices;
|
|
|
|
use nu_errors::ParseError;
|
|
use nu_source::{HasSpan, Span, Spanned, SpannedItem};
|
|
|
|
use super::token_group::TokenBuilder;
|
|
|
|
use super::tokens::{
|
|
CommandBuilder, CommentsBuilder, GroupBuilder, LiteBlock, LiteCommand, LiteComment,
|
|
PipelineBuilder, TokenContents,
|
|
};
|
|
|
|
type Input<'t> = Peekable<CharIndices<'t>>;
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct Token {
|
|
pub contents: TokenContents,
|
|
pub span: Span,
|
|
}
|
|
|
|
impl Token {
|
|
pub fn new(contents: TokenContents, span: Span) -> Token {
|
|
Token { contents, span }
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Copy)]
|
|
enum BlockKind {
|
|
Paren,
|
|
CurlyBracket,
|
|
SquareBracket,
|
|
}
|
|
|
|
impl BlockKind {
|
|
fn closing(self) -> char {
|
|
match self {
|
|
BlockKind::Paren => ')',
|
|
BlockKind::SquareBracket => ']',
|
|
BlockKind::CurlyBracket => '}',
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Finds the extents of a basline token, returning the string with its
|
|
/// associated span, along with any parse error that was discovered along the
|
|
/// way.
|
|
///
|
|
/// Baseline tokens are unparsed content separated by spaces or a command
|
|
/// separator (like pipe or semicolon) Baseline tokens may be surrounded by
|
|
/// quotes (single, double, or backtick) or braces (square, paren, curly)
|
|
///
|
|
/// Baseline tokens may be further processed based on the needs of the syntax
|
|
/// shape that encounters them. They are still lightly lexed. For example, if a
|
|
/// baseline token begins with `{`, the entire token will continue until the
|
|
/// closing `}`, taking comments into consideration.
|
|
pub fn baseline(src: &mut Input, span_offset: usize) -> (Spanned<String>, Option<ParseError>) {
|
|
let mut token_contents = String::new();
|
|
let start_offset = if let Some((pos, _)) = src.peek() {
|
|
*pos
|
|
} else {
|
|
0
|
|
};
|
|
|
|
// This variable tracks the starting character of a string literal, so that
|
|
// we remain inside the string literal lexer mode until we encounter the
|
|
// closing quote.
|
|
let mut quote_start: Option<char> = None;
|
|
|
|
let mut in_comment = false;
|
|
|
|
// This Vec tracks paired delimiters
|
|
let mut block_level: Vec<BlockKind> = vec![];
|
|
|
|
// A baseline token is terminated if it's not nested inside of a paired
|
|
// delimiter and the next character is one of: `|`, `;`, `#` or any
|
|
// whitespace.
|
|
fn is_termination(block_level: &[BlockKind], c: char) -> bool {
|
|
block_level.is_empty() && (c.is_whitespace() || c == '|' || c == ';' || c == '#')
|
|
}
|
|
|
|
// The process of slurping up a baseline token repeats:
|
|
//
|
|
// - String literal, which begins with `'`, `"` or `\``, and continues until
|
|
// the same character is encountered again.
|
|
// - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
|
|
// the matching closing delimiter is found, skipping comments and string
|
|
// literals.
|
|
// - When not nested inside of a delimiter pair, when a terminating
|
|
// character (whitespace, `|`, `;` or `#`) is encountered, the baseline
|
|
// token is done.
|
|
// - Otherwise, accumulate the character into the current baseline token.
|
|
while let Some((_, c)) = src.peek() {
|
|
let c = *c;
|
|
|
|
if quote_start.is_some() {
|
|
// If we encountered the closing quote character for the current
|
|
// string, we're done with the current string.
|
|
if Some(c) == quote_start {
|
|
quote_start = None;
|
|
}
|
|
} else if c == '#' {
|
|
if is_termination(&block_level, c) {
|
|
break;
|
|
}
|
|
in_comment = true;
|
|
} else if c == '\n' {
|
|
in_comment = false;
|
|
if is_termination(&block_level, c) {
|
|
break;
|
|
}
|
|
} else if in_comment {
|
|
if is_termination(&block_level, c) {
|
|
break;
|
|
}
|
|
} else if c == '\'' || c == '"' || c == '`' {
|
|
// We encountered the opening quote of a string literal.
|
|
quote_start = Some(c);
|
|
} else if c == '[' {
|
|
// We encountered an opening `[` delimiter.
|
|
block_level.push(BlockKind::SquareBracket);
|
|
} else if c == ']' {
|
|
// We encountered a closing `]` delimiter. Pop off the opening `[`
|
|
// delimiter.
|
|
if let Some(BlockKind::SquareBracket) = block_level.last() {
|
|
let _ = block_level.pop();
|
|
}
|
|
} else if c == '{' {
|
|
// We encountered an opening `{` delimiter.
|
|
block_level.push(BlockKind::CurlyBracket);
|
|
} else if c == '}' {
|
|
// We encountered a closing `}` delimiter. Pop off the opening `{`.
|
|
if let Some(BlockKind::CurlyBracket) = block_level.last() {
|
|
let _ = block_level.pop();
|
|
}
|
|
} else if c == '(' {
|
|
// We enceountered an opening `(` delimiter.
|
|
block_level.push(BlockKind::Paren);
|
|
} else if c == ')' {
|
|
// We encountered a closing `)` delimiter. Pop off the opening `(`.
|
|
if let Some(BlockKind::Paren) = block_level.last() {
|
|
let _ = block_level.pop();
|
|
}
|
|
} else if is_termination(&block_level, c) {
|
|
break;
|
|
}
|
|
|
|
// Otherwise, accumulate the character into the current token.
|
|
token_contents.push(c);
|
|
|
|
// Consume the character.
|
|
let _ = src.next();
|
|
}
|
|
|
|
let span = Span::new(
|
|
start_offset + span_offset,
|
|
start_offset + span_offset + token_contents.len(),
|
|
);
|
|
|
|
// If there is still unclosed opening delimiters, close them and add
|
|
// synthetic closing characters to the accumulated token.
|
|
if let Some(block) = block_level.last() {
|
|
let delim: char = (*block).closing();
|
|
let cause = ParseError::unexpected_eof(delim.to_string(), span);
|
|
|
|
while let Some(bk) = block_level.pop() {
|
|
token_contents.push(bk.closing());
|
|
}
|
|
|
|
return (token_contents.spanned(span), Some(cause));
|
|
}
|
|
|
|
if let Some(delimiter) = quote_start {
|
|
// The non-lite parse trims quotes on both sides, so we add the expected quote so that
|
|
// anyone wanting to consume this partial parse (e.g., completions) will be able to get
|
|
// correct information from the non-lite parse.
|
|
token_contents.push(delimiter);
|
|
|
|
return (
|
|
token_contents.spanned(span),
|
|
Some(ParseError::unexpected_eof(delimiter.to_string(), span)),
|
|
);
|
|
}
|
|
|
|
// If we didn't accumulate any characters, it's an unexpected error.
|
|
if token_contents.is_empty() {
|
|
return (
|
|
token_contents.spanned(span),
|
|
Some(ParseError::unexpected_eof("command".to_string(), span)),
|
|
);
|
|
}
|
|
|
|
(token_contents.spanned(span), None)
|
|
}
|
|
|
|
/// We encountered a `#` character. Keep consuming characters until we encounter
|
|
/// a newline character (but don't consume it).
|
|
fn parse_comment(input: &mut Input, hash_offset: usize) -> LiteComment {
|
|
let mut comment = String::new();
|
|
let mut in_ws = true;
|
|
let mut body_start = 0;
|
|
|
|
input.next();
|
|
|
|
while let Some((_, c)) = input.peek() {
|
|
if *c == '\n' {
|
|
break;
|
|
}
|
|
|
|
if in_ws && c.is_whitespace() {
|
|
body_start += c.len_utf8();
|
|
} else if in_ws && !c.is_whitespace() {
|
|
in_ws = false;
|
|
}
|
|
|
|
comment.push(*c);
|
|
input.next();
|
|
}
|
|
|
|
if body_start == 0 {
|
|
let len = comment.len();
|
|
|
|
LiteComment::new(comment.spanned(Span::new(hash_offset + 1, hash_offset + 1 + len)))
|
|
} else {
|
|
let ws = comment[..body_start].to_string();
|
|
let body = comment[body_start..].to_string();
|
|
|
|
let body_len = body.len();
|
|
|
|
LiteComment::new_with_ws(
|
|
ws.spanned(Span::new(hash_offset + 1, hash_offset + 1 + body_start)),
|
|
body.spanned(Span::new(
|
|
hash_offset + 1 + body_start,
|
|
hash_offset + 1 + body_start + body_len,
|
|
)),
|
|
)
|
|
}
|
|
}
|
|
|
|
#[derive(SmartDefault)]
|
|
struct BlockParser {
|
|
groups: TokenBuilder<GroupBuilder>,
|
|
group: GroupBuilder,
|
|
pipeline: PipelineBuilder,
|
|
command: CommandBuilder,
|
|
prev_token: Option<Token>,
|
|
prev_comments: CommentsBuilder,
|
|
prev_comment_indent: usize,
|
|
}
|
|
|
|
impl BlockParser {
|
|
fn consumed(&mut self, token: Token) {
|
|
self.prev_token = Some(token);
|
|
}
|
|
|
|
fn success(mut self) -> (LiteBlock, Option<ParseError>) {
|
|
self.close_group();
|
|
|
|
(LiteBlock::new(self.groups.map(|g| g.into())), None)
|
|
}
|
|
|
|
fn fail(self, error: ParseError) -> (LiteBlock, Option<ParseError>) {
|
|
(LiteBlock::new(self.groups.map(|g| g.into())), Some(error))
|
|
}
|
|
|
|
fn comment(&mut self, token: &LiteComment) {
|
|
if self.prev_comments.is_empty() {
|
|
self.prev_comment_indent = token.ws_len();
|
|
}
|
|
|
|
self.prev_comments
|
|
.push(token.unindent(self.prev_comment_indent));
|
|
}
|
|
|
|
fn eoleol(&mut self) {
|
|
self.prev_comment_indent = 0;
|
|
self.prev_comments.take();
|
|
|
|
self.eol();
|
|
}
|
|
|
|
fn eol(&mut self) {
|
|
// If the last token on the current line is a `|`, the group
|
|
// continues on the next line.
|
|
if let Some(prev) = &self.prev_token {
|
|
if let TokenContents::Pipe = prev.contents {
|
|
return;
|
|
}
|
|
}
|
|
|
|
self.close_group();
|
|
}
|
|
|
|
fn pipe(&mut self) -> Result<(), ()> {
|
|
// If the current command has content, accumulate it into
|
|
// the current pipeline and start a new command.
|
|
|
|
match self.close_command() {
|
|
None => Err(()),
|
|
Some(command) => {
|
|
self.pipeline.push(command);
|
|
Ok(())
|
|
}
|
|
}
|
|
}
|
|
|
|
fn semicolon(&mut self) {
|
|
self.close_pipeline();
|
|
}
|
|
|
|
fn baseline(&mut self, part: Spanned<String>) {
|
|
// We encountered an unclassified character. Accumulate it into
|
|
// the current command as a string.
|
|
|
|
self.command.push(part);
|
|
}
|
|
|
|
fn close_command(&mut self) -> Option<LiteCommand> {
|
|
let command = self.command.take()?;
|
|
let command = LiteCommand {
|
|
parts: command.into(),
|
|
comments: self.prev_comments.take().map(|c| c.into()),
|
|
};
|
|
|
|
self.prev_comment_indent = 0;
|
|
|
|
Some(command)
|
|
}
|
|
|
|
fn close_pipeline(&mut self) {
|
|
if let Some(command) = self.close_command() {
|
|
self.pipeline.push(command);
|
|
}
|
|
|
|
if let Some(pipeline) = self.pipeline.take() {
|
|
self.group.push(pipeline);
|
|
}
|
|
}
|
|
|
|
fn close_group(&mut self) {
|
|
self.close_pipeline();
|
|
|
|
if let Some(group) = self.group.take() {
|
|
self.groups.push(group);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Try to parse a list of tokens into a block.
|
|
pub fn parse_block(tokens: Vec<Token>) -> (LiteBlock, Option<ParseError>) {
|
|
let mut parser = BlockParser::default();
|
|
|
|
let mut tokens = tokens.iter().peekable();
|
|
|
|
// The parsing process repeats:
|
|
//
|
|
// - newline (`\n` or `\r`)
|
|
// - pipes (`|`)
|
|
// - semicolon
|
|
while let Some(token) = tokens.next() {
|
|
match &token.contents {
|
|
TokenContents::Eol => {
|
|
// If we encounter two newline characters in a row, use a special eoleol event,
|
|
// which allows the parser to discard comments that shouldn't be treated as
|
|
// documentation for the following item.
|
|
if let Some(Token {
|
|
contents: TokenContents::Eol,
|
|
..
|
|
}) = tokens.peek()
|
|
{
|
|
tokens.next();
|
|
parser.eoleol();
|
|
} else {
|
|
// We encountered a newline character. If the last token on the
|
|
// current line is a `|`, continue the current group on the next
|
|
// line. Otherwise, close up the current group by rolling up the
|
|
// current command into the current pipeline, and then roll up
|
|
// the current pipeline into the group.
|
|
parser.eol();
|
|
}
|
|
}
|
|
TokenContents::Pipe => {
|
|
// We encountered a pipe (`|`) character, which terminates a
|
|
// command.
|
|
|
|
if parser.pipe().is_err() {
|
|
// If the current command doesn't have content, return an
|
|
// error that indicates that the `|` was unexpected.
|
|
return parser.fail(ParseError::extra_tokens(
|
|
"|".to_string().spanned(token.span),
|
|
));
|
|
}
|
|
// match parser.pipe() {}
|
|
}
|
|
TokenContents::Semicolon => {
|
|
// We encountered a semicolon (`;`) character, which terminates
|
|
// a pipeline.
|
|
|
|
parser.semicolon();
|
|
}
|
|
TokenContents::Baseline(part) => {
|
|
// We encountered an unclassified character. Accumulate it into
|
|
// the current command as a string.
|
|
|
|
parser.baseline(part.to_string().spanned(token.span));
|
|
}
|
|
TokenContents::Comment(comment) => parser.comment(comment),
|
|
}
|
|
|
|
parser.consumed(token.clone());
|
|
}
|
|
|
|
parser.success()
|
|
}
|
|
|
|
/// Breaks the input string into a vector of tokens. This tokenization only tries to classify separators like
|
|
/// semicolons, pipes, etc from external bare values (values that haven't been classified further)
|
|
/// Takes in a string and and offset, which is used to offset the spans created (for when this function is used to parse inner strings)
|
|
pub fn lex(input: &str, span_offset: usize) -> (Vec<Token>, Option<ParseError>) {
|
|
// Break the input slice into an iterator of Unicode characters.
|
|
let mut char_indices = input.char_indices().peekable();
|
|
let mut error = None;
|
|
|
|
let mut output = vec![];
|
|
let mut is_complete = true;
|
|
|
|
// The lexing process repeats. One character of lookahead is sufficient to decide what to do next.
|
|
//
|
|
// - `|`: the token is either `|` token or a `||` token
|
|
// - `;`: the token is a semicolon
|
|
// - `\n` or `\r`: the token is an EOL (end of line) token
|
|
// - other whitespace: ignored
|
|
// - `#` the token starts a line comment, which contains all of the subsequent characters until the next EOL
|
|
// -
|
|
while let Some((idx, c)) = char_indices.peek() {
|
|
if *c == '|' {
|
|
// If the next character is `|`, it's either `|` or `||`.
|
|
|
|
let idx = *idx;
|
|
let prev_idx = idx;
|
|
let _ = char_indices.next();
|
|
|
|
// If the next character is `|`, we're looking at a `||`.
|
|
if let Some((idx, c)) = char_indices.peek() {
|
|
if *c == '|' {
|
|
let idx = *idx;
|
|
let _ = char_indices.next();
|
|
output.push(Token::new(
|
|
TokenContents::Baseline("||".into()),
|
|
Span::new(span_offset + prev_idx, span_offset + idx + 1),
|
|
));
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Otherwise, it's just a regular `|` token.
|
|
output.push(Token::new(
|
|
TokenContents::Pipe,
|
|
Span::new(span_offset + idx, span_offset + idx + 1),
|
|
));
|
|
is_complete = false;
|
|
} else if *c == ';' {
|
|
// If the next character is a `;`, we're looking at a semicolon token.
|
|
|
|
if !is_complete && error.is_none() {
|
|
error = Some(ParseError::extra_tokens(
|
|
";".to_string().spanned(Span::new(*idx, idx + 1)),
|
|
));
|
|
}
|
|
let idx = *idx;
|
|
let _ = char_indices.next();
|
|
output.push(Token::new(
|
|
TokenContents::Semicolon,
|
|
Span::new(span_offset + idx, span_offset + idx + 1),
|
|
));
|
|
} else if *c == '\n' || *c == '\r' {
|
|
// If the next character is a newline, we're looking at an EOL (end of line) token.
|
|
|
|
let idx = *idx;
|
|
let _ = char_indices.next();
|
|
output.push(Token::new(
|
|
TokenContents::Eol,
|
|
Span::new(span_offset + idx, span_offset + idx + 1),
|
|
));
|
|
} else if *c == '#' {
|
|
// If the next character is `#`, we're at the beginning of a line
|
|
// comment. The comment continues until the next newline.
|
|
let idx = *idx;
|
|
|
|
let comment = parse_comment(&mut char_indices, idx);
|
|
let span = comment.span();
|
|
|
|
output.push(Token::new(TokenContents::Comment(comment), span));
|
|
} else if c.is_whitespace() {
|
|
// If the next character is non-newline whitespace, skip it.
|
|
|
|
let _ = char_indices.next();
|
|
} else {
|
|
// Otherwise, try to consume an unclassified token.
|
|
|
|
let (result, err) = baseline(&mut char_indices, span_offset);
|
|
if error.is_none() {
|
|
error = err;
|
|
}
|
|
is_complete = true;
|
|
let Spanned { item, span } = result;
|
|
output.push(Token::new(TokenContents::Baseline(item), span));
|
|
}
|
|
}
|
|
|
|
(output, error)
|
|
}
|