nushell/crates/nu-parser/src/lex/lexer.rs
Jonathan Turner 7e184b58b2
Fix warnings for Rust 1.51 (#3214)
* Fix warnings for Rust 1.51

* More fixes

* More fixes
2021-03-26 21:26:57 +13:00

514 lines
17 KiB
Rust

use smart_default::SmartDefault;
use std::iter::Peekable;
use std::str::CharIndices;
use nu_errors::ParseError;
use nu_source::{HasSpan, Span, Spanned, SpannedItem};
use super::token_group::TokenBuilder;
use super::tokens::{
CommandBuilder, CommentsBuilder, GroupBuilder, LiteBlock, LiteCommand, LiteComment,
PipelineBuilder, TokenContents,
};
type Input<'t> = Peekable<CharIndices<'t>>;
#[derive(Debug, Clone)]
pub struct Token {
pub contents: TokenContents,
pub span: Span,
}
impl Token {
pub fn new(contents: TokenContents, span: Span) -> Token {
Token { contents, span }
}
}
#[derive(Clone, Copy)]
enum BlockKind {
Paren,
CurlyBracket,
SquareBracket,
}
impl BlockKind {
fn closing(self) -> char {
match self {
BlockKind::Paren => ')',
BlockKind::SquareBracket => ']',
BlockKind::CurlyBracket => '}',
}
}
}
/// Finds the extents of a basline token, returning the string with its
/// associated span, along with any parse error that was discovered along the
/// way.
///
/// Baseline tokens are unparsed content separated by spaces or a command
/// separator (like pipe or semicolon) Baseline tokens may be surrounded by
/// quotes (single, double, or backtick) or braces (square, paren, curly)
///
/// Baseline tokens may be further processed based on the needs of the syntax
/// shape that encounters them. They are still lightly lexed. For example, if a
/// baseline token begins with `{`, the entire token will continue until the
/// closing `}`, taking comments into consideration.
pub fn baseline(src: &mut Input, span_offset: usize) -> (Spanned<String>, Option<ParseError>) {
let mut token_contents = String::new();
let start_offset = if let Some((pos, _)) = src.peek() {
*pos
} else {
0
};
// This variable tracks the starting character of a string literal, so that
// we remain inside the string literal lexer mode until we encounter the
// closing quote.
let mut quote_start: Option<char> = None;
let mut in_comment = false;
// This Vec tracks paired delimiters
let mut block_level: Vec<BlockKind> = vec![];
// A baseline token is terminated if it's not nested inside of a paired
// delimiter and the next character is one of: `|`, `;`, `#` or any
// whitespace.
fn is_termination(block_level: &[BlockKind], c: char) -> bool {
block_level.is_empty() && (c.is_whitespace() || c == '|' || c == ';' || c == '#')
}
// The process of slurping up a baseline token repeats:
//
// - String literal, which begins with `'`, `"` or `\``, and continues until
// the same character is encountered again.
// - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
// the matching closing delimiter is found, skipping comments and string
// literals.
// - When not nested inside of a delimiter pair, when a terminating
// character (whitespace, `|`, `;` or `#`) is encountered, the baseline
// token is done.
// - Otherwise, accumulate the character into the current baseline token.
while let Some((_, c)) = src.peek() {
let c = *c;
if quote_start.is_some() {
// If we encountered the closing quote character for the current
// string, we're done with the current string.
if Some(c) == quote_start {
quote_start = None;
}
} else if c == '#' {
if is_termination(&block_level, c) {
break;
}
in_comment = true;
} else if c == '\n' {
in_comment = false;
if is_termination(&block_level, c) {
break;
}
} else if in_comment {
if is_termination(&block_level, c) {
break;
}
} else if c == '\'' || c == '"' || c == '`' {
// We encountered the opening quote of a string literal.
quote_start = Some(c);
} else if c == '[' {
// We encountered an opening `[` delimiter.
block_level.push(BlockKind::SquareBracket);
} else if c == ']' {
// We encountered a closing `]` delimiter. Pop off the opening `[`
// delimiter.
if let Some(BlockKind::SquareBracket) = block_level.last() {
let _ = block_level.pop();
}
} else if c == '{' {
// We encountered an opening `{` delimiter.
block_level.push(BlockKind::CurlyBracket);
} else if c == '}' {
// We encountered a closing `}` delimiter. Pop off the opening `{`.
if let Some(BlockKind::CurlyBracket) = block_level.last() {
let _ = block_level.pop();
}
} else if c == '(' {
// We enceountered an opening `(` delimiter.
block_level.push(BlockKind::Paren);
} else if c == ')' {
// We encountered a closing `)` delimiter. Pop off the opening `(`.
if let Some(BlockKind::Paren) = block_level.last() {
let _ = block_level.pop();
}
} else if is_termination(&block_level, c) {
break;
}
// Otherwise, accumulate the character into the current token.
token_contents.push(c);
// Consume the character.
let _ = src.next();
}
let span = Span::new(
start_offset + span_offset,
start_offset + span_offset + token_contents.len(),
);
// If there is still unclosed opening delimiters, close them and add
// synthetic closing characters to the accumulated token.
if let Some(block) = block_level.last() {
let delim: char = (*block).closing();
let cause = ParseError::unexpected_eof(delim.to_string(), span);
while let Some(bk) = block_level.pop() {
token_contents.push(bk.closing());
}
return (token_contents.spanned(span), Some(cause));
}
if let Some(delimiter) = quote_start {
// The non-lite parse trims quotes on both sides, so we add the expected quote so that
// anyone wanting to consume this partial parse (e.g., completions) will be able to get
// correct information from the non-lite parse.
token_contents.push(delimiter);
return (
token_contents.spanned(span),
Some(ParseError::unexpected_eof(delimiter.to_string(), span)),
);
}
// If we didn't accumulate any characters, it's an unexpected error.
if token_contents.is_empty() {
return (
token_contents.spanned(span),
Some(ParseError::unexpected_eof("command".to_string(), span)),
);
}
(token_contents.spanned(span), None)
}
/// We encountered a `#` character. Keep consuming characters until we encounter
/// a newline character (but don't consume it).
fn parse_comment(input: &mut Input, hash_offset: usize) -> LiteComment {
let mut comment = String::new();
let mut in_ws = true;
let mut body_start = 0;
input.next();
while let Some((_, c)) = input.peek() {
if *c == '\n' {
break;
}
if in_ws && c.is_whitespace() {
body_start += c.len_utf8();
} else if in_ws && !c.is_whitespace() {
in_ws = false;
}
comment.push(*c);
input.next();
}
if body_start == 0 {
let len = comment.len();
LiteComment::new(comment.spanned(Span::new(hash_offset + 1, hash_offset + 1 + len)))
} else {
let ws = comment[..body_start].to_string();
let body = comment[body_start..].to_string();
let body_len = body.len();
LiteComment::new_with_ws(
ws.spanned(Span::new(hash_offset + 1, hash_offset + 1 + body_start)),
body.spanned(Span::new(
hash_offset + 1 + body_start,
hash_offset + 1 + body_start + body_len,
)),
)
}
}
#[derive(SmartDefault)]
struct BlockParser {
groups: TokenBuilder<GroupBuilder>,
group: GroupBuilder,
pipeline: PipelineBuilder,
command: CommandBuilder,
prev_token: Option<Token>,
prev_comments: CommentsBuilder,
prev_comment_indent: usize,
}
impl BlockParser {
fn consumed(&mut self, token: Token) {
self.prev_token = Some(token);
}
fn success(mut self) -> (LiteBlock, Option<ParseError>) {
self.close_group();
(LiteBlock::new(self.groups.map(|g| g.into())), None)
}
fn fail(self, error: ParseError) -> (LiteBlock, Option<ParseError>) {
(LiteBlock::new(self.groups.map(|g| g.into())), Some(error))
}
fn comment(&mut self, token: &LiteComment) {
if self.prev_comments.is_empty() {
self.prev_comment_indent = token.ws_len();
}
self.prev_comments
.push(token.unindent(self.prev_comment_indent));
}
fn eoleol(&mut self) {
self.prev_comment_indent = 0;
self.prev_comments.take();
self.eol();
}
fn eol(&mut self) {
// If the last token on the current line is a `|`, the group
// continues on the next line.
if let Some(prev) = &self.prev_token {
if let TokenContents::Pipe = prev.contents {
return;
}
}
self.close_group();
}
fn pipe(&mut self) -> Result<(), ()> {
// If the current command has content, accumulate it into
// the current pipeline and start a new command.
match self.close_command() {
None => Err(()),
Some(command) => {
self.pipeline.push(command);
Ok(())
}
}
}
fn semicolon(&mut self) {
self.close_pipeline();
}
fn baseline(&mut self, part: Spanned<String>) {
// We encountered an unclassified character. Accumulate it into
// the current command as a string.
self.command.push(part);
}
fn close_command(&mut self) -> Option<LiteCommand> {
let command = self.command.take()?;
let command = LiteCommand {
parts: command.into(),
comments: self.prev_comments.take().map(|c| c.into()),
};
self.prev_comment_indent = 0;
Some(command)
}
fn close_pipeline(&mut self) {
if let Some(command) = self.close_command() {
self.pipeline.push(command);
}
if let Some(pipeline) = self.pipeline.take() {
self.group.push(pipeline);
}
}
fn close_group(&mut self) {
self.close_pipeline();
if let Some(group) = self.group.take() {
self.groups.push(group);
}
}
}
/// Try to parse a list of tokens into a block.
pub fn parse_block(tokens: Vec<Token>) -> (LiteBlock, Option<ParseError>) {
let mut parser = BlockParser::default();
let mut tokens = tokens.iter().peekable();
// The parsing process repeats:
//
// - newline (`\n` or `\r`)
// - pipes (`|`)
// - semicolon
while let Some(token) = tokens.next() {
match &token.contents {
TokenContents::Eol => {
// If we encounter two newline characters in a row, use a special eoleol event,
// which allows the parser to discard comments that shouldn't be treated as
// documentation for the following item.
if let Some(Token {
contents: TokenContents::Eol,
..
}) = tokens.peek()
{
tokens.next();
parser.eoleol();
} else {
// We encountered a newline character. If the last token on the
// current line is a `|`, continue the current group on the next
// line. Otherwise, close up the current group by rolling up the
// current command into the current pipeline, and then roll up
// the current pipeline into the group.
parser.eol();
}
}
TokenContents::Pipe => {
// We encountered a pipe (`|`) character, which terminates a
// command.
if parser.pipe().is_err() {
// If the current command doesn't have content, return an
// error that indicates that the `|` was unexpected.
return parser.fail(ParseError::extra_tokens(
"|".to_string().spanned(token.span),
));
}
// match parser.pipe() {}
}
TokenContents::Semicolon => {
// We encountered a semicolon (`;`) character, which terminates
// a pipeline.
parser.semicolon();
}
TokenContents::Baseline(part) => {
// We encountered an unclassified character. Accumulate it into
// the current command as a string.
parser.baseline(part.to_string().spanned(token.span));
}
TokenContents::Comment(comment) => parser.comment(comment),
}
parser.consumed(token.clone());
}
parser.success()
}
/// Breaks the input string into a vector of tokens. This tokenization only tries to classify separators like
/// semicolons, pipes, etc from external bare values (values that haven't been classified further)
/// Takes in a string and and offset, which is used to offset the spans created (for when this function is used to parse inner strings)
pub fn lex(input: &str, span_offset: usize) -> (Vec<Token>, Option<ParseError>) {
// Break the input slice into an iterator of Unicode characters.
let mut char_indices = input.char_indices().peekable();
let mut error = None;
let mut output = vec![];
let mut is_complete = true;
// The lexing process repeats. One character of lookahead is sufficient to decide what to do next.
//
// - `|`: the token is either `|` token or a `||` token
// - `;`: the token is a semicolon
// - `\n` or `\r`: the token is an EOL (end of line) token
// - other whitespace: ignored
// - `#` the token starts a line comment, which contains all of the subsequent characters until the next EOL
// -
while let Some((idx, c)) = char_indices.peek() {
if *c == '|' {
// If the next character is `|`, it's either `|` or `||`.
let idx = *idx;
let prev_idx = idx;
let _ = char_indices.next();
// If the next character is `|`, we're looking at a `||`.
if let Some((idx, c)) = char_indices.peek() {
if *c == '|' {
let idx = *idx;
let _ = char_indices.next();
output.push(Token::new(
TokenContents::Baseline("||".into()),
Span::new(span_offset + prev_idx, span_offset + idx + 1),
));
continue;
}
}
// Otherwise, it's just a regular `|` token.
output.push(Token::new(
TokenContents::Pipe,
Span::new(span_offset + idx, span_offset + idx + 1),
));
is_complete = false;
} else if *c == ';' {
// If the next character is a `;`, we're looking at a semicolon token.
if !is_complete && error.is_none() {
error = Some(ParseError::extra_tokens(
";".to_string().spanned(Span::new(*idx, idx + 1)),
));
}
let idx = *idx;
let _ = char_indices.next();
output.push(Token::new(
TokenContents::Semicolon,
Span::new(span_offset + idx, span_offset + idx + 1),
));
} else if *c == '\n' || *c == '\r' {
// If the next character is a newline, we're looking at an EOL (end of line) token.
let idx = *idx;
let _ = char_indices.next();
output.push(Token::new(
TokenContents::Eol,
Span::new(span_offset + idx, span_offset + idx + 1),
));
} else if *c == '#' {
// If the next character is `#`, we're at the beginning of a line
// comment. The comment continues until the next newline.
let idx = *idx;
let comment = parse_comment(&mut char_indices, idx);
let span = comment.span();
output.push(Token::new(TokenContents::Comment(comment), span));
} else if c.is_whitespace() {
// If the next character is non-newline whitespace, skip it.
let _ = char_indices.next();
} else {
// Otherwise, try to consume an unclassified token.
let (result, err) = baseline(&mut char_indices, span_offset);
if error.is_none() {
error = err;
}
is_complete = true;
let Spanned { item, span } = result;
output.push(Token::new(TokenContents::Baseline(item), span));
}
}
(output, error)
}