first commit

This commit is contained in:
JT 2021-06-30 13:42:56 +12:00
commit 29d2449fb3
9 changed files with 871 additions and 0 deletions

8
Cargo.toml Normal file
View File

@ -0,0 +1,8 @@
[package]
name = "engine-q"
version = "0.1.0"
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

319
src/lex.rs Normal file
View File

@ -0,0 +1,319 @@
use crate::{ParseError, Span};
#[derive(Debug, PartialEq, Eq)]
pub enum TokenContents {
Item,
Comment,
Pipe,
Semicolon,
Eol,
}
#[derive(Debug, PartialEq, Eq)]
pub struct Token {
pub contents: TokenContents,
pub span: Span,
}
impl Token {
pub fn new(contents: TokenContents, span: Span) -> Token {
Token { contents, span }
}
}
#[derive(Clone, Copy, Debug)]
pub enum BlockKind {
Paren,
CurlyBracket,
SquareBracket,
}
impl BlockKind {
fn closing(self) -> u8 {
match self {
BlockKind::Paren => b')',
BlockKind::SquareBracket => b']',
BlockKind::CurlyBracket => b'}',
}
}
}
#[derive(PartialEq, Eq, Debug)]
pub enum LexMode {
Normal,
}
// A baseline token is terminated if it's not nested inside of a paired
// delimiter and the next character is one of: `|`, `;`, `#` or any
// whitespace.
fn is_item_terminator(block_level: &[BlockKind], c: u8) -> bool {
block_level.is_empty()
&& (c == b' ' || c == b'\t' || c == b'\n' || c == b'|' || c == b';' || c == b'#')
}
pub fn lex_item(
input: &[u8],
curr_offset: &mut usize,
file_id: usize,
) -> (Span, Option<ParseError>) {
// This variable tracks the starting character of a string literal, so that
// we remain inside the string literal lexer mode until we encounter the
// closing quote.
let mut quote_start: Option<u8> = None;
let mut in_comment = false;
let token_start = *curr_offset;
// This Vec tracks paired delimiters
let mut block_level: Vec<BlockKind> = vec![];
// The process of slurping up a baseline token repeats:
//
// - String literal, which begins with `'`, `"` or `\``, and continues until
// the same character is encountered again.
// - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until
// the matching closing delimiter is found, skipping comments and string
// literals.
// - When not nested inside of a delimiter pair, when a terminating
// character (whitespace, `|`, `;` or `#`) is encountered, the baseline
// token is done.
// - Otherwise, accumulate the character into the current baseline token.
while let Some(c) = input.get(*curr_offset) {
let c = *c;
if quote_start.is_some() {
// If we encountered the closing quote character for the current
// string, we're done with the current string.
if Some(c) == quote_start {
quote_start = None;
}
} else if c == b'#' {
if is_item_terminator(&block_level, c) {
break;
}
in_comment = true;
} else if c == b'\n' {
in_comment = false;
if is_item_terminator(&block_level, c) {
break;
}
} else if in_comment {
if is_item_terminator(&block_level, c) {
break;
}
} else if c == b'\'' || c == b'"' {
// We encountered the opening quote of a string literal.
quote_start = Some(c);
} else if c == b'[' {
// We encountered an opening `[` delimiter.
block_level.push(BlockKind::SquareBracket);
} else if c == b']' {
// We encountered a closing `]` delimiter. Pop off the opening `[`
// delimiter.
if let Some(BlockKind::SquareBracket) = block_level.last() {
let _ = block_level.pop();
}
} else if c == b'{' {
// We encountered an opening `{` delimiter.
block_level.push(BlockKind::CurlyBracket);
} else if c == b'}' {
// We encountered a closing `}` delimiter. Pop off the opening `{`.
if let Some(BlockKind::CurlyBracket) = block_level.last() {
let _ = block_level.pop();
}
} else if c == b'(' {
// We enceountered an opening `(` delimiter.
block_level.push(BlockKind::Paren);
} else if c == b')' {
// We encountered a closing `)` delimiter. Pop off the opening `(`.
if let Some(BlockKind::Paren) = block_level.last() {
let _ = block_level.pop();
}
} else if is_item_terminator(&block_level, c) {
break;
}
*curr_offset += 1;
}
let span = Span::new(token_start, *curr_offset, file_id);
// If there is still unclosed opening delimiters, close them and add
// synthetic closing characters to the accumulated token.
if let Some(block) = block_level.last() {
let delim = block.closing();
let cause = ParseError::UnexpectedEof((delim as char).to_string(), span);
return (span, Some(cause));
}
if let Some(delim) = quote_start {
// The non-lite parse trims quotes on both sides, so we add the expected quote so that
// anyone wanting to consume this partial parse (e.g., completions) will be able to get
// correct information from the non-lite parse.
return (
span,
Some(ParseError::UnexpectedEof((delim as char).to_string(), span)),
);
}
// If we didn't accumulate any characters, it's an unexpected error.
if *curr_offset - token_start == 0 {
return (
span,
Some(ParseError::UnexpectedEof("command".to_string(), span)),
);
}
(span, None)
}
pub fn lex(
input: &[u8],
file_id: usize,
span_offset: usize,
lex_mode: LexMode,
) -> (Vec<Token>, Option<ParseError>) {
let mut error = None;
let mut curr_offset = span_offset;
let mut output = vec![];
let mut is_complete = true;
while let Some(c) = input.get(curr_offset) {
let c = *c;
if c == b'|' {
// If the next character is `|`, it's either `|` or `||`.
let idx = curr_offset;
let prev_idx = idx;
curr_offset += 1;
// If the next character is `|`, we're looking at a `||`.
if let Some(c) = input.get(curr_offset) {
if *c == b'|' {
let idx = curr_offset;
curr_offset += 1;
output.push(Token::new(
TokenContents::Item,
Span::new(span_offset + prev_idx, span_offset + idx + 1, file_id),
));
continue;
}
}
// Otherwise, it's just a regular `|` token.
output.push(Token::new(
TokenContents::Pipe,
Span::new(span_offset + idx, span_offset + idx + 1, file_id),
));
is_complete = false;
} else if c == b';' {
// If the next character is a `;`, we're looking at a semicolon token.
if !is_complete && error.is_none() {
error = Some(ParseError::ExtraTokens(Span::new(
curr_offset,
curr_offset + 1,
file_id,
)));
}
let idx = curr_offset;
curr_offset += 1;
output.push(Token::new(
TokenContents::Semicolon,
Span::new(idx, idx + 1, file_id),
));
} else if c == b'\n' || c == b'\r' {
// If the next character is a newline, we're looking at an EOL (end of line) token.
let idx = curr_offset;
curr_offset += 1;
if lex_mode == LexMode::Normal {
output.push(Token::new(
TokenContents::Eol,
Span::new(idx, idx + 1, file_id),
));
}
} else if c == b'#' {
// If the next character is `#`, we're at the beginning of a line
// comment. The comment continues until the next newline.
let mut start = curr_offset;
while let Some(input) = input.get(curr_offset) {
curr_offset += 1;
if *input == b'\n' {
output.push(Token::new(
TokenContents::Comment,
Span::new(start, curr_offset, file_id),
));
start = curr_offset;
break;
}
}
if start != curr_offset {
output.push(Token::new(
TokenContents::Comment,
Span::new(start, curr_offset, file_id),
));
}
} else if c == b' ' || c == b'\t' {
// If the next character is non-newline whitespace, skip it.
curr_offset += 1;
} else {
// Otherwise, try to consume an unclassified token.
let (span, err) = lex_item(input, &mut curr_offset, file_id);
if error.is_none() {
error = err;
}
is_complete = true;
output.push(Token::new(TokenContents::Item, span));
}
}
(output, error)
}
#[cfg(test)]
mod lex_tests {
use super::*;
#[test]
fn lex_basic() {
let file = b"let x = 4";
let output = lex(file, 0, 0, LexMode::Normal);
assert!(output.1.is_none());
}
#[test]
fn lex_newline() {
let file = b"let x = 300\nlet y = 500;";
let output = lex(file, 0, 0, LexMode::Normal);
println!("{:#?}", output.0);
assert!(output.0.contains(&Token {
contents: TokenContents::Eol,
span: Span {
start: 11,
end: 12,
file_id: 0
}
}));
}
#[test]
fn lex_empty() {
let file = b"";
let output = lex(file, 0, 0, LexMode::Normal);
assert!(output.0.is_empty());
assert!(output.1.is_none());
}
}

12
src/lib.rs Normal file
View File

@ -0,0 +1,12 @@
mod lex;
mod lite_parse;
mod parse_error;
mod parser;
mod parser_state;
mod span;
pub use lex::{lex, LexMode, Token, TokenContents};
pub use lite_parse::{lite_parse, LiteBlock, LiteCommand, LiteStatement};
pub use parse_error::ParseError;
pub use parser_state::{ParserState, ParserWorkingSet, VarLocation};
pub use span::Span;

207
src/lite_parse.rs Normal file
View File

@ -0,0 +1,207 @@
use crate::{ParseError, Span, Token, TokenContents};
#[derive(Debug)]
pub struct LiteCommand {
pub comments: Vec<Span>,
pub parts: Vec<Span>,
}
impl Default for LiteCommand {
fn default() -> Self {
Self::new()
}
}
impl LiteCommand {
pub fn new() -> Self {
Self {
comments: vec![],
parts: vec![],
}
}
pub fn push(&mut self, span: Span) {
self.parts.push(span);
}
pub fn is_empty(&self) -> bool {
self.parts.is_empty()
}
}
#[derive(Debug)]
pub struct LiteStatement {
pub commands: Vec<LiteCommand>,
}
impl Default for LiteStatement {
fn default() -> Self {
Self::new()
}
}
impl LiteStatement {
pub fn new() -> Self {
Self { commands: vec![] }
}
pub fn push(&mut self, command: LiteCommand) {
self.commands.push(command);
}
pub fn is_empty(&self) -> bool {
self.commands.is_empty()
}
}
#[derive(Debug)]
pub struct LiteBlock {
pub block: Vec<LiteStatement>,
}
impl Default for LiteBlock {
fn default() -> Self {
Self::new()
}
}
impl LiteBlock {
pub fn new() -> Self {
Self { block: vec![] }
}
pub fn push(&mut self, pipeline: LiteStatement) {
self.block.push(pipeline);
}
pub fn is_empty(&self) -> bool {
self.block.is_empty()
}
}
pub fn lite_parse(tokens: &[Token]) -> (LiteBlock, Option<ParseError>) {
let mut curr_token = 0;
let mut block = LiteBlock::new();
let mut curr_pipeline = LiteStatement::new();
let mut curr_command = LiteCommand::new();
while let Some(token) = tokens.get(curr_token) {
match &token.contents {
TokenContents::Item => curr_command.push(token.span),
TokenContents::Pipe => {
if !curr_command.is_empty() {
curr_pipeline.push(curr_command);
curr_command = LiteCommand::new();
}
}
TokenContents::Eol | TokenContents::Semicolon => {
if !curr_command.is_empty() {
curr_pipeline.push(curr_command);
}
curr_command = LiteCommand::new();
if !curr_pipeline.is_empty() {
block.push(curr_pipeline);
}
curr_pipeline = LiteStatement::new();
}
TokenContents::Comment => {
curr_command.comments.push(token.span);
}
}
curr_token += 1;
}
if !curr_command.is_empty() {
curr_pipeline.push(curr_command);
}
if !curr_pipeline.is_empty() {
block.push(curr_pipeline);
}
(block, None)
}
#[cfg(test)]
mod tests {
use crate::{lex, lite_parse, LiteBlock, ParseError, Span};
fn lite_parse_helper(input: &[u8]) -> Result<LiteBlock, ParseError> {
let (output, err) = lex(input, 0, 0, crate::LexMode::Normal);
if let Some(err) = err {
return Err(err);
}
let (output, err) = lite_parse(&output);
if let Some(err) = err {
return Err(err);
}
Ok(output)
}
#[test]
fn comment_before() -> Result<(), ParseError> {
let input = b"# this is a comment\ndef foo bar";
let lite_block = lite_parse_helper(input)?;
assert_eq!(lite_block.block.len(), 1);
assert_eq!(lite_block.block[0].commands.len(), 1);
assert_eq!(lite_block.block[0].commands[0].comments.len(), 1);
assert_eq!(lite_block.block[0].commands[0].parts.len(), 3);
Ok(())
}
#[test]
fn comment_beside() -> Result<(), ParseError> {
let input = b"def foo bar # this is a comment";
let lite_block = lite_parse_helper(input)?;
assert_eq!(lite_block.block.len(), 1);
assert_eq!(lite_block.block[0].commands.len(), 1);
assert_eq!(lite_block.block[0].commands[0].comments.len(), 1);
assert_eq!(lite_block.block[0].commands[0].parts.len(), 3);
Ok(())
}
#[test]
fn comments_stack() -> Result<(), ParseError> {
let input = b"# this is a comment\n# another comment\ndef foo bar ";
let lite_block = lite_parse_helper(input)?;
assert_eq!(lite_block.block.len(), 1);
assert_eq!(lite_block.block[0].commands.len(), 1);
assert_eq!(lite_block.block[0].commands[0].comments.len(), 2);
assert_eq!(lite_block.block[0].commands[0].parts.len(), 3);
Ok(())
}
#[test]
fn separated_comments_dont_stack() -> Result<(), ParseError> {
let input = b"# this is a comment\n\n# another comment\ndef foo bar ";
let lite_block = lite_parse_helper(input)?;
assert_eq!(lite_block.block.len(), 1);
assert_eq!(lite_block.block[0].commands.len(), 1);
assert_eq!(lite_block.block[0].commands[0].comments.len(), 1);
assert_eq!(
lite_block.block[0].commands[0].comments[0],
Span {
start: 21,
end: 39,
file_id: 0
}
);
assert_eq!(lite_block.block[0].commands[0].parts.len(), 3);
Ok(())
}
}

25
src/main.rs Normal file
View File

@ -0,0 +1,25 @@
use nu_parser_new::{lex, lite_parse, LexMode, ParserWorkingSet};
fn main() -> std::io::Result<()> {
if let Some(path) = std::env::args().nth(1) {
let file = std::fs::read(&path)?;
// let (output, err) = lex(&file, 0, 0, LexMode::Normal);
// println!("{:?} tokens, error: {:?}", output, err);
// let (output, err) = lite_parse(&output);
// println!("{:?}, error: {:?}", output, err);
let mut working_set = ParserWorkingSet::new(None);
let (output, err) = working_set.parse_file(&path, &file);
println!("{:?} {:?}", output, err);
Ok(())
} else {
println!("specify file to lex");
Ok(())
}
}

7
src/parse_error.rs Normal file
View File

@ -0,0 +1,7 @@
pub use crate::Span;
#[derive(Debug)]
pub enum ParseError {
ExtraTokens(Span),
UnexpectedEof(String, Span),
}

102
src/parser.rs Normal file
View File

@ -0,0 +1,102 @@
use std::str::Utf8Error;
use crate::{lex, lite_parse, LiteBlock, LiteStatement, ParseError, ParserWorkingSet, Span};
#[derive(Debug)]
pub enum Expression {}
#[derive(Debug)]
pub enum Import {}
#[derive(Debug)]
pub struct Block {
stmts: Vec<Statement>,
}
impl Block {
pub fn new() -> Self {
Self { stmts: vec![] }
}
}
#[derive(Debug)]
pub struct VarDecl {
name: String,
value: Expression,
}
#[derive(Debug)]
pub enum Statement {
Pipeline(Pipeline),
VarDecl(VarDecl),
Import(Import),
None,
}
#[derive(Debug)]
pub struct Pipeline {}
impl Pipeline {
pub fn new() -> Self {
Self {}
}
}
impl ParserWorkingSet {
fn parse_statement(
&mut self,
block: &mut Block,
lite_pipeline: &LiteStatement,
) -> Option<ParseError> {
match lite_pipeline.commands.len() {
0 => None,
1 => {
let command_name = self.get_span_contents(lite_pipeline.commands[0].parts[0]);
println!("{:?}", command_name);
if command_name == b"let" {
println!("found let")
}
None
}
_ => {
// pipeline
None
}
}
}
pub fn parse_block(&mut self, lite_block: &LiteBlock) -> (Block, Option<ParseError>) {
let mut error = None;
self.enter_scope();
let mut block = Block::new();
for pipeline in &lite_block.block {
let err = self.parse_statement(&mut block, pipeline);
error = error.or(err);
}
self.exit_scope();
(block, error)
}
pub fn parse_file(&mut self, fname: &str, contents: &[u8]) -> (Block, Option<ParseError>) {
let mut error = None;
let file_id = self.add_file(fname.into(), contents.into());
let (output, err) = lex(contents, file_id, 0, crate::LexMode::Normal);
error = error.or(err);
let (output, err) = lite_parse(&output);
error = error.or(err);
println!("{:?}", output);
let (output, err) = self.parse_block(&output);
error = error.or(err);
(output, error)
}
}

175
src/parser_state.rs Normal file
View File

@ -0,0 +1,175 @@
use crate::Span;
use std::{collections::HashMap, sync::Arc};
pub struct ParserState {
files: Vec<(String, Vec<u8>)>,
}
pub enum VarLocation {
CurrentScope,
OuterScope,
}
#[derive(Clone, Copy)]
pub enum Type {}
struct ScopeFrame {
vars: HashMap<String, Type>,
}
impl ScopeFrame {
pub fn new() -> Self {
Self {
vars: HashMap::new(),
}
}
}
pub struct ParserWorkingSet {
files: Vec<(String, Vec<u8>)>,
permanent_state: Option<Arc<ParserState>>,
scope: Vec<ScopeFrame>,
}
impl Default for ParserState {
fn default() -> Self {
Self::new()
}
}
impl ParserState {
pub fn new() -> Self {
Self { files: vec![] }
}
pub fn merge_working_set(this: &mut Arc<ParserState>, mut working_set: ParserWorkingSet) {
// Remove the working set's reference to the permanent state so we can safely take a mutable reference
working_set.permanent_state = None;
// Take the mutable reference and extend the permanent state from the working set
if let Some(this) = std::sync::Arc::<ParserState>::get_mut(this) {
this.files.extend(working_set.files);
} else {
panic!("Internal error: merging working set should always succeed");
}
}
pub fn num_files(&self) -> usize {
self.files.len()
}
pub(crate) fn add_file(&mut self, filename: String, contents: Vec<u8>) -> usize {
self.files.push((filename, contents));
self.num_files() - 1
}
pub(crate) fn get_file_contents(&self, idx: usize) -> &[u8] {
&self.files[idx].1
}
}
impl ParserWorkingSet {
pub fn new(permanent_state: Option<Arc<ParserState>>) -> Self {
Self {
files: vec![],
permanent_state,
scope: vec![],
}
}
pub fn num_files(&self) -> usize {
let parent_len = if let Some(permanent_state) = &self.permanent_state {
permanent_state.num_files()
} else {
0
};
self.files.len() + parent_len
}
pub fn add_file(&mut self, filename: String, contents: Vec<u8>) -> usize {
self.files.push((filename, contents));
self.num_files() - 1
}
pub fn get_span_contents(&self, span: Span) -> &[u8] {
if let Some(permanent_state) = &self.permanent_state {
let num_permanent_files = permanent_state.num_files();
if span.file_id < num_permanent_files {
&permanent_state.get_file_contents(span.file_id)[span.start..span.end]
} else {
&self.files[span.file_id - num_permanent_files].1[span.start..span.end]
}
} else {
&self.files[span.file_id].1[span.start..span.end]
}
}
pub fn enter_scope(&mut self) {
self.scope.push(ScopeFrame::new());
}
pub fn exit_scope(&mut self) {
self.scope.push(ScopeFrame::new());
}
pub fn find_variable(&self, name: &str) -> Option<(VarLocation, Type)> {
for scope in self.scope.iter().rev().enumerate() {
if let Some(result) = scope.1.vars.get(name) {
if scope.0 == 0 {
// Top level
return Some((VarLocation::CurrentScope, result.clone()));
} else {
return Some((VarLocation::OuterScope, result.clone()));
}
}
}
None
}
}
fn main() {}
#[cfg(test)]
mod parser_state_tests {
use super::*;
#[test]
fn add_file_gives_id() {
let mut parser_state = ParserWorkingSet::new(Some(Arc::new(ParserState::new())));
let id = parser_state.add_file("test.nu".into(), vec![]);
assert_eq!(id, 0);
}
#[test]
fn add_file_gives_id_including_parent() {
let mut parser_state = ParserState::new();
let parent_id = parser_state.add_file("test.nu".into(), vec![]);
let mut working_set = ParserWorkingSet::new(Some(Arc::new(parser_state)));
let working_set_id = working_set.add_file("child.nu".into(), vec![]);
assert_eq!(parent_id, 0);
assert_eq!(working_set_id, 1);
}
#[test]
fn merge_states() {
let mut parser_state = ParserState::new();
let parent_id = parser_state.add_file("test.nu".into(), vec![]);
let mut parser_state = Arc::new(parser_state);
let mut working_set = ParserWorkingSet::new(Some(parser_state.clone()));
let working_set_id = working_set.add_file("child.nu".into(), vec![]);
ParserState::merge_working_set(&mut parser_state, working_set);
assert_eq!(parser_state.num_files(), 2);
assert_eq!(&parser_state.files[0].0, "test.nu");
assert_eq!(&parser_state.files[1].0, "child.nu");
}
}

16
src/span.rs Normal file
View File

@ -0,0 +1,16 @@
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Span {
pub start: usize,
pub end: usize,
pub file_id: usize,
}
impl Span {
pub fn new(start: usize, end: usize, file_id: usize) -> Span {
Span {
start,
end,
file_id,
}
}
}