From 29d2449fb3b9b1afe5b7efb12ef5ad16d9f72c4b Mon Sep 17 00:00:00 2001 From: JT Date: Wed, 30 Jun 2021 13:42:56 +1200 Subject: [PATCH] first commit --- Cargo.toml | 8 ++ src/lex.rs | 319 ++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 12 ++ src/lite_parse.rs | 207 ++++++++++++++++++++++++++++ src/main.rs | 25 ++++ src/parse_error.rs | 7 + src/parser.rs | 102 ++++++++++++++ src/parser_state.rs | 175 ++++++++++++++++++++++++ src/span.rs | 16 +++ 9 files changed, 871 insertions(+) create mode 100644 Cargo.toml create mode 100644 src/lex.rs create mode 100644 src/lib.rs create mode 100644 src/lite_parse.rs create mode 100644 src/main.rs create mode 100644 src/parse_error.rs create mode 100644 src/parser.rs create mode 100644 src/parser_state.rs create mode 100644 src/span.rs diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 000000000..5f6558be8 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "engine-q" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/src/lex.rs b/src/lex.rs new file mode 100644 index 000000000..84f91f8cc --- /dev/null +++ b/src/lex.rs @@ -0,0 +1,319 @@ +use crate::{ParseError, Span}; + +#[derive(Debug, PartialEq, Eq)] +pub enum TokenContents { + Item, + Comment, + Pipe, + Semicolon, + Eol, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct Token { + pub contents: TokenContents, + pub span: Span, +} + +impl Token { + pub fn new(contents: TokenContents, span: Span) -> Token { + Token { contents, span } + } +} + +#[derive(Clone, Copy, Debug)] +pub enum BlockKind { + Paren, + CurlyBracket, + SquareBracket, +} + +impl BlockKind { + fn closing(self) -> u8 { + match self { + BlockKind::Paren => b')', + BlockKind::SquareBracket => b']', + BlockKind::CurlyBracket => b'}', + } + } +} + +#[derive(PartialEq, Eq, Debug)] +pub enum LexMode { + Normal, +} + +// A baseline token is terminated if it's not nested inside of a paired +// delimiter and the next character is one of: `|`, `;`, `#` or any +// whitespace. +fn is_item_terminator(block_level: &[BlockKind], c: u8) -> bool { + block_level.is_empty() + && (c == b' ' || c == b'\t' || c == b'\n' || c == b'|' || c == b';' || c == b'#') +} + +pub fn lex_item( + input: &[u8], + curr_offset: &mut usize, + file_id: usize, +) -> (Span, Option) { + // This variable tracks the starting character of a string literal, so that + // we remain inside the string literal lexer mode until we encounter the + // closing quote. + let mut quote_start: Option = None; + + let mut in_comment = false; + + let token_start = *curr_offset; + + // This Vec tracks paired delimiters + let mut block_level: Vec = vec![]; + + // The process of slurping up a baseline token repeats: + // + // - String literal, which begins with `'`, `"` or `\``, and continues until + // the same character is encountered again. + // - Delimiter pair, which begins with `[`, `(`, or `{`, and continues until + // the matching closing delimiter is found, skipping comments and string + // literals. + // - When not nested inside of a delimiter pair, when a terminating + // character (whitespace, `|`, `;` or `#`) is encountered, the baseline + // token is done. + // - Otherwise, accumulate the character into the current baseline token. + while let Some(c) = input.get(*curr_offset) { + let c = *c; + + if quote_start.is_some() { + // If we encountered the closing quote character for the current + // string, we're done with the current string. + if Some(c) == quote_start { + quote_start = None; + } + } else if c == b'#' { + if is_item_terminator(&block_level, c) { + break; + } + in_comment = true; + } else if c == b'\n' { + in_comment = false; + if is_item_terminator(&block_level, c) { + break; + } + } else if in_comment { + if is_item_terminator(&block_level, c) { + break; + } + } else if c == b'\'' || c == b'"' { + // We encountered the opening quote of a string literal. + quote_start = Some(c); + } else if c == b'[' { + // We encountered an opening `[` delimiter. + block_level.push(BlockKind::SquareBracket); + } else if c == b']' { + // We encountered a closing `]` delimiter. Pop off the opening `[` + // delimiter. + if let Some(BlockKind::SquareBracket) = block_level.last() { + let _ = block_level.pop(); + } + } else if c == b'{' { + // We encountered an opening `{` delimiter. + block_level.push(BlockKind::CurlyBracket); + } else if c == b'}' { + // We encountered a closing `}` delimiter. Pop off the opening `{`. + if let Some(BlockKind::CurlyBracket) = block_level.last() { + let _ = block_level.pop(); + } + } else if c == b'(' { + // We enceountered an opening `(` delimiter. + block_level.push(BlockKind::Paren); + } else if c == b')' { + // We encountered a closing `)` delimiter. Pop off the opening `(`. + if let Some(BlockKind::Paren) = block_level.last() { + let _ = block_level.pop(); + } + } else if is_item_terminator(&block_level, c) { + break; + } + + *curr_offset += 1; + } + + let span = Span::new(token_start, *curr_offset, file_id); + + // If there is still unclosed opening delimiters, close them and add + // synthetic closing characters to the accumulated token. + if let Some(block) = block_level.last() { + let delim = block.closing(); + let cause = ParseError::UnexpectedEof((delim as char).to_string(), span); + + return (span, Some(cause)); + } + + if let Some(delim) = quote_start { + // The non-lite parse trims quotes on both sides, so we add the expected quote so that + // anyone wanting to consume this partial parse (e.g., completions) will be able to get + // correct information from the non-lite parse. + return ( + span, + Some(ParseError::UnexpectedEof((delim as char).to_string(), span)), + ); + } + + // If we didn't accumulate any characters, it's an unexpected error. + if *curr_offset - token_start == 0 { + return ( + span, + Some(ParseError::UnexpectedEof("command".to_string(), span)), + ); + } + + (span, None) +} + +pub fn lex( + input: &[u8], + file_id: usize, + span_offset: usize, + lex_mode: LexMode, +) -> (Vec, Option) { + let mut error = None; + + let mut curr_offset = span_offset; + + let mut output = vec![]; + let mut is_complete = true; + + while let Some(c) = input.get(curr_offset) { + let c = *c; + if c == b'|' { + // If the next character is `|`, it's either `|` or `||`. + + let idx = curr_offset; + let prev_idx = idx; + curr_offset += 1; + + // If the next character is `|`, we're looking at a `||`. + if let Some(c) = input.get(curr_offset) { + if *c == b'|' { + let idx = curr_offset; + curr_offset += 1; + output.push(Token::new( + TokenContents::Item, + Span::new(span_offset + prev_idx, span_offset + idx + 1, file_id), + )); + continue; + } + } + + // Otherwise, it's just a regular `|` token. + output.push(Token::new( + TokenContents::Pipe, + Span::new(span_offset + idx, span_offset + idx + 1, file_id), + )); + is_complete = false; + } else if c == b';' { + // If the next character is a `;`, we're looking at a semicolon token. + + if !is_complete && error.is_none() { + error = Some(ParseError::ExtraTokens(Span::new( + curr_offset, + curr_offset + 1, + file_id, + ))); + } + let idx = curr_offset; + curr_offset += 1; + output.push(Token::new( + TokenContents::Semicolon, + Span::new(idx, idx + 1, file_id), + )); + } else if c == b'\n' || c == b'\r' { + // If the next character is a newline, we're looking at an EOL (end of line) token. + + let idx = curr_offset; + curr_offset += 1; + if lex_mode == LexMode::Normal { + output.push(Token::new( + TokenContents::Eol, + Span::new(idx, idx + 1, file_id), + )); + } + } else if c == b'#' { + // If the next character is `#`, we're at the beginning of a line + // comment. The comment continues until the next newline. + let mut start = curr_offset; + + while let Some(input) = input.get(curr_offset) { + curr_offset += 1; + if *input == b'\n' { + output.push(Token::new( + TokenContents::Comment, + Span::new(start, curr_offset, file_id), + )); + start = curr_offset; + + break; + } + } + if start != curr_offset { + output.push(Token::new( + TokenContents::Comment, + Span::new(start, curr_offset, file_id), + )); + } + } else if c == b' ' || c == b'\t' { + // If the next character is non-newline whitespace, skip it. + curr_offset += 1; + } else { + // Otherwise, try to consume an unclassified token. + + let (span, err) = lex_item(input, &mut curr_offset, file_id); + if error.is_none() { + error = err; + } + is_complete = true; + output.push(Token::new(TokenContents::Item, span)); + } + } + (output, error) +} + +#[cfg(test)] +mod lex_tests { + use super::*; + + #[test] + fn lex_basic() { + let file = b"let x = 4"; + + let output = lex(file, 0, 0, LexMode::Normal); + + assert!(output.1.is_none()); + } + + #[test] + fn lex_newline() { + let file = b"let x = 300\nlet y = 500;"; + + let output = lex(file, 0, 0, LexMode::Normal); + + println!("{:#?}", output.0); + assert!(output.0.contains(&Token { + contents: TokenContents::Eol, + span: Span { + start: 11, + end: 12, + file_id: 0 + } + })); + } + + #[test] + fn lex_empty() { + let file = b""; + + let output = lex(file, 0, 0, LexMode::Normal); + + assert!(output.0.is_empty()); + assert!(output.1.is_none()); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 000000000..36d336dd8 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,12 @@ +mod lex; +mod lite_parse; +mod parse_error; +mod parser; +mod parser_state; +mod span; + +pub use lex::{lex, LexMode, Token, TokenContents}; +pub use lite_parse::{lite_parse, LiteBlock, LiteCommand, LiteStatement}; +pub use parse_error::ParseError; +pub use parser_state::{ParserState, ParserWorkingSet, VarLocation}; +pub use span::Span; diff --git a/src/lite_parse.rs b/src/lite_parse.rs new file mode 100644 index 000000000..a3f2d9e0c --- /dev/null +++ b/src/lite_parse.rs @@ -0,0 +1,207 @@ +use crate::{ParseError, Span, Token, TokenContents}; + +#[derive(Debug)] +pub struct LiteCommand { + pub comments: Vec, + pub parts: Vec, +} + +impl Default for LiteCommand { + fn default() -> Self { + Self::new() + } +} + +impl LiteCommand { + pub fn new() -> Self { + Self { + comments: vec![], + parts: vec![], + } + } + + pub fn push(&mut self, span: Span) { + self.parts.push(span); + } + + pub fn is_empty(&self) -> bool { + self.parts.is_empty() + } +} + +#[derive(Debug)] +pub struct LiteStatement { + pub commands: Vec, +} + +impl Default for LiteStatement { + fn default() -> Self { + Self::new() + } +} + +impl LiteStatement { + pub fn new() -> Self { + Self { commands: vec![] } + } + + pub fn push(&mut self, command: LiteCommand) { + self.commands.push(command); + } + + pub fn is_empty(&self) -> bool { + self.commands.is_empty() + } +} + +#[derive(Debug)] +pub struct LiteBlock { + pub block: Vec, +} + +impl Default for LiteBlock { + fn default() -> Self { + Self::new() + } +} + +impl LiteBlock { + pub fn new() -> Self { + Self { block: vec![] } + } + + pub fn push(&mut self, pipeline: LiteStatement) { + self.block.push(pipeline); + } + + pub fn is_empty(&self) -> bool { + self.block.is_empty() + } +} + +pub fn lite_parse(tokens: &[Token]) -> (LiteBlock, Option) { + let mut curr_token = 0; + + let mut block = LiteBlock::new(); + let mut curr_pipeline = LiteStatement::new(); + let mut curr_command = LiteCommand::new(); + + while let Some(token) = tokens.get(curr_token) { + match &token.contents { + TokenContents::Item => curr_command.push(token.span), + TokenContents::Pipe => { + if !curr_command.is_empty() { + curr_pipeline.push(curr_command); + curr_command = LiteCommand::new(); + } + } + TokenContents::Eol | TokenContents::Semicolon => { + if !curr_command.is_empty() { + curr_pipeline.push(curr_command); + } + curr_command = LiteCommand::new(); + + if !curr_pipeline.is_empty() { + block.push(curr_pipeline); + } + curr_pipeline = LiteStatement::new(); + } + TokenContents::Comment => { + curr_command.comments.push(token.span); + } + } + curr_token += 1; + } + if !curr_command.is_empty() { + curr_pipeline.push(curr_command); + } + + if !curr_pipeline.is_empty() { + block.push(curr_pipeline); + } + + (block, None) +} + +#[cfg(test)] +mod tests { + use crate::{lex, lite_parse, LiteBlock, ParseError, Span}; + + fn lite_parse_helper(input: &[u8]) -> Result { + let (output, err) = lex(input, 0, 0, crate::LexMode::Normal); + if let Some(err) = err { + return Err(err); + } + + let (output, err) = lite_parse(&output); + if let Some(err) = err { + return Err(err); + } + + Ok(output) + } + + #[test] + fn comment_before() -> Result<(), ParseError> { + let input = b"# this is a comment\ndef foo bar"; + + let lite_block = lite_parse_helper(input)?; + + assert_eq!(lite_block.block.len(), 1); + assert_eq!(lite_block.block[0].commands.len(), 1); + assert_eq!(lite_block.block[0].commands[0].comments.len(), 1); + assert_eq!(lite_block.block[0].commands[0].parts.len(), 3); + + Ok(()) + } + + #[test] + fn comment_beside() -> Result<(), ParseError> { + let input = b"def foo bar # this is a comment"; + + let lite_block = lite_parse_helper(input)?; + + assert_eq!(lite_block.block.len(), 1); + assert_eq!(lite_block.block[0].commands.len(), 1); + assert_eq!(lite_block.block[0].commands[0].comments.len(), 1); + assert_eq!(lite_block.block[0].commands[0].parts.len(), 3); + + Ok(()) + } + + #[test] + fn comments_stack() -> Result<(), ParseError> { + let input = b"# this is a comment\n# another comment\ndef foo bar "; + + let lite_block = lite_parse_helper(input)?; + + assert_eq!(lite_block.block.len(), 1); + assert_eq!(lite_block.block[0].commands.len(), 1); + assert_eq!(lite_block.block[0].commands[0].comments.len(), 2); + assert_eq!(lite_block.block[0].commands[0].parts.len(), 3); + + Ok(()) + } + + #[test] + fn separated_comments_dont_stack() -> Result<(), ParseError> { + let input = b"# this is a comment\n\n# another comment\ndef foo bar "; + + let lite_block = lite_parse_helper(input)?; + + assert_eq!(lite_block.block.len(), 1); + assert_eq!(lite_block.block[0].commands.len(), 1); + assert_eq!(lite_block.block[0].commands[0].comments.len(), 1); + assert_eq!( + lite_block.block[0].commands[0].comments[0], + Span { + start: 21, + end: 39, + file_id: 0 + } + ); + assert_eq!(lite_block.block[0].commands[0].parts.len(), 3); + + Ok(()) + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 000000000..227ba3126 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,25 @@ +use nu_parser_new::{lex, lite_parse, LexMode, ParserWorkingSet}; + +fn main() -> std::io::Result<()> { + if let Some(path) = std::env::args().nth(1) { + let file = std::fs::read(&path)?; + + // let (output, err) = lex(&file, 0, 0, LexMode::Normal); + + // println!("{:?} tokens, error: {:?}", output, err); + + // let (output, err) = lite_parse(&output); + + // println!("{:?}, error: {:?}", output, err); + + let mut working_set = ParserWorkingSet::new(None); + + let (output, err) = working_set.parse_file(&path, &file); + println!("{:?} {:?}", output, err); + + Ok(()) + } else { + println!("specify file to lex"); + Ok(()) + } +} diff --git a/src/parse_error.rs b/src/parse_error.rs new file mode 100644 index 000000000..40cc6ed67 --- /dev/null +++ b/src/parse_error.rs @@ -0,0 +1,7 @@ +pub use crate::Span; + +#[derive(Debug)] +pub enum ParseError { + ExtraTokens(Span), + UnexpectedEof(String, Span), +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 000000000..fd86d624b --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,102 @@ +use std::str::Utf8Error; + +use crate::{lex, lite_parse, LiteBlock, LiteStatement, ParseError, ParserWorkingSet, Span}; + +#[derive(Debug)] +pub enum Expression {} + +#[derive(Debug)] +pub enum Import {} + +#[derive(Debug)] +pub struct Block { + stmts: Vec, +} + +impl Block { + pub fn new() -> Self { + Self { stmts: vec![] } + } +} + +#[derive(Debug)] +pub struct VarDecl { + name: String, + value: Expression, +} + +#[derive(Debug)] +pub enum Statement { + Pipeline(Pipeline), + VarDecl(VarDecl), + Import(Import), + None, +} + +#[derive(Debug)] +pub struct Pipeline {} + +impl Pipeline { + pub fn new() -> Self { + Self {} + } +} + +impl ParserWorkingSet { + fn parse_statement( + &mut self, + block: &mut Block, + lite_pipeline: &LiteStatement, + ) -> Option { + match lite_pipeline.commands.len() { + 0 => None, + 1 => { + let command_name = self.get_span_contents(lite_pipeline.commands[0].parts[0]); + println!("{:?}", command_name); + if command_name == b"let" { + println!("found let") + } + None + } + _ => { + // pipeline + None + } + } + } + + pub fn parse_block(&mut self, lite_block: &LiteBlock) -> (Block, Option) { + let mut error = None; + self.enter_scope(); + + let mut block = Block::new(); + + for pipeline in &lite_block.block { + let err = self.parse_statement(&mut block, pipeline); + error = error.or(err); + } + + self.exit_scope(); + + (block, error) + } + + pub fn parse_file(&mut self, fname: &str, contents: &[u8]) -> (Block, Option) { + let mut error = None; + + let file_id = self.add_file(fname.into(), contents.into()); + + let (output, err) = lex(contents, file_id, 0, crate::LexMode::Normal); + error = error.or(err); + + let (output, err) = lite_parse(&output); + error = error.or(err); + + println!("{:?}", output); + + let (output, err) = self.parse_block(&output); + error = error.or(err); + + (output, error) + } +} diff --git a/src/parser_state.rs b/src/parser_state.rs new file mode 100644 index 000000000..5f0aa82f7 --- /dev/null +++ b/src/parser_state.rs @@ -0,0 +1,175 @@ +use crate::Span; +use std::{collections::HashMap, sync::Arc}; + +pub struct ParserState { + files: Vec<(String, Vec)>, +} + +pub enum VarLocation { + CurrentScope, + OuterScope, +} + +#[derive(Clone, Copy)] +pub enum Type {} + +struct ScopeFrame { + vars: HashMap, +} + +impl ScopeFrame { + pub fn new() -> Self { + Self { + vars: HashMap::new(), + } + } +} + +pub struct ParserWorkingSet { + files: Vec<(String, Vec)>, + permanent_state: Option>, + scope: Vec, +} + +impl Default for ParserState { + fn default() -> Self { + Self::new() + } +} + +impl ParserState { + pub fn new() -> Self { + Self { files: vec![] } + } + + pub fn merge_working_set(this: &mut Arc, mut working_set: ParserWorkingSet) { + // Remove the working set's reference to the permanent state so we can safely take a mutable reference + working_set.permanent_state = None; + + // Take the mutable reference and extend the permanent state from the working set + if let Some(this) = std::sync::Arc::::get_mut(this) { + this.files.extend(working_set.files); + } else { + panic!("Internal error: merging working set should always succeed"); + } + } + + pub fn num_files(&self) -> usize { + self.files.len() + } + + pub(crate) fn add_file(&mut self, filename: String, contents: Vec) -> usize { + self.files.push((filename, contents)); + + self.num_files() - 1 + } + + pub(crate) fn get_file_contents(&self, idx: usize) -> &[u8] { + &self.files[idx].1 + } +} + +impl ParserWorkingSet { + pub fn new(permanent_state: Option>) -> Self { + Self { + files: vec![], + permanent_state, + scope: vec![], + } + } + + pub fn num_files(&self) -> usize { + let parent_len = if let Some(permanent_state) = &self.permanent_state { + permanent_state.num_files() + } else { + 0 + }; + + self.files.len() + parent_len + } + + pub fn add_file(&mut self, filename: String, contents: Vec) -> usize { + self.files.push((filename, contents)); + + self.num_files() - 1 + } + + pub fn get_span_contents(&self, span: Span) -> &[u8] { + if let Some(permanent_state) = &self.permanent_state { + let num_permanent_files = permanent_state.num_files(); + if span.file_id < num_permanent_files { + &permanent_state.get_file_contents(span.file_id)[span.start..span.end] + } else { + &self.files[span.file_id - num_permanent_files].1[span.start..span.end] + } + } else { + &self.files[span.file_id].1[span.start..span.end] + } + } + + pub fn enter_scope(&mut self) { + self.scope.push(ScopeFrame::new()); + } + + pub fn exit_scope(&mut self) { + self.scope.push(ScopeFrame::new()); + } + + pub fn find_variable(&self, name: &str) -> Option<(VarLocation, Type)> { + for scope in self.scope.iter().rev().enumerate() { + if let Some(result) = scope.1.vars.get(name) { + if scope.0 == 0 { + // Top level + return Some((VarLocation::CurrentScope, result.clone())); + } else { + return Some((VarLocation::OuterScope, result.clone())); + } + } + } + + None + } +} + +fn main() {} + +#[cfg(test)] +mod parser_state_tests { + use super::*; + + #[test] + fn add_file_gives_id() { + let mut parser_state = ParserWorkingSet::new(Some(Arc::new(ParserState::new()))); + let id = parser_state.add_file("test.nu".into(), vec![]); + + assert_eq!(id, 0); + } + + #[test] + fn add_file_gives_id_including_parent() { + let mut parser_state = ParserState::new(); + let parent_id = parser_state.add_file("test.nu".into(), vec![]); + + let mut working_set = ParserWorkingSet::new(Some(Arc::new(parser_state))); + let working_set_id = working_set.add_file("child.nu".into(), vec![]); + + assert_eq!(parent_id, 0); + assert_eq!(working_set_id, 1); + } + + #[test] + fn merge_states() { + let mut parser_state = ParserState::new(); + let parent_id = parser_state.add_file("test.nu".into(), vec![]); + let mut parser_state = Arc::new(parser_state); + + let mut working_set = ParserWorkingSet::new(Some(parser_state.clone())); + let working_set_id = working_set.add_file("child.nu".into(), vec![]); + + ParserState::merge_working_set(&mut parser_state, working_set); + + assert_eq!(parser_state.num_files(), 2); + assert_eq!(&parser_state.files[0].0, "test.nu"); + assert_eq!(&parser_state.files[1].0, "child.nu"); + } +} diff --git a/src/span.rs b/src/span.rs new file mode 100644 index 000000000..344344a4b --- /dev/null +++ b/src/span.rs @@ -0,0 +1,16 @@ +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct Span { + pub start: usize, + pub end: usize, + pub file_id: usize, +} + +impl Span { + pub fn new(start: usize, end: usize, file_id: usize) -> Span { + Span { + start, + end, + file_id, + } + } +}