2020-06-08 21:51:45 +02:00
|
|
|
use std::iter::Peekable;
|
2020-05-28 23:46:35 +02:00
|
|
|
use std::str;
|
2020-06-08 21:51:45 +02:00
|
|
|
use std::str::Chars;
|
2020-05-28 23:46:35 +02:00
|
|
|
|
2020-06-05 13:36:11 +02:00
|
|
|
#[derive(Clone, Debug, PartialEq)]
|
2020-05-28 23:46:35 +02:00
|
|
|
pub enum TokenKind {
|
|
|
|
Unknown,
|
|
|
|
Literal,
|
|
|
|
Identifier,
|
2020-05-29 00:27:08 +02:00
|
|
|
|
2020-05-28 23:46:35 +02:00
|
|
|
Plus,
|
|
|
|
Minus,
|
|
|
|
Star,
|
|
|
|
Slash,
|
|
|
|
Power,
|
|
|
|
Equals,
|
2020-05-30 20:28:13 +02:00
|
|
|
Exclamation,
|
2020-05-29 00:27:08 +02:00
|
|
|
|
2020-06-13 16:19:32 +02:00
|
|
|
UnitKeyword,
|
2020-06-15 19:10:55 +02:00
|
|
|
ToKeyword,
|
2020-05-29 00:27:08 +02:00
|
|
|
|
2020-05-28 23:46:35 +02:00
|
|
|
Pipe,
|
2020-06-08 21:51:45 +02:00
|
|
|
OpenCeil,
|
|
|
|
ClosedCeil,
|
|
|
|
OpenFloor,
|
|
|
|
ClosedFloor,
|
2020-05-28 23:46:35 +02:00
|
|
|
OpenParenthesis,
|
|
|
|
ClosedParenthesis,
|
2020-05-29 00:27:08 +02:00
|
|
|
Comma,
|
2020-06-14 22:03:22 +02:00
|
|
|
Semicolon,
|
2020-05-29 00:27:08 +02:00
|
|
|
|
2020-05-28 23:46:35 +02:00
|
|
|
EOF,
|
|
|
|
}
|
|
|
|
|
2020-06-05 13:36:11 +02:00
|
|
|
#[derive(Clone, Debug, PartialEq)]
|
2020-05-28 23:46:35 +02:00
|
|
|
pub struct Token {
|
|
|
|
pub kind: TokenKind,
|
|
|
|
pub value: String,
|
2020-06-06 20:15:32 +02:00
|
|
|
pub span: (usize, usize),
|
2020-05-28 23:46:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
pub struct Lexer<'a> {
|
2020-06-08 21:51:45 +02:00
|
|
|
chars: Peekable<Chars<'a>>,
|
2020-05-28 23:46:35 +02:00
|
|
|
index: usize,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> Lexer<'a> {
|
|
|
|
pub fn lex(source: &str) -> Vec<Token> {
|
|
|
|
let mut lexer = Lexer {
|
2020-06-08 21:51:45 +02:00
|
|
|
chars: source.chars().peekable(),
|
2020-05-28 23:46:35 +02:00
|
|
|
index: 0,
|
|
|
|
};
|
|
|
|
let mut tokens = Vec::new();
|
|
|
|
|
2020-06-08 21:51:45 +02:00
|
|
|
loop {
|
|
|
|
let next = lexer.next();
|
2020-05-28 23:46:35 +02:00
|
|
|
|
2020-06-08 21:51:45 +02:00
|
|
|
if let TokenKind::EOF = next.kind {
|
|
|
|
tokens.push(next);
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
tokens.push(next);
|
|
|
|
}
|
2020-05-28 23:46:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
tokens
|
|
|
|
}
|
|
|
|
|
|
|
|
fn next(&mut self) -> Token {
|
2020-06-08 21:51:45 +02:00
|
|
|
let mut c = if let Some(c) = self.peek() {
|
|
|
|
*c
|
|
|
|
} else {
|
|
|
|
return build(TokenKind::EOF, "", (self.index, self.index));
|
|
|
|
};
|
2020-05-28 23:46:35 +02:00
|
|
|
|
|
|
|
while c == ' ' || c == '\t' || c == '\r' || c == '\n' {
|
2020-06-09 10:34:39 +02:00
|
|
|
if let None = self.advance() {
|
2020-06-08 21:51:45 +02:00
|
|
|
return build(TokenKind::EOF, "", (self.index, self.index));
|
2020-05-28 23:46:35 +02:00
|
|
|
}
|
2020-06-09 10:34:39 +02:00
|
|
|
|
|
|
|
c = *self.peek().unwrap();
|
2020-05-28 23:46:35 +02:00
|
|
|
}
|
|
|
|
|
2020-06-04 20:09:43 +02:00
|
|
|
if c.is_digit(10) {
|
|
|
|
return self.next_number_literal();
|
|
|
|
}
|
|
|
|
|
2020-06-08 21:51:45 +02:00
|
|
|
if is_valid_identifier(Some(&c)) {
|
2020-06-04 20:09:43 +02:00
|
|
|
return self.next_identifier();
|
|
|
|
}
|
|
|
|
|
2020-06-06 20:15:32 +02:00
|
|
|
let span = (self.index, self.index + 1);
|
2020-05-28 23:46:35 +02:00
|
|
|
let token = match c {
|
2020-06-06 20:15:32 +02:00
|
|
|
'+' => build(TokenKind::Plus, "", span),
|
|
|
|
'-' => build(TokenKind::Minus, "", span),
|
|
|
|
'*' => build(TokenKind::Star, "", span),
|
|
|
|
'/' => build(TokenKind::Slash, "", span),
|
|
|
|
'^' => build(TokenKind::Power, "", span),
|
2020-06-08 21:51:45 +02:00
|
|
|
'|' => build(TokenKind::Pipe, "", span),
|
|
|
|
'⌈' => build(TokenKind::OpenCeil, "", span),
|
|
|
|
'⌉' => build(TokenKind::ClosedCeil, "", span),
|
|
|
|
'⌊' => build(TokenKind::OpenFloor, "", span),
|
|
|
|
'⌋' => build(TokenKind::ClosedFloor, "", span),
|
2020-06-06 20:15:32 +02:00
|
|
|
'(' => build(TokenKind::OpenParenthesis, "", span),
|
|
|
|
')' => build(TokenKind::ClosedParenthesis, "", span),
|
|
|
|
'=' => build(TokenKind::Equals, "", span),
|
|
|
|
'!' => build(TokenKind::Exclamation, "", span),
|
|
|
|
',' => build(TokenKind::Comma, "", span),
|
2020-06-14 22:03:22 +02:00
|
|
|
';' => build(TokenKind::Semicolon, "", span),
|
2020-06-06 20:15:32 +02:00
|
|
|
_ => build(TokenKind::Unknown, "", span),
|
2020-05-28 23:46:35 +02:00
|
|
|
};
|
|
|
|
|
2020-06-04 20:09:43 +02:00
|
|
|
self.advance();
|
|
|
|
|
2020-05-28 23:46:35 +02:00
|
|
|
token
|
|
|
|
}
|
|
|
|
|
|
|
|
fn next_number_literal(&mut self) -> Token {
|
|
|
|
let start = self.index;
|
|
|
|
let mut end = start;
|
2020-06-08 21:51:45 +02:00
|
|
|
let mut value = String::new();
|
|
|
|
|
|
|
|
loop {
|
|
|
|
let c = if let Some(c) = self.peek() {
|
|
|
|
*c
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
};
|
|
|
|
|
|
|
|
if !c.is_digit(10) && c != '.' && !c.is_whitespace() {
|
|
|
|
break;
|
|
|
|
}
|
2020-05-28 23:46:35 +02:00
|
|
|
|
|
|
|
end += 1;
|
2020-06-08 21:51:45 +02:00
|
|
|
value.push(c);
|
2020-05-28 23:46:35 +02:00
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
|
2020-06-08 21:51:45 +02:00
|
|
|
build(TokenKind::Literal, &value, (start, end))
|
2020-05-28 23:46:35 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
fn next_identifier(&mut self) -> Token {
|
|
|
|
let start = self.index;
|
|
|
|
let mut end = start;
|
2020-06-09 14:59:29 +02:00
|
|
|
let letter_reg = regex::Regex::new(r"[A-z'_]").unwrap();
|
2020-06-08 21:51:45 +02:00
|
|
|
let mut value = String::new();
|
2020-05-28 23:46:35 +02:00
|
|
|
|
2020-06-08 21:51:45 +02:00
|
|
|
while is_valid_identifier(self.peek()) {
|
|
|
|
let c = *self.peek().unwrap();
|
2020-06-07 19:48:52 +02:00
|
|
|
|
2020-06-09 14:59:29 +02:00
|
|
|
// If the current character is an underscore, expect a number next.
|
|
|
|
// This is to allow the notation like the following: x_1
|
|
|
|
if c == '_' {
|
|
|
|
self.advance();
|
|
|
|
let num = self.next_number_literal().value;
|
|
|
|
value.push('_');
|
|
|
|
value.push_str(&num.trim_end()); // Trim, since the number_literal function allows whitespace, which identifiers should not contain.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2020-06-08 21:51:45 +02:00
|
|
|
// Only allow identifiers with a special character to have *one* character. No more.
|
|
|
|
// Break the loop if it isn't the first run and the current character is a special character.
|
|
|
|
if end - start > 0 && !letter_reg.is_match(&c.to_string()) {
|
2020-06-07 19:48:52 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2020-05-28 23:46:35 +02:00
|
|
|
end += 1;
|
2020-06-08 21:51:45 +02:00
|
|
|
value.push(c);
|
2020-05-28 23:46:35 +02:00
|
|
|
self.advance();
|
|
|
|
}
|
|
|
|
|
2020-06-08 21:51:45 +02:00
|
|
|
let kind = match value.as_ref() {
|
2020-06-13 16:19:32 +02:00
|
|
|
"unit" => TokenKind::UnitKeyword,
|
2020-06-15 19:10:55 +02:00
|
|
|
"to" => TokenKind::ToKeyword,
|
2020-06-08 21:51:45 +02:00
|
|
|
_ => TokenKind::Identifier,
|
|
|
|
};
|
2020-05-28 23:46:35 +02:00
|
|
|
|
2020-06-08 21:51:45 +02:00
|
|
|
build(kind, &value, (start, end))
|
2020-05-28 23:46:35 +02:00
|
|
|
}
|
|
|
|
|
2020-06-08 21:51:45 +02:00
|
|
|
fn peek(&mut self) -> Option<&char> {
|
|
|
|
self.chars.peek()
|
2020-06-07 19:48:52 +02:00
|
|
|
}
|
|
|
|
|
2020-06-08 21:51:45 +02:00
|
|
|
fn advance(&mut self) -> Option<char> {
|
2020-06-05 13:47:39 +02:00
|
|
|
self.index += 1;
|
2020-06-08 21:51:45 +02:00
|
|
|
self.chars.next()
|
2020-05-28 23:46:35 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-06 20:15:32 +02:00
|
|
|
fn build(kind: TokenKind, value: &str, span: (usize, usize)) -> Token {
|
2020-06-04 14:46:45 +02:00
|
|
|
Token {
|
|
|
|
kind,
|
|
|
|
value: value.to_string(),
|
2020-06-06 20:15:32 +02:00
|
|
|
span,
|
2020-06-04 14:46:45 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-08 21:51:45 +02:00
|
|
|
fn is_valid_identifier(c: Option<&char>) -> bool {
|
|
|
|
if let Some(c) = c {
|
2020-06-14 22:03:22 +02:00
|
|
|
regex::Regex::new(r"[^\s\n\r0-9\+-/\*\^!\(\)=\.,;|⌊⌋⌈⌉]")
|
2020-06-08 21:51:45 +02:00
|
|
|
.unwrap()
|
|
|
|
.is_match(&c.to_string())
|
|
|
|
} else {
|
|
|
|
false
|
|
|
|
}
|
2020-05-28 23:46:35 +02:00
|
|
|
}
|
2020-06-04 21:53:45 +02:00
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use super::*;
|
|
|
|
use test_case::test_case;
|
|
|
|
|
|
|
|
fn match_tokens(tokens: Vec<Token>, expected: Vec<TokenKind>) {
|
|
|
|
let mut expected_iter = expected.iter();
|
|
|
|
|
|
|
|
for token in tokens {
|
2020-06-05 13:47:39 +02:00
|
|
|
assert_eq!(token.kind, *expected_iter.next().unwrap());
|
2020-06-04 21:53:45 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_token_kinds() {
|
|
|
|
let tokens = Lexer::lex("+-*/^()|=!,");
|
|
|
|
let expected = vec![
|
|
|
|
TokenKind::Plus,
|
|
|
|
TokenKind::Minus,
|
|
|
|
TokenKind::Star,
|
|
|
|
TokenKind::Slash,
|
|
|
|
TokenKind::Power,
|
|
|
|
TokenKind::OpenParenthesis,
|
|
|
|
TokenKind::ClosedParenthesis,
|
|
|
|
TokenKind::Pipe,
|
|
|
|
TokenKind::Equals,
|
|
|
|
TokenKind::Exclamation,
|
|
|
|
TokenKind::Comma,
|
|
|
|
TokenKind::EOF,
|
|
|
|
];
|
|
|
|
|
|
|
|
match_tokens(tokens, expected);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test_case("1")]
|
|
|
|
#[test_case("24")]
|
|
|
|
#[test_case("56.4")]
|
|
|
|
fn test_number_literal(input: &str) {
|
|
|
|
let tokens = Lexer::lex(input);
|
|
|
|
let expected = vec![TokenKind::Literal, TokenKind::EOF];
|
|
|
|
|
|
|
|
assert_eq!(&tokens[0].value, input);
|
|
|
|
match_tokens(tokens, expected);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test_case("x")]
|
|
|
|
#[test_case("xy")]
|
|
|
|
fn test_identifier(input: &str) {
|
|
|
|
let tokens = Lexer::lex(input);
|
|
|
|
let expected = vec![TokenKind::Identifier, TokenKind::EOF];
|
|
|
|
|
|
|
|
assert_eq!(&tokens[0].value, input);
|
|
|
|
match_tokens(tokens, expected);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_function_call() {
|
|
|
|
let tokens = Lexer::lex("f(x)");
|
|
|
|
let expected = vec![
|
|
|
|
TokenKind::Identifier,
|
|
|
|
TokenKind::OpenParenthesis,
|
|
|
|
TokenKind::Identifier,
|
|
|
|
TokenKind::ClosedParenthesis,
|
|
|
|
TokenKind::EOF,
|
|
|
|
];
|
|
|
|
|
|
|
|
match_tokens(tokens, expected);
|
|
|
|
}
|
|
|
|
}
|