kalker/kalk/src/lexer.rs

451 lines
13 KiB
Rust
Raw Normal View History

use crate::text_utils::{is_subscript, is_superscript};
use std::iter::Peekable;
2020-05-28 23:46:35 +02:00
use std::str;
use std::str::Chars;
2020-05-28 23:46:35 +02:00
#[derive(Clone, Debug, PartialEq, Copy)]
2020-05-28 23:46:35 +02:00
pub enum TokenKind {
Unknown,
Literal,
Identifier,
2020-05-28 23:46:35 +02:00
Plus,
Minus,
Star,
Slash,
Power,
2020-05-30 20:28:13 +02:00
Exclamation,
2020-12-09 22:18:00 +01:00
Percent,
Tick,
GreaterThan,
LessThan,
Equals,
NotEquals,
GreaterOrEquals,
LessOrEquals,
UnitKeyword,
ToKeyword,
2021-05-31 18:55:37 +02:00
IfKeyword,
OtherwiseKeyword,
2020-05-28 23:46:35 +02:00
Pipe,
OpenCeil,
ClosedCeil,
OpenFloor,
ClosedFloor,
2020-05-28 23:46:35 +02:00
OpenParenthesis,
ClosedParenthesis,
OpenBracket,
ClosedBracket,
2021-05-31 18:55:37 +02:00
OpenBrace,
ClosedBrace,
Comma,
Semicolon,
2020-05-28 23:46:35 +02:00
EOF,
}
#[derive(Clone, Debug, PartialEq)]
2020-05-28 23:46:35 +02:00
pub struct Token {
pub kind: TokenKind,
pub value: String,
2020-06-06 20:15:32 +02:00
pub span: (usize, usize),
2020-05-28 23:46:35 +02:00
}
pub struct Lexer<'a> {
chars: Peekable<Chars<'a>>,
2020-05-28 23:46:35 +02:00
index: usize,
}
impl<'a> Lexer<'a> {
pub fn lex(source: &str) -> Vec<Token> {
let mut lexer = Lexer {
chars: source.chars().peekable(),
2020-05-28 23:46:35 +02:00
index: 0,
};
let mut tokens = Vec::new();
loop {
let next = lexer.next();
2020-05-28 23:46:35 +02:00
if let TokenKind::EOF = next.kind {
tokens.push(next);
break;
} else {
tokens.push(next);
}
2020-05-28 23:46:35 +02:00
}
tokens
}
fn next(&mut self) -> Token {
2020-12-09 09:36:49 +01:00
let eof = build(TokenKind::EOF, "", (self.index, self.index));
let mut c = if let Some(c) = self.peek() {
*c
} else {
2020-12-09 09:36:49 +01:00
return eof;
};
2020-05-28 23:46:35 +02:00
while c == ' ' || c == '\t' || c == '\r' || c == '\n' {
if let None = self.advance() {
2020-12-09 09:36:49 +01:00
return eof;
2020-05-28 23:46:35 +02:00
}
2020-12-09 09:36:49 +01:00
c = if let Some(c) = self.peek() {
*c
} else {
return eof;
}
2020-05-28 23:46:35 +02:00
}
if c.is_digit(10) {
return self.next_number_literal();
}
if is_valid_identifier(Some(&c)) {
return self.next_identifier();
}
2020-06-06 20:15:32 +02:00
let span = (self.index, self.index + 1);
2020-05-28 23:46:35 +02:00
let token = match c {
2020-06-06 20:15:32 +02:00
'+' => build(TokenKind::Plus, "", span),
'-' => build(TokenKind::Minus, "", span),
'*' | '×' => build(TokenKind::Star, "", span),
'/' | '÷' => build(TokenKind::Slash, "", span),
2020-06-06 20:15:32 +02:00
'^' => build(TokenKind::Power, "", span),
'|' => build(TokenKind::Pipe, "", span),
'⌈' => build(TokenKind::OpenCeil, "", span),
'⌉' => build(TokenKind::ClosedCeil, "", span),
'⌊' => build(TokenKind::OpenFloor, "", span),
'⌋' => build(TokenKind::ClosedFloor, "", span),
2020-06-06 20:15:32 +02:00
'(' => build(TokenKind::OpenParenthesis, "", span),
')' => build(TokenKind::ClosedParenthesis, "", span),
'[' => build(TokenKind::OpenBracket, "", span),
']' => build(TokenKind::ClosedBracket, "", span),
2021-05-31 18:55:37 +02:00
'{' => build(TokenKind::OpenBrace, "", span),
'}' => build(TokenKind::ClosedBrace, "", span),
2020-06-06 20:15:32 +02:00
'!' => build(TokenKind::Exclamation, "", span),
'=' => build(TokenKind::Equals, "", span),
'>' => build(TokenKind::GreaterThan, "", span),
'<' => build(TokenKind::LessThan, "", span),
2020-06-06 20:15:32 +02:00
',' => build(TokenKind::Comma, "", span),
';' => build(TokenKind::Semicolon, "", span),
2020-12-09 22:18:00 +01:00
'%' => build(TokenKind::Percent, "", span),
'\'' => build(TokenKind::Tick, "", span),
'≠' => build(TokenKind::NotEquals, "", span),
'≥' => build(TokenKind::GreaterOrEquals, "", span),
'≤' => build(TokenKind::LessOrEquals, "", span),
2021-05-18 17:49:31 +02:00
// Some of the special symbols will be lexed here,
// so that they don't merge with other symbols.
2021-10-01 09:21:11 +02:00
'π' => build(TokenKind::Identifier, "pi", span),
'√' => build(TokenKind::Identifier, "sqrt", span),
'τ' => build(TokenKind::Identifier, "tau", span),
'ϕ' => build(TokenKind::Identifier, "phi", span),
'Γ' => build(TokenKind::Identifier, "gamma", span),
'∏' => build(TokenKind::Identifier, "prod", span),
2020-06-06 20:15:32 +02:00
_ => build(TokenKind::Unknown, "", span),
2020-05-28 23:46:35 +02:00
};
self.advance();
// Handle tokens with two characters
match (token.kind, self.peek()) {
(TokenKind::Star, Some('*')) => {
2021-05-17 18:14:48 +02:00
self.advance();
return build(TokenKind::Power, "", span);
}
(TokenKind::Exclamation, Some('=')) => {
self.advance();
return build(TokenKind::NotEquals, "", span);
}
(TokenKind::GreaterThan, Some('=')) => {
self.advance();
return build(TokenKind::GreaterOrEquals, "", span);
}
(TokenKind::LessThan, Some('=')) => {
self.advance();
return build(TokenKind::LessOrEquals, "", span);
}
_ => (),
2021-05-17 18:14:48 +02:00
}
2020-05-28 23:46:35 +02:00
token
}
fn next_number_literal(&mut self) -> Token {
2021-12-30 02:05:04 +01:00
let mut start = self.index;
2020-05-28 23:46:35 +02:00
let mut end = start;
let mut value = String::new();
2021-12-30 02:05:04 +01:00
let mut leading_zero = self.peek().unwrap_or(&'\0') == &'0';
let mut base = 10u32;
loop {
let c = if let Some(c) = self.peek() {
*c
} else {
break;
};
2021-12-30 02:05:04 +01:00
// If at the second character and
// the first character is a zero,
// allow a letter
if end - start == 1 && leading_zero {
base = match c {
'b' => 2,
'o' => 8,
'x' => 16,
_ => 10,
};
// Don't include eg. 0x in the value
if base != 10 {
start += 2;
end += 1;
self.advance();
value.clear();
leading_zero = false;
continue;
}
2021-12-30 02:05:04 +01:00
}
if !c.is_digit(base) && c != '.' && c != '_' && !c.is_whitespace()
|| c == '\n'
|| c == '\r'
{
break;
}
2020-05-28 23:46:35 +02:00
end += 1;
value.push(c);
2020-05-28 23:46:35 +02:00
self.advance();
}
2021-12-30 02:05:04 +01:00
// Subscript unicode symbols after the literal, eg. 11₂
let mut base_str = String::new();
while crate::text_utils::is_subscript(self.peek().unwrap_or(&'\0')) {
base_str.push(*self.peek().unwrap());
self.advance();
}
if base_str != "" {
base = crate::text_utils::subscript_to_digits(base_str.chars())
.parse::<u32>()
.unwrap_or(10);
}
if base != 10 {
value.push_str(&format!("_{}", base));
}
build(TokenKind::Literal, &value, (start, end))
2020-05-28 23:46:35 +02:00
}
fn next_identifier(&mut self) -> Token {
let start = self.index;
let mut end = start;
let mut value = String::new();
2020-05-28 23:46:35 +02:00
while is_valid_identifier(self.peek()) {
let c = *self.peek().unwrap();
// If the current character is an underscore, allow a number next.
2020-06-09 14:59:29 +02:00
// This is to allow the notation like the following: x_1
if c == '_' {
self.advance();
let num = self.next().value;
2020-06-09 14:59:29 +02:00
value.push('_');
value.push_str(&num.trim_end()); // Trim, since the number_literal function allows whitespace, which identifiers should not contain.
break;
}
// Only allow identifiers with a special character to have *one* character. No more.
// Break the loop if it isn't the first run and the current character is a special character.
if end - start > 0
&& !(c.is_ascii_alphabetic()
|| c == '\''
|| c == '_'
|| is_superscript(&c)
|| is_subscript(&c))
{
break;
}
2020-05-28 23:46:35 +02:00
end += 1;
value.push(c);
2020-05-28 23:46:35 +02:00
self.advance();
}
let kind = match value.as_ref() {
"unit" => TokenKind::UnitKeyword,
"to" => TokenKind::ToKeyword,
2021-05-31 18:55:37 +02:00
"if" => TokenKind::IfKeyword,
"otherwise" => TokenKind::OtherwiseKeyword,
_ => TokenKind::Identifier,
};
2020-05-28 23:46:35 +02:00
2021-10-01 09:21:11 +02:00
let value = match value.as_ref() {
"Σ" | "" => String::from("sum"),
"" => String::from("prod"),
"" | "integral" => String::from("integrate"),
"sin⁻¹" => String::from("asin"),
"cos⁻¹" => String::from("acos"),
"tan⁻¹" => String::from("atan"),
"cot⁻¹" => String::from("acot"),
"cosec⁻¹" => String::from("acosec"),
"sec⁻¹" => String::from("asec"),
"sinh⁻¹" => String::from("asinh"),
"cosh⁻¹" => String::from("acosh"),
"tanh⁻¹" => String::from("atanh"),
"coth⁻¹" => String::from("acoth"),
"cosech⁻¹" => String::from("acosech"),
"sech⁻¹" => String::from("asech"),
"" => String::from("cbrt"),
2021-10-01 09:21:11 +02:00
"°" => String::from("deg"),
_ => value, // things like log₂ are handled in the parser
2021-10-01 09:21:11 +02:00
};
2020-06-18 18:06:17 +02:00
build(kind, &value, (start, end))
2020-05-28 23:46:35 +02:00
}
fn peek(&mut self) -> Option<&char> {
self.chars.peek()
}
fn advance(&mut self) -> Option<char> {
self.index += 1;
self.chars.next()
2020-05-28 23:46:35 +02:00
}
}
2020-06-06 20:15:32 +02:00
fn build(kind: TokenKind, value: &str, span: (usize, usize)) -> Token {
2020-06-04 14:46:45 +02:00
Token {
kind,
value: value.to_string(),
2020-06-06 20:15:32 +02:00
span,
2020-06-04 14:46:45 +02:00
}
}
fn is_valid_identifier(c: Option<&char>) -> bool {
if let Some(c) = c {
match c {
'+' | '-' | '/' | '*' | '%' | '^' | '!' | '(' | ')' | '=' | '.' | ',' | ';' | '|'
2021-05-31 18:55:37 +02:00
| '⌊' | '⌋' | '⌈' | '⌉' | '[' | ']' | '{' | '}' | 'π' | '√' | 'τ' | 'ϕ' | 'Γ' | '<'
| '>' | '≠' | '≥' | '≤' | '×' | '÷' => false,
_ => !c.is_digit(10) || is_superscript(c) || is_subscript(c),
}
} else {
false
}
2020-05-28 23:46:35 +02:00
}
2020-06-04 21:53:45 +02:00
#[cfg(test)]
mod tests {
use super::*;
use test_case::test_case;
2020-12-30 22:50:39 +01:00
use wasm_bindgen_test::*;
wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser);
2020-06-04 21:53:45 +02:00
fn match_tokens(tokens: Vec<Token>, expected: Vec<TokenKind>) {
let mut expected_iter = expected.iter();
for token in tokens {
assert_eq!(token.kind, *expected_iter.next().unwrap());
2020-06-04 21:53:45 +02:00
}
}
#[test]
2020-12-30 22:50:39 +01:00
#[wasm_bindgen_test]
2020-06-04 21:53:45 +02:00
fn test_token_kinds() {
2020-12-09 22:18:00 +01:00
let tokens = Lexer::lex("+-*/%^()|=!,");
2020-06-04 21:53:45 +02:00
let expected = vec![
TokenKind::Plus,
TokenKind::Minus,
TokenKind::Star,
TokenKind::Slash,
2020-12-09 22:18:00 +01:00
TokenKind::Percent,
2020-06-04 21:53:45 +02:00
TokenKind::Power,
TokenKind::OpenParenthesis,
TokenKind::ClosedParenthesis,
TokenKind::Pipe,
TokenKind::Equals,
TokenKind::Exclamation,
TokenKind::Comma,
TokenKind::EOF,
];
match_tokens(tokens, expected);
}
#[test]
#[wasm_bindgen_test]
fn test_brackets() {
let tokens = Lexer::lex("[1 < 2]");
let expected = vec![
TokenKind::OpenBracket,
TokenKind::Literal,
TokenKind::LessThan,
TokenKind::Literal,
TokenKind::ClosedBracket,
TokenKind::EOF,
];
match_tokens(tokens, expected);
}
2020-12-09 09:36:49 +01:00
#[test]
2020-12-30 22:50:39 +01:00
#[wasm_bindgen_test]
2020-12-09 09:36:49 +01:00
fn test_empty() {
// test_case macro doesn't seem to work with spaces.
let test_cases = vec![" ", " ", "test ", " test "];
for input in test_cases {
let tokens = Lexer::lex(input);
if regex::Regex::new(r"^\s*$").unwrap().is_match(input) {
let expected = vec![TokenKind::EOF];
match_tokens(tokens, expected);
} else {
let expected = vec![TokenKind::Identifier, TokenKind::EOF];
match_tokens(tokens, expected);
}
}
}
2020-06-04 21:53:45 +02:00
#[test_case("1")]
#[test_case("24")]
#[test_case("56.4")]
fn test_number_literal(input: &str) {
let tokens = Lexer::lex(input);
let expected = vec![TokenKind::Literal, TokenKind::EOF];
assert_eq!(&tokens[0].value, input);
match_tokens(tokens, expected);
}
#[test_case("x")]
#[test_case("xy")]
fn test_identifier(input: &str) {
let tokens = Lexer::lex(input);
let expected = vec![TokenKind::Identifier, TokenKind::EOF];
assert_eq!(&tokens[0].value, input);
match_tokens(tokens, expected);
}
#[test]
fn test_function_call() {
let tokens = Lexer::lex("f(x)");
let expected = vec![
TokenKind::Identifier,
TokenKind::OpenParenthesis,
TokenKind::Identifier,
TokenKind::ClosedParenthesis,
TokenKind::EOF,
];
match_tokens(tokens, expected);
}
}