mirror of
https://github.com/PaddiM8/kalker.git
synced 2024-12-15 11:01:10 +01:00
449 lines
13 KiB
Rust
449 lines
13 KiB
Rust
use crate::text_utils::{is_subscript, is_superscript};
|
||
use std::iter::Peekable;
|
||
use std::str;
|
||
use std::str::Chars;
|
||
|
||
#[derive(Clone, Debug, PartialEq, Copy)]
|
||
pub enum TokenKind {
|
||
Unknown,
|
||
Literal,
|
||
Identifier,
|
||
|
||
Plus,
|
||
Minus,
|
||
Star,
|
||
Slash,
|
||
Power,
|
||
Exclamation,
|
||
Percent,
|
||
Tick,
|
||
GreaterThan,
|
||
LessThan,
|
||
Equals,
|
||
NotEquals,
|
||
GreaterOrEquals,
|
||
LessOrEquals,
|
||
|
||
UnitKeyword,
|
||
ToKeyword,
|
||
IfKeyword,
|
||
OtherwiseKeyword,
|
||
|
||
Pipe,
|
||
OpenCeil,
|
||
ClosedCeil,
|
||
OpenFloor,
|
||
ClosedFloor,
|
||
OpenParenthesis,
|
||
ClosedParenthesis,
|
||
OpenBracket,
|
||
ClosedBracket,
|
||
OpenBrace,
|
||
ClosedBrace,
|
||
Comma,
|
||
Semicolon,
|
||
|
||
EOF,
|
||
}
|
||
|
||
#[derive(Clone, Debug, PartialEq)]
|
||
pub struct Token {
|
||
pub kind: TokenKind,
|
||
pub value: String,
|
||
pub span: (usize, usize),
|
||
}
|
||
|
||
pub struct Lexer<'a> {
|
||
chars: Peekable<Chars<'a>>,
|
||
index: usize,
|
||
}
|
||
|
||
impl<'a> Lexer<'a> {
|
||
pub fn lex(source: &str) -> Vec<Token> {
|
||
let mut lexer = Lexer {
|
||
chars: source.chars().peekable(),
|
||
index: 0,
|
||
};
|
||
let mut tokens = Vec::new();
|
||
|
||
loop {
|
||
let next = lexer.next();
|
||
|
||
if let TokenKind::EOF = next.kind {
|
||
tokens.push(next);
|
||
break;
|
||
} else {
|
||
tokens.push(next);
|
||
}
|
||
}
|
||
|
||
tokens
|
||
}
|
||
|
||
fn next(&mut self) -> Token {
|
||
let eof = build(TokenKind::EOF, "", (self.index, self.index));
|
||
let mut c = if let Some(c) = self.peek() {
|
||
*c
|
||
} else {
|
||
return eof;
|
||
};
|
||
|
||
while c == ' ' || c == '\t' || c == '\r' || c == '\n' {
|
||
if let None = self.advance() {
|
||
return eof;
|
||
}
|
||
|
||
c = if let Some(c) = self.peek() {
|
||
*c
|
||
} else {
|
||
return eof;
|
||
}
|
||
}
|
||
|
||
if c.is_digit(10) {
|
||
return self.next_number_literal();
|
||
}
|
||
|
||
if is_valid_identifier(Some(&c)) {
|
||
return self.next_identifier();
|
||
}
|
||
|
||
let span = (self.index, self.index + 1);
|
||
let token = match c {
|
||
'+' => build(TokenKind::Plus, "", span),
|
||
'-' => build(TokenKind::Minus, "", span),
|
||
'*' | '×' => build(TokenKind::Star, "", span),
|
||
'/' | '÷' => build(TokenKind::Slash, "", span),
|
||
'^' => build(TokenKind::Power, "", span),
|
||
'|' => build(TokenKind::Pipe, "", span),
|
||
'⌈' => build(TokenKind::OpenCeil, "", span),
|
||
'⌉' => build(TokenKind::ClosedCeil, "", span),
|
||
'⌊' => build(TokenKind::OpenFloor, "", span),
|
||
'⌋' => build(TokenKind::ClosedFloor, "", span),
|
||
'(' => build(TokenKind::OpenParenthesis, "", span),
|
||
')' => build(TokenKind::ClosedParenthesis, "", span),
|
||
'[' => build(TokenKind::OpenBracket, "", span),
|
||
']' => build(TokenKind::ClosedBracket, "", span),
|
||
'{' => build(TokenKind::OpenBrace, "", span),
|
||
'}' => build(TokenKind::ClosedBrace, "", span),
|
||
'!' => build(TokenKind::Exclamation, "", span),
|
||
'=' => build(TokenKind::Equals, "", span),
|
||
'>' => build(TokenKind::GreaterThan, "", span),
|
||
'<' => build(TokenKind::LessThan, "", span),
|
||
',' => build(TokenKind::Comma, "", span),
|
||
';' => build(TokenKind::Semicolon, "", span),
|
||
'%' => build(TokenKind::Percent, "", span),
|
||
'\'' => build(TokenKind::Tick, "", span),
|
||
'≠' => build(TokenKind::NotEquals, "", span),
|
||
'≥' => build(TokenKind::GreaterOrEquals, "", span),
|
||
'≤' => build(TokenKind::LessOrEquals, "", span),
|
||
// Some of the special symbols will be lexed here,
|
||
// so that they don't merge with other symbols.
|
||
'π' => build(TokenKind::Identifier, "pi", span),
|
||
'√' => build(TokenKind::Identifier, "sqrt", span),
|
||
'τ' => build(TokenKind::Identifier, "tau", span),
|
||
'ϕ' => build(TokenKind::Identifier, "phi", span),
|
||
'Γ' => build(TokenKind::Identifier, "gamma", span),
|
||
'∏' => build(TokenKind::Identifier, "prod", span),
|
||
_ => build(TokenKind::Unknown, "", span),
|
||
};
|
||
|
||
self.advance();
|
||
|
||
// Handle tokens with two characters
|
||
match (token.kind, self.peek()) {
|
||
(TokenKind::Star, Some('*')) => {
|
||
self.advance();
|
||
return build(TokenKind::Power, "", span);
|
||
}
|
||
(TokenKind::Exclamation, Some('=')) => {
|
||
self.advance();
|
||
return build(TokenKind::NotEquals, "", span);
|
||
}
|
||
(TokenKind::GreaterThan, Some('=')) => {
|
||
self.advance();
|
||
return build(TokenKind::GreaterOrEquals, "", span);
|
||
}
|
||
(TokenKind::LessThan, Some('=')) => {
|
||
self.advance();
|
||
return build(TokenKind::LessOrEquals, "", span);
|
||
}
|
||
_ => (),
|
||
}
|
||
|
||
token
|
||
}
|
||
|
||
fn next_number_literal(&mut self) -> Token {
|
||
let mut start = self.index;
|
||
let mut end = start;
|
||
let mut value = String::new();
|
||
let mut leading_zero = self.peek().unwrap_or(&'\0') == &'0';
|
||
let mut base = 10u32;
|
||
|
||
loop {
|
||
let c = if let Some(c) = self.peek() {
|
||
*c
|
||
} else {
|
||
break;
|
||
};
|
||
|
||
// If at the second character and
|
||
// the first character is a zero,
|
||
// allow a letter
|
||
if end - start == 1 && leading_zero {
|
||
base = match c {
|
||
'b' => 2,
|
||
'o' => 8,
|
||
'x' => 16,
|
||
_ => 10,
|
||
};
|
||
|
||
// Don't include eg. 0x in the value
|
||
start += 2;
|
||
end += 1;
|
||
self.advance();
|
||
value.clear();
|
||
leading_zero = false;
|
||
continue;
|
||
}
|
||
|
||
if !c.is_digit(base) && c != '.' && c != '_' && !c.is_whitespace()
|
||
|| c == '\n'
|
||
|| c == '\r'
|
||
{
|
||
break;
|
||
}
|
||
|
||
end += 1;
|
||
value.push(c);
|
||
self.advance();
|
||
}
|
||
|
||
// Subscript unicode symbols after the literal, eg. 11₂
|
||
let mut base_str = String::new();
|
||
while crate::text_utils::is_subscript(self.peek().unwrap_or(&'\0')) {
|
||
base_str.push(*self.peek().unwrap());
|
||
self.advance();
|
||
}
|
||
|
||
if base_str != "" {
|
||
base = crate::text_utils::subscript_to_digits(base_str.chars())
|
||
.parse::<u32>()
|
||
.unwrap_or(10);
|
||
}
|
||
|
||
if base != 10 {
|
||
value.push_str(&format!("_{}", base));
|
||
}
|
||
|
||
build(TokenKind::Literal, &value, (start, end))
|
||
}
|
||
|
||
fn next_identifier(&mut self) -> Token {
|
||
let start = self.index;
|
||
let mut end = start;
|
||
let mut value = String::new();
|
||
|
||
while is_valid_identifier(self.peek()) {
|
||
let c = *self.peek().unwrap();
|
||
|
||
// If the current character is an underscore, allow a number next.
|
||
// This is to allow the notation like the following: x_1
|
||
if c == '_' {
|
||
self.advance();
|
||
let num = self.next().value;
|
||
value.push('_');
|
||
value.push_str(&num.trim_end()); // Trim, since the number_literal function allows whitespace, which identifiers should not contain.
|
||
break;
|
||
}
|
||
|
||
// Only allow identifiers with a special character to have *one* character. No more.
|
||
// Break the loop if it isn't the first run and the current character is a special character.
|
||
if end - start > 0
|
||
&& !(c.is_ascii_alphabetic()
|
||
|| c == '\''
|
||
|| c == '_'
|
||
|| is_superscript(&c)
|
||
|| is_subscript(&c))
|
||
{
|
||
break;
|
||
}
|
||
|
||
end += 1;
|
||
value.push(c);
|
||
self.advance();
|
||
}
|
||
|
||
let kind = match value.as_ref() {
|
||
"unit" => TokenKind::UnitKeyword,
|
||
"to" => TokenKind::ToKeyword,
|
||
"if" => TokenKind::IfKeyword,
|
||
"otherwise" => TokenKind::OtherwiseKeyword,
|
||
_ => TokenKind::Identifier,
|
||
};
|
||
|
||
let value = match value.as_ref() {
|
||
"Σ" | "∑" => String::from("sum"),
|
||
"∏" => String::from("prod"),
|
||
"∫" | "integral" => String::from("integrate"),
|
||
"sin⁻¹" => String::from("asin"),
|
||
"cos⁻¹" => String::from("acos"),
|
||
"tan⁻¹" => String::from("atan"),
|
||
"cot⁻¹" => String::from("acot"),
|
||
"cosec⁻¹" => String::from("acosec"),
|
||
"sec⁻¹" => String::from("asec"),
|
||
"sinh⁻¹" => String::from("asinh"),
|
||
"cosh⁻¹" => String::from("acosh"),
|
||
"tanh⁻¹" => String::from("atanh"),
|
||
"coth⁻¹" => String::from("acoth"),
|
||
"cosech⁻¹" => String::from("acosech"),
|
||
"sech⁻¹" => String::from("asech"),
|
||
"∛" => String::from("cbrt"),
|
||
"°" => String::from("deg"),
|
||
_ => value, // things like log₂ are handled in the parser
|
||
};
|
||
|
||
build(kind, &value, (start, end))
|
||
}
|
||
|
||
fn peek(&mut self) -> Option<&char> {
|
||
self.chars.peek()
|
||
}
|
||
|
||
fn advance(&mut self) -> Option<char> {
|
||
self.index += 1;
|
||
self.chars.next()
|
||
}
|
||
}
|
||
|
||
fn build(kind: TokenKind, value: &str, span: (usize, usize)) -> Token {
|
||
Token {
|
||
kind,
|
||
value: value.to_string(),
|
||
span,
|
||
}
|
||
}
|
||
|
||
fn is_valid_identifier(c: Option<&char>) -> bool {
|
||
if let Some(c) = c {
|
||
match c {
|
||
'+' | '-' | '/' | '*' | '%' | '^' | '!' | '(' | ')' | '=' | '.' | ',' | ';' | '|'
|
||
| '⌊' | '⌋' | '⌈' | '⌉' | '[' | ']' | '{' | '}' | 'π' | '√' | 'τ' | 'ϕ' | 'Γ' | '<'
|
||
| '>' | '≠' | '≥' | '≤' | '×' | '÷' => false,
|
||
_ => !c.is_digit(10) || is_superscript(c) || is_subscript(c),
|
||
}
|
||
} else {
|
||
false
|
||
}
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
use test_case::test_case;
|
||
use wasm_bindgen_test::*;
|
||
wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser);
|
||
|
||
fn match_tokens(tokens: Vec<Token>, expected: Vec<TokenKind>) {
|
||
let mut expected_iter = expected.iter();
|
||
|
||
for token in tokens {
|
||
assert_eq!(token.kind, *expected_iter.next().unwrap());
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
#[wasm_bindgen_test]
|
||
fn test_token_kinds() {
|
||
let tokens = Lexer::lex("+-*/%^()|=!,");
|
||
let expected = vec![
|
||
TokenKind::Plus,
|
||
TokenKind::Minus,
|
||
TokenKind::Star,
|
||
TokenKind::Slash,
|
||
TokenKind::Percent,
|
||
TokenKind::Power,
|
||
TokenKind::OpenParenthesis,
|
||
TokenKind::ClosedParenthesis,
|
||
TokenKind::Pipe,
|
||
TokenKind::Equals,
|
||
TokenKind::Exclamation,
|
||
TokenKind::Comma,
|
||
TokenKind::EOF,
|
||
];
|
||
|
||
match_tokens(tokens, expected);
|
||
}
|
||
|
||
#[test]
|
||
#[wasm_bindgen_test]
|
||
fn test_brackets() {
|
||
let tokens = Lexer::lex("[1 < 2]");
|
||
let expected = vec![
|
||
TokenKind::OpenBracket,
|
||
TokenKind::Literal,
|
||
TokenKind::LessThan,
|
||
TokenKind::Literal,
|
||
TokenKind::ClosedBracket,
|
||
TokenKind::EOF,
|
||
];
|
||
|
||
match_tokens(tokens, expected);
|
||
}
|
||
|
||
#[test]
|
||
#[wasm_bindgen_test]
|
||
fn test_empty() {
|
||
// test_case macro doesn't seem to work with spaces.
|
||
let test_cases = vec![" ", " ", "test ", " test "];
|
||
|
||
for input in test_cases {
|
||
let tokens = Lexer::lex(input);
|
||
|
||
if regex::Regex::new(r"^\s*$").unwrap().is_match(input) {
|
||
let expected = vec![TokenKind::EOF];
|
||
match_tokens(tokens, expected);
|
||
} else {
|
||
let expected = vec![TokenKind::Identifier, TokenKind::EOF];
|
||
match_tokens(tokens, expected);
|
||
}
|
||
}
|
||
}
|
||
|
||
#[test_case("1")]
|
||
#[test_case("24")]
|
||
#[test_case("56.4")]
|
||
fn test_number_literal(input: &str) {
|
||
let tokens = Lexer::lex(input);
|
||
let expected = vec![TokenKind::Literal, TokenKind::EOF];
|
||
|
||
assert_eq!(&tokens[0].value, input);
|
||
match_tokens(tokens, expected);
|
||
}
|
||
|
||
#[test_case("x")]
|
||
#[test_case("xy")]
|
||
fn test_identifier(input: &str) {
|
||
let tokens = Lexer::lex(input);
|
||
let expected = vec![TokenKind::Identifier, TokenKind::EOF];
|
||
|
||
assert_eq!(&tokens[0].value, input);
|
||
match_tokens(tokens, expected);
|
||
}
|
||
|
||
#[test]
|
||
fn test_function_call() {
|
||
let tokens = Lexer::lex("f(x)");
|
||
let expected = vec![
|
||
TokenKind::Identifier,
|
||
TokenKind::OpenParenthesis,
|
||
TokenKind::Identifier,
|
||
TokenKind::ClosedParenthesis,
|
||
TokenKind::EOF,
|
||
];
|
||
|
||
match_tokens(tokens, expected);
|
||
}
|
||
}
|