kalker/kalk/src/lexer.rs
2021-12-30 12:44:48 +01:00

451 lines
13 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use crate::text_utils::{is_subscript, is_superscript};
use std::iter::Peekable;
use std::str;
use std::str::Chars;
#[derive(Clone, Debug, PartialEq, Copy)]
pub enum TokenKind {
Unknown,
Literal,
Identifier,
Plus,
Minus,
Star,
Slash,
Power,
Exclamation,
Percent,
Tick,
GreaterThan,
LessThan,
Equals,
NotEquals,
GreaterOrEquals,
LessOrEquals,
UnitKeyword,
ToKeyword,
IfKeyword,
OtherwiseKeyword,
Pipe,
OpenCeil,
ClosedCeil,
OpenFloor,
ClosedFloor,
OpenParenthesis,
ClosedParenthesis,
OpenBracket,
ClosedBracket,
OpenBrace,
ClosedBrace,
Comma,
Semicolon,
EOF,
}
#[derive(Clone, Debug, PartialEq)]
pub struct Token {
pub kind: TokenKind,
pub value: String,
pub span: (usize, usize),
}
pub struct Lexer<'a> {
chars: Peekable<Chars<'a>>,
index: usize,
}
impl<'a> Lexer<'a> {
pub fn lex(source: &str) -> Vec<Token> {
let mut lexer = Lexer {
chars: source.chars().peekable(),
index: 0,
};
let mut tokens = Vec::new();
loop {
let next = lexer.next();
if let TokenKind::EOF = next.kind {
tokens.push(next);
break;
} else {
tokens.push(next);
}
}
tokens
}
fn next(&mut self) -> Token {
let eof = build(TokenKind::EOF, "", (self.index, self.index));
let mut c = if let Some(c) = self.peek() {
*c
} else {
return eof;
};
while c == ' ' || c == '\t' || c == '\r' || c == '\n' {
if let None = self.advance() {
return eof;
}
c = if let Some(c) = self.peek() {
*c
} else {
return eof;
}
}
if c.is_digit(10) {
return self.next_number_literal();
}
if is_valid_identifier(Some(&c)) {
return self.next_identifier();
}
let span = (self.index, self.index + 1);
let token = match c {
'+' => build(TokenKind::Plus, "", span),
'-' => build(TokenKind::Minus, "", span),
'*' | '×' => build(TokenKind::Star, "", span),
'/' | '÷' => build(TokenKind::Slash, "", span),
'^' => build(TokenKind::Power, "", span),
'|' => build(TokenKind::Pipe, "", span),
'⌈' => build(TokenKind::OpenCeil, "", span),
'⌉' => build(TokenKind::ClosedCeil, "", span),
'⌊' => build(TokenKind::OpenFloor, "", span),
'⌋' => build(TokenKind::ClosedFloor, "", span),
'(' => build(TokenKind::OpenParenthesis, "", span),
')' => build(TokenKind::ClosedParenthesis, "", span),
'[' => build(TokenKind::OpenBracket, "", span),
']' => build(TokenKind::ClosedBracket, "", span),
'{' => build(TokenKind::OpenBrace, "", span),
'}' => build(TokenKind::ClosedBrace, "", span),
'!' => build(TokenKind::Exclamation, "", span),
'=' => build(TokenKind::Equals, "", span),
'>' => build(TokenKind::GreaterThan, "", span),
'<' => build(TokenKind::LessThan, "", span),
',' => build(TokenKind::Comma, "", span),
';' => build(TokenKind::Semicolon, "", span),
'%' => build(TokenKind::Percent, "", span),
'\'' => build(TokenKind::Tick, "", span),
'≠' => build(TokenKind::NotEquals, "", span),
'≥' => build(TokenKind::GreaterOrEquals, "", span),
'≤' => build(TokenKind::LessOrEquals, "", span),
// Some of the special symbols will be lexed here,
// so that they don't merge with other symbols.
'π' => build(TokenKind::Identifier, "pi", span),
'√' => build(TokenKind::Identifier, "sqrt", span),
'τ' => build(TokenKind::Identifier, "tau", span),
'ϕ' => build(TokenKind::Identifier, "phi", span),
'Γ' => build(TokenKind::Identifier, "gamma", span),
'∏' => build(TokenKind::Identifier, "prod", span),
_ => build(TokenKind::Unknown, "", span),
};
self.advance();
// Handle tokens with two characters
match (token.kind, self.peek()) {
(TokenKind::Star, Some('*')) => {
self.advance();
return build(TokenKind::Power, "", span);
}
(TokenKind::Exclamation, Some('=')) => {
self.advance();
return build(TokenKind::NotEquals, "", span);
}
(TokenKind::GreaterThan, Some('=')) => {
self.advance();
return build(TokenKind::GreaterOrEquals, "", span);
}
(TokenKind::LessThan, Some('=')) => {
self.advance();
return build(TokenKind::LessOrEquals, "", span);
}
_ => (),
}
token
}
fn next_number_literal(&mut self) -> Token {
let mut start = self.index;
let mut end = start;
let mut value = String::new();
let mut leading_zero = self.peek().unwrap_or(&'\0') == &'0';
let mut base = 10u32;
loop {
let c = if let Some(c) = self.peek() {
*c
} else {
break;
};
// If at the second character and
// the first character is a zero,
// allow a letter
if end - start == 1 && leading_zero {
base = match c {
'b' => 2,
'o' => 8,
'x' => 16,
_ => 10,
};
// Don't include eg. 0x in the value
if base != 10 {
start += 2;
end += 1;
self.advance();
value.clear();
leading_zero = false;
continue;
}
}
if !c.is_digit(base) && c != '.' && c != '_' && !c.is_whitespace()
|| c == '\n'
|| c == '\r'
{
break;
}
end += 1;
value.push(c);
self.advance();
}
// Subscript unicode symbols after the literal, eg. 11₂
let mut base_str = String::new();
while crate::text_utils::is_subscript(self.peek().unwrap_or(&'\0')) {
base_str.push(*self.peek().unwrap());
self.advance();
}
if base_str != "" {
base = crate::text_utils::subscript_to_digits(base_str.chars())
.parse::<u32>()
.unwrap_or(10);
}
if base != 10 {
value.push_str(&format!("_{}", base));
}
build(TokenKind::Literal, &value, (start, end))
}
fn next_identifier(&mut self) -> Token {
let start = self.index;
let mut end = start;
let mut value = String::new();
while is_valid_identifier(self.peek()) {
let c = *self.peek().unwrap();
// If the current character is an underscore, allow a number next.
// This is to allow the notation like the following: x_1
if c == '_' {
self.advance();
let num = self.next().value;
value.push('_');
value.push_str(&num.trim_end()); // Trim, since the number_literal function allows whitespace, which identifiers should not contain.
break;
}
// Only allow identifiers with a special character to have *one* character. No more.
// Break the loop if it isn't the first run and the current character is a special character.
if end - start > 0
&& !(c.is_ascii_alphabetic()
|| c == '\''
|| c == '_'
|| is_superscript(&c)
|| is_subscript(&c))
{
break;
}
end += 1;
value.push(c);
self.advance();
}
let kind = match value.as_ref() {
"unit" => TokenKind::UnitKeyword,
"to" => TokenKind::ToKeyword,
"if" => TokenKind::IfKeyword,
"otherwise" => TokenKind::OtherwiseKeyword,
_ => TokenKind::Identifier,
};
let value = match value.as_ref() {
"Σ" | "" => String::from("sum"),
"" => String::from("prod"),
"" | "integral" => String::from("integrate"),
"sin⁻¹" => String::from("asin"),
"cos⁻¹" => String::from("acos"),
"tan⁻¹" => String::from("atan"),
"cot⁻¹" => String::from("acot"),
"cosec⁻¹" => String::from("acosec"),
"sec⁻¹" => String::from("asec"),
"sinh⁻¹" => String::from("asinh"),
"cosh⁻¹" => String::from("acosh"),
"tanh⁻¹" => String::from("atanh"),
"coth⁻¹" => String::from("acoth"),
"cosech⁻¹" => String::from("acosech"),
"sech⁻¹" => String::from("asech"),
"" => String::from("cbrt"),
"°" => String::from("deg"),
_ => value, // things like log₂ are handled in the parser
};
build(kind, &value, (start, end))
}
fn peek(&mut self) -> Option<&char> {
self.chars.peek()
}
fn advance(&mut self) -> Option<char> {
self.index += 1;
self.chars.next()
}
}
fn build(kind: TokenKind, value: &str, span: (usize, usize)) -> Token {
Token {
kind,
value: value.to_string(),
span,
}
}
fn is_valid_identifier(c: Option<&char>) -> bool {
if let Some(c) = c {
match c {
'+' | '-' | '/' | '*' | '%' | '^' | '!' | '(' | ')' | '=' | '.' | ',' | ';' | '|'
| '⌊' | '⌋' | '⌈' | '⌉' | '[' | ']' | '{' | '}' | 'π' | '√' | 'τ' | 'ϕ' | 'Γ' | '<'
| '>' | '≠' | '≥' | '≤' | '×' | '÷' => false,
_ => !c.is_digit(10) || is_superscript(c) || is_subscript(c),
}
} else {
false
}
}
#[cfg(test)]
mod tests {
use super::*;
use test_case::test_case;
use wasm_bindgen_test::*;
wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser);
fn match_tokens(tokens: Vec<Token>, expected: Vec<TokenKind>) {
let mut expected_iter = expected.iter();
for token in tokens {
assert_eq!(token.kind, *expected_iter.next().unwrap());
}
}
#[test]
#[wasm_bindgen_test]
fn test_token_kinds() {
let tokens = Lexer::lex("+-*/%^()|=!,");
let expected = vec![
TokenKind::Plus,
TokenKind::Minus,
TokenKind::Star,
TokenKind::Slash,
TokenKind::Percent,
TokenKind::Power,
TokenKind::OpenParenthesis,
TokenKind::ClosedParenthesis,
TokenKind::Pipe,
TokenKind::Equals,
TokenKind::Exclamation,
TokenKind::Comma,
TokenKind::EOF,
];
match_tokens(tokens, expected);
}
#[test]
#[wasm_bindgen_test]
fn test_brackets() {
let tokens = Lexer::lex("[1 < 2]");
let expected = vec![
TokenKind::OpenBracket,
TokenKind::Literal,
TokenKind::LessThan,
TokenKind::Literal,
TokenKind::ClosedBracket,
TokenKind::EOF,
];
match_tokens(tokens, expected);
}
#[test]
#[wasm_bindgen_test]
fn test_empty() {
// test_case macro doesn't seem to work with spaces.
let test_cases = vec![" ", " ", "test ", " test "];
for input in test_cases {
let tokens = Lexer::lex(input);
if regex::Regex::new(r"^\s*$").unwrap().is_match(input) {
let expected = vec![TokenKind::EOF];
match_tokens(tokens, expected);
} else {
let expected = vec![TokenKind::Identifier, TokenKind::EOF];
match_tokens(tokens, expected);
}
}
}
#[test_case("1")]
#[test_case("24")]
#[test_case("56.4")]
fn test_number_literal(input: &str) {
let tokens = Lexer::lex(input);
let expected = vec![TokenKind::Literal, TokenKind::EOF];
assert_eq!(&tokens[0].value, input);
match_tokens(tokens, expected);
}
#[test_case("x")]
#[test_case("xy")]
fn test_identifier(input: &str) {
let tokens = Lexer::lex(input);
let expected = vec![TokenKind::Identifier, TokenKind::EOF];
assert_eq!(&tokens[0].value, input);
match_tokens(tokens, expected);
}
#[test]
fn test_function_call() {
let tokens = Lexer::lex("f(x)");
let expected = vec![
TokenKind::Identifier,
TokenKind::OpenParenthesis,
TokenKind::Identifier,
TokenKind::ClosedParenthesis,
TokenKind::EOF,
];
match_tokens(tokens, expected);
}
}