use crate::text_utils::{is_subscript, is_superscript}; use std::iter::Peekable; use std::str; use std::str::Chars; #[derive(Clone, Debug, PartialEq, Copy)] pub enum TokenKind { Unknown, Literal, Identifier, Plus, Minus, Star, Slash, Power, Exclamation, Percent, Tick, GreaterThan, LessThan, Equals, NotEquals, GreaterOrEquals, LessOrEquals, UnitKeyword, ToKeyword, IfKeyword, OtherwiseKeyword, Pipe, OpenCeil, ClosedCeil, OpenFloor, ClosedFloor, OpenParenthesis, ClosedParenthesis, OpenBracket, ClosedBracket, OpenBrace, ClosedBrace, Comma, Semicolon, EOF, } #[derive(Clone, Debug, PartialEq)] pub struct Token { pub kind: TokenKind, pub value: String, pub span: (usize, usize), } pub struct Lexer<'a> { chars: Peekable>, index: usize, } impl<'a> Lexer<'a> { pub fn lex(source: &str) -> Vec { let mut lexer = Lexer { chars: source.chars().peekable(), index: 0, }; let mut tokens = Vec::new(); loop { let next = lexer.next(); if let TokenKind::EOF = next.kind { tokens.push(next); break; } else { tokens.push(next); } } tokens } fn next(&mut self) -> Token { let eof = build(TokenKind::EOF, "", (self.index, self.index)); let mut c = if let Some(c) = self.peek() { *c } else { return eof; }; while c == ' ' || c == '\t' || c == '\r' || c == '\n' { if let None = self.advance() { return eof; } c = if let Some(c) = self.peek() { *c } else { return eof; } } if c.is_digit(10) { return self.next_number_literal(); } if is_valid_identifier(Some(&c)) { return self.next_identifier(); } let span = (self.index, self.index + 1); let token = match c { '+' => build(TokenKind::Plus, "", span), '-' => build(TokenKind::Minus, "", span), '*' | '×' => build(TokenKind::Star, "", span), '/' | '÷' => build(TokenKind::Slash, "", span), '^' => build(TokenKind::Power, "", span), '|' => build(TokenKind::Pipe, "", span), '⌈' => build(TokenKind::OpenCeil, "", span), '⌉' => build(TokenKind::ClosedCeil, "", span), '⌊' => build(TokenKind::OpenFloor, "", span), '⌋' => build(TokenKind::ClosedFloor, "", span), '(' => build(TokenKind::OpenParenthesis, "", span), ')' => build(TokenKind::ClosedParenthesis, "", span), '[' => build(TokenKind::OpenBracket, "", span), ']' => build(TokenKind::ClosedBracket, "", span), '{' => build(TokenKind::OpenBrace, "", span), '}' => build(TokenKind::ClosedBrace, "", span), '!' => build(TokenKind::Exclamation, "", span), '=' => build(TokenKind::Equals, "", span), '>' => build(TokenKind::GreaterThan, "", span), '<' => build(TokenKind::LessThan, "", span), ',' => build(TokenKind::Comma, "", span), ';' => build(TokenKind::Semicolon, "", span), '%' => build(TokenKind::Percent, "", span), '\'' => build(TokenKind::Tick, "", span), '≠' => build(TokenKind::NotEquals, "", span), '≥' => build(TokenKind::GreaterOrEquals, "", span), '≤' => build(TokenKind::LessOrEquals, "", span), // Some of the special symbols will be lexed here, // so that they don't merge with other symbols. 'π' => build(TokenKind::Identifier, "pi", span), '√' => build(TokenKind::Identifier, "sqrt", span), 'τ' => build(TokenKind::Identifier, "tau", span), 'ϕ' => build(TokenKind::Identifier, "phi", span), 'Γ' => build(TokenKind::Identifier, "gamma", span), '∏' => build(TokenKind::Identifier, "prod", span), _ => build(TokenKind::Unknown, "", span), }; self.advance(); // Handle tokens with two characters match (token.kind, self.peek()) { (TokenKind::Star, Some('*')) => { self.advance(); return build(TokenKind::Power, "", span); } (TokenKind::Exclamation, Some('=')) => { self.advance(); return build(TokenKind::NotEquals, "", span); } (TokenKind::GreaterThan, Some('=')) => { self.advance(); return build(TokenKind::GreaterOrEquals, "", span); } (TokenKind::LessThan, Some('=')) => { self.advance(); return build(TokenKind::LessOrEquals, "", span); } _ => (), } token } fn next_number_literal(&mut self) -> Token { let mut start = self.index; let mut end = start; let mut value = String::new(); let mut leading_zero = self.peek().unwrap_or(&'\0') == &'0'; let mut base = 10u32; loop { let c = if let Some(c) = self.peek() { *c } else { break; }; // If at the second character and // the first character is a zero, // allow a letter if end - start == 1 && leading_zero { base = match c { 'b' => 2, 'o' => 8, 'x' => 16, _ => 10, }; // Don't include eg. 0x in the value if base != 10 { start += 2; end += 1; self.advance(); value.clear(); leading_zero = false; continue; } } if !c.is_digit(base) && c != '.' && c != '_' && !c.is_whitespace() || c == '\n' || c == '\r' { break; } end += 1; value.push(c); self.advance(); } // Subscript unicode symbols after the literal, eg. 11₂ let mut base_str = String::new(); while crate::text_utils::is_subscript(self.peek().unwrap_or(&'\0')) { base_str.push(*self.peek().unwrap()); self.advance(); } if base_str != "" { base = crate::text_utils::subscript_to_digits(base_str.chars()) .parse::() .unwrap_or(10); } if base != 10 { value.push_str(&format!("_{}", base)); } build(TokenKind::Literal, &value, (start, end)) } fn next_identifier(&mut self) -> Token { let start = self.index; let mut end = start; let mut value = String::new(); while is_valid_identifier(self.peek()) { let c = *self.peek().unwrap(); // If the current character is an underscore, allow a number next. // This is to allow the notation like the following: x_1 if c == '_' { self.advance(); let num = self.next().value; value.push('_'); value.push_str(&num.trim_end()); // Trim, since the number_literal function allows whitespace, which identifiers should not contain. break; } // Only allow identifiers with a special character to have *one* character. No more. // Break the loop if it isn't the first run and the current character is a special character. if end - start > 0 && !(c.is_ascii_alphabetic() || c == '\'' || c == '_' || is_superscript(&c) || is_subscript(&c)) { break; } end += 1; value.push(c); self.advance(); } let kind = match value.as_ref() { "unit" => TokenKind::UnitKeyword, "to" => TokenKind::ToKeyword, "if" => TokenKind::IfKeyword, "otherwise" => TokenKind::OtherwiseKeyword, _ => TokenKind::Identifier, }; let value = match value.as_ref() { "Σ" | "∑" => String::from("sum"), "∏" => String::from("prod"), "∫" | "integral" => String::from("integrate"), "sin⁻¹" => String::from("asin"), "cos⁻¹" => String::from("acos"), "tan⁻¹" => String::from("atan"), "cot⁻¹" => String::from("acot"), "cosec⁻¹" => String::from("acosec"), "sec⁻¹" => String::from("asec"), "sinh⁻¹" => String::from("asinh"), "cosh⁻¹" => String::from("acosh"), "tanh⁻¹" => String::from("atanh"), "coth⁻¹" => String::from("acoth"), "cosech⁻¹" => String::from("acosech"), "sech⁻¹" => String::from("asech"), "∛" => String::from("cbrt"), "°" => String::from("deg"), _ => value, // things like log₂ are handled in the parser }; build(kind, &value, (start, end)) } fn peek(&mut self) -> Option<&char> { self.chars.peek() } fn advance(&mut self) -> Option { self.index += 1; self.chars.next() } } fn build(kind: TokenKind, value: &str, span: (usize, usize)) -> Token { Token { kind, value: value.to_string(), span, } } fn is_valid_identifier(c: Option<&char>) -> bool { if let Some(c) = c { match c { '+' | '-' | '/' | '*' | '%' | '^' | '!' | '(' | ')' | '=' | '.' | ',' | ';' | '|' | '⌊' | '⌋' | '⌈' | '⌉' | '[' | ']' | '{' | '}' | 'π' | '√' | 'τ' | 'ϕ' | 'Γ' | '<' | '>' | '≠' | '≥' | '≤' | '×' | '÷' => false, _ => !c.is_digit(10) || is_superscript(c) || is_subscript(c), } } else { false } } #[cfg(test)] mod tests { use super::*; use test_case::test_case; use wasm_bindgen_test::*; wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser); fn match_tokens(tokens: Vec, expected: Vec) { let mut expected_iter = expected.iter(); for token in tokens { assert_eq!(token.kind, *expected_iter.next().unwrap()); } } #[test] #[wasm_bindgen_test] fn test_token_kinds() { let tokens = Lexer::lex("+-*/%^()|=!,"); let expected = vec![ TokenKind::Plus, TokenKind::Minus, TokenKind::Star, TokenKind::Slash, TokenKind::Percent, TokenKind::Power, TokenKind::OpenParenthesis, TokenKind::ClosedParenthesis, TokenKind::Pipe, TokenKind::Equals, TokenKind::Exclamation, TokenKind::Comma, TokenKind::EOF, ]; match_tokens(tokens, expected); } #[test] #[wasm_bindgen_test] fn test_brackets() { let tokens = Lexer::lex("[1 < 2]"); let expected = vec![ TokenKind::OpenBracket, TokenKind::Literal, TokenKind::LessThan, TokenKind::Literal, TokenKind::ClosedBracket, TokenKind::EOF, ]; match_tokens(tokens, expected); } #[test] #[wasm_bindgen_test] fn test_empty() { // test_case macro doesn't seem to work with spaces. let test_cases = vec![" ", " ", "test ", " test "]; for input in test_cases { let tokens = Lexer::lex(input); if regex::Regex::new(r"^\s*$").unwrap().is_match(input) { let expected = vec![TokenKind::EOF]; match_tokens(tokens, expected); } else { let expected = vec![TokenKind::Identifier, TokenKind::EOF]; match_tokens(tokens, expected); } } } #[test_case("1")] #[test_case("24")] #[test_case("56.4")] fn test_number_literal(input: &str) { let tokens = Lexer::lex(input); let expected = vec![TokenKind::Literal, TokenKind::EOF]; assert_eq!(&tokens[0].value, input); match_tokens(tokens, expected); } #[test_case("x")] #[test_case("xy")] fn test_identifier(input: &str) { let tokens = Lexer::lex(input); let expected = vec![TokenKind::Identifier, TokenKind::EOF]; assert_eq!(&tokens[0].value, input); match_tokens(tokens, expected); } #[test] fn test_function_call() { let tokens = Lexer::lex("f(x)"); let expected = vec![ TokenKind::Identifier, TokenKind::OpenParenthesis, TokenKind::Identifier, TokenKind::ClosedParenthesis, TokenKind::EOF, ]; match_tokens(tokens, expected); } }