kalker/kalk/src/lexer.rs

use crate::text_utils::{is_subscript, is_superscript};
use std::iter::Peekable;
use std::str;
use std::str::Chars;

#[derive(Clone, Debug, PartialEq, Copy)]
pub enum TokenKind {
    Unknown,
    Literal,
    Identifier,

    Plus,
    Minus,
    Star,
    Slash,
    Power,
    Exclamation,
    Percent,
    Tick,
    GreaterThan,
    LessThan,
    Equals,
    NotEquals,
    GreaterOrEquals,
    LessOrEquals,

    UnitKeyword,
    ToKeyword,
    IfKeyword,
    OtherwiseKeyword,

    Pipe,
    OpenCeil,
    ClosedCeil,
    OpenFloor,
    ClosedFloor,
    OpenParenthesis,
    ClosedParenthesis,
    OpenBracket,
    ClosedBracket,
    OpenBrace,
    ClosedBrace,
    Comma,
    Semicolon,

    EOF,
}

#[derive(Clone, Debug, PartialEq)]
pub struct Token {
    pub kind: TokenKind,
    pub value: String,
    pub span: (usize, usize),
}

pub struct Lexer<'a> {
    chars: Peekable<Chars<'a>>,
    index: usize,
}

impl<'a> Lexer<'a> {
    pub fn lex(source: &str) -> Vec<Token> {
        let mut lexer = Lexer {
            chars: source.chars().peekable(),
            index: 0,
        };
        let mut tokens = Vec::new();

        loop {
            let next = lexer.next();

            if let TokenKind::EOF = next.kind {
                tokens.push(next);
                break;
            } else {
                tokens.push(next);
            }
        }

        tokens
    }

    fn next(&mut self) -> Token {
        let eof = build(TokenKind::EOF, "", (self.index, self.index));
        let mut c = if let Some(c) = self.peek() {
            *c
        } else {
            return eof;
        };

        while c == ' ' || c == '\t' || c == '\r' || c == '\n' {
            if let None = self.advance() {
                return eof;
            }

            c = if let Some(c) = self.peek() {
                *c
            } else {
                return eof;
            }
        }

        if c.is_digit(10) {
            return self.next_number_literal();
        }

        if is_valid_identifier(Some(&c)) {
            return self.next_identifier();
        }

        let span = (self.index, self.index + 1);
        let token = match c {
            '+' => build(TokenKind::Plus, "", span),
            '-' => build(TokenKind::Minus, "", span),
            '*' | '×' => build(TokenKind::Star, "", span),
            '/' | '÷' => build(TokenKind::Slash, "", span),
            '^' => build(TokenKind::Power, "", span),
            '|' => build(TokenKind::Pipe, "", span),
            '⌈' => build(TokenKind::OpenCeil, "", span),
            '⌉' => build(TokenKind::ClosedCeil, "", span),
            '⌊' => build(TokenKind::OpenFloor, "", span),
            '⌋' => build(TokenKind::ClosedFloor, "", span),
            '(' => build(TokenKind::OpenParenthesis, "", span),
            ')' => build(TokenKind::ClosedParenthesis, "", span),
            '[' => build(TokenKind::OpenBracket, "", span),
            ']' => build(TokenKind::ClosedBracket, "", span),
            '{' => build(TokenKind::OpenBrace, "", span),
            '}' => build(TokenKind::ClosedBrace, "", span),
            '!' => build(TokenKind::Exclamation, "", span),
            '=' => build(TokenKind::Equals, "", span),
            '>' => build(TokenKind::GreaterThan, "", span),
            '<' => build(TokenKind::LessThan, "", span),
            ',' => build(TokenKind::Comma, "", span),
            ';' => build(TokenKind::Semicolon, "", span),
            '%' => build(TokenKind::Percent, "", span),
            '\'' => build(TokenKind::Tick, "", span),
            '≠' => build(TokenKind::NotEquals, "", span),
            '≥' => build(TokenKind::GreaterOrEquals, "", span),
            '≤' => build(TokenKind::LessOrEquals, "", span),
            // Some of the special symbols will be lexed here,
            // so that they don't merge with other symbols.
            'π' => build(TokenKind::Identifier, "pi", span),
            '√' => build(TokenKind::Identifier, "sqrt", span),
            'τ' => build(TokenKind::Identifier, "tau", span),
            'ϕ' => build(TokenKind::Identifier, "phi", span),
            'Γ' => build(TokenKind::Identifier, "gamma", span),
            '∏' => build(TokenKind::Identifier, "prod", span),
            _ => build(TokenKind::Unknown, "", span),
        };

        self.advance();

        // Handle tokens with two characters
        match (token.kind, self.peek()) {
            (TokenKind::Star, Some('*')) => {
                self.advance();
                return build(TokenKind::Power, "", span);
            }
            (TokenKind::Exclamation, Some('=')) => {
                self.advance();
                return build(TokenKind::NotEquals, "", span);
            }
            (TokenKind::GreaterThan, Some('=')) => {
                self.advance();
                return build(TokenKind::GreaterOrEquals, "", span);
            }
            (TokenKind::LessThan, Some('=')) => {
                self.advance();
                return build(TokenKind::LessOrEquals, "", span);
            }
            _ => (),
        }

        token
    }

    fn next_number_literal(&mut self) -> Token {
        let mut start = self.index;
        let mut end = start;
        let mut value = String::new();
        let mut leading_zero = self.peek().unwrap_or(&'\0') == &'0';
        let mut base = 10u32;

        loop {
            let c = if let Some(c) = self.peek() {
                *c
            } else {
                break;
            };

            // If at the second character and
            // the first character is a zero,
            // allow a letter
            if end - start == 1 && leading_zero {
                base = match c {
                    'b' => 2,
                    'o' => 8,
                    'x' => 16,
                    _ => 10,
                };

                // Don't include eg. 0x in the value
                if base != 10 {
                    start += 2;
                    end += 1;
                    self.advance();
                    value.clear();
                    leading_zero = false;
                    continue;
                }
            }

            if !c.is_digit(base) && c != '.' && c != '_' && !c.is_whitespace()
                || c == '\n'
                || c == '\r'
            {
                break;
            }

            end += 1;
            value.push(c);
            self.advance();
        }

        // Subscript unicode symbols after the literal, eg. 11₂
        let mut base_str = String::new();
        while crate::text_utils::is_subscript(self.peek().unwrap_or(&'\0')) {
            base_str.push(*self.peek().unwrap());
            self.advance();
        }

        if base_str != "" {
            base = crate::text_utils::subscript_to_digits(base_str.chars())
                .parse::<u32>()
                .unwrap_or(10);
        }

        if base != 10 {
            value.push_str(&format!("_{}", base));
        }

        build(TokenKind::Literal, &value, (start, end))
    }

    fn next_identifier(&mut self) -> Token {
        let start = self.index;
        let mut end = start;
        let mut value = String::new();

        while is_valid_identifier(self.peek()) {
            let c = *self.peek().unwrap();

            // If the current character is an underscore, allow a number next.
            // This is to allow the notation like the following: x_1
            if c == '_' {
                self.advance();
                let num = self.next().value;
                value.push('_');
                value.push_str(&num.trim_end()); // Trim, since the number_literal function allows whitespace, which identifiers should not contain.
                break;
            }

            // Only allow identifiers with a special character to have *one* character. No more.
            // Break the loop if it isn't the first run and the current character is a special character.
            if end - start > 0
                && !(c.is_ascii_alphabetic()
                    || c == '\''
                    || c == '_'
                    || is_superscript(&c)
                    || is_subscript(&c))
            {
                break;
            }

            end += 1;
            value.push(c);
            self.advance();
        }

        let kind = match value.as_ref() {
            "unit" => TokenKind::UnitKeyword,
            "to" => TokenKind::ToKeyword,
            "if" => TokenKind::IfKeyword,
            "otherwise" => TokenKind::OtherwiseKeyword,
            _ => TokenKind::Identifier,
        };

        let value = match value.as_ref() {
            "Σ" | "∑" => String::from("sum"),
            "∏" => String::from("prod"),
            "∫" | "integral" => String::from("integrate"),
            "sin⁻¹" => String::from("asin"),
            "cos⁻¹" => String::from("acos"),
            "tan⁻¹" => String::from("atan"),
            "cot⁻¹" => String::from("acot"),
            "cosec⁻¹" => String::from("acosec"),
            "sec⁻¹" => String::from("asec"),
            "sinh⁻¹" => String::from("asinh"),
            "cosh⁻¹" => String::from("acosh"),
            "tanh⁻¹" => String::from("atanh"),
            "coth⁻¹" => String::from("acoth"),
            "cosech⁻¹" => String::from("acosech"),
            "sech⁻¹" => String::from("asech"),
            "∛" => String::from("cbrt"),
            "°" => String::from("deg"),
            _ => value, // things like log₂ are handled in the parser
        };

        build(kind, &value, (start, end))
    }

    fn peek(&mut self) -> Option<&char> {
        self.chars.peek()
    }

    fn advance(&mut self) -> Option<char> {
        self.index += 1;
        self.chars.next()
    }
}

fn build(kind: TokenKind, value: &str, span: (usize, usize)) -> Token {
    Token {
        kind,
        value: value.to_string(),
        span,
    }
}

fn is_valid_identifier(c: Option<&char>) -> bool {
    if let Some(c) = c {
        match c {
            '+' | '-' | '/' | '*' | '%' | '^' | '!' | '(' | ')' | '=' | '.' | ',' | ';' | '|'
            | '⌊' | '⌋' | '⌈' | '⌉' | '[' | ']' | '{' | '}' | 'π' | '√' | 'τ' | 'ϕ' | 'Γ' | '<'
            | '>' | '≠' | '≥' | '≤' | '×' | '÷' => false,
            _ => !c.is_digit(10) || is_superscript(c) || is_subscript(c),
        }
    } else {
        false
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use test_case::test_case;
    use wasm_bindgen_test::*;
    wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser);

    fn match_tokens(tokens: Vec<Token>, expected: Vec<TokenKind>) {
        let mut expected_iter = expected.iter();

        for token in tokens {
            assert_eq!(token.kind, *expected_iter.next().unwrap());
        }
    }

    #[test]
    #[wasm_bindgen_test]
    fn test_token_kinds() {
        let tokens = Lexer::lex("+-*/%^()|=!,");
        let expected = vec![
            TokenKind::Plus,
            TokenKind::Minus,
            TokenKind::Star,
            TokenKind::Slash,
            TokenKind::Percent,
            TokenKind::Power,
            TokenKind::OpenParenthesis,
            TokenKind::ClosedParenthesis,
            TokenKind::Pipe,
            TokenKind::Equals,
            TokenKind::Exclamation,
            TokenKind::Comma,
            TokenKind::EOF,
        ];

        match_tokens(tokens, expected);
    }

    #[test]
    #[wasm_bindgen_test]
    fn test_brackets() {
        let tokens = Lexer::lex("[1 < 2]");
        let expected = vec![
            TokenKind::OpenBracket,
            TokenKind::Literal,
            TokenKind::LessThan,
            TokenKind::Literal,
            TokenKind::ClosedBracket,
            TokenKind::EOF,
        ];

        match_tokens(tokens, expected);
    }

    #[test]
    #[wasm_bindgen_test]
    fn test_empty() {
        // test_case macro doesn't seem to work with spaces.
        let test_cases = vec![" ", "     ", "test ", " test     "];

        for input in test_cases {
            let tokens = Lexer::lex(input);

            if regex::Regex::new(r"^\s*$").unwrap().is_match(input) {
                let expected = vec![TokenKind::EOF];
                match_tokens(tokens, expected);
            } else {
                let expected = vec![TokenKind::Identifier, TokenKind::EOF];
                match_tokens(tokens, expected);
            }
        }
    }

    #[test_case("1")]
    #[test_case("24")]
    #[test_case("56.4")]
    fn test_number_literal(input: &str) {
        let tokens = Lexer::lex(input);
        let expected = vec![TokenKind::Literal, TokenKind::EOF];

        assert_eq!(&tokens[0].value, input);
        match_tokens(tokens, expected);
    }

    #[test_case("x")]
    #[test_case("xy")]
    fn test_identifier(input: &str) {
        let tokens = Lexer::lex(input);
        let expected = vec![TokenKind::Identifier, TokenKind::EOF];

        assert_eq!(&tokens[0].value, input);
        match_tokens(tokens, expected);
    }

    #[test]
    fn test_function_call() {
        let tokens = Lexer::lex("f(x)");
        let expected = vec![
            TokenKind::Identifier,
            TokenKind::OpenParenthesis,
            TokenKind::Identifier,
            TokenKind::ClosedParenthesis,
            TokenKind::EOF,
        ];

        match_tokens(tokens, expected);
    }
}
-												Special symbols for *, /, arc functions, and subscript

											
										
										
											2021-12-29 18:32:11 +01:00
+								use crate::text_utils::{is_subscript, is_superscript};
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								use std::iter::Peekable;
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								use std::str;
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								use std::str::Chars;
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
-												Added 'Copy' to TokenKind, and added a missing line related to the sum function

											
										
										
											2020-12-13 22:09:49 +01:00
+								#[derive(Clone, Debug, PartialEq, Copy)]
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								pub enum TokenKind {
 								    Unknown,
 								    Literal,
 								    Identifier,
-												Prepared for supporting multiple function arguments. The types now support several arguments, and the only thing left is to parse this.

											
										
										
											2020-05-29 00:27:08 +02:00
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								    Plus,
 								    Minus,
 								    Star,
 								    Slash,
 								    Power,
-												Implemented factorial.

											
										
										
											2020-05-30 20:28:13 +02:00
+								    Exclamation,
-												added percentage unit and modulo

											
										
										
											2020-12-09 22:18:00 +01:00
+								    Percent,
-												Basics of derivation

Derivation implemented for function calls (only). Eg. f'(2). It is not
yet possible to do something like f''(2), but this should be implemented
in the future. It should also be possible to derive normal expressions,
but this is not yet possible.

											
										
										
											2021-05-17 20:36:53 +02:00
+								    Tick,
-												Comparison operators and Iverson brackets

											
										
										
											2021-05-31 13:46:06 +02:00
+								    GreaterThan,
 								    LessThan,
 								    Equals,
 								    NotEquals,
 								    GreaterOrEquals,
 								    LessOrEquals,
-												Prepared for supporting multiple function arguments. The types now support several arguments, and the only thing left is to parse this.

											
										
										
											2020-05-29 00:27:08 +02:00
-												Added the `unit` statement (very basic and experimental).

											
										
										
											2020-06-13 16:19:32 +02:00
+								    UnitKeyword,
-												Integrated the angle unit system with then new dynamic unit system.

											
										
										
											2020-06-15 19:10:55 +02:00
+								    ToKeyword,
-												Implemented piecewise

											
										
										
											2021-05-31 18:55:37 +02:00
+								    IfKeyword,
 								    OtherwiseKeyword,
-												Prepared for supporting multiple function arguments. The types now support several arguments, and the only thing left is to parse this.

											
										
										
											2020-05-29 00:27:08 +02:00
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								    Pipe,
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								    OpenCeil,
 								    ClosedCeil,
 								    OpenFloor,
 								    ClosedFloor,
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								    OpenParenthesis,
 								    ClosedParenthesis,
-												Comparison operators and Iverson brackets

											
										
										
											2021-05-31 13:46:06 +02:00
+								    OpenBracket,
 								    ClosedBracket,
-												Implemented piecewise

											
										
										
											2021-05-31 18:55:37 +02:00
+								    OpenBrace,
 								    ClosedBrace,
-												Prepared for supporting multiple function arguments. The types now support several arguments, and the only thing left is to parse this.

											
										
										
											2020-05-29 00:27:08 +02:00
+								    Comma,
-												Added semicolon support to combine several statements in one line.

											
										
										
											2020-06-14 22:03:22 +02:00
+								    Semicolon,
-												Prepared for supporting multiple function arguments. The types now support several arguments, and the only thing left is to parse this.

											
										
										
											2020-05-29 00:27:08 +02:00
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								    EOF,
 								}
-												Set up foundation for parser unit testing and added test_var() and test_precedence() tests.

											
										
										
											2020-06-05 13:36:11 +02:00
+								#[derive(Clone, Debug, PartialEq)]
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								pub struct Token {
 								    pub kind: TokenKind,
 								    pub value: String,
-												Added position data to tokens.

											
										
										
											2020-06-06 20:15:32 +02:00
+								    pub span: (usize, usize),
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								}
 								pub struct Lexer<'a> {
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								    chars: Peekable<Chars<'a>>,
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								    index: usize,
 								}
 								impl<'a> Lexer<'a> {
 								    pub fn lex(source: &str) -> Vec<Token> {
 								        let mut lexer = Lexer {
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								            chars: source.chars().peekable(),
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								            index: 0,
 								        };
 								        let mut tokens = Vec::new();
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								        loop {
 								            let next = lexer.next();
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								            if let TokenKind::EOF = next.kind {
 								                tokens.push(next);
 								                break;
 								            } else {
 								                tokens.push(next);
 								            }
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								        }
 								        tokens
 								    }
 								    fn next(&mut self) -> Token {
-												fixed panic on trailing spaces

											
										
										
											2020-12-09 09:36:49 +01:00
+								        let eof = build(TokenKind::EOF, "", (self.index, self.index));
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								        let mut c = if let Some(c) = self.peek() {
 								            *c
 								        } else {
-												fixed panic on trailing spaces

											
										
										
											2020-12-09 09:36:49 +01:00
+								            return eof;
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								        };
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
 								        while c == ' ' || c == '\t' || c == '\r' || c == '\n' {
-												Fixed lexer not ignoring whitespace properly.

											
										
										
											2020-06-09 10:34:39 +02:00
+								            if let None = self.advance() {
-												fixed panic on trailing spaces

											
										
										
											2020-12-09 09:36:49 +01:00
+								                return eof;
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								            }
-												Fixed lexer not ignoring whitespace properly.

											
										
										
											2020-06-09 10:34:39 +02:00
-												fixed panic on trailing spaces

											
										
										
											2020-12-09 09:36:49 +01:00
+								            c = if let Some(c) = self.peek() {
 								                *c
 								            } else {
 								                return eof;
 								            }
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								        }
-												Removed `self.advance()` code duplication in lexer.

											
										
										
											2020-06-04 20:09:43 +02:00
+								        if c.is_digit(10) {
 								            return self.next_number_literal();
 								        }
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								        if is_valid_identifier(Some(&c)) {
-												Removed `self.advance()` code duplication in lexer.

											
										
										
											2020-06-04 20:09:43 +02:00
+								            return self.next_identifier();
 								        }
-												Added position data to tokens.

											
										
										
											2020-06-06 20:15:32 +02:00
+								        let span = (self.index, self.index + 1);
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								        let token = match c {
-												Added position data to tokens.

											
										
										
											2020-06-06 20:15:32 +02:00
+								            '+' => build(TokenKind::Plus, "", span),
 								            '-' => build(TokenKind::Minus, "", span),
-												Special symbols for *, /, arc functions, and subscript

											
										
										
											2021-12-29 18:32:11 +01:00
+								            '*' | '×' => build(TokenKind::Star, "", span),
 								            '/' | '÷' => build(TokenKind::Slash, "", span),
-												Added position data to tokens.

											
										
										
											2020-06-06 20:15:32 +02:00
+								            '^' => build(TokenKind::Power, "", span),
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								            '|' => build(TokenKind::Pipe, "", span),
 								            '⌈' => build(TokenKind::OpenCeil, "", span),
 								            '⌉' => build(TokenKind::ClosedCeil, "", span),
 								            '⌊' => build(TokenKind::OpenFloor, "", span),
 								            '⌋' => build(TokenKind::ClosedFloor, "", span),
-												Added position data to tokens.

											
										
										
											2020-06-06 20:15:32 +02:00
+								            '(' => build(TokenKind::OpenParenthesis, "", span),
 								            ')' => build(TokenKind::ClosedParenthesis, "", span),
-												Comparison operators and Iverson brackets

											
										
										
											2021-05-31 13:46:06 +02:00
+								            '[' => build(TokenKind::OpenBracket, "", span),
 								            ']' => build(TokenKind::ClosedBracket, "", span),
-												Implemented piecewise

											
										
										
											2021-05-31 18:55:37 +02:00
+								            '{' => build(TokenKind::OpenBrace, "", span),
 								            '}' => build(TokenKind::ClosedBrace, "", span),
-												Added position data to tokens.

											
										
										
											2020-06-06 20:15:32 +02:00
+								            '!' => build(TokenKind::Exclamation, "", span),
-												Comparison operators and Iverson brackets

											
										
										
											2021-05-31 13:46:06 +02:00
+								            '=' => build(TokenKind::Equals, "", span),
 								            '>' => build(TokenKind::GreaterThan, "", span),
 								            '<' => build(TokenKind::LessThan, "", span),
-												Added position data to tokens.

											
										
										
											2020-06-06 20:15:32 +02:00
+								            ',' => build(TokenKind::Comma, "", span),
-												Added semicolon support to combine several statements in one line.

											
										
										
											2020-06-14 22:03:22 +02:00
+								            ';' => build(TokenKind::Semicolon, "", span),
-												added percentage unit and modulo

											
										
										
											2020-12-09 22:18:00 +01:00
+								            '%' => build(TokenKind::Percent, "", span),
-												Basics of derivation

Derivation implemented for function calls (only). Eg. f'(2). It is not
yet possible to do something like f''(2), but this should be implemented
in the future. It should also be possible to derive normal expressions,
but this is not yet possible.

											
										
										
											2021-05-17 20:36:53 +02:00
+								            '\'' => build(TokenKind::Tick, "", span),
-												Comparison operators and Iverson brackets

											
										
										
											2021-05-31 13:46:06 +02:00
+								            '≠' => build(TokenKind::NotEquals, "", span),
 								            '≥' => build(TokenKind::GreaterOrEquals, "", span),
 								            '≤' => build(TokenKind::LessOrEquals, "", span),
-												Lex special symbols as one token

											
										
										
											2021-05-18 17:49:31 +02:00
+								            // Some of the special symbols will be lexed here,
 								            // so that they don't merge with other symbols.
-												v1.0.1

											
										
										
											2021-10-01 09:21:11 +02:00
+								            'π' => build(TokenKind::Identifier, "pi", span),
 								            '√' => build(TokenKind::Identifier, "sqrt", span),
 								            'τ' => build(TokenKind::Identifier, "tau", span),
 								            'ϕ' => build(TokenKind::Identifier, "phi", span),
 								            'Γ' => build(TokenKind::Identifier, "gamma", span),
 								            '∏' => build(TokenKind::Identifier, "prod", span),
-												Added position data to tokens.

											
										
										
											2020-06-06 20:15:32 +02:00
+								            _ => build(TokenKind::Unknown, "", span),
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								        };
-												Removed `self.advance()` code duplication in lexer.

											
										
										
											2020-06-04 20:09:43 +02:00
+								        self.advance();
-												Comparison operators and Iverson brackets

											
										
										
											2021-05-31 13:46:06 +02:00
+								        // Handle tokens with two characters
 								        match (token.kind, self.peek()) {
 								            (TokenKind::Star, Some('*')) => {
-												Lex '**' as power sign

											
										
										
											2021-05-17 18:14:48 +02:00
+								                self.advance();
 								                return build(TokenKind::Power, "", span);
 								            }
-												Comparison operators and Iverson brackets

											
										
										
											2021-05-31 13:46:06 +02:00
+								            (TokenKind::Exclamation, Some('=')) => {
 								                self.advance();
 								                return build(TokenKind::NotEquals, "", span);
 								            }
 								            (TokenKind::GreaterThan, Some('=')) => {
 								                self.advance();
 								                return build(TokenKind::GreaterOrEquals, "", span);
 								            }
 								            (TokenKind::LessThan, Some('=')) => {
 								                self.advance();
 								                return build(TokenKind::LessOrEquals, "", span);
 								            }
 								            _ => (),
-												Lex '**' as power sign

											
										
										
											2021-05-17 18:14:48 +02:00
+								        }
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								        token
 								    }
 								    fn next_number_literal(&mut self) -> Token {
-												Number bases

											
										
										
											2021-12-30 02:05:04 +01:00
+								        let mut start = self.index;
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								        let mut end = start;
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								        let mut value = String::new();
-												Number bases

											
										
										
											2021-12-30 02:05:04 +01:00
+								        let mut leading_zero = self.peek().unwrap_or(&'\0') == &'0';
 								        let mut base = 10u32;
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
 								        loop {
 								            let c = if let Some(c) = self.peek() {
 								                *c
 								            } else {
 								                break;
 								            };
-												Number bases

											
										
										
											2021-12-30 02:05:04 +01:00
+								            // If at the second character and
 								            // the first character is a zero,
 								            // allow a letter
 								            if end - start == 1 && leading_zero {
 								                base = match c {
 								                    'b' => 2,
 								                    'o' => 8,
 								                    'x' => 16,
 								                    _ => 10,
 								                };
 								                // Don't include eg. 0x in the value
-												Fixed '0' being lexed as a base literal

											
										
										
											2021-12-30 12:44:48 +01:00
+								                if base != 10 {
 								                    start += 2;
 								                    end += 1;
 								                    self.advance();
 								                    value.clear();
 								                    leading_zero = false;
 								                    continue;
 								                }
-												Number bases

											
										
										
											2021-12-30 02:05:04 +01:00
+								            }
 								            if !c.is_digit(base) && c != '.' && c != '_' && !c.is_whitespace()
 								                || c == '\n'
 								                || c == '\r'
 								            {
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								                break;
 								            }
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
 								            end += 1;
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								            value.push(c);
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								            self.advance();
 								        }
-												Number bases

											
										
										
											2021-12-30 02:05:04 +01:00
+								        // Subscript unicode symbols after the literal, eg. 11₂
 								        let mut base_str = String::new();
 								        while crate::text_utils::is_subscript(self.peek().unwrap_or(&'\0')) {
 								            base_str.push(*self.peek().unwrap());
 								            self.advance();
 								        }
 								        if base_str != "" {
 								            base = crate::text_utils::subscript_to_digits(base_str.chars())
 								                .parse::<u32>()
 								                .unwrap_or(10);
 								        }
 								        if base != 10 {
 								            value.push_str(&format!("_{}", base));
 								        }
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								        build(TokenKind::Literal, &value, (start, end))
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								    }
 								    fn next_identifier(&mut self) -> Token {
 								        let start = self.index;
 								        let mut end = start;
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								        let mut value = String::new();
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								        while is_valid_identifier(self.peek()) {
 								            let c = *self.peek().unwrap();
-												Somewhat fixed UTF-8 support in the lexer.

											
										
										
											2020-06-07 19:48:52 +02:00
-												allow letters after underscore in variables, eg. x_a

											
										
										
											2020-12-09 10:47:46 +01:00
+								            // If the current character is an underscore, allow a number next.
-												Made the `x_1` notation possible.

											
										
										
											2020-06-09 14:59:29 +02:00
+								            // This is to allow the notation like the following: x_1
 								            if c == '_' {
 								                self.advance();
-												allow letters after underscore in variables, eg. x_a

											
										
										
											2020-12-09 10:47:46 +01:00
+								                let num = self.next().value;
-												Made the `x_1` notation possible.

											
										
										
											2020-06-09 14:59:29 +02:00
+								                value.push('_');
 								                value.push_str(&num.trim_end()); // Trim, since the number_literal function allows whitespace, which identifiers should not contain.
 								                break;
 								            }
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								            // Only allow identifiers with a special character to have *one* character. No more.
 								            // Break the loop if it isn't the first run and the current character is a special character.
-												Special symbols for *, /, arc functions, and subscript

											
										
										
											2021-12-29 18:32:11 +01:00
+								            if end - start > 0
 								                && !(c.is_ascii_alphabetic()
 								                    || c == '\''
 								                    || c == '_'
 								                    || is_superscript(&c)
 								                    || is_subscript(&c))
 								            {
-												Somewhat fixed UTF-8 support in the lexer.

											
										
										
											2020-06-07 19:48:52 +02:00
+								                break;
 								            }
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								            end += 1;
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								            value.push(c);
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								            self.advance();
 								        }
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								        let kind = match value.as_ref() {
-												Added the `unit` statement (very basic and experimental).

											
										
										
											2020-06-13 16:19:32 +02:00
+								            "unit" => TokenKind::UnitKeyword,
-												Integrated the angle unit system with then new dynamic unit system.

											
										
										
											2020-06-15 19:10:55 +02:00
+								            "to" => TokenKind::ToKeyword,
-												Implemented piecewise

											
										
										
											2021-05-31 18:55:37 +02:00
+								            "if" => TokenKind::IfKeyword,
 								            "otherwise" => TokenKind::OtherwiseKeyword,
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								            _ => TokenKind::Identifier,
 								        };
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
-												v1.0.1

											
										
										
											2021-10-01 09:21:11 +02:00
+								        let value = match value.as_ref() {
 								            "Σ" | "∑" => String::from("sum"),
-												Fix undefined function: '∏'

Without this fix `prod(1,10,n)` works fine, but `∏(1,10,n)`
returns `Undefined function: '∏'.`

											
										
										
											2021-10-03 14:44:40 +02:00
+								            "∏" => String::from("prod"),
-												Special symbols for *, /, arc functions, and subscript

											
										
										
											2021-12-29 18:32:11 +01:00
+								            "∫" | "integral" => String::from("integrate"),
 								            "sin⁻¹" => String::from("asin"),
 								            "cos⁻¹" => String::from("acos"),
 								            "tan⁻¹" => String::from("atan"),
 								            "cot⁻¹" => String::from("acot"),
 								            "cosec⁻¹" => String::from("acosec"),
 								            "sec⁻¹" => String::from("asec"),
 								            "sinh⁻¹" => String::from("asinh"),
 								            "cosh⁻¹" => String::from("acosh"),
 								            "tanh⁻¹" => String::from("atanh"),
 								            "coth⁻¹" => String::from("acoth"),
 								            "cosech⁻¹" => String::from("acosech"),
 								            "sech⁻¹" => String::from("asech"),
 								            "∛" => String::from("cbrt"),
-												v1.0.1

											
										
										
											2021-10-01 09:21:11 +02:00
+								            "°" => String::from("deg"),
-												Special symbols for *, /, arc functions, and subscript

											
										
										
											2021-12-29 18:32:11 +01:00
+								            _ => value, // things like log₂ are handled in the parser
-												v1.0.1

											
										
										
											2021-10-01 09:21:11 +02:00
+								        };
-												Made lexer lex '°' as 'deg'.

											
										
										
											2020-06-18 18:06:17 +02:00
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								        build(kind, &value, (start, end))
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								    }
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								    fn peek(&mut self) -> Option<&char> {
 								        self.chars.peek()
-												Somewhat fixed UTF-8 support in the lexer.

											
										
										
											2020-06-07 19:48:52 +02:00
+								    }
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								    fn advance(&mut self) -> Option<char> {
-												Cleaned up redundant code and replaced compare_enums() with PartialEq.

											
										
										
											2020-06-05 13:47:39 +02:00
+								        self.index += 1;
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								        self.chars.next()
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								    }
 								}
-												Added position data to tokens.

											
										
										
											2020-06-06 20:15:32 +02:00
+								fn build(kind: TokenKind, value: &str, span: (usize, usize)) -> Token {
-												Replaced f64 with rug::Float.

											
										
										
											2020-06-04 14:46:45 +02:00
+								    Token {
 								        kind,
 								        value: value.to_string(),
-												Added position data to tokens.

											
										
										
											2020-06-06 20:15:32 +02:00
+								        span,
-												Replaced f64 with rug::Float.

											
										
										
											2020-06-04 14:46:45 +02:00
+								    }
 								}
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								fn is_valid_identifier(c: Option<&char>) -> bool {
 								    if let Some(c) = c {
-												Removed regex dependency from kalk crate and made 'test-case' a dev-dependency

											
										
										
											2021-01-03 00:07:30 +01:00
+								        match c {
 								            '+' | '-' | '/' | '*' | '%' | '^' | '!' | '(' | ')' | '=' | '.' | ',' | ';' | '|'
-												Implemented piecewise

											
										
										
											2021-05-31 18:55:37 +02:00
+								            | '⌊' | '⌋' | '⌈' | '⌉' | '[' | ']' | '{' | '}' | 'π' | '√' | 'τ' | 'ϕ' | 'Γ' | '<'
-												Special symbols for *, /, arc functions, and subscript

											
										
										
											2021-12-29 18:32:11 +01:00
+								            | '>' | '≠' | '≥' | '≤' | '×' | '÷' => false,
 								            _ => !c.is_digit(10) || is_superscript(c) || is_subscript(c),
-												Removed regex dependency from kalk crate and made 'test-case' a dev-dependency

											
										
										
											2021-01-03 00:07:30 +01:00
+								        }
-												Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

											
										
										
											2020-06-08 21:51:45 +02:00
+								    } else {
 								        false
 								    }
-												Initial commit

											
										
										
											2020-05-28 23:46:35 +02:00
+								}
-												Added unit tests for lexer.

											
										
										
											2020-06-04 21:53:45 +02:00
 								#[cfg(test)]
 								mod tests {
 								    use super::*;
 								    use test_case::test_case;
-												WebAssembly foundation

											
										
										
											2020-12-30 22:50:39 +01:00
+								    use wasm_bindgen_test::*;
 								    wasm_bindgen_test::wasm_bindgen_test_configure!(run_in_browser);
-												Added unit tests for lexer.

											
										
										
											2020-06-04 21:53:45 +02:00
 								    fn match_tokens(tokens: Vec<Token>, expected: Vec<TokenKind>) {
 								        let mut expected_iter = expected.iter();
 								        for token in tokens {
-												Cleaned up redundant code and replaced compare_enums() with PartialEq.

											
										
										
											2020-06-05 13:47:39 +02:00
+								            assert_eq!(token.kind, *expected_iter.next().unwrap());
-												Added unit tests for lexer.

											
										
										
											2020-06-04 21:53:45 +02:00
+								        }
 								    }
 								    #[test]
-												WebAssembly foundation

											
										
										
											2020-12-30 22:50:39 +01:00
+								    #[wasm_bindgen_test]
-												Added unit tests for lexer.

											
										
										
											2020-06-04 21:53:45 +02:00
+								    fn test_token_kinds() {
-												added percentage unit and modulo

											
										
										
											2020-12-09 22:18:00 +01:00
+								        let tokens = Lexer::lex("+-*/%^()|=!,");
-												Added unit tests for lexer.

											
										
										
											2020-06-04 21:53:45 +02:00
+								        let expected = vec![
 								            TokenKind::Plus,
 								            TokenKind::Minus,
 								            TokenKind::Star,
 								            TokenKind::Slash,
-												added percentage unit and modulo

											
										
										
											2020-12-09 22:18:00 +01:00
+								            TokenKind::Percent,
-												Added unit tests for lexer.

											
										
										
											2020-06-04 21:53:45 +02:00
+								            TokenKind::Power,
 								            TokenKind::OpenParenthesis,
 								            TokenKind::ClosedParenthesis,
 								            TokenKind::Pipe,
 								            TokenKind::Equals,
 								            TokenKind::Exclamation,
 								            TokenKind::Comma,
 								            TokenKind::EOF,
 								        ];
 								        match_tokens(tokens, expected);
 								    }
-												Comparison operators and Iverson brackets

											
										
										
											2021-05-31 13:46:06 +02:00
+								    #[test]
 								    #[wasm_bindgen_test]
 								    fn test_brackets() {
 								        let tokens = Lexer::lex("[1 < 2]");
 								        let expected = vec![
 								            TokenKind::OpenBracket,
 								            TokenKind::Literal,
 								            TokenKind::LessThan,
 								            TokenKind::Literal,
 								            TokenKind::ClosedBracket,
 								            TokenKind::EOF,
 								        ];
 								        match_tokens(tokens, expected);
 								    }
-												fixed panic on trailing spaces

											
										
										
											2020-12-09 09:36:49 +01:00
+								    #[test]
-												WebAssembly foundation

											
										
										
											2020-12-30 22:50:39 +01:00
+								    #[wasm_bindgen_test]
-												fixed panic on trailing spaces

											
										
										
											2020-12-09 09:36:49 +01:00
+								    fn test_empty() {
 								        // test_case macro doesn't seem to work with spaces.
 								        let test_cases = vec![" ", "     ", "test ", " test     "];
 								        for input in test_cases {
 								            let tokens = Lexer::lex(input);
 								            if regex::Regex::new(r"^\s*$").unwrap().is_match(input) {
 								                let expected = vec![TokenKind::EOF];
 								                match_tokens(tokens, expected);
 								            } else {
 								                let expected = vec![TokenKind::Identifier, TokenKind::EOF];
 								                match_tokens(tokens, expected);
 								            }
 								        }
 								    }
-												Added unit tests for lexer.

											
										
										
											2020-06-04 21:53:45 +02:00
+								    #[test_case("1")]
 								    #[test_case("24")]
 								    #[test_case("56.4")]
 								    fn test_number_literal(input: &str) {
 								        let tokens = Lexer::lex(input);
 								        let expected = vec![TokenKind::Literal, TokenKind::EOF];
 								        assert_eq!(&tokens[0].value, input);
 								        match_tokens(tokens, expected);
 								    }
 								    #[test_case("x")]
 								    #[test_case("xy")]
 								    fn test_identifier(input: &str) {
 								        let tokens = Lexer::lex(input);
 								        let expected = vec![TokenKind::Identifier, TokenKind::EOF];
 								        assert_eq!(&tokens[0].value, input);
 								        match_tokens(tokens, expected);
 								    }
 								    #[test]
 								    fn test_function_call() {
 								        let tokens = Lexer::lex("f(x)");
 								        let expected = vec![
 								            TokenKind::Identifier,
 								            TokenKind::OpenParenthesis,
 								            TokenKind::Identifier,
 								            TokenKind::ClosedParenthesis,
 								            TokenKind::EOF,
 								        ];
 								        match_tokens(tokens, expected);
 								    }
 								}