kalker/kalk/src/lexer.rs

use std::iter::Peekable;
use std::str;
use std::str::Chars;

#[derive(Clone, Debug, PartialEq)]
pub enum TokenKind {
    Unknown,
    Literal,
    Identifier,

    Plus,
    Minus,
    Star,
    Slash,
    Power,
    Equals,
    Exclamation,

    UnitKeyword,
    ToKeyword,

    Pipe,
    OpenCeil,
    ClosedCeil,
    OpenFloor,
    ClosedFloor,
    OpenParenthesis,
    ClosedParenthesis,
    Comma,
    Semicolon,

    EOF,
}

#[derive(Clone, Debug, PartialEq)]
pub struct Token {
    pub kind: TokenKind,
    pub value: String,
    pub span: (usize, usize),
}

pub struct Lexer<'a> {
    chars: Peekable<Chars<'a>>,
    index: usize,
}

impl<'a> Lexer<'a> {
    pub fn lex(source: &str) -> Vec<Token> {
        let mut lexer = Lexer {
            chars: source.chars().peekable(),
            index: 0,
        };
        let mut tokens = Vec::new();

        loop {
            let next = lexer.next();

            if let TokenKind::EOF = next.kind {
                tokens.push(next);
                break;
            } else {
                tokens.push(next);
            }
        }

        tokens
    }

    fn next(&mut self) -> Token {
        let mut c = if let Some(c) = self.peek() {
            *c
        } else {
            return build(TokenKind::EOF, "", (self.index, self.index));
        };

        while c == ' ' || c == '\t' || c == '\r' || c == '\n' {
            if let None = self.advance() {
                return build(TokenKind::EOF, "", (self.index, self.index));
            }

            c = *self.peek().unwrap();
        }

        if c.is_digit(10) {
            return self.next_number_literal();
        }

        if is_valid_identifier(Some(&c)) {
            return self.next_identifier();
        }

        let span = (self.index, self.index + 1);
        let token = match c {
            '+' => build(TokenKind::Plus, "", span),
            '-' => build(TokenKind::Minus, "", span),
            '*' => build(TokenKind::Star, "", span),
            '/' => build(TokenKind::Slash, "", span),
            '^' => build(TokenKind::Power, "", span),
            '|' => build(TokenKind::Pipe, "", span),
            '⌈' => build(TokenKind::OpenCeil, "", span),
            '⌉' => build(TokenKind::ClosedCeil, "", span),
            '⌊' => build(TokenKind::OpenFloor, "", span),
            '⌋' => build(TokenKind::ClosedFloor, "", span),
            '(' => build(TokenKind::OpenParenthesis, "", span),
            ')' => build(TokenKind::ClosedParenthesis, "", span),
            '=' => build(TokenKind::Equals, "", span),
            '!' => build(TokenKind::Exclamation, "", span),
            ',' => build(TokenKind::Comma, "", span),
            ';' => build(TokenKind::Semicolon, "", span),
            _ => build(TokenKind::Unknown, "", span),
        };

        self.advance();

        token
    }

    fn next_number_literal(&mut self) -> Token {
        let start = self.index;
        let mut end = start;
        let mut value = String::new();

        loop {
            let c = if let Some(c) = self.peek() {
                *c
            } else {
                break;
            };

            if !c.is_digit(10) && c != '.' && !c.is_whitespace() {
                break;
            }

            end += 1;
            value.push(c);
            self.advance();
        }

        build(TokenKind::Literal, &value, (start, end))
    }

    fn next_identifier(&mut self) -> Token {
        let start = self.index;
        let mut end = start;
        let letter_reg = regex::Regex::new(r"[A-z'_]").unwrap();
        let mut value = String::new();

        while is_valid_identifier(self.peek()) {
            let c = *self.peek().unwrap();

            // If the current character is an underscore, expect a number next.
            // This is to allow the notation like the following: x_1
            if c == '_' {
                self.advance();
                let num = self.next_number_literal().value;
                value.push('_');
                value.push_str(&num.trim_end()); // Trim, since the number_literal function allows whitespace, which identifiers should not contain.
                break;
            }

            // Only allow identifiers with a special character to have *one* character. No more.
            // Break the loop if it isn't the first run and the current character is a special character.
            if end - start > 0 && !letter_reg.is_match(&c.to_string()) {
                break;
            }

            end += 1;
            value.push(c);
            self.advance();
        }

        let kind = match value.as_ref() {
            "unit" => TokenKind::UnitKeyword,
            "to" => TokenKind::ToKeyword,
            _ => TokenKind::Identifier,
        };

        build(kind, &value, (start, end))
    }

    fn peek(&mut self) -> Option<&char> {
        self.chars.peek()
    }

    fn advance(&mut self) -> Option<char> {
        self.index += 1;
        self.chars.next()
    }
}

fn build(kind: TokenKind, value: &str, span: (usize, usize)) -> Token {
    Token {
        kind,
        value: value.to_string(),
        span,
    }
}

fn is_valid_identifier(c: Option<&char>) -> bool {
    if let Some(c) = c {
        regex::Regex::new(r"[^\s\n\r0-9\+-/\*\^!\(\)=\.,;|⌊⌋⌈⌉]")
            .unwrap()
            .is_match(&c.to_string())
    } else {
        false
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use test_case::test_case;

    fn match_tokens(tokens: Vec<Token>, expected: Vec<TokenKind>) {
        let mut expected_iter = expected.iter();

        for token in tokens {
            assert_eq!(token.kind, *expected_iter.next().unwrap());
        }
    }

    #[test]
    fn test_token_kinds() {
        let tokens = Lexer::lex("+-*/^()|=!,");
        let expected = vec![
            TokenKind::Plus,
            TokenKind::Minus,
            TokenKind::Star,
            TokenKind::Slash,
            TokenKind::Power,
            TokenKind::OpenParenthesis,
            TokenKind::ClosedParenthesis,
            TokenKind::Pipe,
            TokenKind::Equals,
            TokenKind::Exclamation,
            TokenKind::Comma,
            TokenKind::EOF,
        ];

        match_tokens(tokens, expected);
    }

    #[test_case("1")]
    #[test_case("24")]
    #[test_case("56.4")]
    fn test_number_literal(input: &str) {
        let tokens = Lexer::lex(input);
        let expected = vec![TokenKind::Literal, TokenKind::EOF];

        assert_eq!(&tokens[0].value, input);
        match_tokens(tokens, expected);
    }

    #[test_case("x")]
    #[test_case("xy")]
    fn test_identifier(input: &str) {
        let tokens = Lexer::lex(input);
        let expected = vec![TokenKind::Identifier, TokenKind::EOF];

        assert_eq!(&tokens[0].value, input);
        match_tokens(tokens, expected);
    }

    #[test]
    fn test_function_call() {
        let tokens = Lexer::lex("f(x)");
        let expected = vec![
            TokenKind::Identifier,
            TokenKind::OpenParenthesis,
            TokenKind::Identifier,
            TokenKind::ClosedParenthesis,
            TokenKind::EOF,
        ];

        match_tokens(tokens, expected);
    }
}
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`use std::iter::Peekable;`
Initial commit 2020-05-28 23:46:35 +02:00			`use std::str;`
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`use std::str::Chars;`
Initial commit 2020-05-28 23:46:35 +02:00
Set up foundation for parser unit testing and added test_var() and test_precedence() tests. 2020-06-05 13:36:11 +02:00			`#[derive(Clone, Debug, PartialEq)]`
Initial commit 2020-05-28 23:46:35 +02:00			`pub enum TokenKind {`
			`Unknown,`
			`Literal,`
			`Identifier,`
Prepared for supporting multiple function arguments. The types now support several arguments, and the only thing left is to parse this. 2020-05-29 00:27:08 +02:00
Initial commit 2020-05-28 23:46:35 +02:00			`Plus,`
			`Minus,`
			`Star,`
			`Slash,`
			`Power,`
			`Equals,`
Implemented factorial. 2020-05-30 20:28:13 +02:00			`Exclamation,`
Prepared for supporting multiple function arguments. The types now support several arguments, and the only thing left is to parse this. 2020-05-29 00:27:08 +02:00
Added the `unit` statement (very basic and experimental). 2020-06-13 16:19:32 +02:00			`UnitKeyword,`
Integrated the angle unit system with then new dynamic unit system. 2020-06-15 19:10:55 +02:00			`ToKeyword,`
Prepared for supporting multiple function arguments. The types now support several arguments, and the only thing left is to parse this. 2020-05-29 00:27:08 +02:00
Initial commit 2020-05-28 23:46:35 +02:00			`Pipe,`
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`OpenCeil,`
			`ClosedCeil,`
			`OpenFloor,`
			`ClosedFloor,`
Initial commit 2020-05-28 23:46:35 +02:00			`OpenParenthesis,`
			`ClosedParenthesis,`
Prepared for supporting multiple function arguments. The types now support several arguments, and the only thing left is to parse this. 2020-05-29 00:27:08 +02:00			`Comma,`
Added semicolon support to combine several statements in one line. 2020-06-14 22:03:22 +02:00			`Semicolon,`
Prepared for supporting multiple function arguments. The types now support several arguments, and the only thing left is to parse this. 2020-05-29 00:27:08 +02:00
Initial commit 2020-05-28 23:46:35 +02:00			`EOF,`
			`}`

Set up foundation for parser unit testing and added test_var() and test_precedence() tests. 2020-06-05 13:36:11 +02:00			`#[derive(Clone, Debug, PartialEq)]`
Initial commit 2020-05-28 23:46:35 +02:00			`pub struct Token {`
			`pub kind: TokenKind,`
			`pub value: String,`
Added position data to tokens. 2020-06-06 20:15:32 +02:00			`pub span: (usize, usize),`
Initial commit 2020-05-28 23:46:35 +02:00			`}`

			`pub struct Lexer<'a> {`
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`chars: Peekable<Chars<'a>>,`
Initial commit 2020-05-28 23:46:35 +02:00			`index: usize,`
			`}`

			`impl<'a> Lexer<'a> {`
			`pub fn lex(source: &str) -> Vec<Token> {`
			`let mut lexer = Lexer {`
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`chars: source.chars().peekable(),`
Initial commit 2020-05-28 23:46:35 +02:00			`index: 0,`
			`};`
			`let mut tokens = Vec::new();`

Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`loop {`
			`let next = lexer.next();`
Initial commit 2020-05-28 23:46:35 +02:00
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`if let TokenKind::EOF = next.kind {`
			`tokens.push(next);`
			`break;`
			`} else {`
			`tokens.push(next);`
			`}`
Initial commit 2020-05-28 23:46:35 +02:00			`}`

			`tokens`
			`}`

			`fn next(&mut self) -> Token {`
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`let mut c = if let Some(c) = self.peek() {`
			`*c`
			`} else {`
			`return build(TokenKind::EOF, "", (self.index, self.index));`
			`};`
Initial commit 2020-05-28 23:46:35 +02:00
			`while c == ' ' \|\| c == '\t' \|\| c == '\r' \|\| c == '\n' {`
Fixed lexer not ignoring whitespace properly. 2020-06-09 10:34:39 +02:00			`if let None = self.advance() {`
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`return build(TokenKind::EOF, "", (self.index, self.index));`
Initial commit 2020-05-28 23:46:35 +02:00			`}`
Fixed lexer not ignoring whitespace properly. 2020-06-09 10:34:39 +02:00
			`c = *self.peek().unwrap();`
Initial commit 2020-05-28 23:46:35 +02:00			`}`

Removed `self.advance()` code duplication in lexer. 2020-06-04 20:09:43 +02:00			`if c.is_digit(10) {`
			`return self.next_number_literal();`
			`}`

Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`if is_valid_identifier(Some(&c)) {`
Removed `self.advance()` code duplication in lexer. 2020-06-04 20:09:43 +02:00			`return self.next_identifier();`
			`}`

Added position data to tokens. 2020-06-06 20:15:32 +02:00			`let span = (self.index, self.index + 1);`
Initial commit 2020-05-28 23:46:35 +02:00			`let token = match c {`
Added position data to tokens. 2020-06-06 20:15:32 +02:00			`'+' => build(TokenKind::Plus, "", span),`
			`'-' => build(TokenKind::Minus, "", span),`
			`'*' => build(TokenKind::Star, "", span),`
			`'/' => build(TokenKind::Slash, "", span),`
			`'^' => build(TokenKind::Power, "", span),`
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`'\|' => build(TokenKind::Pipe, "", span),`
			`'⌈' => build(TokenKind::OpenCeil, "", span),`
			`'⌉' => build(TokenKind::ClosedCeil, "", span),`
			`'⌊' => build(TokenKind::OpenFloor, "", span),`
			`'⌋' => build(TokenKind::ClosedFloor, "", span),`
Added position data to tokens. 2020-06-06 20:15:32 +02:00			`'(' => build(TokenKind::OpenParenthesis, "", span),`
			`')' => build(TokenKind::ClosedParenthesis, "", span),`
			`'=' => build(TokenKind::Equals, "", span),`
			`'!' => build(TokenKind::Exclamation, "", span),`
			`',' => build(TokenKind::Comma, "", span),`
Added semicolon support to combine several statements in one line. 2020-06-14 22:03:22 +02:00			`';' => build(TokenKind::Semicolon, "", span),`
Added position data to tokens. 2020-06-06 20:15:32 +02:00			`_ => build(TokenKind::Unknown, "", span),`
Initial commit 2020-05-28 23:46:35 +02:00			`};`

Removed `self.advance()` code duplication in lexer. 2020-06-04 20:09:43 +02:00			`self.advance();`

Initial commit 2020-05-28 23:46:35 +02:00			`token`
			`}`

			`fn next_number_literal(&mut self) -> Token {`
			`let start = self.index;`
			`let mut end = start;`
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`let mut value = String::new();`

			`loop {`
			`let c = if let Some(c) = self.peek() {`
			`*c`
			`} else {`
			`break;`
			`};`

			`if !c.is_digit(10) && c != '.' && !c.is_whitespace() {`
			`break;`
			`}`
Initial commit 2020-05-28 23:46:35 +02:00
			`end += 1;`
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`value.push(c);`
Initial commit 2020-05-28 23:46:35 +02:00			`self.advance();`
			`}`

Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`build(TokenKind::Literal, &value, (start, end))`
Initial commit 2020-05-28 23:46:35 +02:00			`}`

			`fn next_identifier(&mut self) -> Token {`
			`let start = self.index;`
			`let mut end = start;`
Made the `x_1` notation possible. 2020-06-09 14:59:29 +02:00			`let letter_reg = regex::Regex::new(r"[A-z'_]").unwrap();`
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`let mut value = String::new();`
Initial commit 2020-05-28 23:46:35 +02:00
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`while is_valid_identifier(self.peek()) {`
			`let c = *self.peek().unwrap();`
Somewhat fixed UTF-8 support in the lexer. 2020-06-07 19:48:52 +02:00
Made the `x_1` notation possible. 2020-06-09 14:59:29 +02:00			`// If the current character is an underscore, expect a number next.`
			`// This is to allow the notation like the following: x_1`
			`if c == '_' {`
			`self.advance();`
			`let num = self.next_number_literal().value;`
			`value.push('_');`
			`value.push_str(&num.trim_end()); // Trim, since the number_literal function allows whitespace, which identifiers should not contain.`
			`break;`
			`}`

Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`// Only allow identifiers with a special character to have one character. No more.`
			`// Break the loop if it isn't the first run and the current character is a special character.`
			`if end - start > 0 && !letter_reg.is_match(&c.to_string()) {`
Somewhat fixed UTF-8 support in the lexer. 2020-06-07 19:48:52 +02:00			`break;`
			`}`

Initial commit 2020-05-28 23:46:35 +02:00			`end += 1;`
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`value.push(c);`
Initial commit 2020-05-28 23:46:35 +02:00			`self.advance();`
			`}`

Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`let kind = match value.as_ref() {`
Added the `unit` statement (very basic and experimental). 2020-06-13 16:19:32 +02:00			`"unit" => TokenKind::UnitKeyword,`
Integrated the angle unit system with then new dynamic unit system. 2020-06-15 19:10:55 +02:00			`"to" => TokenKind::ToKeyword,`
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`_ => TokenKind::Identifier,`
			`};`
Initial commit 2020-05-28 23:46:35 +02:00
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`build(kind, &value, (start, end))`
Initial commit 2020-05-28 23:46:35 +02:00			`}`

Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`fn peek(&mut self) -> Option<&char> {`
			`self.chars.peek()`
Somewhat fixed UTF-8 support in the lexer. 2020-06-07 19:48:52 +02:00			`}`

Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`fn advance(&mut self) -> Option<char> {`
Cleaned up redundant code and replaced compare_enums() with PartialEq. 2020-06-05 13:47:39 +02:00			`self.index += 1;`
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`self.chars.next()`
Initial commit 2020-05-28 23:46:35 +02:00			`}`
			`}`

Added position data to tokens. 2020-06-06 20:15:32 +02:00			`fn build(kind: TokenKind, value: &str, span: (usize, usize)) -> Token {`
Replaced f64 with rug::Float. 2020-06-04 14:46:45 +02:00			`Token {`
			`kind,`
			`value: value.to_string(),`
Added position data to tokens. 2020-06-06 20:15:32 +02:00			`span,`
Replaced f64 with rug::Float. 2020-06-04 14:46:45 +02:00			`}`
			`}`

Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`fn is_valid_identifier(c: Option<&char>) -> bool {`
			`if let Some(c) = c {`
Added semicolon support to combine several statements in one line. 2020-06-14 22:03:22 +02:00			`regex::Regex::new(r"[^\s\n\r0-9\+-/\*\^!\(\)=\.,;\|⌊⌋⌈⌉]")`
Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support. 2020-06-08 21:51:45 +02:00			`.unwrap()`
			`.is_match(&c.to_string())`
			`} else {`
			`false`
			`}`
Initial commit 2020-05-28 23:46:35 +02:00			`}`
Added unit tests for lexer. 2020-06-04 21:53:45 +02:00
			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`
			`use test_case::test_case;`

			`fn match_tokens(tokens: Vec<Token>, expected: Vec<TokenKind>) {`
			`let mut expected_iter = expected.iter();`

			`for token in tokens {`
Cleaned up redundant code and replaced compare_enums() with PartialEq. 2020-06-05 13:47:39 +02:00			`assert_eq!(token.kind, *expected_iter.next().unwrap());`
Added unit tests for lexer. 2020-06-04 21:53:45 +02:00			`}`
			`}`

			`#[test]`
			`fn test_token_kinds() {`
			`let tokens = Lexer::lex("+-*/^()\|=!,");`
			`let expected = vec![`
			`TokenKind::Plus,`
			`TokenKind::Minus,`
			`TokenKind::Star,`
			`TokenKind::Slash,`
			`TokenKind::Power,`
			`TokenKind::OpenParenthesis,`
			`TokenKind::ClosedParenthesis,`
			`TokenKind::Pipe,`
			`TokenKind::Equals,`
			`TokenKind::Exclamation,`
			`TokenKind::Comma,`
			`TokenKind::EOF,`
			`];`

			`match_tokens(tokens, expected);`
			`}`

			`#[test_case("1")]`
			`#[test_case("24")]`
			`#[test_case("56.4")]`
			`fn test_number_literal(input: &str) {`
			`let tokens = Lexer::lex(input);`
			`let expected = vec![TokenKind::Literal, TokenKind::EOF];`

			`assert_eq!(&tokens[0].value, input);`
			`match_tokens(tokens, expected);`
			`}`

			`#[test_case("x")]`
			`#[test_case("xy")]`
			`fn test_identifier(input: &str) {`
			`let tokens = Lexer::lex(input);`
			`let expected = vec![TokenKind::Identifier, TokenKind::EOF];`

			`assert_eq!(&tokens[0].value, input);`
			`match_tokens(tokens, expected);`
			`}`

			`#[test]`
			`fn test_function_call() {`
			`let tokens = Lexer::lex("f(x)");`
			`let expected = vec![`
			`TokenKind::Identifier,`
			`TokenKind::OpenParenthesis,`
			`TokenKind::Identifier,`
			`TokenKind::ClosedParenthesis,`
			`TokenKind::EOF,`
			`];`

			`match_tokens(tokens, expected);`
			`}`
			`}`