kalker/kalk/src/lexer.rs

use std::str;

#[derive(Clone, Debug)]
pub enum TokenKind {
    Unknown,
    Literal,
    Identifier,

    Plus,
    Minus,
    Star,
    Slash,
    Power,
    Equals,
    Exclamation,

    Deg,
    Rad,

    Pipe,
    OpenParenthesis,
    ClosedParenthesis,
    Comma,

    EOF,
}

#[derive(Clone, Debug)]
pub struct Token {
    pub kind: TokenKind,
    pub value: String,
}

pub struct Lexer<'a> {
    source: &'a [u8],
    index: usize,
}

impl<'a> Lexer<'a> {
    pub fn lex(source: &str) -> Vec<Token> {
        let mut lexer = Lexer {
            source: source.as_bytes(),
            index: 0,
        };
        let mut tokens = Vec::new();

        while !lexer.is_at_end() {
            tokens.push(lexer.next());
        }

        // If there isn't already an EOF token, add it.
        if let TokenKind::EOF = tokens.last().unwrap().kind {
        } else {
            tokens.push(build(TokenKind::EOF, ""));
        }

        tokens
    }

    fn next(&mut self) -> Token {
        let mut c = self.peek();

        while c == ' ' || c == '\t' || c == '\r' || c == '\n' {
            self.advance();

            if self.is_at_end() {
                return build(TokenKind::EOF, "");
            } else {
                c = self.peek();
            }
        }

        if c.is_digit(10) {
            return self.next_number_literal();
        }

        if c.is_alphabetic() {
            return self.next_identifier();
        }

        let token = match c {
            '+' => build(TokenKind::Plus, ""),
            '-' => build(TokenKind::Minus, ""),
            '*' => build(TokenKind::Star, ""),
            '/' => build(TokenKind::Slash, ""),
            '^' => build(TokenKind::Power, ""),
            '(' => build(TokenKind::OpenParenthesis, ""),
            ')' => build(TokenKind::ClosedParenthesis, ""),
            '|' => build(TokenKind::Pipe, ""),
            '=' => build(TokenKind::Equals, ""),
            '!' => build(TokenKind::Exclamation, ""),
            ',' => build(TokenKind::Comma, ""),
            _ => build(TokenKind::Unknown, ""),
        };

        self.advance();

        token
    }

    fn next_number_literal(&mut self) -> Token {
        let start = self.index;
        let mut end = start;

        while !self.is_at_end() && (self.peek().is_digit(10) || self.peek() == '.') {
            end += 1;
            self.advance();
        }

        if let Ok(value) = str::from_utf8(&self.source[start..end]) {
            build(TokenKind::Literal, value)
        } else {
            build(TokenKind::Unknown, "")
        }
    }

    fn next_identifier(&mut self) -> Token {
        let start = self.index;
        let mut end = start;

        while !self.is_at_end() && is_valid_identifier(self.peek()) {
            end += 1;
            self.advance();
        }

        if let Ok(value) = str::from_utf8(&self.source[start..end]) {
            let kind = match value {
                "deg" | "°" => TokenKind::Deg,
                "rad" => TokenKind::Rad,
                _ => TokenKind::Identifier,
            };

            build(kind, value)
        } else {
            build(TokenKind::Unknown, "")
        }
    }

    fn peek(&self) -> char {
        self.source[self.index].into()
    }

    fn advance(&mut self) {
        self.index = self.index + 1;
    }

    fn is_at_end(&self) -> bool {
        self.index >= self.source.len()
    }
}

fn build(kind: TokenKind, value: &str) -> Token {
    Token {
        kind,
        value: value.to_string(),
    }
}

fn is_valid_identifier(c: char) -> bool {
    c.is_alphabetic() || c == '°' || c == '√' || c == '\'' || c == '¨' || c == 'Σ'
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::ast::compare_enums;
    use test_case::test_case;

    fn match_tokens(tokens: Vec<Token>, expected: Vec<TokenKind>) {
        let mut expected_iter = expected.iter();

        for token in tokens {
            assert!(compare_enums(&token.kind, &expected_iter.next().unwrap()));
        }
    }

    #[test]
    fn test_token_kinds() {
        let tokens = Lexer::lex("+-*/^()|=!,");
        let expected = vec![
            TokenKind::Plus,
            TokenKind::Minus,
            TokenKind::Star,
            TokenKind::Slash,
            TokenKind::Power,
            TokenKind::OpenParenthesis,
            TokenKind::ClosedParenthesis,
            TokenKind::Pipe,
            TokenKind::Equals,
            TokenKind::Exclamation,
            TokenKind::Comma,
            TokenKind::EOF,
        ];

        match_tokens(tokens, expected);
    }

    #[test_case("1")]
    #[test_case("24")]
    #[test_case("56.4")]
    fn test_number_literal(input: &str) {
        let tokens = Lexer::lex(input);
        let expected = vec![TokenKind::Literal, TokenKind::EOF];

        assert_eq!(&tokens[0].value, input);
        match_tokens(tokens, expected);
    }

    #[test_case("x")]
    #[test_case("xy")]
    fn test_identifier(input: &str) {
        let tokens = Lexer::lex(input);
        let expected = vec![TokenKind::Identifier, TokenKind::EOF];

        assert_eq!(&tokens[0].value, input);
        match_tokens(tokens, expected);
    }

    #[test]
    fn test_function_call() {
        let tokens = Lexer::lex("f(x)");
        let expected = vec![
            TokenKind::Identifier,
            TokenKind::OpenParenthesis,
            TokenKind::Identifier,
            TokenKind::ClosedParenthesis,
            TokenKind::EOF,
        ];

        match_tokens(tokens, expected);
    }
}
Initial commit 2020-05-28 23:46:35 +02:00			`use std::str;`

			`#[derive(Clone, Debug)]`
			`pub enum TokenKind {`
			`Unknown,`
			`Literal,`
			`Identifier,`
Prepared for supporting multiple function arguments. The types now support several arguments, and the only thing left is to parse this. 2020-05-29 00:27:08 +02:00
Initial commit 2020-05-28 23:46:35 +02:00			`Plus,`
			`Minus,`
			`Star,`
			`Slash,`
			`Power,`
			`Equals,`
Implemented factorial. 2020-05-30 20:28:13 +02:00			`Exclamation,`
Prepared for supporting multiple function arguments. The types now support several arguments, and the only thing left is to parse this. 2020-05-29 00:27:08 +02:00
Initial commit 2020-05-28 23:46:35 +02:00			`Deg,`
			`Rad,`
Prepared for supporting multiple function arguments. The types now support several arguments, and the only thing left is to parse this. 2020-05-29 00:27:08 +02:00
Initial commit 2020-05-28 23:46:35 +02:00			`Pipe,`
			`OpenParenthesis,`
			`ClosedParenthesis,`
Prepared for supporting multiple function arguments. The types now support several arguments, and the only thing left is to parse this. 2020-05-29 00:27:08 +02:00			`Comma,`

Initial commit 2020-05-28 23:46:35 +02:00			`EOF,`
			`}`

			`#[derive(Clone, Debug)]`
			`pub struct Token {`
			`pub kind: TokenKind,`
			`pub value: String,`
			`}`

			`pub struct Lexer<'a> {`
			`source: &'a [u8],`
			`index: usize,`
			`}`

			`impl<'a> Lexer<'a> {`
			`pub fn lex(source: &str) -> Vec<Token> {`
			`let mut lexer = Lexer {`
			`source: source.as_bytes(),`
			`index: 0,`
			`};`
			`let mut tokens = Vec::new();`

			`while !lexer.is_at_end() {`
			`tokens.push(lexer.next());`
			`}`

			`// If there isn't already an EOF token, add it.`
			`if let TokenKind::EOF = tokens.last().unwrap().kind {`
			`} else {`
Replaced f64 with rug::Float. 2020-06-04 14:46:45 +02:00			`tokens.push(build(TokenKind::EOF, ""));`
Initial commit 2020-05-28 23:46:35 +02:00			`}`

			`tokens`
			`}`

			`fn next(&mut self) -> Token {`
			`let mut c = self.peek();`

			`while c == ' ' \|\| c == '\t' \|\| c == '\r' \|\| c == '\n' {`
			`self.advance();`

			`if self.is_at_end() {`
Replaced f64 with rug::Float. 2020-06-04 14:46:45 +02:00			`return build(TokenKind::EOF, "");`
Initial commit 2020-05-28 23:46:35 +02:00			`} else {`
			`c = self.peek();`
			`}`
			`}`

Removed `self.advance()` code duplication in lexer. 2020-06-04 20:09:43 +02:00			`if c.is_digit(10) {`
			`return self.next_number_literal();`
			`}`

			`if c.is_alphabetic() {`
			`return self.next_identifier();`
			`}`

Initial commit 2020-05-28 23:46:35 +02:00			`let token = match c {`
Removed `self.advance()` code duplication in lexer. 2020-06-04 20:09:43 +02:00			`'+' => build(TokenKind::Plus, ""),`
			`'-' => build(TokenKind::Minus, ""),`
			`'*' => build(TokenKind::Star, ""),`
			`'/' => build(TokenKind::Slash, ""),`
			`'^' => build(TokenKind::Power, ""),`
			`'(' => build(TokenKind::OpenParenthesis, ""),`
			`')' => build(TokenKind::ClosedParenthesis, ""),`
			`'\|' => build(TokenKind::Pipe, ""),`
			`'=' => build(TokenKind::Equals, ""),`
			`'!' => build(TokenKind::Exclamation, ""),`
			`',' => build(TokenKind::Comma, ""),`
			`_ => build(TokenKind::Unknown, ""),`
Initial commit 2020-05-28 23:46:35 +02:00			`};`

Removed `self.advance()` code duplication in lexer. 2020-06-04 20:09:43 +02:00			`self.advance();`

Initial commit 2020-05-28 23:46:35 +02:00			`token`
			`}`

			`fn next_number_literal(&mut self) -> Token {`
			`let start = self.index;`
			`let mut end = start;`

			`while !self.is_at_end() && (self.peek().is_digit(10) \|\| self.peek() == '.') {`
			`end += 1;`
			`self.advance();`
			`}`

			`if let Ok(value) = str::from_utf8(&self.source[start..end]) {`
Replaced f64 with rug::Float. 2020-06-04 14:46:45 +02:00			`build(TokenKind::Literal, value)`
Initial commit 2020-05-28 23:46:35 +02:00			`} else {`
Replaced f64 with rug::Float. 2020-06-04 14:46:45 +02:00			`build(TokenKind::Unknown, "")`
Initial commit 2020-05-28 23:46:35 +02:00			`}`
			`}`

			`fn next_identifier(&mut self) -> Token {`
			`let start = self.index;`
			`let mut end = start;`

			`while !self.is_at_end() && is_valid_identifier(self.peek()) {`
			`end += 1;`
			`self.advance();`
			`}`

			`if let Ok(value) = str::from_utf8(&self.source[start..end]) {`
			`let kind = match value {`
			`"deg" \| "°" => TokenKind::Deg,`
			`"rad" => TokenKind::Rad,`
			`_ => TokenKind::Identifier,`
			`};`

Replaced f64 with rug::Float. 2020-06-04 14:46:45 +02:00			`build(kind, value)`
Initial commit 2020-05-28 23:46:35 +02:00			`} else {`
Replaced f64 with rug::Float. 2020-06-04 14:46:45 +02:00			`build(TokenKind::Unknown, "")`
Initial commit 2020-05-28 23:46:35 +02:00			`}`
			`}`

			`fn peek(&self) -> char {`
			`self.source[self.index].into()`
			`}`

			`fn advance(&mut self) {`
			`self.index = self.index + 1;`
			`}`

			`fn is_at_end(&self) -> bool {`
			`self.index >= self.source.len()`
			`}`
			`}`

Replaced f64 with rug::Float. 2020-06-04 14:46:45 +02:00			`fn build(kind: TokenKind, value: &str) -> Token {`
			`Token {`
			`kind,`
			`value: value.to_string(),`
			`}`
			`}`

Initial commit 2020-05-28 23:46:35 +02:00			`fn is_valid_identifier(c: char) -> bool {`
Replaced f64 with rug::Float. 2020-06-04 14:46:45 +02:00			`c.is_alphabetic() \|\| c == '°' \|\| c == '√' \|\| c == '\'' \|\| c == '¨' \|\| c == 'Σ'`
Initial commit 2020-05-28 23:46:35 +02:00			`}`
Added unit tests for lexer. 2020-06-04 21:53:45 +02:00
			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`
			`use crate::ast::compare_enums;`
			`use test_case::test_case;`

			`fn match_tokens(tokens: Vec<Token>, expected: Vec<TokenKind>) {`
			`let mut expected_iter = expected.iter();`

			`for token in tokens {`
			`assert!(compare_enums(&token.kind, &expected_iter.next().unwrap()));`
			`}`
			`}`

			`#[test]`
			`fn test_token_kinds() {`
			`let tokens = Lexer::lex("+-*/^()\|=!,");`
			`let expected = vec![`
			`TokenKind::Plus,`
			`TokenKind::Minus,`
			`TokenKind::Star,`
			`TokenKind::Slash,`
			`TokenKind::Power,`
			`TokenKind::OpenParenthesis,`
			`TokenKind::ClosedParenthesis,`
			`TokenKind::Pipe,`
			`TokenKind::Equals,`
			`TokenKind::Exclamation,`
			`TokenKind::Comma,`
			`TokenKind::EOF,`
			`];`

			`match_tokens(tokens, expected);`
			`}`

			`#[test_case("1")]`
			`#[test_case("24")]`
			`#[test_case("56.4")]`
			`fn test_number_literal(input: &str) {`
			`let tokens = Lexer::lex(input);`
			`let expected = vec![TokenKind::Literal, TokenKind::EOF];`

			`assert_eq!(&tokens[0].value, input);`
			`match_tokens(tokens, expected);`
			`}`

			`#[test_case("x")]`
			`#[test_case("xy")]`
			`fn test_identifier(input: &str) {`
			`let tokens = Lexer::lex(input);`
			`let expected = vec![TokenKind::Identifier, TokenKind::EOF];`

			`assert_eq!(&tokens[0].value, input);`
			`match_tokens(tokens, expected);`
			`}`

			`#[test]`
			`fn test_function_call() {`
			`let tokens = Lexer::lex("f(x)");`
			`let expected = vec![`
			`TokenKind::Identifier,`
			`TokenKind::OpenParenthesis,`
			`TokenKind::Identifier,`
			`TokenKind::ClosedParenthesis,`
			`TokenKind::EOF,`
			`];`

			`match_tokens(tokens, expected);`
			`}`
			`}`