Changed lexer source to a peekable iterator, resulting in (hopefully) proper UTF-8 support.

This commit is contained in:
PaddiM8 2020-06-08 21:51:45 +02:00
parent 42524eefa3
commit 165df6d3f3
2 changed files with 81 additions and 65 deletions

View File

@ -1,4 +1,6 @@
use std::iter::Peekable;
use std::str; use std::str;
use std::str::Chars;
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq)]
pub enum TokenKind { pub enum TokenKind {
@ -18,6 +20,10 @@ pub enum TokenKind {
Rad, Rad,
Pipe, Pipe,
OpenCeil,
ClosedCeil,
OpenFloor,
ClosedFloor,
OpenParenthesis, OpenParenthesis,
ClosedParenthesis, ClosedParenthesis,
Comma, Comma,
@ -33,41 +39,44 @@ pub struct Token {
} }
pub struct Lexer<'a> { pub struct Lexer<'a> {
source: &'a [u8], chars: Peekable<Chars<'a>>,
index: usize, index: usize,
} }
impl<'a> Lexer<'a> { impl<'a> Lexer<'a> {
pub fn lex(source: &str) -> Vec<Token> { pub fn lex(source: &str) -> Vec<Token> {
let mut lexer = Lexer { let mut lexer = Lexer {
source: source.as_bytes(), chars: source.chars().peekable(),
index: 0, index: 0,
}; };
let mut tokens = Vec::new(); let mut tokens = Vec::new();
while !lexer.is_at_end() { loop {
tokens.push(lexer.next()); let next = lexer.next();
}
// If there isn't already an EOF token, add it. if let TokenKind::EOF = next.kind {
if let TokenKind::EOF = tokens.last().unwrap().kind { tokens.push(next);
break;
} else { } else {
tokens.push(build(TokenKind::EOF, "", (source.len(), source.len()))); tokens.push(next);
}
} }
tokens tokens
} }
fn next(&mut self) -> Token { fn next(&mut self) -> Token {
let mut c = self.peek(); let mut c = if let Some(c) = self.peek() {
*c
} else {
return build(TokenKind::EOF, "", (self.index, self.index));
};
while c == ' ' || c == '\t' || c == '\r' || c == '\n' { while c == ' ' || c == '\t' || c == '\r' || c == '\n' {
self.advance(); if let Some(next_c) = self.advance() {
c = next_c;
if self.is_at_end() {
return build(TokenKind::EOF, "", (self.index, self.index));
} else { } else {
c = self.peek(); return build(TokenKind::EOF, "", (self.index, self.index));
} }
} }
@ -75,7 +84,7 @@ impl<'a> Lexer<'a> {
return self.next_number_literal(); return self.next_number_literal();
} }
if is_valid_identifier(c) { if is_valid_identifier(Some(&c)) {
return self.next_identifier(); return self.next_identifier();
} }
@ -86,9 +95,13 @@ impl<'a> Lexer<'a> {
'*' => build(TokenKind::Star, "", span), '*' => build(TokenKind::Star, "", span),
'/' => build(TokenKind::Slash, "", span), '/' => build(TokenKind::Slash, "", span),
'^' => build(TokenKind::Power, "", span), '^' => build(TokenKind::Power, "", span),
'|' => build(TokenKind::Pipe, "", span),
'⌈' => build(TokenKind::OpenCeil, "", span),
'⌉' => build(TokenKind::ClosedCeil, "", span),
'⌊' => build(TokenKind::OpenFloor, "", span),
'⌋' => build(TokenKind::ClosedFloor, "", span),
'(' => build(TokenKind::OpenParenthesis, "", span), '(' => build(TokenKind::OpenParenthesis, "", span),
')' => build(TokenKind::ClosedParenthesis, "", span), ')' => build(TokenKind::ClosedParenthesis, "", span),
'|' => build(TokenKind::Pipe, "", span),
'=' => build(TokenKind::Equals, "", span), '=' => build(TokenKind::Equals, "", span),
'!' => build(TokenKind::Exclamation, "", span), '!' => build(TokenKind::Exclamation, "", span),
',' => build(TokenKind::Comma, "", span), ',' => build(TokenKind::Comma, "", span),
@ -103,70 +116,63 @@ impl<'a> Lexer<'a> {
fn next_number_literal(&mut self) -> Token { fn next_number_literal(&mut self) -> Token {
let start = self.index; let start = self.index;
let mut end = start; let mut end = start;
let mut value = String::new();
loop {
let c = if let Some(c) = self.peek() {
*c
} else {
break;
};
if !c.is_digit(10) && c != '.' && !c.is_whitespace() {
break;
}
while !self.is_at_end()
&& (self.peek().is_digit(10) || self.peek() == '.' || self.peek().is_whitespace())
{
end += 1; end += 1;
value.push(c);
self.advance(); self.advance();
} }
if let Ok(value) = str::from_utf8(&self.source[start..end]) { build(TokenKind::Literal, &value, (start, end))
build(TokenKind::Literal, value, (start, end))
} else {
build(TokenKind::Unknown, "", (self.index, self.index))
}
} }
fn next_identifier(&mut self) -> Token { fn next_identifier(&mut self) -> Token {
let start = self.index; let start = self.index;
let mut end = start; let mut end = start;
let letter_reg = regex::Regex::new(r"[A-z']").unwrap(); let letter_reg = regex::Regex::new(r"[A-z']").unwrap();
let mut value = String::new();
while !self.is_at_end() && is_valid_identifier(self.peek()) { while is_valid_identifier(self.peek()) {
let c = self.peek(); let c = *self.peek().unwrap();
// Separate special characters from normal characters // Only allow identifiers with a special character to have *one* character. No more.
// in order to allow eg. x√64 // Break the loop if it isn't the first run and the current character is a special character.
if end - start > 0 // If this isn't the first run if end - start > 0 && !letter_reg.is_match(&c.to_string()) {
&& letter_reg.is_match(&(self.previous() as char).to_string()) // and the previous char was a normal one
&& !letter_reg.is_match(&c.to_string())
// and this one is a special character (why did rustfmt put this on a new line??)
{
break; break;
} }
end += 1; end += 1;
value.push(c);
self.advance(); self.advance();
} }
if let Ok(value) = str::from_utf8(&self.source[start..end]) { let kind = match value.as_ref() {
let kind = match value {
"deg" | "°" => TokenKind::Deg, "deg" | "°" => TokenKind::Deg,
"rad" => TokenKind::Rad, "rad" => TokenKind::Rad,
_ => TokenKind::Identifier, _ => TokenKind::Identifier,
}; };
build(kind, value, (start, end)) build(kind, &value, (start, end))
} else {
build(TokenKind::Unknown, "", (self.index, self.index))
}
} }
fn peek(&self) -> char { fn peek(&mut self) -> Option<&char> {
self.source[self.index].into() self.chars.peek()
} }
fn previous(&self) -> char { fn advance(&mut self) -> Option<char> {
self.source[self.index - 1].into()
}
fn advance(&mut self) {
self.index += 1; self.index += 1;
} self.chars.next()
fn is_at_end(&self) -> bool {
self.index >= self.source.len()
} }
} }
@ -178,10 +184,14 @@ fn build(kind: TokenKind, value: &str, span: (usize, usize)) -> Token {
} }
} }
fn is_valid_identifier(c: char) -> bool { fn is_valid_identifier(c: Option<&char>) -> bool {
regex::Regex::new(r"[^\s\n\r0-9\+-/\*\^!\(\)=\.,|]") if let Some(c) = c {
regex::Regex::new(r"[^\s\n\r0-9\+-/\*\^!\(\)=\.,|⌊⌋⌈⌉]")
.unwrap() .unwrap()
.is_match(&c.to_string()) .is_match(&c.to_string())
} else {
false
}
} }
#[cfg(test)] #[cfg(test)]

View File

@ -208,7 +208,7 @@ fn parse_factorial(context: &mut Context) -> Result<Expr, CalcError> {
fn parse_primary(context: &mut Context) -> Result<Expr, CalcError> { fn parse_primary(context: &mut Context) -> Result<Expr, CalcError> {
let expr = match peek(context).kind { let expr = match peek(context).kind {
TokenKind::OpenParenthesis => parse_group(context)?, TokenKind::OpenParenthesis => parse_group(context)?,
TokenKind::Pipe => parse_abs(context)?, TokenKind::Pipe | TokenKind::OpenCeil | TokenKind::OpenFloor => parse_group_fn(context)?,
TokenKind::Identifier => parse_identifier(context)?, TokenKind::Identifier => parse_identifier(context)?,
_ => Expr::Literal(advance(context).value.clone()), _ => Expr::Literal(advance(context).value.clone()),
}; };
@ -228,12 +228,18 @@ fn parse_group(context: &mut Context) -> Result<Expr, CalcError> {
Ok(group_expr) Ok(group_expr)
} }
fn parse_abs(context: &mut Context) -> Result<Expr, CalcError> { fn parse_group_fn(context: &mut Context) -> Result<Expr, CalcError> {
advance(context); let name = match &advance(context).kind {
let group_expr = Expr::Group(Box::new(parse_expr(context)?)); TokenKind::Pipe => "abs",
consume(context, TokenKind::Pipe)?; TokenKind::OpenCeil => "ceil",
TokenKind::OpenFloor => "floor",
_ => panic!("Unexpected parsing error."),
};
Ok(Expr::FnCall(String::from("abs"), vec![group_expr])) let expr = parse_expr(context)?;
advance(context);
Ok(Expr::FnCall(name.to_string(), vec![expr]))
} }
fn parse_identifier(context: &mut Context) -> Result<Expr, CalcError> { fn parse_identifier(context: &mut Context) -> Result<Expr, CalcError> {