use crate::errors::ShellError; use derive_new::new; use log::trace; use logos_derive::Logos; use std::ops::Range; #[derive(Debug, Clone, Copy, Eq, PartialEq, Logos)] #[extras = "LexerState"] crate enum TopToken { #[error] Error, #[end] END, #[regex = "-?[0-9]+"] #[callback = "after_num"] Num, #[regex = r#"'([^']|\\')*'"#] SQString, #[regex = r#""([^"]|\\")*""#] DQString, #[token = "$"] #[callback = "start_variable"] Dollar, #[regex = r#"[^\s0-9"'$\-][^\s"']*"#] #[callback = "end_bare_variable"] Bare, #[token = "|"] Pipe, #[token = "."] Dot, #[token = "{"] OpenBrace, #[token = "}"] CloseBrace, #[token = "("] OpenParen, #[token = ")"] CloseParen, #[token = ">"] OpGt, #[token = "<"] OpLt, #[token = ">="] OpGte, #[token = "<="] OpLte, #[token = "=="] OpEq, #[token = "!="] OpNeq, #[token = "--"] DashDash, #[token = "-"] Dash, #[regex = r"\s+"] Whitespace, } impl TopToken { fn to_token(&self) -> Option { use TopToken::*; let result = match self { END => return None, Num => Token::Num, SQString => Token::SQString, DQString => Token::DQString, Dollar => Token::Dollar, Bare => Token::Bare, Pipe => Token::Pipe, Dot => Token::Bare, OpenBrace => Token::OpenBrace, CloseBrace => Token::CloseBrace, OpenParen => Token::OpenParen, CloseParen => Token::CloseParen, OpGt => Token::OpGt, OpLt => Token::OpLt, OpGte => Token::OpGte, OpLte => Token::OpLte, OpEq => Token::OpEq, OpNeq => Token::OpNeq, DashDash => Token::DashDash, Dash => Token::Dash, Whitespace => Token::Whitespace, Error => unreachable!("Don't call to_token with the error variant"), }; Some(result) } } fn after_num(lex: &mut logos::Lexer) { trace!("after_num EXTRAS={:?}", lex.extras); lex.extras.current = LexerStateName::AfterNum; } fn start_variable(lex: &mut logos::Lexer) { trace!("start_variable EXTRAS={:?}", lex.extras); lex.extras.current = LexerStateName::Var; } fn end_bare_variable(lex: &mut logos::Lexer) { trace!("end_variable EXTRAS={:?}", lex.extras); lex.extras.current = LexerStateName::AfterVariableToken; } #[derive(Logos, Debug, Clone, Copy, Eq, PartialEq)] #[extras = "LexerState"] crate enum AfterNum { #[error] Error, #[end] END, #[regex = "(B|KB|MB|GB|TB|PB)"] #[callback = "end_unit"] Unit, #[regex = r"\s"] #[callback = "end_number"] Whitespace, } impl AfterNum { fn to_token(&self) -> Option { use AfterNum::*; let result = match self { END => return None, Unit => Token::Unit, Whitespace => Token::Whitespace, Error => unreachable!("Don't call to_token with the error variant"), }; Some(result) } } fn end_unit(lex: &mut logos::Lexer) { trace!("end_unit EXTRAS={:?}", lex.extras); lex.extras.current = LexerStateName::Top; } fn end_number(lex: &mut logos::Lexer) { trace!("end_unit EXTRAS={:?}", lex.extras); lex.extras.current = LexerStateName::Top; } #[derive(Logos, Debug, Clone, Copy, Eq, PartialEq)] #[extras = "LexerState"] crate enum VariableToken { #[error] Error, #[end] END, #[regex = r"[A-Za-z][A-Za-z0-9\-?!]*"] #[callback = "end_variable"] Variable, } impl VariableToken { fn to_token(&self) -> Option { use VariableToken::*; let result = match self { END => return None, Variable => Token::Variable, Error => unreachable!("Don't call to_token with the error variant"), }; Some(result) } } fn end_variable(lex: &mut logos::Lexer) { trace!("end_variable EXTRAS={:?}", lex.extras); lex.extras.current = LexerStateName::AfterVariableToken; } #[derive(Logos, Debug, Clone, Copy, Eq, PartialEq)] #[extras = "LexerState"] crate enum AfterVariableToken { #[error] Error, #[end] END, #[token = "."] #[callback = "start_member"] Dot, #[regex = r"\s"] #[callback = "terminate_variable"] Whitespace, } impl AfterVariableToken { fn to_token(&self) -> Option { use AfterVariableToken::*; let result = match self { END => return None, Dot => Token::PathDot, Whitespace => Token::Whitespace, Error => unreachable!("Don't call to_token with the error variant"), }; Some(result) } } fn start_member(lex: &mut logos::Lexer) { trace!("start_variable EXTRAS={:?}", lex.extras); lex.extras.current = LexerStateName::AfterMemberDot; } fn terminate_variable(lex: &mut logos::Lexer) { trace!("terminate_variable EXTRAS={:?}", lex.extras); lex.extras.current = LexerStateName::Top; } #[derive(Logos, Debug, Clone, Copy, Eq, PartialEq)] #[extras = "LexerState"] crate enum AfterMemberDot { #[error] Error, #[end] END, #[regex = r"[A-Za-z][A-Za-z0-9\-?!]*"] #[callback = "finish_member"] Member, #[regex = r#"'([^']|\\')*'"#] SQString, #[regex = r#""([^"]|\\")*""#] DQString, #[regex = r"\s"] Whitespace, } impl AfterMemberDot { fn to_token(&self) -> Option { use AfterMemberDot::*; let result = match self { END => return None, Member => Token::Member, SQString => Token::SQMember, DQString => Token::DQMember, Whitespace => Token::Whitespace, Error => unreachable!("Don't call to_token with the error variant"), }; Some(result) } } fn finish_member(lex: &mut logos::Lexer) { trace!("finish_member EXTRAS={:?}", lex.extras); lex.extras.current = LexerStateName::AfterVariableToken; } #[derive(Debug, Clone, Copy)] crate enum LexerStateName { Top, Var, AfterMemberDot, AfterNum, AfterVariableToken, } impl Default for LexerStateName { fn default() -> LexerStateName { LexerStateName::Top } } #[derive(Debug, Clone, Default)] crate struct LexerState { current: LexerStateName, } impl logos::Extras for LexerState { fn on_advance(&mut self) {} fn on_whitespace(&mut self, _byte: u8) {} } #[derive(Debug, Clone, Copy, PartialEq, Eq, Ord, PartialOrd)] pub struct Span { crate start: usize, crate end: usize, // source: &'source str, } impl From<(usize, usize)> for Span { fn from(input: (usize, usize)) -> Span { Span { start: input.0, end: input.1, } } } impl From<&std::ops::Range> for Span { fn from(input: &std::ops::Range) -> Span { Span { start: input.start, end: input.end, } } } impl Span { fn new(range: &Range) -> Span { Span { start: range.start, end: range.end, // source, } } } impl language_reporting::ReportingSpan for Span { fn with_start(&self, start: usize) -> Self { Span { start, end: self.end, } } fn with_end(&self, end: usize) -> Self { Span { start: self.start, end, } } fn start(&self) -> usize { self.start } fn end(&self) -> usize { self.end } } #[derive(new, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)] pub struct Spanned { crate span: Span, crate item: T, } impl std::ops::Deref for Spanned { type Target = T; fn deref(&self) -> &T { &self.item } } impl Spanned { crate fn from_item(item: T, span: Span) -> Spanned { Spanned { span, item } } } #[derive(new, Debug, Clone, Eq, PartialEq)] pub struct SpannedToken<'source> { crate span: Span, crate slice: &'source str, crate token: Token, } impl SpannedToken<'source> { crate fn to_spanned_string(&self) -> Spanned { Spanned::from_item(self.slice.to_string(), self.span) } crate fn to_string(&self) -> String { self.slice.to_string() } crate fn as_slice(&self) -> &str { self.slice } } #[derive(Debug, Clone, Eq, PartialEq)] pub enum Token { Variable, PathDot, Member, SQMember, DQMember, Num, SQString, DQString, Unit, Dollar, Bare, Pipe, OpenBrace, CloseBrace, OpenParen, CloseParen, OpGt, OpLt, OpGte, OpLte, OpEq, OpNeq, Dash, DashDash, Whitespace, } // #[derive(Debug, Clone, Eq, PartialEq)] // crate enum Token<'source> { // Top(SpannedToken<'source, TopToken>), // Var(SpannedToken<'source, VariableToken>), // Dot(SpannedToken<'source, &'source str>), // Member(SpannedToken<'source, &'source str>), // Whitespace(SpannedToken<'source, &'source str>), // } #[derive(Clone)] crate struct Lexer<'source> { lexer: logos::Lexer, first: bool, whitespace: bool, // state: LexerState } impl Lexer<'source> { crate fn new(source: &str, whitespace: bool) -> Lexer<'_> { Lexer { first: true, lexer: logos::Logos::lexer(source), whitespace // state: LexerState::default(), } } } impl Iterator for Lexer<'source> { type Item = Result<(usize, SpannedToken<'source>, usize), ShellError>; // type Item = Result, ShellError>; fn next(&mut self) -> Option { if self.first { self.first = false; match self.lexer.token { TopToken::Error => { return Some(Err(lex_error(&self.lexer.range(), self.lexer.source))) } TopToken::Whitespace if !self.whitespace => return self.next(), other => { return spanned(other.to_token()?, self.lexer.slice(), &self.lexer.range()) } } } else { trace!("STATE={:?}", self.lexer.extras); match self.lexer.extras.current { LexerStateName::Top => { let (lexer, range, slice, token) = advance::(self.lexer.clone()); self.lexer = lexer; match token { TopToken::Error => return Some(Err(lex_error(&range, self.lexer.source))), TopToken::Whitespace if !self.whitespace => return self.next(), other => return spanned(other.to_token()?, slice, &range), } } LexerStateName::AfterNum => { let (lexer, range, slice, token) = advance::(self.lexer.clone()); self.lexer = lexer; match token { AfterNum::Error => return Some(Err(lex_error(&range, self.lexer.source))), AfterNum::Whitespace if !self.whitespace => self.next(), other => return spanned(other.to_token()?, slice, &range), } } LexerStateName::AfterMemberDot => { let (lexer, range, slice, token) = advance::(self.lexer.clone()); self.lexer = lexer; match token { AfterMemberDot::Error => { return Some(Err(lex_error(&range, self.lexer.source))) } AfterMemberDot::Whitespace if !self.whitespace => self.next(), other => return spanned(other.to_token()?, slice, &range), } } LexerStateName::AfterVariableToken => { let (lexer, range, slice, token) = advance::(self.lexer.clone()); self.lexer = lexer; match token { AfterVariableToken::Error => { return Some(Err(lex_error(&range, self.lexer.source))) } AfterVariableToken::Whitespace if !self.whitespace => self.next(), other => return spanned(other.to_token()?, slice, &range), } } LexerStateName::Var => { let (lexer, range, slice, token) = advance::(self.lexer.clone()); self.lexer = lexer; match token { VariableToken::Error => { return Some(Err(lex_error(&range, self.lexer.source))) } other => return spanned(other.to_token()?, slice, &range), } } } } } } fn lex_error(range: &Range, _source: &str) -> ShellError { use language_reporting::*; ShellError::diagnostic( Diagnostic::new(Severity::Error, "Lex error") .with_label(Label::new_primary(Span::new(range))), ) } fn spanned<'source>( token: Token, slice: &'source str, range: &Range, ) -> Option, usize), ShellError>> { let token = SpannedToken::new(Span::new(range), slice, token); Some(Ok((range.start, token, range.end))) } fn advance( lexer: logos::Lexer, ) -> ( logos::Lexer, Range, &'source str, T, ) where T: logos::Logos + logos::source::WithSource<&'source str> + Copy, { let lexer = lexer.advance_as::(); let token = &lexer.token; let range = lexer.range(); let slice = lexer.slice(); (lexer.clone().morph::(), range, slice, *token) } #[cfg(test)] mod tests { use super::*; use pretty_assertions::assert_eq; fn assert_lex(source: &str, tokens: &[TestToken<'_>]) { let lex = Lexer::new(source, false); let mut current = 0; let expected_tokens: Vec = tokens .iter() .filter_map(|token_desc| { trace!("{:?}", token_desc); let len = token_desc.source.len(); let range = current..(current + len); let token = token_desc.to_token(&range); current = current + len; if let SpannedToken { token: Token::Whitespace, .. } = token { None } else { Some(token) } }) .collect(); let actual_tokens: Result, _> = lex.map(|result| result.map(|(_, i, _)| i)).collect(); let actual_tokens = actual_tokens.unwrap(); assert_eq!(actual_tokens, expected_tokens); } #[derive(Debug)] enum TokenDesc { Ws, Member, PathDot, Top(TopToken), Var(VariableToken), } #[derive(Debug, new)] struct TestToken<'source> { desc: TokenDesc, source: &'source str, } impl TestToken<'source> { fn to_token(&self, range: &std::ops::Range) -> SpannedToken<'source> { match self.desc { TokenDesc::Top(tok) => { SpannedToken::new(Span::new(range), self.source, tok.to_token().unwrap()) } TokenDesc::Var(tok) => { SpannedToken::new(Span::new(range), self.source, tok.to_token().unwrap()) } TokenDesc::Member => { SpannedToken::new(Span::new(range), self.source, Token::Member) } TokenDesc::Ws => { SpannedToken::new(Span::new(range), self.source, Token::Whitespace) } TokenDesc::PathDot => { SpannedToken::new(Span::new(range), self.source, Token::PathDot) } } } } macro_rules! chomp_tokens { { rest = { SP $($rest:tt)* }, accum = [ $($accum:tt)* ] } => { chomp_tokens! { rest = { $($rest)* }, accum = [ $($accum)* { SP } ] } }; { rest = { ws($expr:expr) $($rest:tt)* }, accum = [ $($accum:tt)* ] } => { chomp_tokens! { rest = { $($rest)* }, accum = [ $($accum)* { ws($expr) } ] } }; { rest = { $id:ident ( $expr:expr ) $($rest:tt)* }, accum = [ $($accum:tt)* ] } => { chomp_tokens! { rest = { $($rest)* }, accum = [ $($accum)* { tok(stringify!($id), $expr) } ] } }; { rest = { $token:tt $($rest:tt)* }, accum = [ $($accum:tt)* ] } => { chomp_tokens! { rest = { $($rest)* }, accum = [ $($accum)* { tk($token) } ] } }; { rest = { }, accum = [ $({ $($tokens:tt)* })* ] } => { &[ $($($tokens)*),* ] } } macro_rules! tokens { ($($tokens:tt)*) => { chomp_tokens! { rest = { $($tokens)* }, accum = [] } }; } #[test] fn test_tokenize_number() { assert_lex("123", tokens![Num("123")]); // assert_lex("123", &[tok("Num", "123")]); assert_lex( "123 456 789", tokens![Num("123") SP Num("456") SP Num("789")], ); assert_lex("-123", tokens![Num("-123")]); assert_lex( "123 -456 789", tokens![ Num("123") ws(" ") Num("-456") ws(" ") Num("789") ], ) } #[test] fn test_tokenize_variable() { assert_lex("$var", tokens![ "$" Var("var")]); } #[test] fn test_tokenize_string() { assert_lex( r#" "hello world" "#, tokens![ SP DQString(r#""hello world""#) SP ], ); assert_lex( r#" 'hello world' "#, tokens![ SP SQString(r#"'hello world'"#) SP ], ); } #[test] fn test_tokenize_path() { assert_lex("$var.bar", tokens![ "$" Var("var") "???." Member("bar") ]); assert_lex("$it.bar", tokens![ "$" Var("it") "???." Member("bar") ]); assert_lex( "$var. bar", tokens![ "$" Var("var") "???." SP Member("bar") ], ); assert_lex("$it. bar", tokens![ "$" Var("it") "???." SP Member("bar") ]); } #[test] fn test_tokenize_operator() { assert_lex( "$it.cpu > 10", tokens![ "$" Var("it") "???." Member("cpu") SP ">" SP Num("10") ], ); assert_lex( "$it.cpu < 10", tokens![ "$" Var("it") "???." Member("cpu") SP "<" SP Num("10") ], ); assert_lex( "$it.cpu >= 10", tokens![ "$" Var("it") "???." Member("cpu") SP ">=" SP Num("10") ], ); assert_lex( "$it.cpu <= 10", tokens![ "$" Var("it") "???." Member("cpu") SP "<=" SP Num("10") ], ); assert_lex( "$it.cpu == 10", tokens![ "$" Var("it") "???." Member("cpu") SP "==" SP Num("10") ], ); assert_lex( "$it.cpu != 10", tokens![ "$" Var("it") "???." Member("cpu") SP "!=" SP Num("10") ], ); } #[test] fn test_tokenize_smoke() { assert_lex( "ls | where cpu > 10", tokens![ Bare("ls") SP "|" SP Bare("where") SP Bare("cpu") SP ">" SP Num("10") ], ); assert_lex( "ls | where { $it.cpu > 10 }", tokens![ Bare("ls") SP "|" SP Bare("where") SP "{" SP "$" Var("it") "???." Member("cpu") SP ">" SP Num("10") SP "}" ], ); assert_lex( "open input2.json | from-json | select glossary", tokens![ Bare("open") SP Bare("input2.json") SP "|" SP Bare("from-json") SP "|" SP Bare("select") SP Bare("glossary") ], ); assert_lex( "git add . -v", tokens![ Bare("git") SP Bare("add") SP Bare(".") SP "-" Bare("v") ], ) } fn tok(name: &str, value: &'source str) -> TestToken<'source> { match name { "Num" => TestToken::new(TokenDesc::Top(TopToken::Num), value), "Var" => TestToken::new(TokenDesc::Var(VariableToken::Variable), value), "Member" => TestToken::new(TokenDesc::Member, value), "Bare" => TestToken::new(TokenDesc::Top(TopToken::Bare), value), "DQString" => TestToken::new(TokenDesc::Top(TopToken::DQString), value), "SQString" => TestToken::new(TokenDesc::Top(TopToken::SQString), value), other => panic!("Unexpected token name in test: {}", other), } } fn tk(name: &'source str) -> TestToken<'source> { let token = match name { "???." => return TestToken::new(TokenDesc::PathDot, "."), "." => TopToken::Dot, "--" => TopToken::DashDash, "-" => TopToken::Dash, "$" => TopToken::Dollar, "|" => TopToken::Pipe, "{" => TopToken::OpenBrace, "}" => TopToken::CloseBrace, ">" => TopToken::OpGt, "<" => TopToken::OpLt, ">=" => TopToken::OpGte, "<=" => TopToken::OpLte, "==" => TopToken::OpEq, "!=" => TopToken::OpNeq, other => panic!("Unexpected token name in test: {}", other), }; TestToken::new(TokenDesc::Top(token), name) } const SP: TestToken<'static> = TestToken { desc: TokenDesc::Ws, source: " ", }; fn ws(string: &'static str) -> TestToken<'source> { TestToken::new(TokenDesc::Ws, string) } }