Big token refactoring

- Extract keywords, literals and combo tokens into separate sub-enums
- Add a macro for quickly generating all tokens including the sub-enum
  tokens. This also takes less chars to write
This commit is contained in:
Kai-Philipp Nosper 2022-02-08 18:56:17 +01:00
parent c723b1c2cb
commit 726dd62794
3 changed files with 410 additions and 228 deletions

View File

@ -1,8 +1,8 @@
use crate::token::Token;
use anyhow::Result;
use std::{iter::Peekable, str::Chars};
use thiserror::Error;
use crate::{token::Token, T};
#[derive(Debug, Error)]
pub enum LexErr {
#[error("Failed to parse '{0}' as i64")]
@ -52,62 +52,62 @@ impl<'a> Lexer<'a> {
// Double character tokens
'>' if matches!(self.peek(), '>') => {
self.next();
tokens.push(Token::Shr);
tokens.push(T![>>]);
}
'<' if matches!(self.peek(), '<') => {
self.next();
tokens.push(Token::Shl);
tokens.push(T![<<]);
}
'=' if matches!(self.peek(), '=') => {
self.next();
tokens.push(Token::EquEqu);
tokens.push(T![==]);
}
'!' if matches!(self.peek(), '=') => {
self.next();
tokens.push(Token::NotEqu);
tokens.push(T![!=]);
}
'<' if matches!(self.peek(), '=') => {
self.next();
tokens.push(Token::LAngleEqu);
tokens.push(T![<=]);
}
'>' if matches!(self.peek(), '=') => {
self.next();
tokens.push(Token::RAngleEqu);
tokens.push(T![>=]);
}
'<' if matches!(self.peek(), '-') => {
self.next();
tokens.push(Token::LArrow);
tokens.push(T![<-]);
}
'&' if matches!(self.peek(), '&') => {
self.next();
tokens.push(Token::LAnd);
tokens.push(T![&&]);
}
'|' if matches!(self.peek(), '|') => {
self.next();
tokens.push(Token::LOr);
tokens.push(T![||]);
}
// Single character tokens
';' => tokens.push(Token::Semicolon),
'+' => tokens.push(Token::Add),
'-' => tokens.push(Token::Sub),
'*' => tokens.push(Token::Mul),
'/' => tokens.push(Token::Div),
'%' => tokens.push(Token::Mod),
'|' => tokens.push(Token::BOr),
'&' => tokens.push(Token::BAnd),
'^' => tokens.push(Token::BXor),
'(' => tokens.push(Token::LParen),
')' => tokens.push(Token::RParen),
'~' => tokens.push(Token::Tilde),
'<' => tokens.push(Token::LAngle),
'>' => tokens.push(Token::RAngle),
'=' => tokens.push(Token::Equ),
'{' => tokens.push(Token::LBraces),
'}' => tokens.push(Token::RBraces),
'!' => tokens.push(Token::LNot),
'[' => tokens.push(Token::LBracket),
']' => tokens.push(Token::RBracket),
';' => tokens.push(T![;]),
'+' => tokens.push(T![+]),
'-' => tokens.push(T![-]),
'*' => tokens.push(T![*]),
'/' => tokens.push(T![/]),
'%' => tokens.push(T![%]),
'|' => tokens.push(T![|]),
'&' => tokens.push(T![&]),
'^' => tokens.push(T![^]),
'(' => tokens.push(T!['(']),
')' => tokens.push(T![')']),
'~' => tokens.push(T![~]),
'<' => tokens.push(T![<]),
'>' => tokens.push(T![>]),
'=' => tokens.push(T![=]),
'{' => tokens.push(T!['{']),
'}' => tokens.push(T!['}']),
'!' => tokens.push(T![!]),
'[' => tokens.push(T!['[']),
']' => tokens.push(T![']']),
// Special tokens with variable length
@ -151,7 +151,7 @@ impl<'a> Lexer<'a> {
// Try to convert the string representation of the value to i64
let i64val = sval.parse().map_err(|_| LexErr::NumericParse(sval))?;
Ok(Token::I64(i64val))
Ok(T![i64(i64val)])
}
/// Lex characters as a string until encountering an unescaped closing doublequoute char '"'
@ -185,7 +185,7 @@ impl<'a> Lexer<'a> {
// Consume closing "
self.next();
Ok(Token::String(text))
Ok(T![str(text)])
}
/// Lex characters from the text as an identifier. This includes the first character passed in
@ -206,13 +206,13 @@ impl<'a> Lexer<'a> {
// Check for pre-defined keywords
let token = match ident.as_str() {
"loop" => Token::Loop,
"print" => Token::Print,
"if" => Token::If,
"else" => Token::Else,
"loop" => T![loop],
"print" => T![print],
"if" => T![if],
"else" => T![else],
// If it doesn't match a keyword, it is a normal identifier
_ => Token::Ident(ident),
_ => T![ident(ident)],
};
Ok(token)
@ -231,31 +231,31 @@ impl<'a> Lexer<'a> {
#[cfg(test)]
mod tests {
use super::{lex, Token};
use crate::{lexer::lex, T};
#[test]
fn test_lexer() {
let code = "33 +5*2 + 4456467*2334+3 % - / << ^ | & >>";
let expected = vec![
Token::I64(33),
Token::Add,
Token::I64(5),
Token::Mul,
Token::I64(2),
Token::Add,
Token::I64(4456467),
Token::Mul,
Token::I64(2334),
Token::Add,
Token::I64(3),
Token::Mod,
Token::Sub,
Token::Div,
Token::Shl,
Token::BXor,
Token::BOr,
Token::BAnd,
Token::Shr,
T![i64(33)],
T![+],
T![i64(5)],
T![*],
T![i64(2)],
T![+],
T![i64(4456467)],
T![*],
T![i64(2334)],
T![+],
T![i64(3)],
T![%],
T![-],
T![/],
T![<<],
T![^],
T![|],
T![&],
T![>>],
];
let actual = lex(code).unwrap();

View File

@ -1,8 +1,11 @@
use std::iter::Peekable;
use crate::ast::*;
use crate::stringstore::{Sid, StringStore};
use crate::token::Token;
use crate::{
ast::{Ast, BinOpType, BlockScope, Expression, If, Loop, Statement, UnOpType},
stringstore::{Sid, StringStore},
token::Token,
T,
};
/// Parse the given tokens into an abstract syntax tree
pub fn parse<T: Iterator<Item = Token>, A: IntoIterator<IntoIter = T>>(tokens: A) -> Ast {
@ -45,15 +48,15 @@ impl<T: Iterator<Item = Token>> Parser<T> {
loop {
match self.peek() {
Token::Semicolon => {
T![;] => {
self.next();
}
Token::EoF | Token::RBraces => break,
T![EoF] | T!['}'] => break,
Token::LBraces => {
T!['{'] => {
self.next();
prog.push(Statement::Block(self.parse_scoped_block()));
if !matches!(self.next(), Token::RBraces) {
if !matches!(self.next(), T!['}']) {
panic!("Error parsing block: Expectected closing braces '}}'");
}
}
@ -71,22 +74,22 @@ impl<T: Iterator<Item = Token>> Parser<T> {
/// Parse a single statement from the tokens.
fn parse_stmt(&mut self) -> Statement {
match self.peek() {
Token::Loop => Statement::Loop(self.parse_loop()),
T![loop] => Statement::Loop(self.parse_loop()),
Token::Print => {
T![print] => {
self.next();
let expr = self.parse_expr();
// After a statement, there must be a semicolon
if !matches!(self.next(), Token::Semicolon) {
if !matches!(self.next(), T![;]) {
panic!("Expected semicolon after statement");
}
Statement::Print(expr)
}
Token::If => Statement::If(self.parse_if()),
T![if] => Statement::If(self.parse_if()),
// If it is not a loop, try to lex as an expression
_ => {
@ -106,7 +109,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
let stmt = Statement::Expr(expr);
// After a statement, there must be a semicolon
if !matches!(self.next(), Token::Semicolon) {
if !matches!(self.next(), T![;]) {
panic!("Expected semicolon after statement");
}
@ -117,34 +120,34 @@ impl<T: Iterator<Item = Token>> Parser<T> {
/// Parse an if statement from the tokens
fn parse_if(&mut self) -> If {
if !matches!(self.next(), Token::If) {
if !matches!(self.next(), T![if]) {
panic!("Error lexing if: Expected if token");
}
let condition = self.parse_expr();
if !matches!(self.next(), Token::LBraces) {
if !matches!(self.next(), T!['{']) {
panic!("Error lexing if: Expected '{{'")
}
let body_true = self.parse_scoped_block();
if !matches!(self.next(), Token::RBraces) {
if !matches!(self.next(), T!['}']) {
panic!("Error lexing if: Expected '}}'")
}
let mut body_false = BlockScope::default();
if matches!(self.peek(), Token::Else) {
if matches!(self.peek(), T![else]) {
self.next();
if !matches!(self.next(), Token::LBraces) {
if !matches!(self.next(), T!['{']) {
panic!("Error lexing if: Expected '{{'")
}
body_false = self.parse_scoped_block();
if !matches!(self.next(), Token::RBraces) {
if !matches!(self.next(), T!['}']) {
panic!("Error lexing if: Expected '}}'")
}
}
@ -158,7 +161,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
/// Parse a loop statement from the tokens
fn parse_loop(&mut self) -> Loop {
if !matches!(self.next(), Token::Loop) {
if !matches!(self.next(), T![loop]) {
panic!("Error lexing loop: Expected loop token");
}
@ -168,14 +171,14 @@ impl<T: Iterator<Item = Token>> Parser<T> {
let body;
match self.next() {
Token::LBraces => {
T!['{'] => {
body = self.parse_scoped_block();
}
Token::Semicolon => {
T![;] => {
advancement = Some(self.parse_expr());
if !matches!(self.next(), Token::LBraces) {
if !matches!(self.next(), T!['{']) {
panic!("Error lexing loop: Expected '{{'")
}
@ -185,7 +188,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
_ => panic!("Error lexing loop: Expected ';' or '{{'"),
}
if !matches!(self.next(), Token::RBraces) {
if !matches!(self.next(), T!['}']) {
panic!("Error lexing loop: Expected '}}'")
}
@ -234,22 +237,22 @@ impl<T: Iterator<Item = Token>> Parser<T> {
fn parse_primary(&mut self) -> Expression {
match self.next() {
// Literal i64
Token::I64(val) => Expression::I64(val),
T![i64(val)] => Expression::I64(val),
// Literal String
Token::String(text) => Expression::String(self.stringstore.intern_or_lookup(&text)),
T![str(text)] => Expression::String(self.stringstore.intern_or_lookup(&text)),
Token::LBracket => {
T!['['] => {
let size = self.parse_expr();
if !matches!(self.next(), Token::RBracket) {
if !matches!(self.next(), T![']']) {
panic!("Error parsing array literal: Expected closing bracket")
}
Expression::ArrayLiteral(size.into())
}
Token::Ident(name) if matches!(self.peek(), Token::LBracket) => {
T![ident(name)] if matches!(self.peek(), T!['[']) => {
let sid = self.stringstore.intern_or_lookup(&name);
let stackpos = self
.varstack
@ -263,14 +266,14 @@ impl<T: Iterator<Item = Token>> Parser<T> {
let index = self.parse_expr();
if !matches!(self.next(), Token::RBracket) {
if !matches!(self.next(), T![']']) {
panic!("Error parsing array access: Expected closing bracket")
}
Expression::ArrayAccess(sid, stackpos, index.into())
}
Token::Ident(name) => {
T![ident(name)] => {
let sid = self.stringstore.intern_or_lookup(&name);
let stackpos = self
.varstack
@ -283,11 +286,11 @@ impl<T: Iterator<Item = Token>> Parser<T> {
}
// Parentheses grouping
Token::LParen => {
T!['('] => {
let inner_expr = self.parse_expr();
// Verify that there is a closing parenthesis
if !matches!(self.next(), Token::RParen) {
if !matches!(self.next(), T![')']) {
panic!("Error parsing primary expr: Exepected closing parenthesis ')'");
}
@ -295,19 +298,19 @@ impl<T: Iterator<Item = Token>> Parser<T> {
}
// Unary negation
Token::Sub => {
T![-] => {
let operand = self.parse_primary();
Expression::UnOp(UnOpType::Negate, operand.into())
}
// Unary bitwise not (bitflip)
Token::Tilde => {
T![~] => {
let operand = self.parse_primary();
Expression::UnOp(UnOpType::BNot, operand.into())
}
// Unary logical not
Token::LNot => {
T![!] => {
let operand = self.parse_primary();
Expression::UnOp(UnOpType::LNot, operand.into())
}
@ -318,33 +321,36 @@ impl<T: Iterator<Item = Token>> Parser<T> {
/// Get the next Token without removing it
fn peek(&mut self) -> &Token {
self.tokens.peek().unwrap_or(&Token::EoF)
self.tokens.peek().unwrap_or(&T![EoF])
}
/// Advance to next Token and return the removed Token
fn next(&mut self) -> Token {
self.tokens.next().unwrap_or(Token::EoF)
self.tokens.next().unwrap_or(T![EoF])
}
}
#[cfg(test)]
mod tests {
use super::{parse, BinOpType, Expression};
use crate::{parser::Statement, token::Token};
use crate::{
ast::{BinOpType, Expression, Statement},
parser::parse,
T,
};
#[test]
fn test_parser() {
// Expression: 1 + 2 * 3 + 4
// With precedence: (1 + (2 * 3)) + 4
// Expression: 1 + 2 * 3 - 4
// With precedence: (1 + (2 * 3)) - 4
let tokens = [
Token::I64(1),
Token::Add,
Token::I64(2),
Token::Mul,
Token::I64(3),
Token::Sub,
Token::I64(4),
Token::Semicolon,
T![i64(1)],
T![+],
T![i64(2)],
T![*],
T![i64(3)],
T![-],
T![i64(4)],
T![;],
];
let expected = Statement::Expr(Expression::BinOp(

View File

@ -1,152 +1,328 @@
use crate::ast::BinOpType;
use crate::{ast::BinOpType, T};
/// Language keywords
#[derive(Debug, PartialEq, Eq)]
pub enum Keyword {
/// Loop keyword ("loop")
Loop,
/// Print keyword ("print")
Print,
/// If keyword ("if")
If,
/// Else keyword ("else")
Else,
}
/// Literal values
#[derive(Debug, PartialEq, Eq)]
pub enum Literal {
/// Integer literal (64-bit)
I64(i64),
/// String literal
String(String),
}
/// Combined tokens that consist of a combination of characters
#[derive(Debug, PartialEq, Eq)]
pub enum Combo {
/// Equal Equal ("==")
Equal2,
/// Exclamation mark Equal ("!=")
ExclamationMarkEqual,
/// Ampersand Ampersand ("&&")
Ampersand2,
/// Pipe Pipe ("||")
Pipe2,
/// LessThan LessThan ("<<")
LessThan2,
/// GreaterThan GreaterThan (">>")
GreaterThan2,
/// LessThan Equal ("<=")
LessThanEqual,
/// GreaterThan Equal (">=")
GreaterThanEqual,
/// LessThan Minus ("<-")
LessThanMinus,
}
#[derive(Debug, PartialEq, Eq)]
pub enum Token {
/// Integer literal (64-bit)
I64(i64),
/// Literal value token
Literal(Literal),
/// String literal
String(String),
/// Keyword token
Keyword(Keyword),
/// Identifier (name for variables, functions, ...)
Ident(String),
/// Loop keyword (loop)
Loop,
/// Combined tokens consisting of multiple characters
Combo(Combo),
/// Print keyword (print)
Print,
/// Equal Sign ("=")
Equal,
/// If keyword (if)
If,
/// Else keyword (else)
Else,
/// Left Bracket ('[')
LBracket,
/// Right Bracket (']')
RBracket,
/// Left Parenthesis ('(')
LParen,
/// Right Parenthesis (')')
RParen,
/// Left curly braces ({)
LBraces,
/// Right curly braces (})
RBraces,
/// Plus (+)
Add,
/// Minus (-)
Sub,
/// Asterisk (*)
Mul,
/// Slash (/)
Div,
/// Percent (%)
Mod,
/// Equal Equal (==)
EquEqu,
/// Exclamationmark Equal (!=)
NotEqu,
/// Pipe (|)
BOr,
/// Ampersand (&)
BAnd,
/// Circumflex (^)
BXor,
/// Logical AND (&&)
LAnd,
/// Logical OR (||)
LOr,
/// Shift Left (<<)
Shl,
/// Shift Right (>>)
Shr,
/// Tilde (~)
Tilde,
/// Logical not (!)
LNot,
/// Left angle bracket (<)
LAngle,
/// Right angle bracket (>)
RAngle,
/// Left angle bracket Equal (<=)
LAngleEqu,
/// Left angle bracket Equal (>=)
RAngleEqu,
/// Left arrow (<-)
LArrow,
/// Equal Sign (=)
Equ,
/// Semicolon (;)
/// Semicolon (";")
Semicolon,
/// End of file
EoF,
/// Left Bracket ("[")
LBracket,
/// Right Bracket ("]")
RBracket,
/// Left Parenthesis ("(")
LParen,
/// Right Parenthesis (")"")
RParen,
/// Left curly braces ("{")
LBraces,
/// Right curly braces ("}")
RBraces,
/// Plus ("+")
Plus,
/// Minus ("-")
Minus,
/// Asterisk ("*")
Asterisk,
/// Slash ("/")
Slash,
/// Percent ("%")
Percent,
/// Pipe ("|")
Pipe,
/// Tilde ("~")
Tilde,
/// Logical not ("!")
Exclamationmark,
/// Left angle bracket ("<")
LessThan,
/// Right angle bracket (">")
GreaterThan,
/// Ampersand ("&")
Ampersand,
/// Circumflex ("^")
Circumflex,
}
impl Token {
/// If the Token can be used as a binary operation type, get the matching BinOpType. Otherwise
/// return None.
pub fn try_to_binop(&self) -> Option<BinOpType> {
Some(match self {
Token::Add => BinOpType::Add,
Token::Sub => BinOpType::Sub,
T![+] => BinOpType::Add,
T![-] => BinOpType::Sub,
Token::Mul => BinOpType::Mul,
Token::Div => BinOpType::Div,
Token::Mod => BinOpType::Mod,
T![*] => BinOpType::Mul,
T![/] => BinOpType::Div,
T![%] => BinOpType::Mod,
Token::BAnd => BinOpType::BAnd,
Token::BOr => BinOpType::BOr,
Token::BXor => BinOpType::BXor,
T![&] => BinOpType::BAnd,
T![|] => BinOpType::BOr,
T![^] => BinOpType::BXor,
Token::LAnd => BinOpType::LAnd,
Token::LOr => BinOpType::LOr,
T![&&] => BinOpType::LAnd,
T![||] => BinOpType::LOr,
Token::Shl => BinOpType::Shl,
Token::Shr => BinOpType::Shr,
T![<<] => BinOpType::Shl,
T![>>] => BinOpType::Shr,
Token::EquEqu => BinOpType::EquEqu,
Token::NotEqu => BinOpType::NotEqu,
T![==] => BinOpType::EquEqu,
T![!=] => BinOpType::NotEqu,
Token::LAngle => BinOpType::Less,
Token::LAngleEqu => BinOpType::LessEqu,
T![<] => BinOpType::Less,
T![<=] => BinOpType::LessEqu,
Token::RAngle => BinOpType::Greater,
Token::RAngleEqu => BinOpType::GreaterEqu,
T![>] => BinOpType::Greater,
T![>=] => BinOpType::GreaterEqu,
Token::LArrow => BinOpType::Declare,
Token::Equ => BinOpType::Assign,
T![<-] => BinOpType::Declare,
T![=] => BinOpType::Assign,
_ => return None,
})
}
}
/// Macro to quickly create a token of the specified kind
#[macro_export]
macro_rules! T {
// Keywords
[loop] => {
crate::token::Token::Keyword(crate::token::Keyword::Loop)
};
[print] => {
crate::token::Token::Keyword(crate::token::Keyword::Print)
};
[if] => {
crate::token::Token::Keyword(crate::token::Keyword::If)
};
[else] => {
crate::token::Token::Keyword(crate::token::Keyword::Else)
};
// Literals
[i64($val:tt)] => {
crate::token::Token::Literal(crate::token::Literal::I64($val))
};
[str($val:tt)] => {
crate::token::Token::Literal(crate::token::Literal::String($val))
};
// Ident
[ident($val:tt)] => {
crate::token::Token::Ident($val)
};
// Combo crate::token::Tokens
[==] => {
crate::token::Token::Combo(crate::token::Combo::Equal2)
};
[!=] => {
crate::token::Token::Combo(crate::token::Combo::ExclamationMarkEqual)
};
[&&] => {
crate::token::Token::Combo(crate::token::Combo::Ampersand2)
};
[||] => {
crate::token::Token::Combo(crate::token::Combo::Pipe2)
};
[<<] => {
crate::token::Token::Combo(crate::token::Combo::LessThan2)
};
[>>] => {
crate::token::Token::Combo(crate::token::Combo::GreaterThan2)
};
[<=] => {
crate::token::Token::Combo(crate::token::Combo::LessThanEqual)
};
[>=] => {
crate::token::Token::Combo(crate::token::Combo::GreaterThanEqual)
};
[<-] => {
crate::token::Token::Combo(crate::token::Combo::LessThanMinus)
};
// Normal Tokens
[=] => {
crate::token::Token::Equal
};
[;] => {
crate::token::Token::Semicolon
};
[EoF] => {
crate::token::Token::EoF
};
['['] => {
crate::token::Token::LBracket
};
[']'] => {
crate::token::Token::RBracket
};
['('] => {
crate::token::Token::LParen
};
[')'] => {
crate::token::Token::RParen
};
['{'] => {
crate::token::Token::LBraces
};
['}'] => {
crate::token::Token::RBraces
};
[+] => {
crate::token::Token::Plus
};
[-] => {
crate::token::Token::Minus
};
[*] => {
crate::token::Token::Asterisk
};
[/] => {
crate::token::Token::Slash
};
[%] => {
crate::token::Token::Percent
};
[|] => {
crate::token::Token::Pipe
};
[~] => {
crate::token::Token::Tilde
};
[!] => {
crate::token::Token::Exclamationmark
};
[<] => {
crate::token::Token::LessThan
};
[>] => {
crate::token::Token::GreaterThan
};
[&] => {
crate::token::Token::Ampersand
};
[^] => {
crate::token::Token::Circumflex
};
}