From 726dd6279474367e6d9639dfd0958a4e4cbf1767 Mon Sep 17 00:00:00 2001 From: Kai-Philipp Nosper Date: Tue, 8 Feb 2022 18:56:17 +0100 Subject: [PATCH] Big token refactoring - Extract keywords, literals and combo tokens into separate sub-enums - Add a macro for quickly generating all tokens including the sub-enum tokens. This also takes less chars to write --- src/lexer.rs | 116 +++++++------- src/parser.rs | 106 +++++++------ src/token.rs | 416 +++++++++++++++++++++++++++++++++++--------------- 3 files changed, 410 insertions(+), 228 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index cc2bf41..a95219a 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -1,8 +1,8 @@ -use crate::token::Token; -use anyhow::Result; use std::{iter::Peekable, str::Chars}; use thiserror::Error; +use crate::{token::Token, T}; + #[derive(Debug, Error)] pub enum LexErr { #[error("Failed to parse '{0}' as i64")] @@ -52,62 +52,62 @@ impl<'a> Lexer<'a> { // Double character tokens '>' if matches!(self.peek(), '>') => { self.next(); - tokens.push(Token::Shr); + tokens.push(T![>>]); } '<' if matches!(self.peek(), '<') => { self.next(); - tokens.push(Token::Shl); + tokens.push(T![<<]); } '=' if matches!(self.peek(), '=') => { self.next(); - tokens.push(Token::EquEqu); + tokens.push(T![==]); } '!' if matches!(self.peek(), '=') => { self.next(); - tokens.push(Token::NotEqu); + tokens.push(T![!=]); } '<' if matches!(self.peek(), '=') => { self.next(); - tokens.push(Token::LAngleEqu); + tokens.push(T![<=]); } '>' if matches!(self.peek(), '=') => { self.next(); - tokens.push(Token::RAngleEqu); + tokens.push(T![>=]); } '<' if matches!(self.peek(), '-') => { self.next(); - tokens.push(Token::LArrow); + tokens.push(T![<-]); } '&' if matches!(self.peek(), '&') => { self.next(); - tokens.push(Token::LAnd); + tokens.push(T![&&]); } '|' if matches!(self.peek(), '|') => { self.next(); - tokens.push(Token::LOr); + tokens.push(T![||]); } // Single character tokens - ';' => tokens.push(Token::Semicolon), - '+' => tokens.push(Token::Add), - '-' => tokens.push(Token::Sub), - '*' => tokens.push(Token::Mul), - '/' => tokens.push(Token::Div), - '%' => tokens.push(Token::Mod), - '|' => tokens.push(Token::BOr), - '&' => tokens.push(Token::BAnd), - '^' => tokens.push(Token::BXor), - '(' => tokens.push(Token::LParen), - ')' => tokens.push(Token::RParen), - '~' => tokens.push(Token::Tilde), - '<' => tokens.push(Token::LAngle), - '>' => tokens.push(Token::RAngle), - '=' => tokens.push(Token::Equ), - '{' => tokens.push(Token::LBraces), - '}' => tokens.push(Token::RBraces), - '!' => tokens.push(Token::LNot), - '[' => tokens.push(Token::LBracket), - ']' => tokens.push(Token::RBracket), + ';' => tokens.push(T![;]), + '+' => tokens.push(T![+]), + '-' => tokens.push(T![-]), + '*' => tokens.push(T![*]), + '/' => tokens.push(T![/]), + '%' => tokens.push(T![%]), + '|' => tokens.push(T![|]), + '&' => tokens.push(T![&]), + '^' => tokens.push(T![^]), + '(' => tokens.push(T!['(']), + ')' => tokens.push(T![')']), + '~' => tokens.push(T![~]), + '<' => tokens.push(T![<]), + '>' => tokens.push(T![>]), + '=' => tokens.push(T![=]), + '{' => tokens.push(T!['{']), + '}' => tokens.push(T!['}']), + '!' => tokens.push(T![!]), + '[' => tokens.push(T!['[']), + ']' => tokens.push(T![']']), // Special tokens with variable length @@ -151,7 +151,7 @@ impl<'a> Lexer<'a> { // Try to convert the string representation of the value to i64 let i64val = sval.parse().map_err(|_| LexErr::NumericParse(sval))?; - Ok(Token::I64(i64val)) + Ok(T![i64(i64val)]) } /// Lex characters as a string until encountering an unescaped closing doublequoute char '"' @@ -185,7 +185,7 @@ impl<'a> Lexer<'a> { // Consume closing " self.next(); - Ok(Token::String(text)) + Ok(T![str(text)]) } /// Lex characters from the text as an identifier. This includes the first character passed in @@ -206,13 +206,13 @@ impl<'a> Lexer<'a> { // Check for pre-defined keywords let token = match ident.as_str() { - "loop" => Token::Loop, - "print" => Token::Print, - "if" => Token::If, - "else" => Token::Else, + "loop" => T![loop], + "print" => T![print], + "if" => T![if], + "else" => T![else], // If it doesn't match a keyword, it is a normal identifier - _ => Token::Ident(ident), + _ => T![ident(ident)], }; Ok(token) @@ -231,31 +231,31 @@ impl<'a> Lexer<'a> { #[cfg(test)] mod tests { - use super::{lex, Token}; + use crate::{lexer::lex, T}; #[test] fn test_lexer() { let code = "33 +5*2 + 4456467*2334+3 % - / << ^ | & >>"; let expected = vec![ - Token::I64(33), - Token::Add, - Token::I64(5), - Token::Mul, - Token::I64(2), - Token::Add, - Token::I64(4456467), - Token::Mul, - Token::I64(2334), - Token::Add, - Token::I64(3), - Token::Mod, - Token::Sub, - Token::Div, - Token::Shl, - Token::BXor, - Token::BOr, - Token::BAnd, - Token::Shr, + T![i64(33)], + T![+], + T![i64(5)], + T![*], + T![i64(2)], + T![+], + T![i64(4456467)], + T![*], + T![i64(2334)], + T![+], + T![i64(3)], + T![%], + T![-], + T![/], + T![<<], + T![^], + T![|], + T![&], + T![>>], ]; let actual = lex(code).unwrap(); diff --git a/src/parser.rs b/src/parser.rs index 92da8ff..195c2f6 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,8 +1,11 @@ use std::iter::Peekable; -use crate::ast::*; -use crate::stringstore::{Sid, StringStore}; -use crate::token::Token; +use crate::{ + ast::{Ast, BinOpType, BlockScope, Expression, If, Loop, Statement, UnOpType}, + stringstore::{Sid, StringStore}, + token::Token, + T, +}; /// Parse the given tokens into an abstract syntax tree pub fn parse, A: IntoIterator>(tokens: A) -> Ast { @@ -45,15 +48,15 @@ impl> Parser { loop { match self.peek() { - Token::Semicolon => { + T![;] => { self.next(); } - Token::EoF | Token::RBraces => break, + T![EoF] | T!['}'] => break, - Token::LBraces => { + T!['{'] => { self.next(); prog.push(Statement::Block(self.parse_scoped_block())); - if !matches!(self.next(), Token::RBraces) { + if !matches!(self.next(), T!['}']) { panic!("Error parsing block: Expectected closing braces '}}'"); } } @@ -71,22 +74,22 @@ impl> Parser { /// Parse a single statement from the tokens. fn parse_stmt(&mut self) -> Statement { match self.peek() { - Token::Loop => Statement::Loop(self.parse_loop()), + T![loop] => Statement::Loop(self.parse_loop()), - Token::Print => { + T![print] => { self.next(); let expr = self.parse_expr(); // After a statement, there must be a semicolon - if !matches!(self.next(), Token::Semicolon) { + if !matches!(self.next(), T![;]) { panic!("Expected semicolon after statement"); } Statement::Print(expr) } - Token::If => Statement::If(self.parse_if()), + T![if] => Statement::If(self.parse_if()), // If it is not a loop, try to lex as an expression _ => { @@ -106,7 +109,7 @@ impl> Parser { let stmt = Statement::Expr(expr); // After a statement, there must be a semicolon - if !matches!(self.next(), Token::Semicolon) { + if !matches!(self.next(), T![;]) { panic!("Expected semicolon after statement"); } @@ -117,34 +120,34 @@ impl> Parser { /// Parse an if statement from the tokens fn parse_if(&mut self) -> If { - if !matches!(self.next(), Token::If) { + if !matches!(self.next(), T![if]) { panic!("Error lexing if: Expected if token"); } let condition = self.parse_expr(); - if !matches!(self.next(), Token::LBraces) { + if !matches!(self.next(), T!['{']) { panic!("Error lexing if: Expected '{{'") } let body_true = self.parse_scoped_block(); - if !matches!(self.next(), Token::RBraces) { + if !matches!(self.next(), T!['}']) { panic!("Error lexing if: Expected '}}'") } let mut body_false = BlockScope::default(); - if matches!(self.peek(), Token::Else) { + if matches!(self.peek(), T![else]) { self.next(); - if !matches!(self.next(), Token::LBraces) { + if !matches!(self.next(), T!['{']) { panic!("Error lexing if: Expected '{{'") } body_false = self.parse_scoped_block(); - if !matches!(self.next(), Token::RBraces) { + if !matches!(self.next(), T!['}']) { panic!("Error lexing if: Expected '}}'") } } @@ -158,7 +161,7 @@ impl> Parser { /// Parse a loop statement from the tokens fn parse_loop(&mut self) -> Loop { - if !matches!(self.next(), Token::Loop) { + if !matches!(self.next(), T![loop]) { panic!("Error lexing loop: Expected loop token"); } @@ -168,14 +171,14 @@ impl> Parser { let body; match self.next() { - Token::LBraces => { + T!['{'] => { body = self.parse_scoped_block(); } - Token::Semicolon => { + T![;] => { advancement = Some(self.parse_expr()); - if !matches!(self.next(), Token::LBraces) { + if !matches!(self.next(), T!['{']) { panic!("Error lexing loop: Expected '{{'") } @@ -185,7 +188,7 @@ impl> Parser { _ => panic!("Error lexing loop: Expected ';' or '{{'"), } - if !matches!(self.next(), Token::RBraces) { + if !matches!(self.next(), T!['}']) { panic!("Error lexing loop: Expected '}}'") } @@ -234,22 +237,22 @@ impl> Parser { fn parse_primary(&mut self) -> Expression { match self.next() { // Literal i64 - Token::I64(val) => Expression::I64(val), + T![i64(val)] => Expression::I64(val), // Literal String - Token::String(text) => Expression::String(self.stringstore.intern_or_lookup(&text)), + T![str(text)] => Expression::String(self.stringstore.intern_or_lookup(&text)), - Token::LBracket => { + T!['['] => { let size = self.parse_expr(); - if !matches!(self.next(), Token::RBracket) { + if !matches!(self.next(), T![']']) { panic!("Error parsing array literal: Expected closing bracket") } Expression::ArrayLiteral(size.into()) } - Token::Ident(name) if matches!(self.peek(), Token::LBracket) => { + T![ident(name)] if matches!(self.peek(), T!['[']) => { let sid = self.stringstore.intern_or_lookup(&name); let stackpos = self .varstack @@ -258,19 +261,19 @@ impl> Parser { .position(|it| *it == sid) .map(|it| self.varstack.len() - it - 1) .unwrap_or(usize::MAX); - + self.next(); let index = self.parse_expr(); - if !matches!(self.next(), Token::RBracket) { + if !matches!(self.next(), T![']']) { panic!("Error parsing array access: Expected closing bracket") } Expression::ArrayAccess(sid, stackpos, index.into()) } - Token::Ident(name) => { + T![ident(name)] => { let sid = self.stringstore.intern_or_lookup(&name); let stackpos = self .varstack @@ -283,11 +286,11 @@ impl> Parser { } // Parentheses grouping - Token::LParen => { + T!['('] => { let inner_expr = self.parse_expr(); // Verify that there is a closing parenthesis - if !matches!(self.next(), Token::RParen) { + if !matches!(self.next(), T![')']) { panic!("Error parsing primary expr: Exepected closing parenthesis ')'"); } @@ -295,19 +298,19 @@ impl> Parser { } // Unary negation - Token::Sub => { + T![-] => { let operand = self.parse_primary(); Expression::UnOp(UnOpType::Negate, operand.into()) } // Unary bitwise not (bitflip) - Token::Tilde => { + T![~] => { let operand = self.parse_primary(); Expression::UnOp(UnOpType::BNot, operand.into()) } // Unary logical not - Token::LNot => { + T![!] => { let operand = self.parse_primary(); Expression::UnOp(UnOpType::LNot, operand.into()) } @@ -318,33 +321,36 @@ impl> Parser { /// Get the next Token without removing it fn peek(&mut self) -> &Token { - self.tokens.peek().unwrap_or(&Token::EoF) + self.tokens.peek().unwrap_or(&T![EoF]) } /// Advance to next Token and return the removed Token fn next(&mut self) -> Token { - self.tokens.next().unwrap_or(Token::EoF) + self.tokens.next().unwrap_or(T![EoF]) } } #[cfg(test)] mod tests { - use super::{parse, BinOpType, Expression}; - use crate::{parser::Statement, token::Token}; + use crate::{ + ast::{BinOpType, Expression, Statement}, + parser::parse, + T, + }; #[test] fn test_parser() { - // Expression: 1 + 2 * 3 + 4 - // With precedence: (1 + (2 * 3)) + 4 + // Expression: 1 + 2 * 3 - 4 + // With precedence: (1 + (2 * 3)) - 4 let tokens = [ - Token::I64(1), - Token::Add, - Token::I64(2), - Token::Mul, - Token::I64(3), - Token::Sub, - Token::I64(4), - Token::Semicolon, + T![i64(1)], + T![+], + T![i64(2)], + T![*], + T![i64(3)], + T![-], + T![i64(4)], + T![;], ]; let expected = Statement::Expr(Expression::BinOp( diff --git a/src/token.rs b/src/token.rs index 5ca659b..3fc1c93 100644 --- a/src/token.rs +++ b/src/token.rs @@ -1,152 +1,328 @@ -use crate::ast::BinOpType; +use crate::{ast::BinOpType, T}; + +/// Language keywords +#[derive(Debug, PartialEq, Eq)] +pub enum Keyword { + /// Loop keyword ("loop") + Loop, + /// Print keyword ("print") + Print, + /// If keyword ("if") + If, + /// Else keyword ("else") + Else, +} + +/// Literal values +#[derive(Debug, PartialEq, Eq)] +pub enum Literal { + /// Integer literal (64-bit) + I64(i64), + /// String literal + String(String), +} + +/// Combined tokens that consist of a combination of characters +#[derive(Debug, PartialEq, Eq)] +pub enum Combo { + /// Equal Equal ("==") + Equal2, + + /// Exclamation mark Equal ("!=") + ExclamationMarkEqual, + + /// Ampersand Ampersand ("&&") + Ampersand2, + + /// Pipe Pipe ("||") + Pipe2, + + /// LessThan LessThan ("<<") + LessThan2, + + /// GreaterThan GreaterThan (">>") + GreaterThan2, + + /// LessThan Equal ("<=") + LessThanEqual, + + /// GreaterThan Equal (">=") + GreaterThanEqual, + + /// LessThan Minus ("<-") + LessThanMinus, +} #[derive(Debug, PartialEq, Eq)] pub enum Token { - /// Integer literal (64-bit) - I64(i64), + /// Literal value token + Literal(Literal), - /// String literal - String(String), + /// Keyword token + Keyword(Keyword), /// Identifier (name for variables, functions, ...) Ident(String), - /// Loop keyword (loop) - Loop, + /// Combined tokens consisting of multiple characters + Combo(Combo), - /// Print keyword (print) - Print, + /// Equal Sign ("=") + Equal, - /// If keyword (if) - If, - - /// Else keyword (else) - Else, - - /// Left Bracket ('[') - LBracket, - - /// Right Bracket (']') - RBracket, - - /// Left Parenthesis ('(') - LParen, - - /// Right Parenthesis (')') - RParen, - - /// Left curly braces ({) - LBraces, - - /// Right curly braces (}) - RBraces, - - /// Plus (+) - Add, - - /// Minus (-) - Sub, - - /// Asterisk (*) - Mul, - - /// Slash (/) - Div, - - /// Percent (%) - Mod, - - /// Equal Equal (==) - EquEqu, - - /// Exclamationmark Equal (!=) - NotEqu, - - /// Pipe (|) - BOr, - - /// Ampersand (&) - BAnd, - - /// Circumflex (^) - BXor, - - /// Logical AND (&&) - LAnd, - - /// Logical OR (||) - LOr, - - /// Shift Left (<<) - Shl, - - /// Shift Right (>>) - Shr, - - /// Tilde (~) - Tilde, - - /// Logical not (!) - LNot, - - /// Left angle bracket (<) - LAngle, - - /// Right angle bracket (>) - RAngle, - - /// Left angle bracket Equal (<=) - LAngleEqu, - - /// Left angle bracket Equal (>=) - RAngleEqu, - - /// Left arrow (<-) - LArrow, - - /// Equal Sign (=) - Equ, - - /// Semicolon (;) + /// Semicolon (";") Semicolon, /// End of file EoF, + + /// Left Bracket ("[") + LBracket, + + /// Right Bracket ("]") + RBracket, + + /// Left Parenthesis ("(") + LParen, + + /// Right Parenthesis (")"") + RParen, + + /// Left curly braces ("{") + LBraces, + + /// Right curly braces ("}") + RBraces, + + /// Plus ("+") + Plus, + + /// Minus ("-") + Minus, + + /// Asterisk ("*") + Asterisk, + + /// Slash ("/") + Slash, + + /// Percent ("%") + Percent, + + /// Pipe ("|") + Pipe, + + /// Tilde ("~") + Tilde, + + /// Logical not ("!") + Exclamationmark, + + /// Left angle bracket ("<") + LessThan, + + /// Right angle bracket (">") + GreaterThan, + + /// Ampersand ("&") + Ampersand, + + /// Circumflex ("^") + Circumflex, } impl Token { + /// If the Token can be used as a binary operation type, get the matching BinOpType. Otherwise + /// return None. pub fn try_to_binop(&self) -> Option { Some(match self { - Token::Add => BinOpType::Add, - Token::Sub => BinOpType::Sub, + T![+] => BinOpType::Add, + T![-] => BinOpType::Sub, - Token::Mul => BinOpType::Mul, - Token::Div => BinOpType::Div, - Token::Mod => BinOpType::Mod, + T![*] => BinOpType::Mul, + T![/] => BinOpType::Div, + T![%] => BinOpType::Mod, - Token::BAnd => BinOpType::BAnd, - Token::BOr => BinOpType::BOr, - Token::BXor => BinOpType::BXor, + T![&] => BinOpType::BAnd, + T![|] => BinOpType::BOr, + T![^] => BinOpType::BXor, - Token::LAnd => BinOpType::LAnd, - Token::LOr => BinOpType::LOr, + T![&&] => BinOpType::LAnd, + T![||] => BinOpType::LOr, - Token::Shl => BinOpType::Shl, - Token::Shr => BinOpType::Shr, + T![<<] => BinOpType::Shl, + T![>>] => BinOpType::Shr, - Token::EquEqu => BinOpType::EquEqu, - Token::NotEqu => BinOpType::NotEqu, + T![==] => BinOpType::EquEqu, + T![!=] => BinOpType::NotEqu, - Token::LAngle => BinOpType::Less, - Token::LAngleEqu => BinOpType::LessEqu, + T![<] => BinOpType::Less, + T![<=] => BinOpType::LessEqu, - Token::RAngle => BinOpType::Greater, - Token::RAngleEqu => BinOpType::GreaterEqu, + T![>] => BinOpType::Greater, + T![>=] => BinOpType::GreaterEqu, - Token::LArrow => BinOpType::Declare, - Token::Equ => BinOpType::Assign, + T![<-] => BinOpType::Declare, + T![=] => BinOpType::Assign, _ => return None, }) } } + +/// Macro to quickly create a token of the specified kind +#[macro_export] +macro_rules! T { + // Keywords + [loop] => { + crate::token::Token::Keyword(crate::token::Keyword::Loop) + }; + + [print] => { + crate::token::Token::Keyword(crate::token::Keyword::Print) + }; + + [if] => { + crate::token::Token::Keyword(crate::token::Keyword::If) + }; + + [else] => { + crate::token::Token::Keyword(crate::token::Keyword::Else) + }; + + // Literals + [i64($val:tt)] => { + crate::token::Token::Literal(crate::token::Literal::I64($val)) + }; + + [str($val:tt)] => { + crate::token::Token::Literal(crate::token::Literal::String($val)) + }; + + // Ident + [ident($val:tt)] => { + crate::token::Token::Ident($val) + }; + + // Combo crate::token::Tokens + [==] => { + crate::token::Token::Combo(crate::token::Combo::Equal2) + }; + + [!=] => { + crate::token::Token::Combo(crate::token::Combo::ExclamationMarkEqual) + }; + + [&&] => { + crate::token::Token::Combo(crate::token::Combo::Ampersand2) + }; + + [||] => { + crate::token::Token::Combo(crate::token::Combo::Pipe2) + }; + + [<<] => { + crate::token::Token::Combo(crate::token::Combo::LessThan2) + }; + + [>>] => { + crate::token::Token::Combo(crate::token::Combo::GreaterThan2) + }; + + [<=] => { + crate::token::Token::Combo(crate::token::Combo::LessThanEqual) + }; + + [>=] => { + crate::token::Token::Combo(crate::token::Combo::GreaterThanEqual) + }; + + [<-] => { + crate::token::Token::Combo(crate::token::Combo::LessThanMinus) + }; + + // Normal Tokens + [=] => { + crate::token::Token::Equal + }; + + [;] => { + crate::token::Token::Semicolon + }; + + [EoF] => { + crate::token::Token::EoF + }; + + ['['] => { + crate::token::Token::LBracket + }; + + [']'] => { + crate::token::Token::RBracket + }; + + ['('] => { + crate::token::Token::LParen + }; + + [')'] => { + crate::token::Token::RParen + }; + + ['{'] => { + crate::token::Token::LBraces + }; + + ['}'] => { + crate::token::Token::RBraces + }; + + [+] => { + crate::token::Token::Plus + }; + + [-] => { + crate::token::Token::Minus + }; + + [*] => { + crate::token::Token::Asterisk + }; + + [/] => { + crate::token::Token::Slash + }; + + [%] => { + crate::token::Token::Percent + }; + + [|] => { + crate::token::Token::Pipe + }; + + [~] => { + crate::token::Token::Tilde + }; + + [!] => { + crate::token::Token::Exclamationmark + }; + + [<] => { + crate::token::Token::LessThan + }; + + [>] => { + crate::token::Token::GreaterThan + }; + + [&] => { + crate::token::Token::Ampersand + }; + + [^] => { + crate::token::Token::Circumflex + }; +}