Refactor lexer

2022-01-29 14:55:22 +01:00 · 2022-01-29 14:55:22 +01:00 · 9e3a642810
commit 9e3a642810
parent e62121c75b
4 changed files with 301 additions and 258 deletions
--- a/src/lexer.rs
+++ b/src/lexer.rs
@ -1,107 +1,11 @@
 use std::{iter::Peekable, str::Chars};
-use crate::parser::BinOpType;
+use crate::token::{Keyword, Literal, Token};
-#[derive(Debug, PartialEq, Eq)]
+/// Lex the provided code into a Token Buffer
-pub enum Token {
+pub fn lex(code: &str) -> Vec<Token> {
-    /// Integer literal (64-bit)
+    let mut lexer = Lexer::new(code);
-    I64(i64),
+    lexer.lex()
    /// String literal ("Some string")
    Str(String),
    /// Left parenthesis ('(')
    LParen,
    /// Right parentheses (')')
    RParen,
    /// Left brace ({)
    LBrace,
    /// Right brace (})
    RBrace,
    /// Identifier (variable / function / ... name)
    Ident(String),
    /// Dollar sign ($)
    Dollar,
    /// Double Dollar sign ($$)
    DoubleDollar,
    /// Let identifier (let)
    Let,
    /// While (while)
    While,
    /// For (for)
    For,
    /// If (if)
    If,
    /// Else (else)
    Else,
    /// Assignment (single equal) (=)
    Assign,
    /// Plus (+)
    Add,
    /// Minus (-)
    Sub,
    /// Asterisk (*)
    Mul,
    /// Slash (/)
    Div,
    /// Percent (%)
    Mod,
    /// Pipe (|)
    BOr,
    /// Ampersand (&)
    BAnd,
    /// Circumflex (^)
    BXor,
    /// Shift Left (<<)
    Shl,
    /// Shift Right (>>)
    Shr,
    /// Equal sign (==)
    Equ,
    /// Not Equal sign (!=)
    Neq,
    /// Greater than (>)
    Gt,
    /// Greater or equal (>=)
    Ge,
    /// Less than (<)
    Lt,
    /// Less or equal (<=)
    Le,
    /// Semicolon (;)
    Semicolon,
    /// End of file
    EoF,
 }
 struct Lexer<'a> {
@ -114,67 +18,59 @@ impl<'a> Lexer<'a> {
        Self { code }
    }
    /// Advance to next character and return the removed char. If there is no next char, '\0'
    /// is returned.
    fn next(&mut self) -> char {
        self.code.next().unwrap_or('\0')
    }
    /// Get the next character without removing it. If there is no next char, '\0' is returned.
    fn peek(&mut self) -> char {
        self.code.peek().copied().unwrap_or('\0')
    }
    fn lex(&mut self) -> Vec<Token> {
        let mut tokens = Vec::new();
-        while let Some(ch) = self.next() {
+        loop {
-            match ch {
+            match self.next() {
                // End of text
                '\0' => break,
                // Skip whitespace
                ' ' | '\r' | '\n' | '\t' => (),
-                // Lex numbers
+                // Handle tokens that span two characters
-                '0'..='9' => {
+                '>' if matches!(self.peek(), '>') => {
                    let mut sval = String::from(ch);
                    // Do as long as a next char exists and it is a numeric char
                    while let Some(ch) = self.peek() {
                        // The next char is verified to be Some, so unwrap is safe
                        match ch {
                            // Underscore is a separator, so remove it but don't add to number
                            '_' => {
                                self.next().unwrap();
                            }
                            '0'..='9' => {
                                sval.push(self.next().unwrap());
                            }
                            // Next char is not a number, so stop and finish the number token
                            _ => break,
                        }
                    }
                    // TODO: We only added numeric chars to the string, but the conversion could still fail
                    tokens.push(Token::I64(sval.parse().unwrap()));
                }
                '>' if matches!(self.peek(), Some('>')) => {
                    self.next();
                    tokens.push(Token::Shr);
                }
-                '<' if matches!(self.peek(), Some('<')) => {
+                '<' if matches!(self.peek(), '<') => {
                    self.next();
                    tokens.push(Token::Shl);
                }
-                '=' if matches!(self.peek(), Some('=')) => {
+                '=' if matches!(self.peek(), '=') => {
                    self.next();
                    tokens.push(Token::Equ);
                }
-                '!' if matches!(self.peek(), Some('=')) => {
+                '!' if matches!(self.peek(), '=') => {
                    self.next();
                    tokens.push(Token::Neq);
                }
-                '<' if matches!(self.peek(), Some('=')) => {
+                '<' if matches!(self.peek(), '=') => {
                    self.next();
                    tokens.push(Token::Le);
                }
-                '>' if matches!(self.peek(), Some('=')) => {
+                '>' if matches!(self.peek(), '=') => {
                    self.next();
                    tokens.push(Token::Ge);
                }
-                '$' if matches!(self.peek(), Some('$')) => {
+                '$' if matches!(self.peek(), '$') => {
                    self.next();
                    tokens.push(Token::DoubleDollar);
                }
                // Handle tokens that span one character
                '+' => tokens.push(Token::Add),
                '-' => tokens.push(Token::Sub),
                '*' => tokens.push(Token::Mul),
@ -193,145 +89,139 @@ impl<'a> Lexer<'a> {
                '}' => tokens.push(Token::RBrace),
                '$' => tokens.push(Token::Dollar),
-                '"' => {
+                // Handle special multicharacter tokens
                    let mut text = String::new();
-                    let mut escape = false;
+                // Lex numbers
                ch @ '0'..='9' => tokens.push(self.lex_number(ch)),
-                    // Do as long as a next char exists and it is not '"'
+                // Lex strings
-                    loop {
+                '"' => tokens.push(self.lex_string()),
                        if escape {
                            escape = false;
-                            match self.next() {
+                // Lex identifiers
-                                Some('\\') => text.push('\\'),
+                ch @ ('a'..='z' | 'A'..='Z' | '_') => tokens.push(self.lex_ident(ch)),
                                Some('n') => text.push('\n'),
                                Some('r') => text.push('\r'),
                                Some('t') => text.push('\t'),
                                ch => panic!("Invalid string escape: '{:?}'", ch),
                            }
-                        } else {
+                // Any other character is unexpected
-                            match self.peek() {
+                ch => panic!("Lexer encountered unexpected char: '{}'", ch),
                                Some('"') => {
                                    self.next();
                                    break;
                                }
                                Some('\\') => {
                                    self.next();
                                    escape = true;
                                }
                                None => panic!("String is never terminated (missing '\"')"),
                                _ => text.push(self.next().unwrap()),
                            }
                        }
                    }
                    tokens.push(Token::Str(text));
                }
                'a'..='z' | 'A'..='Z' | '_' => {
                    let mut ident = String::from(ch);
                    // Do as long as a next char exists and it is a valid ident char
                    while let Some('a'..='z' | 'A'..='Z' | '_' | '0'..='9') = self.peek() {
                        // The next char is verified to be Some, so unwrap is safe
                        ident.push(self.next().unwrap());
                    }
                    match ident.as_str() {
                        "true" => tokens.push(Token::I64(1)),
                        "false" => tokens.push(Token::I64(0)),
                        "let" => tokens.push(Token::Let),
                        "while" => tokens.push(Token::While),
                        "if" => tokens.push(Token::If),
                        "else" => tokens.push(Token::Else),
                        "for" => tokens.push(Token::For),
                        _ => tokens.push(Token::Ident(ident)),
                    }
                }
                //TODO: Don't panic, keep calm
                _ => panic!("Lexer encountered unexpected char: '{}'", ch),
            }
        }
        tokens
    }
-    /// Advance to next character and return the removed char
+    fn lex_number(&mut self, first_char: char) -> Token {
-    fn next(&mut self) -> Option<char> {
+        let mut sval = String::from(first_char);
-        self.code.next()
+
        // Do as long as a next char exists and it is a numeric char
        loop {
            // The next char is verified to be Some, so unwrap is safe
            match self.peek() {
                // Underscore is a separator, so remove it but don't add to number
                '_' => {
                    self.next();
                }
                '0'..='9' => {
                    sval.push(self.next());
                }
                // Next char is not a number, so stop and finish the number token
                _ => break,
            }
        }
        // TODO: We only added numeric chars to the string, but the conversion could still fail
        Token::Literal(Literal::I64(sval.parse().unwrap()))
    }
-    /// Get the next character without removing it
+    /// Lex an identifier from the character stream. The first char has to have been consumed
-    fn peek(&mut self) -> Option<char> {
+    /// from the stream already and is passed as an argument instead.
-        self.code.peek().copied()
+    fn lex_ident(&mut self, first_char: char) -> Token {
        let mut ident = String::from(first_char);
        // Do as long as a next char exists and it is a valid ident char
        while let 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' = self.peek() {
            // The next char is verified to be Some, so unwrap is safe
            ident.push(self.next());
        }
        // Check if the identifier is a keyword
        match ident.as_str() {
            "true" => Token::Literal(Literal::I64(1)),
            "false" => Token::Literal(Literal::I64(0)),
            "let" => Token::Keyword(Keyword::Let),
            "while" => Token::Keyword(Keyword::While),
            "if" => Token::Keyword(Keyword::If),
            "else" => Token::Keyword(Keyword::Else),
            "for" => Token::Keyword(Keyword::For),
            _ => Token::Ident(ident),
        }
    }
 }
-/// Lex the provided code into a Token Buffer
+    /// Lex a string token from the character stream. This requires the initial quote '"' to be
-///
+    /// consumed before.
-/// TODO: Don't panic and implement error handling using Result
+    fn lex_string(&mut self) -> Token {
-pub fn lex(code: &str) -> Vec<Token> {
+        let mut text = String::new();
    let mut lexer = Lexer::new(code);
    lexer.lex()
 }
-impl Token {
+        let mut escape = false;
    pub fn try_to_binop(&self) -> Option<BinOpType> {
        Some(match self {
            Token::Add => BinOpType::Add,
            Token::Sub => BinOpType::Sub,
-            Token::Mul => BinOpType::Mul,
+        // Do as long as a next char exists and it is not '"'
-            Token::Div => BinOpType::Div,
+        loop {
-            Token::Mod => BinOpType::Mod,
+            if escape {
                escape = false;
-            Token::BAnd => BinOpType::BAnd,
+                // Escape characters
-            Token::BOr => BinOpType::BOr,
+                match self.next() {
-            Token::BXor => BinOpType::BXor,
+                    '\\' => text.push('\\'),
                    'n' => text.push('\n'),
                    'r' => text.push('\r'),
                    't' => text.push('\t'),
                    ch => panic!("Invalid string escape: '{:?}'", ch),
                }
            } else {
                match self.peek() {
                    // Doublequote '"' ends the string lexing
                    '"' => {
                        self.next();
                        break;
                    }
                    // Backslash '\' escapes the next character
                    '\\' => {
                        self.next();
                        escape = true;
                    }
-            Token::Shl => BinOpType::Shl,
+                    // Reached end of text but didn't encounter closing doublequote '"'
-            Token::Shr => BinOpType::Shr,
+                    '\0' => panic!("String is never terminated (missing '\"')"),
-            Token::Equ => BinOpType::Equ,
+                    _ => text.push(self.next()),
-            Token::Neq => BinOpType::Neq,
+                }
            }
        }
-            Token::Gt => BinOpType::Gt,
+        Token::Literal(Literal::Str(text))
            Token::Ge => BinOpType::Ge,
            Token::Lt => BinOpType::Lt,
            Token::Le => BinOpType::Le,
            Token::Assign => BinOpType::Assign,
            _ => return None,
        })
    }
 }
 #[cfg(test)]
 mod tests {
    use crate::token::Literal;
    use super::{lex, Token};
    #[test]
    fn test_lexer() {
        let code = "33   +5*2  + 4456467*2334+3 % - / << ^ | & >>";
        let expected = vec![
-            Token::I64(33),
+            Token::Literal(Literal::I64(33)),
            Token::Add,
-            Token::I64(5),
+            Token::Literal(Literal::I64(5)),
            Token::Mul,
-            Token::I64(2),
+            Token::Literal(Literal::I64(2)),
            Token::Add,
-            Token::I64(4456467),
+            Token::Literal(Literal::I64(4456467)),
            Token::Mul,
-            Token::I64(2334),
+            Token::Literal(Literal::I64(2334)),
            Token::Add,
-            Token::I64(3),
+            Token::Literal(Literal::I64(3)),
            Token::Mod,
            Token::Sub,
            Token::Div,
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,3 +1,4 @@
 pub mod lexer;
 pub mod parser;
 pub mod interpreter;
 pub mod token;
--- a/src/parser.rs
+++ b/src/parser.rs
@ -1,6 +1,6 @@
 use std::{iter::Peekable, rc::Rc};
-use crate::lexer::Token;
+use crate::token::{Keyword, Literal, Token};
 /// Types for binary operators
 #[derive(Debug, PartialEq, Eq, Clone)]
@ -142,10 +142,15 @@ impl<T: Iterator<Item = Token>> Parser<T> {
                }
                Token::EoF => break,
                Token::RBrace => break,
-                Token::Let => self.parse_let_stmt(),
+
-                Token::While => self.parse_while(),
+                Token::Keyword(keyword) => match keyword {
-                Token::If => self.parse_if(),
+                    Keyword::Let => self.parse_let_stmt(),
-                Token::For => self.parse_for(),
+                    Keyword::While => self.parse_while(),
                    Keyword::If => self.parse_if(),
                    Keyword::For => self.parse_for(),
                    Keyword::Else => panic!("Unexpected else keyword"),
                },
                Token::Dollar => {
                    self.next();
                    Stmt::Print(self.parse_expr())
@ -165,13 +170,13 @@ impl<T: Iterator<Item = Token>> Parser<T> {
    }
    fn parse_for(&mut self) -> Stmt {
-        if !matches!(self.next(), Token::For) {
+        if !matches!(self.next(), Token::Keyword(Keyword::For)) {
            panic!("Error parsing for: Expected for token");
        }
        let init = match self.parse_let_stmt() {
            Stmt::Let(name, rhs) => (name, rhs),
-            _ => unreachable!()
+            _ => unreachable!(),
        };
        if !matches!(self.next(), Token::Semicolon) {
@ -200,7 +205,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
    }
    fn parse_if(&mut self) -> Stmt {
-        if !matches!(self.next(), Token::If) {
+        if !matches!(self.next(), Token::Keyword(Keyword::If)) {
            panic!("Error parsing if: Expected if token");
        }
@ -218,7 +223,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
        let mut body_else = Ast { prog: Vec::new() };
-        if matches!(self.peek(), Token::Else) {
+        if matches!(self.peek(), Token::Keyword(Keyword::Else)) {
            self.next();
            if !matches!(self.next(), Token::LBrace) {
@ -236,7 +241,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
    }
    fn parse_while(&mut self) -> Stmt {
-        if !matches!(self.next(), Token::While) {
+        if !matches!(self.next(), Token::Keyword(Keyword::While)) {
            panic!("Error parsing while: Expected while token");
        }
@ -256,7 +261,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
    }
    fn parse_let_stmt(&mut self) -> Stmt {
-        if !matches!(self.next(), Token::Let) {
+        if !matches!(self.next(), Token::Keyword(Keyword::Let)) {
            panic!("Error parsing let: Expected let token");
        }
@ -310,9 +315,9 @@ impl<T: Iterator<Item = Token>> Parser<T> {
    /// Parse a primary expression (for now only number)
    fn parse_primary(&mut self) -> Expr {
        match self.next() {
-            Token::I64(val) => Expr::I64(val),
+            Token::Literal(Literal::I64(val)) => Expr::I64(val),
-            Token::Str(text) => Expr::Str(text.into()),
+            Token::Literal(Literal::Str(text)) => Expr::Str(text.into()),
            Token::Ident(name) => Expr::Ident(name),
@ -377,8 +382,8 @@ impl BinOpType {
 mod tests {
    use super::{parse, BinOpType, Expr};
    use crate::{
        lexer::Token,
        parser::{Ast, Stmt},
        token::{Literal, Token},
    };
    #[test]
@ -386,13 +391,13 @@ mod tests {
        // Expression: 1 + 2 * 3 + 4
        // With precedence: (1 + (2 * 3)) + 4
        let tokens = [
-            Token::I64(1),
+            Token::Literal(Literal::I64(1)),
            Token::Add,
-            Token::I64(2),
+            Token::Literal(Literal::I64(2)),
            Token::Mul,
-            Token::I64(3),
+            Token::Literal(Literal::I64(3)),
            Token::Sub,
-            Token::I64(4),
+            Token::Literal(Literal::I64(4)),
        ];
        let expected = Expr::BinOp(
--- a/src/token.rs
+++ b/src/token.rs
@ -0,0 +1,147 @@
 use crate::parser::BinOpType;
 #[derive(Debug, PartialEq, Eq)]
 pub enum Literal {
    /// Integer literal (64-bit)
    I64(i64),
    /// String literal ("Some string")
    Str(String),
 }
 #[derive(Debug, PartialEq, Eq)]
 pub enum Keyword {
    /// Let identifier (let)
    Let,
    /// While (while)
    While,
    /// For (for)
    For,
    /// If (if)
    If,
    /// Else (else)
    Else,
 }
 #[derive(Debug, PartialEq, Eq)]
 pub enum Token {
    /// Literal values
    Literal(Literal),
    /// Identifier (variable / function / ... name)
    Ident(String),
    /// Specific identifiers that have a special meaning as keywords
    Keyword(Keyword),
    /// Left parenthesis ('(')
    LParen,
    /// Right parentheses (')')
    RParen,
    /// Left brace ({)
    LBrace,
    /// Right brace (})
    RBrace,
    /// Dollar sign ($)
    Dollar,
    /// Double Dollar sign ($$)
    DoubleDollar,
    /// Assignment (single equal) (=)
    Assign,
    /// Plus (+)
    Add,
    /// Minus (-)
    Sub,
    /// Asterisk (*)
    Mul,
    /// Slash (/)
    Div,
    /// Percent (%)
    Mod,
    /// Pipe (|)
    BOr,
    /// Ampersand (&)
    BAnd,
    /// Circumflex (^)
    BXor,
    /// Shift Left (<<)
    Shl,
    /// Shift Right (>>)
    Shr,
    /// Equal sign (==)
    Equ,
    /// Not Equal sign (!=)
    Neq,
    /// Greater than (>)
    Gt,
    /// Greater or equal (>=)
    Ge,
    /// Less than (<)
    Lt,
    /// Less or equal (<=)
    Le,
    /// Semicolon (;)
    Semicolon,
    /// End of file
    EoF,
 }
 impl Token {
    pub fn try_to_binop(&self) -> Option<BinOpType> {
        Some(match self {
            Token::Add => BinOpType::Add,
            Token::Sub => BinOpType::Sub,
            Token::Mul => BinOpType::Mul,
            Token::Div => BinOpType::Div,
            Token::Mod => BinOpType::Mod,
            Token::BAnd => BinOpType::BAnd,
            Token::BOr => BinOpType::BOr,
            Token::BXor => BinOpType::BXor,
            Token::Shl => BinOpType::Shl,
            Token::Shr => BinOpType::Shr,
            Token::Equ => BinOpType::Equ,
            Token::Neq => BinOpType::Neq,
            Token::Gt => BinOpType::Gt,
            Token::Ge => BinOpType::Ge,
            Token::Lt => BinOpType::Lt,
            Token::Le => BinOpType::Le,
            Token::Assign => BinOpType::Assign,
            _ => return None,
        })
    }
 }