diff --git a/src/lexer.rs b/src/lexer.rs index 5566748..459b404 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -1,107 +1,11 @@ use std::{iter::Peekable, str::Chars}; -use crate::parser::BinOpType; +use crate::token::{Keyword, Literal, Token}; -#[derive(Debug, PartialEq, Eq)] -pub enum Token { - /// Integer literal (64-bit) - I64(i64), - - /// String literal ("Some string") - Str(String), - - /// Left parenthesis ('(') - LParen, - - /// Right parentheses (')') - RParen, - - /// Left brace ({) - LBrace, - - /// Right brace (}) - RBrace, - - /// Identifier (variable / function / ... name) - Ident(String), - - /// Dollar sign ($) - Dollar, - - /// Double Dollar sign ($$) - DoubleDollar, - - /// Let identifier (let) - Let, - - /// While (while) - While, - - /// For (for) - For, - - /// If (if) - If, - - /// Else (else) - Else, - - /// Assignment (single equal) (=) - Assign, - - /// Plus (+) - Add, - - /// Minus (-) - Sub, - - /// Asterisk (*) - Mul, - - /// Slash (/) - Div, - - /// Percent (%) - Mod, - - /// Pipe (|) - BOr, - - /// Ampersand (&) - BAnd, - - /// Circumflex (^) - BXor, - - /// Shift Left (<<) - Shl, - - /// Shift Right (>>) - Shr, - - /// Equal sign (==) - Equ, - - /// Not Equal sign (!=) - Neq, - - /// Greater than (>) - Gt, - - /// Greater or equal (>=) - Ge, - - /// Less than (<) - Lt, - - /// Less or equal (<=) - Le, - - /// Semicolon (;) - Semicolon, - - /// End of file - EoF, +/// Lex the provided code into a Token Buffer +pub fn lex(code: &str) -> Vec { + let mut lexer = Lexer::new(code); + lexer.lex() } struct Lexer<'a> { @@ -114,67 +18,59 @@ impl<'a> Lexer<'a> { Self { code } } + /// Advance to next character and return the removed char. If there is no next char, '\0' + /// is returned. + fn next(&mut self) -> char { + self.code.next().unwrap_or('\0') + } + + /// Get the next character without removing it. If there is no next char, '\0' is returned. + fn peek(&mut self) -> char { + self.code.peek().copied().unwrap_or('\0') + } + fn lex(&mut self) -> Vec { let mut tokens = Vec::new(); - while let Some(ch) = self.next() { - match ch { + loop { + match self.next() { + // End of text + '\0' => break, + // Skip whitespace ' ' | '\r' | '\n' | '\t' => (), - // Lex numbers - '0'..='9' => { - let mut sval = String::from(ch); - - // Do as long as a next char exists and it is a numeric char - while let Some(ch) = self.peek() { - // The next char is verified to be Some, so unwrap is safe - match ch { - // Underscore is a separator, so remove it but don't add to number - '_' => { - self.next().unwrap(); - } - '0'..='9' => { - sval.push(self.next().unwrap()); - } - // Next char is not a number, so stop and finish the number token - _ => break, - } - } - - // TODO: We only added numeric chars to the string, but the conversion could still fail - tokens.push(Token::I64(sval.parse().unwrap())); - } - - '>' if matches!(self.peek(), Some('>')) => { + // Handle tokens that span two characters + '>' if matches!(self.peek(), '>') => { self.next(); tokens.push(Token::Shr); } - '<' if matches!(self.peek(), Some('<')) => { + '<' if matches!(self.peek(), '<') => { self.next(); tokens.push(Token::Shl); } - '=' if matches!(self.peek(), Some('=')) => { + '=' if matches!(self.peek(), '=') => { self.next(); tokens.push(Token::Equ); } - '!' if matches!(self.peek(), Some('=')) => { + '!' if matches!(self.peek(), '=') => { self.next(); tokens.push(Token::Neq); } - '<' if matches!(self.peek(), Some('=')) => { + '<' if matches!(self.peek(), '=') => { self.next(); tokens.push(Token::Le); } - '>' if matches!(self.peek(), Some('=')) => { + '>' if matches!(self.peek(), '=') => { self.next(); tokens.push(Token::Ge); } - '$' if matches!(self.peek(), Some('$')) => { + '$' if matches!(self.peek(), '$') => { self.next(); tokens.push(Token::DoubleDollar); } + // Handle tokens that span one character '+' => tokens.push(Token::Add), '-' => tokens.push(Token::Sub), '*' => tokens.push(Token::Mul), @@ -193,145 +89,139 @@ impl<'a> Lexer<'a> { '}' => tokens.push(Token::RBrace), '$' => tokens.push(Token::Dollar), - '"' => { - let mut text = String::new(); + // Handle special multicharacter tokens - let mut escape = false; + // Lex numbers + ch @ '0'..='9' => tokens.push(self.lex_number(ch)), - // Do as long as a next char exists and it is not '"' - loop { - if escape { - escape = false; + // Lex strings + '"' => tokens.push(self.lex_string()), - match self.next() { - Some('\\') => text.push('\\'), - Some('n') => text.push('\n'), - Some('r') => text.push('\r'), - Some('t') => text.push('\t'), - ch => panic!("Invalid string escape: '{:?}'", ch), - } + // Lex identifiers + ch @ ('a'..='z' | 'A'..='Z' | '_') => tokens.push(self.lex_ident(ch)), - } else { - match self.peek() { - Some('"') => { - self.next(); - break; - } - Some('\\') => { - self.next(); - escape = true; - } - None => panic!("String is never terminated (missing '\"')"), - - _ => text.push(self.next().unwrap()), - } - } - } - - - - tokens.push(Token::Str(text)); - } - - 'a'..='z' | 'A'..='Z' | '_' => { - let mut ident = String::from(ch); - - // Do as long as a next char exists and it is a valid ident char - while let Some('a'..='z' | 'A'..='Z' | '_' | '0'..='9') = self.peek() { - // The next char is verified to be Some, so unwrap is safe - ident.push(self.next().unwrap()); - } - - match ident.as_str() { - "true" => tokens.push(Token::I64(1)), - "false" => tokens.push(Token::I64(0)), - "let" => tokens.push(Token::Let), - "while" => tokens.push(Token::While), - "if" => tokens.push(Token::If), - "else" => tokens.push(Token::Else), - "for" => tokens.push(Token::For), - _ => tokens.push(Token::Ident(ident)), - } - } - - //TODO: Don't panic, keep calm - _ => panic!("Lexer encountered unexpected char: '{}'", ch), + // Any other character is unexpected + ch => panic!("Lexer encountered unexpected char: '{}'", ch), } } tokens } - /// Advance to next character and return the removed char - fn next(&mut self) -> Option { - self.code.next() + fn lex_number(&mut self, first_char: char) -> Token { + let mut sval = String::from(first_char); + + // Do as long as a next char exists and it is a numeric char + loop { + // The next char is verified to be Some, so unwrap is safe + match self.peek() { + // Underscore is a separator, so remove it but don't add to number + '_' => { + self.next(); + } + '0'..='9' => { + sval.push(self.next()); + } + // Next char is not a number, so stop and finish the number token + _ => break, + } + } + + // TODO: We only added numeric chars to the string, but the conversion could still fail + Token::Literal(Literal::I64(sval.parse().unwrap())) } - /// Get the next character without removing it - fn peek(&mut self) -> Option { - self.code.peek().copied() + /// Lex an identifier from the character stream. The first char has to have been consumed + /// from the stream already and is passed as an argument instead. + fn lex_ident(&mut self, first_char: char) -> Token { + let mut ident = String::from(first_char); + + // Do as long as a next char exists and it is a valid ident char + while let 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' = self.peek() { + // The next char is verified to be Some, so unwrap is safe + ident.push(self.next()); + } + + // Check if the identifier is a keyword + match ident.as_str() { + "true" => Token::Literal(Literal::I64(1)), + "false" => Token::Literal(Literal::I64(0)), + "let" => Token::Keyword(Keyword::Let), + "while" => Token::Keyword(Keyword::While), + "if" => Token::Keyword(Keyword::If), + "else" => Token::Keyword(Keyword::Else), + "for" => Token::Keyword(Keyword::For), + + _ => Token::Ident(ident), + } } -} -/// Lex the provided code into a Token Buffer -/// -/// TODO: Don't panic and implement error handling using Result -pub fn lex(code: &str) -> Vec { - let mut lexer = Lexer::new(code); - lexer.lex() -} + /// Lex a string token from the character stream. This requires the initial quote '"' to be + /// consumed before. + fn lex_string(&mut self) -> Token { + let mut text = String::new(); -impl Token { - pub fn try_to_binop(&self) -> Option { - Some(match self { - Token::Add => BinOpType::Add, - Token::Sub => BinOpType::Sub, + let mut escape = false; - Token::Mul => BinOpType::Mul, - Token::Div => BinOpType::Div, - Token::Mod => BinOpType::Mod, + // Do as long as a next char exists and it is not '"' + loop { + if escape { + escape = false; - Token::BAnd => BinOpType::BAnd, - Token::BOr => BinOpType::BOr, - Token::BXor => BinOpType::BXor, + // Escape characters + match self.next() { + '\\' => text.push('\\'), + 'n' => text.push('\n'), + 'r' => text.push('\r'), + 't' => text.push('\t'), + ch => panic!("Invalid string escape: '{:?}'", ch), + } + } else { + match self.peek() { + // Doublequote '"' ends the string lexing + '"' => { + self.next(); + break; + } + // Backslash '\' escapes the next character + '\\' => { + self.next(); + escape = true; + } - Token::Shl => BinOpType::Shl, - Token::Shr => BinOpType::Shr, + // Reached end of text but didn't encounter closing doublequote '"' + '\0' => panic!("String is never terminated (missing '\"')"), - Token::Equ => BinOpType::Equ, - Token::Neq => BinOpType::Neq, + _ => text.push(self.next()), + } + } + } - Token::Gt => BinOpType::Gt, - Token::Ge => BinOpType::Ge, - Token::Lt => BinOpType::Lt, - Token::Le => BinOpType::Le, - - Token::Assign => BinOpType::Assign, - - _ => return None, - }) + Token::Literal(Literal::Str(text)) } } #[cfg(test)] mod tests { + use crate::token::Literal; + use super::{lex, Token}; #[test] fn test_lexer() { let code = "33 +5*2 + 4456467*2334+3 % - / << ^ | & >>"; let expected = vec![ - Token::I64(33), + Token::Literal(Literal::I64(33)), Token::Add, - Token::I64(5), + Token::Literal(Literal::I64(5)), Token::Mul, - Token::I64(2), + Token::Literal(Literal::I64(2)), Token::Add, - Token::I64(4456467), + Token::Literal(Literal::I64(4456467)), Token::Mul, - Token::I64(2334), + Token::Literal(Literal::I64(2334)), Token::Add, - Token::I64(3), + Token::Literal(Literal::I64(3)), Token::Mod, Token::Sub, Token::Div, diff --git a/src/lib.rs b/src/lib.rs index 74c1228..02e6110 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ pub mod lexer; pub mod parser; pub mod interpreter; +pub mod token; diff --git a/src/parser.rs b/src/parser.rs index 0746aed..5918b98 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,6 +1,6 @@ use std::{iter::Peekable, rc::Rc}; -use crate::lexer::Token; +use crate::token::{Keyword, Literal, Token}; /// Types for binary operators #[derive(Debug, PartialEq, Eq, Clone)] @@ -142,10 +142,15 @@ impl> Parser { } Token::EoF => break, Token::RBrace => break, - Token::Let => self.parse_let_stmt(), - Token::While => self.parse_while(), - Token::If => self.parse_if(), - Token::For => self.parse_for(), + + Token::Keyword(keyword) => match keyword { + Keyword::Let => self.parse_let_stmt(), + Keyword::While => self.parse_while(), + Keyword::If => self.parse_if(), + Keyword::For => self.parse_for(), + Keyword::Else => panic!("Unexpected else keyword"), + }, + Token::Dollar => { self.next(); Stmt::Print(self.parse_expr()) @@ -165,13 +170,13 @@ impl> Parser { } fn parse_for(&mut self) -> Stmt { - if !matches!(self.next(), Token::For) { + if !matches!(self.next(), Token::Keyword(Keyword::For)) { panic!("Error parsing for: Expected for token"); } let init = match self.parse_let_stmt() { Stmt::Let(name, rhs) => (name, rhs), - _ => unreachable!() + _ => unreachable!(), }; if !matches!(self.next(), Token::Semicolon) { @@ -179,7 +184,7 @@ impl> Parser { } let condition = self.parse_expr(); - + if !matches!(self.next(), Token::Semicolon) { panic!("Error parsing for: Expected semicolon token"); } @@ -200,7 +205,7 @@ impl> Parser { } fn parse_if(&mut self) -> Stmt { - if !matches!(self.next(), Token::If) { + if !matches!(self.next(), Token::Keyword(Keyword::If)) { panic!("Error parsing if: Expected if token"); } @@ -218,15 +223,15 @@ impl> Parser { let mut body_else = Ast { prog: Vec::new() }; - if matches!(self.peek(), Token::Else) { + if matches!(self.peek(), Token::Keyword(Keyword::Else)) { self.next(); if !matches!(self.next(), Token::LBrace) { panic!("Error parsing else: Expected '{{' token"); } - + body_else = self.parse(); - + if !matches!(self.next(), Token::RBrace) { panic!("Error parsing else: Expected '}}' token"); } @@ -236,7 +241,7 @@ impl> Parser { } fn parse_while(&mut self) -> Stmt { - if !matches!(self.next(), Token::While) { + if !matches!(self.next(), Token::Keyword(Keyword::While)) { panic!("Error parsing while: Expected while token"); } @@ -256,7 +261,7 @@ impl> Parser { } fn parse_let_stmt(&mut self) -> Stmt { - if !matches!(self.next(), Token::Let) { + if !matches!(self.next(), Token::Keyword(Keyword::Let)) { panic!("Error parsing let: Expected let token"); } @@ -310,9 +315,9 @@ impl> Parser { /// Parse a primary expression (for now only number) fn parse_primary(&mut self) -> Expr { match self.next() { - Token::I64(val) => Expr::I64(val), + Token::Literal(Literal::I64(val)) => Expr::I64(val), - Token::Str(text) => Expr::Str(text.into()), + Token::Literal(Literal::Str(text)) => Expr::Str(text.into()), Token::Ident(name) => Expr::Ident(name), @@ -377,8 +382,8 @@ impl BinOpType { mod tests { use super::{parse, BinOpType, Expr}; use crate::{ - lexer::Token, parser::{Ast, Stmt}, + token::{Literal, Token}, }; #[test] @@ -386,13 +391,13 @@ mod tests { // Expression: 1 + 2 * 3 + 4 // With precedence: (1 + (2 * 3)) + 4 let tokens = [ - Token::I64(1), + Token::Literal(Literal::I64(1)), Token::Add, - Token::I64(2), + Token::Literal(Literal::I64(2)), Token::Mul, - Token::I64(3), + Token::Literal(Literal::I64(3)), Token::Sub, - Token::I64(4), + Token::Literal(Literal::I64(4)), ]; let expected = Expr::BinOp( diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..06efeb3 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,147 @@ +use crate::parser::BinOpType; + +#[derive(Debug, PartialEq, Eq)] +pub enum Literal { + /// Integer literal (64-bit) + I64(i64), + + /// String literal ("Some string") + Str(String), +} + +#[derive(Debug, PartialEq, Eq)] +pub enum Keyword { + /// Let identifier (let) + Let, + + /// While (while) + While, + + /// For (for) + For, + + /// If (if) + If, + + /// Else (else) + Else, +} + +#[derive(Debug, PartialEq, Eq)] +pub enum Token { + /// Literal values + Literal(Literal), + + /// Identifier (variable / function / ... name) + Ident(String), + + /// Specific identifiers that have a special meaning as keywords + Keyword(Keyword), + + /// Left parenthesis ('(') + LParen, + + /// Right parentheses (')') + RParen, + + /// Left brace ({) + LBrace, + + /// Right brace (}) + RBrace, + + /// Dollar sign ($) + Dollar, + + /// Double Dollar sign ($$) + DoubleDollar, + + /// Assignment (single equal) (=) + Assign, + + /// Plus (+) + Add, + + /// Minus (-) + Sub, + + /// Asterisk (*) + Mul, + + /// Slash (/) + Div, + + /// Percent (%) + Mod, + + /// Pipe (|) + BOr, + + /// Ampersand (&) + BAnd, + + /// Circumflex (^) + BXor, + + /// Shift Left (<<) + Shl, + + /// Shift Right (>>) + Shr, + + /// Equal sign (==) + Equ, + + /// Not Equal sign (!=) + Neq, + + /// Greater than (>) + Gt, + + /// Greater or equal (>=) + Ge, + + /// Less than (<) + Lt, + + /// Less or equal (<=) + Le, + + /// Semicolon (;) + Semicolon, + + /// End of file + EoF, +} + +impl Token { + pub fn try_to_binop(&self) -> Option { + Some(match self { + Token::Add => BinOpType::Add, + Token::Sub => BinOpType::Sub, + + Token::Mul => BinOpType::Mul, + Token::Div => BinOpType::Div, + Token::Mod => BinOpType::Mod, + + Token::BAnd => BinOpType::BAnd, + Token::BOr => BinOpType::BOr, + Token::BXor => BinOpType::BXor, + + Token::Shl => BinOpType::Shl, + Token::Shr => BinOpType::Shr, + + Token::Equ => BinOpType::Equ, + Token::Neq => BinOpType::Neq, + + Token::Gt => BinOpType::Gt, + Token::Ge => BinOpType::Ge, + Token::Lt => BinOpType::Lt, + Token::Le => BinOpType::Le, + + Token::Assign => BinOpType::Assign, + + _ => return None, + }) + } +}