Refactor lexer

This commit is contained in:
Daniel M 2022-01-29 14:55:22 +01:00
parent e62121c75b
commit 9e3a642810
4 changed files with 301 additions and 258 deletions

View File

@ -1,107 +1,11 @@
use std::{iter::Peekable, str::Chars}; use std::{iter::Peekable, str::Chars};
use crate::parser::BinOpType; use crate::token::{Keyword, Literal, Token};
#[derive(Debug, PartialEq, Eq)] /// Lex the provided code into a Token Buffer
pub enum Token { pub fn lex(code: &str) -> Vec<Token> {
/// Integer literal (64-bit) let mut lexer = Lexer::new(code);
I64(i64), lexer.lex()
/// String literal ("Some string")
Str(String),
/// Left parenthesis ('(')
LParen,
/// Right parentheses (')')
RParen,
/// Left brace ({)
LBrace,
/// Right brace (})
RBrace,
/// Identifier (variable / function / ... name)
Ident(String),
/// Dollar sign ($)
Dollar,
/// Double Dollar sign ($$)
DoubleDollar,
/// Let identifier (let)
Let,
/// While (while)
While,
/// For (for)
For,
/// If (if)
If,
/// Else (else)
Else,
/// Assignment (single equal) (=)
Assign,
/// Plus (+)
Add,
/// Minus (-)
Sub,
/// Asterisk (*)
Mul,
/// Slash (/)
Div,
/// Percent (%)
Mod,
/// Pipe (|)
BOr,
/// Ampersand (&)
BAnd,
/// Circumflex (^)
BXor,
/// Shift Left (<<)
Shl,
/// Shift Right (>>)
Shr,
/// Equal sign (==)
Equ,
/// Not Equal sign (!=)
Neq,
/// Greater than (>)
Gt,
/// Greater or equal (>=)
Ge,
/// Less than (<)
Lt,
/// Less or equal (<=)
Le,
/// Semicolon (;)
Semicolon,
/// End of file
EoF,
} }
struct Lexer<'a> { struct Lexer<'a> {
@ -114,67 +18,59 @@ impl<'a> Lexer<'a> {
Self { code } Self { code }
} }
/// Advance to next character and return the removed char. If there is no next char, '\0'
/// is returned.
fn next(&mut self) -> char {
self.code.next().unwrap_or('\0')
}
/// Get the next character without removing it. If there is no next char, '\0' is returned.
fn peek(&mut self) -> char {
self.code.peek().copied().unwrap_or('\0')
}
fn lex(&mut self) -> Vec<Token> { fn lex(&mut self) -> Vec<Token> {
let mut tokens = Vec::new(); let mut tokens = Vec::new();
while let Some(ch) = self.next() { loop {
match ch { match self.next() {
// End of text
'\0' => break,
// Skip whitespace // Skip whitespace
' ' | '\r' | '\n' | '\t' => (), ' ' | '\r' | '\n' | '\t' => (),
// Lex numbers // Handle tokens that span two characters
'0'..='9' => { '>' if matches!(self.peek(), '>') => {
let mut sval = String::from(ch);
// Do as long as a next char exists and it is a numeric char
while let Some(ch) = self.peek() {
// The next char is verified to be Some, so unwrap is safe
match ch {
// Underscore is a separator, so remove it but don't add to number
'_' => {
self.next().unwrap();
}
'0'..='9' => {
sval.push(self.next().unwrap());
}
// Next char is not a number, so stop and finish the number token
_ => break,
}
}
// TODO: We only added numeric chars to the string, but the conversion could still fail
tokens.push(Token::I64(sval.parse().unwrap()));
}
'>' if matches!(self.peek(), Some('>')) => {
self.next(); self.next();
tokens.push(Token::Shr); tokens.push(Token::Shr);
} }
'<' if matches!(self.peek(), Some('<')) => { '<' if matches!(self.peek(), '<') => {
self.next(); self.next();
tokens.push(Token::Shl); tokens.push(Token::Shl);
} }
'=' if matches!(self.peek(), Some('=')) => { '=' if matches!(self.peek(), '=') => {
self.next(); self.next();
tokens.push(Token::Equ); tokens.push(Token::Equ);
} }
'!' if matches!(self.peek(), Some('=')) => { '!' if matches!(self.peek(), '=') => {
self.next(); self.next();
tokens.push(Token::Neq); tokens.push(Token::Neq);
} }
'<' if matches!(self.peek(), Some('=')) => { '<' if matches!(self.peek(), '=') => {
self.next(); self.next();
tokens.push(Token::Le); tokens.push(Token::Le);
} }
'>' if matches!(self.peek(), Some('=')) => { '>' if matches!(self.peek(), '=') => {
self.next(); self.next();
tokens.push(Token::Ge); tokens.push(Token::Ge);
} }
'$' if matches!(self.peek(), Some('$')) => { '$' if matches!(self.peek(), '$') => {
self.next(); self.next();
tokens.push(Token::DoubleDollar); tokens.push(Token::DoubleDollar);
} }
// Handle tokens that span one character
'+' => tokens.push(Token::Add), '+' => tokens.push(Token::Add),
'-' => tokens.push(Token::Sub), '-' => tokens.push(Token::Sub),
'*' => tokens.push(Token::Mul), '*' => tokens.push(Token::Mul),
@ -193,145 +89,139 @@ impl<'a> Lexer<'a> {
'}' => tokens.push(Token::RBrace), '}' => tokens.push(Token::RBrace),
'$' => tokens.push(Token::Dollar), '$' => tokens.push(Token::Dollar),
'"' => { // Handle special multicharacter tokens
let mut text = String::new();
let mut escape = false; // Lex numbers
ch @ '0'..='9' => tokens.push(self.lex_number(ch)),
// Do as long as a next char exists and it is not '"' // Lex strings
loop { '"' => tokens.push(self.lex_string()),
if escape {
escape = false;
match self.next() { // Lex identifiers
Some('\\') => text.push('\\'), ch @ ('a'..='z' | 'A'..='Z' | '_') => tokens.push(self.lex_ident(ch)),
Some('n') => text.push('\n'),
Some('r') => text.push('\r'),
Some('t') => text.push('\t'),
ch => panic!("Invalid string escape: '{:?}'", ch),
}
} else { // Any other character is unexpected
match self.peek() { ch => panic!("Lexer encountered unexpected char: '{}'", ch),
Some('"') => {
self.next();
break;
}
Some('\\') => {
self.next();
escape = true;
}
None => panic!("String is never terminated (missing '\"')"),
_ => text.push(self.next().unwrap()),
}
}
}
tokens.push(Token::Str(text));
}
'a'..='z' | 'A'..='Z' | '_' => {
let mut ident = String::from(ch);
// Do as long as a next char exists and it is a valid ident char
while let Some('a'..='z' | 'A'..='Z' | '_' | '0'..='9') = self.peek() {
// The next char is verified to be Some, so unwrap is safe
ident.push(self.next().unwrap());
}
match ident.as_str() {
"true" => tokens.push(Token::I64(1)),
"false" => tokens.push(Token::I64(0)),
"let" => tokens.push(Token::Let),
"while" => tokens.push(Token::While),
"if" => tokens.push(Token::If),
"else" => tokens.push(Token::Else),
"for" => tokens.push(Token::For),
_ => tokens.push(Token::Ident(ident)),
}
}
//TODO: Don't panic, keep calm
_ => panic!("Lexer encountered unexpected char: '{}'", ch),
} }
} }
tokens tokens
} }
/// Advance to next character and return the removed char fn lex_number(&mut self, first_char: char) -> Token {
fn next(&mut self) -> Option<char> { let mut sval = String::from(first_char);
self.code.next()
// Do as long as a next char exists and it is a numeric char
loop {
// The next char is verified to be Some, so unwrap is safe
match self.peek() {
// Underscore is a separator, so remove it but don't add to number
'_' => {
self.next();
}
'0'..='9' => {
sval.push(self.next());
}
// Next char is not a number, so stop and finish the number token
_ => break,
}
}
// TODO: We only added numeric chars to the string, but the conversion could still fail
Token::Literal(Literal::I64(sval.parse().unwrap()))
} }
/// Get the next character without removing it /// Lex an identifier from the character stream. The first char has to have been consumed
fn peek(&mut self) -> Option<char> { /// from the stream already and is passed as an argument instead.
self.code.peek().copied() fn lex_ident(&mut self, first_char: char) -> Token {
let mut ident = String::from(first_char);
// Do as long as a next char exists and it is a valid ident char
while let 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' = self.peek() {
// The next char is verified to be Some, so unwrap is safe
ident.push(self.next());
}
// Check if the identifier is a keyword
match ident.as_str() {
"true" => Token::Literal(Literal::I64(1)),
"false" => Token::Literal(Literal::I64(0)),
"let" => Token::Keyword(Keyword::Let),
"while" => Token::Keyword(Keyword::While),
"if" => Token::Keyword(Keyword::If),
"else" => Token::Keyword(Keyword::Else),
"for" => Token::Keyword(Keyword::For),
_ => Token::Ident(ident),
}
} }
}
/// Lex the provided code into a Token Buffer /// Lex a string token from the character stream. This requires the initial quote '"' to be
/// /// consumed before.
/// TODO: Don't panic and implement error handling using Result fn lex_string(&mut self) -> Token {
pub fn lex(code: &str) -> Vec<Token> { let mut text = String::new();
let mut lexer = Lexer::new(code);
lexer.lex()
}
impl Token { let mut escape = false;
pub fn try_to_binop(&self) -> Option<BinOpType> {
Some(match self {
Token::Add => BinOpType::Add,
Token::Sub => BinOpType::Sub,
Token::Mul => BinOpType::Mul, // Do as long as a next char exists and it is not '"'
Token::Div => BinOpType::Div, loop {
Token::Mod => BinOpType::Mod, if escape {
escape = false;
Token::BAnd => BinOpType::BAnd, // Escape characters
Token::BOr => BinOpType::BOr, match self.next() {
Token::BXor => BinOpType::BXor, '\\' => text.push('\\'),
'n' => text.push('\n'),
'r' => text.push('\r'),
't' => text.push('\t'),
ch => panic!("Invalid string escape: '{:?}'", ch),
}
} else {
match self.peek() {
// Doublequote '"' ends the string lexing
'"' => {
self.next();
break;
}
// Backslash '\' escapes the next character
'\\' => {
self.next();
escape = true;
}
Token::Shl => BinOpType::Shl, // Reached end of text but didn't encounter closing doublequote '"'
Token::Shr => BinOpType::Shr, '\0' => panic!("String is never terminated (missing '\"')"),
Token::Equ => BinOpType::Equ, _ => text.push(self.next()),
Token::Neq => BinOpType::Neq, }
}
}
Token::Gt => BinOpType::Gt, Token::Literal(Literal::Str(text))
Token::Ge => BinOpType::Ge,
Token::Lt => BinOpType::Lt,
Token::Le => BinOpType::Le,
Token::Assign => BinOpType::Assign,
_ => return None,
})
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::token::Literal;
use super::{lex, Token}; use super::{lex, Token};
#[test] #[test]
fn test_lexer() { fn test_lexer() {
let code = "33 +5*2 + 4456467*2334+3 % - / << ^ | & >>"; let code = "33 +5*2 + 4456467*2334+3 % - / << ^ | & >>";
let expected = vec![ let expected = vec![
Token::I64(33), Token::Literal(Literal::I64(33)),
Token::Add, Token::Add,
Token::I64(5), Token::Literal(Literal::I64(5)),
Token::Mul, Token::Mul,
Token::I64(2), Token::Literal(Literal::I64(2)),
Token::Add, Token::Add,
Token::I64(4456467), Token::Literal(Literal::I64(4456467)),
Token::Mul, Token::Mul,
Token::I64(2334), Token::Literal(Literal::I64(2334)),
Token::Add, Token::Add,
Token::I64(3), Token::Literal(Literal::I64(3)),
Token::Mod, Token::Mod,
Token::Sub, Token::Sub,
Token::Div, Token::Div,

View File

@ -1,3 +1,4 @@
pub mod lexer; pub mod lexer;
pub mod parser; pub mod parser;
pub mod interpreter; pub mod interpreter;
pub mod token;

View File

@ -1,6 +1,6 @@
use std::{iter::Peekable, rc::Rc}; use std::{iter::Peekable, rc::Rc};
use crate::lexer::Token; use crate::token::{Keyword, Literal, Token};
/// Types for binary operators /// Types for binary operators
#[derive(Debug, PartialEq, Eq, Clone)] #[derive(Debug, PartialEq, Eq, Clone)]
@ -142,10 +142,15 @@ impl<T: Iterator<Item = Token>> Parser<T> {
} }
Token::EoF => break, Token::EoF => break,
Token::RBrace => break, Token::RBrace => break,
Token::Let => self.parse_let_stmt(),
Token::While => self.parse_while(), Token::Keyword(keyword) => match keyword {
Token::If => self.parse_if(), Keyword::Let => self.parse_let_stmt(),
Token::For => self.parse_for(), Keyword::While => self.parse_while(),
Keyword::If => self.parse_if(),
Keyword::For => self.parse_for(),
Keyword::Else => panic!("Unexpected else keyword"),
},
Token::Dollar => { Token::Dollar => {
self.next(); self.next();
Stmt::Print(self.parse_expr()) Stmt::Print(self.parse_expr())
@ -165,13 +170,13 @@ impl<T: Iterator<Item = Token>> Parser<T> {
} }
fn parse_for(&mut self) -> Stmt { fn parse_for(&mut self) -> Stmt {
if !matches!(self.next(), Token::For) { if !matches!(self.next(), Token::Keyword(Keyword::For)) {
panic!("Error parsing for: Expected for token"); panic!("Error parsing for: Expected for token");
} }
let init = match self.parse_let_stmt() { let init = match self.parse_let_stmt() {
Stmt::Let(name, rhs) => (name, rhs), Stmt::Let(name, rhs) => (name, rhs),
_ => unreachable!() _ => unreachable!(),
}; };
if !matches!(self.next(), Token::Semicolon) { if !matches!(self.next(), Token::Semicolon) {
@ -200,7 +205,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
} }
fn parse_if(&mut self) -> Stmt { fn parse_if(&mut self) -> Stmt {
if !matches!(self.next(), Token::If) { if !matches!(self.next(), Token::Keyword(Keyword::If)) {
panic!("Error parsing if: Expected if token"); panic!("Error parsing if: Expected if token");
} }
@ -218,7 +223,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
let mut body_else = Ast { prog: Vec::new() }; let mut body_else = Ast { prog: Vec::new() };
if matches!(self.peek(), Token::Else) { if matches!(self.peek(), Token::Keyword(Keyword::Else)) {
self.next(); self.next();
if !matches!(self.next(), Token::LBrace) { if !matches!(self.next(), Token::LBrace) {
@ -236,7 +241,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
} }
fn parse_while(&mut self) -> Stmt { fn parse_while(&mut self) -> Stmt {
if !matches!(self.next(), Token::While) { if !matches!(self.next(), Token::Keyword(Keyword::While)) {
panic!("Error parsing while: Expected while token"); panic!("Error parsing while: Expected while token");
} }
@ -256,7 +261,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
} }
fn parse_let_stmt(&mut self) -> Stmt { fn parse_let_stmt(&mut self) -> Stmt {
if !matches!(self.next(), Token::Let) { if !matches!(self.next(), Token::Keyword(Keyword::Let)) {
panic!("Error parsing let: Expected let token"); panic!("Error parsing let: Expected let token");
} }
@ -310,9 +315,9 @@ impl<T: Iterator<Item = Token>> Parser<T> {
/// Parse a primary expression (for now only number) /// Parse a primary expression (for now only number)
fn parse_primary(&mut self) -> Expr { fn parse_primary(&mut self) -> Expr {
match self.next() { match self.next() {
Token::I64(val) => Expr::I64(val), Token::Literal(Literal::I64(val)) => Expr::I64(val),
Token::Str(text) => Expr::Str(text.into()), Token::Literal(Literal::Str(text)) => Expr::Str(text.into()),
Token::Ident(name) => Expr::Ident(name), Token::Ident(name) => Expr::Ident(name),
@ -377,8 +382,8 @@ impl BinOpType {
mod tests { mod tests {
use super::{parse, BinOpType, Expr}; use super::{parse, BinOpType, Expr};
use crate::{ use crate::{
lexer::Token,
parser::{Ast, Stmt}, parser::{Ast, Stmt},
token::{Literal, Token},
}; };
#[test] #[test]
@ -386,13 +391,13 @@ mod tests {
// Expression: 1 + 2 * 3 + 4 // Expression: 1 + 2 * 3 + 4
// With precedence: (1 + (2 * 3)) + 4 // With precedence: (1 + (2 * 3)) + 4
let tokens = [ let tokens = [
Token::I64(1), Token::Literal(Literal::I64(1)),
Token::Add, Token::Add,
Token::I64(2), Token::Literal(Literal::I64(2)),
Token::Mul, Token::Mul,
Token::I64(3), Token::Literal(Literal::I64(3)),
Token::Sub, Token::Sub,
Token::I64(4), Token::Literal(Literal::I64(4)),
]; ];
let expected = Expr::BinOp( let expected = Expr::BinOp(

147
src/token.rs Normal file
View File

@ -0,0 +1,147 @@
use crate::parser::BinOpType;
#[derive(Debug, PartialEq, Eq)]
pub enum Literal {
/// Integer literal (64-bit)
I64(i64),
/// String literal ("Some string")
Str(String),
}
#[derive(Debug, PartialEq, Eq)]
pub enum Keyword {
/// Let identifier (let)
Let,
/// While (while)
While,
/// For (for)
For,
/// If (if)
If,
/// Else (else)
Else,
}
#[derive(Debug, PartialEq, Eq)]
pub enum Token {
/// Literal values
Literal(Literal),
/// Identifier (variable / function / ... name)
Ident(String),
/// Specific identifiers that have a special meaning as keywords
Keyword(Keyword),
/// Left parenthesis ('(')
LParen,
/// Right parentheses (')')
RParen,
/// Left brace ({)
LBrace,
/// Right brace (})
RBrace,
/// Dollar sign ($)
Dollar,
/// Double Dollar sign ($$)
DoubleDollar,
/// Assignment (single equal) (=)
Assign,
/// Plus (+)
Add,
/// Minus (-)
Sub,
/// Asterisk (*)
Mul,
/// Slash (/)
Div,
/// Percent (%)
Mod,
/// Pipe (|)
BOr,
/// Ampersand (&)
BAnd,
/// Circumflex (^)
BXor,
/// Shift Left (<<)
Shl,
/// Shift Right (>>)
Shr,
/// Equal sign (==)
Equ,
/// Not Equal sign (!=)
Neq,
/// Greater than (>)
Gt,
/// Greater or equal (>=)
Ge,
/// Less than (<)
Lt,
/// Less or equal (<=)
Le,
/// Semicolon (;)
Semicolon,
/// End of file
EoF,
}
impl Token {
pub fn try_to_binop(&self) -> Option<BinOpType> {
Some(match self {
Token::Add => BinOpType::Add,
Token::Sub => BinOpType::Sub,
Token::Mul => BinOpType::Mul,
Token::Div => BinOpType::Div,
Token::Mod => BinOpType::Mod,
Token::BAnd => BinOpType::BAnd,
Token::BOr => BinOpType::BOr,
Token::BXor => BinOpType::BXor,
Token::Shl => BinOpType::Shl,
Token::Shr => BinOpType::Shr,
Token::Equ => BinOpType::Equ,
Token::Neq => BinOpType::Neq,
Token::Gt => BinOpType::Gt,
Token::Ge => BinOpType::Ge,
Token::Lt => BinOpType::Lt,
Token::Le => BinOpType::Le,
Token::Assign => BinOpType::Assign,
_ => return None,
})
}
}