Refactor lexer

This commit is contained in:
Daniel M 2022-01-29 14:55:22 +01:00
parent e62121c75b
commit 9e3a642810
4 changed files with 301 additions and 258 deletions

View File

@ -1,107 +1,11 @@
use std::{iter::Peekable, str::Chars};
use crate::parser::BinOpType;
use crate::token::{Keyword, Literal, Token};
#[derive(Debug, PartialEq, Eq)]
pub enum Token {
/// Integer literal (64-bit)
I64(i64),
/// String literal ("Some string")
Str(String),
/// Left parenthesis ('(')
LParen,
/// Right parentheses (')')
RParen,
/// Left brace ({)
LBrace,
/// Right brace (})
RBrace,
/// Identifier (variable / function / ... name)
Ident(String),
/// Dollar sign ($)
Dollar,
/// Double Dollar sign ($$)
DoubleDollar,
/// Let identifier (let)
Let,
/// While (while)
While,
/// For (for)
For,
/// If (if)
If,
/// Else (else)
Else,
/// Assignment (single equal) (=)
Assign,
/// Plus (+)
Add,
/// Minus (-)
Sub,
/// Asterisk (*)
Mul,
/// Slash (/)
Div,
/// Percent (%)
Mod,
/// Pipe (|)
BOr,
/// Ampersand (&)
BAnd,
/// Circumflex (^)
BXor,
/// Shift Left (<<)
Shl,
/// Shift Right (>>)
Shr,
/// Equal sign (==)
Equ,
/// Not Equal sign (!=)
Neq,
/// Greater than (>)
Gt,
/// Greater or equal (>=)
Ge,
/// Less than (<)
Lt,
/// Less or equal (<=)
Le,
/// Semicolon (;)
Semicolon,
/// End of file
EoF,
/// Lex the provided code into a Token Buffer
pub fn lex(code: &str) -> Vec<Token> {
let mut lexer = Lexer::new(code);
lexer.lex()
}
struct Lexer<'a> {
@ -114,67 +18,59 @@ impl<'a> Lexer<'a> {
Self { code }
}
/// Advance to next character and return the removed char. If there is no next char, '\0'
/// is returned.
fn next(&mut self) -> char {
self.code.next().unwrap_or('\0')
}
/// Get the next character without removing it. If there is no next char, '\0' is returned.
fn peek(&mut self) -> char {
self.code.peek().copied().unwrap_or('\0')
}
fn lex(&mut self) -> Vec<Token> {
let mut tokens = Vec::new();
while let Some(ch) = self.next() {
match ch {
loop {
match self.next() {
// End of text
'\0' => break,
// Skip whitespace
' ' | '\r' | '\n' | '\t' => (),
// Lex numbers
'0'..='9' => {
let mut sval = String::from(ch);
// Do as long as a next char exists and it is a numeric char
while let Some(ch) = self.peek() {
// The next char is verified to be Some, so unwrap is safe
match ch {
// Underscore is a separator, so remove it but don't add to number
'_' => {
self.next().unwrap();
}
'0'..='9' => {
sval.push(self.next().unwrap());
}
// Next char is not a number, so stop and finish the number token
_ => break,
}
}
// TODO: We only added numeric chars to the string, but the conversion could still fail
tokens.push(Token::I64(sval.parse().unwrap()));
}
'>' if matches!(self.peek(), Some('>')) => {
// Handle tokens that span two characters
'>' if matches!(self.peek(), '>') => {
self.next();
tokens.push(Token::Shr);
}
'<' if matches!(self.peek(), Some('<')) => {
'<' if matches!(self.peek(), '<') => {
self.next();
tokens.push(Token::Shl);
}
'=' if matches!(self.peek(), Some('=')) => {
'=' if matches!(self.peek(), '=') => {
self.next();
tokens.push(Token::Equ);
}
'!' if matches!(self.peek(), Some('=')) => {
'!' if matches!(self.peek(), '=') => {
self.next();
tokens.push(Token::Neq);
}
'<' if matches!(self.peek(), Some('=')) => {
'<' if matches!(self.peek(), '=') => {
self.next();
tokens.push(Token::Le);
}
'>' if matches!(self.peek(), Some('=')) => {
'>' if matches!(self.peek(), '=') => {
self.next();
tokens.push(Token::Ge);
}
'$' if matches!(self.peek(), Some('$')) => {
'$' if matches!(self.peek(), '$') => {
self.next();
tokens.push(Token::DoubleDollar);
}
// Handle tokens that span one character
'+' => tokens.push(Token::Add),
'-' => tokens.push(Token::Sub),
'*' => tokens.push(Token::Mul),
@ -193,145 +89,139 @@ impl<'a> Lexer<'a> {
'}' => tokens.push(Token::RBrace),
'$' => tokens.push(Token::Dollar),
'"' => {
let mut text = String::new();
// Handle special multicharacter tokens
let mut escape = false;
// Lex numbers
ch @ '0'..='9' => tokens.push(self.lex_number(ch)),
// Do as long as a next char exists and it is not '"'
loop {
if escape {
escape = false;
// Lex strings
'"' => tokens.push(self.lex_string()),
match self.next() {
Some('\\') => text.push('\\'),
Some('n') => text.push('\n'),
Some('r') => text.push('\r'),
Some('t') => text.push('\t'),
ch => panic!("Invalid string escape: '{:?}'", ch),
}
// Lex identifiers
ch @ ('a'..='z' | 'A'..='Z' | '_') => tokens.push(self.lex_ident(ch)),
} else {
match self.peek() {
Some('"') => {
self.next();
break;
}
Some('\\') => {
self.next();
escape = true;
}
None => panic!("String is never terminated (missing '\"')"),
_ => text.push(self.next().unwrap()),
}
}
}
tokens.push(Token::Str(text));
}
'a'..='z' | 'A'..='Z' | '_' => {
let mut ident = String::from(ch);
// Do as long as a next char exists and it is a valid ident char
while let Some('a'..='z' | 'A'..='Z' | '_' | '0'..='9') = self.peek() {
// The next char is verified to be Some, so unwrap is safe
ident.push(self.next().unwrap());
}
match ident.as_str() {
"true" => tokens.push(Token::I64(1)),
"false" => tokens.push(Token::I64(0)),
"let" => tokens.push(Token::Let),
"while" => tokens.push(Token::While),
"if" => tokens.push(Token::If),
"else" => tokens.push(Token::Else),
"for" => tokens.push(Token::For),
_ => tokens.push(Token::Ident(ident)),
}
}
//TODO: Don't panic, keep calm
_ => panic!("Lexer encountered unexpected char: '{}'", ch),
// Any other character is unexpected
ch => panic!("Lexer encountered unexpected char: '{}'", ch),
}
}
tokens
}
/// Advance to next character and return the removed char
fn next(&mut self) -> Option<char> {
self.code.next()
fn lex_number(&mut self, first_char: char) -> Token {
let mut sval = String::from(first_char);
// Do as long as a next char exists and it is a numeric char
loop {
// The next char is verified to be Some, so unwrap is safe
match self.peek() {
// Underscore is a separator, so remove it but don't add to number
'_' => {
self.next();
}
'0'..='9' => {
sval.push(self.next());
}
// Next char is not a number, so stop and finish the number token
_ => break,
}
}
// TODO: We only added numeric chars to the string, but the conversion could still fail
Token::Literal(Literal::I64(sval.parse().unwrap()))
}
/// Get the next character without removing it
fn peek(&mut self) -> Option<char> {
self.code.peek().copied()
/// Lex an identifier from the character stream. The first char has to have been consumed
/// from the stream already and is passed as an argument instead.
fn lex_ident(&mut self, first_char: char) -> Token {
let mut ident = String::from(first_char);
// Do as long as a next char exists and it is a valid ident char
while let 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' = self.peek() {
// The next char is verified to be Some, so unwrap is safe
ident.push(self.next());
}
// Check if the identifier is a keyword
match ident.as_str() {
"true" => Token::Literal(Literal::I64(1)),
"false" => Token::Literal(Literal::I64(0)),
"let" => Token::Keyword(Keyword::Let),
"while" => Token::Keyword(Keyword::While),
"if" => Token::Keyword(Keyword::If),
"else" => Token::Keyword(Keyword::Else),
"for" => Token::Keyword(Keyword::For),
_ => Token::Ident(ident),
}
}
}
/// Lex the provided code into a Token Buffer
///
/// TODO: Don't panic and implement error handling using Result
pub fn lex(code: &str) -> Vec<Token> {
let mut lexer = Lexer::new(code);
lexer.lex()
}
/// Lex a string token from the character stream. This requires the initial quote '"' to be
/// consumed before.
fn lex_string(&mut self) -> Token {
let mut text = String::new();
impl Token {
pub fn try_to_binop(&self) -> Option<BinOpType> {
Some(match self {
Token::Add => BinOpType::Add,
Token::Sub => BinOpType::Sub,
let mut escape = false;
Token::Mul => BinOpType::Mul,
Token::Div => BinOpType::Div,
Token::Mod => BinOpType::Mod,
// Do as long as a next char exists and it is not '"'
loop {
if escape {
escape = false;
Token::BAnd => BinOpType::BAnd,
Token::BOr => BinOpType::BOr,
Token::BXor => BinOpType::BXor,
// Escape characters
match self.next() {
'\\' => text.push('\\'),
'n' => text.push('\n'),
'r' => text.push('\r'),
't' => text.push('\t'),
ch => panic!("Invalid string escape: '{:?}'", ch),
}
} else {
match self.peek() {
// Doublequote '"' ends the string lexing
'"' => {
self.next();
break;
}
// Backslash '\' escapes the next character
'\\' => {
self.next();
escape = true;
}
Token::Shl => BinOpType::Shl,
Token::Shr => BinOpType::Shr,
// Reached end of text but didn't encounter closing doublequote '"'
'\0' => panic!("String is never terminated (missing '\"')"),
Token::Equ => BinOpType::Equ,
Token::Neq => BinOpType::Neq,
_ => text.push(self.next()),
}
}
}
Token::Gt => BinOpType::Gt,
Token::Ge => BinOpType::Ge,
Token::Lt => BinOpType::Lt,
Token::Le => BinOpType::Le,
Token::Assign => BinOpType::Assign,
_ => return None,
})
Token::Literal(Literal::Str(text))
}
}
#[cfg(test)]
mod tests {
use crate::token::Literal;
use super::{lex, Token};
#[test]
fn test_lexer() {
let code = "33 +5*2 + 4456467*2334+3 % - / << ^ | & >>";
let expected = vec![
Token::I64(33),
Token::Literal(Literal::I64(33)),
Token::Add,
Token::I64(5),
Token::Literal(Literal::I64(5)),
Token::Mul,
Token::I64(2),
Token::Literal(Literal::I64(2)),
Token::Add,
Token::I64(4456467),
Token::Literal(Literal::I64(4456467)),
Token::Mul,
Token::I64(2334),
Token::Literal(Literal::I64(2334)),
Token::Add,
Token::I64(3),
Token::Literal(Literal::I64(3)),
Token::Mod,
Token::Sub,
Token::Div,

View File

@ -1,3 +1,4 @@
pub mod lexer;
pub mod parser;
pub mod interpreter;
pub mod token;

View File

@ -1,6 +1,6 @@
use std::{iter::Peekable, rc::Rc};
use crate::lexer::Token;
use crate::token::{Keyword, Literal, Token};
/// Types for binary operators
#[derive(Debug, PartialEq, Eq, Clone)]
@ -142,10 +142,15 @@ impl<T: Iterator<Item = Token>> Parser<T> {
}
Token::EoF => break,
Token::RBrace => break,
Token::Let => self.parse_let_stmt(),
Token::While => self.parse_while(),
Token::If => self.parse_if(),
Token::For => self.parse_for(),
Token::Keyword(keyword) => match keyword {
Keyword::Let => self.parse_let_stmt(),
Keyword::While => self.parse_while(),
Keyword::If => self.parse_if(),
Keyword::For => self.parse_for(),
Keyword::Else => panic!("Unexpected else keyword"),
},
Token::Dollar => {
self.next();
Stmt::Print(self.parse_expr())
@ -165,13 +170,13 @@ impl<T: Iterator<Item = Token>> Parser<T> {
}
fn parse_for(&mut self) -> Stmt {
if !matches!(self.next(), Token::For) {
if !matches!(self.next(), Token::Keyword(Keyword::For)) {
panic!("Error parsing for: Expected for token");
}
let init = match self.parse_let_stmt() {
Stmt::Let(name, rhs) => (name, rhs),
_ => unreachable!()
_ => unreachable!(),
};
if !matches!(self.next(), Token::Semicolon) {
@ -179,7 +184,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
}
let condition = self.parse_expr();
if !matches!(self.next(), Token::Semicolon) {
panic!("Error parsing for: Expected semicolon token");
}
@ -200,7 +205,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
}
fn parse_if(&mut self) -> Stmt {
if !matches!(self.next(), Token::If) {
if !matches!(self.next(), Token::Keyword(Keyword::If)) {
panic!("Error parsing if: Expected if token");
}
@ -218,15 +223,15 @@ impl<T: Iterator<Item = Token>> Parser<T> {
let mut body_else = Ast { prog: Vec::new() };
if matches!(self.peek(), Token::Else) {
if matches!(self.peek(), Token::Keyword(Keyword::Else)) {
self.next();
if !matches!(self.next(), Token::LBrace) {
panic!("Error parsing else: Expected '{{' token");
}
body_else = self.parse();
if !matches!(self.next(), Token::RBrace) {
panic!("Error parsing else: Expected '}}' token");
}
@ -236,7 +241,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
}
fn parse_while(&mut self) -> Stmt {
if !matches!(self.next(), Token::While) {
if !matches!(self.next(), Token::Keyword(Keyword::While)) {
panic!("Error parsing while: Expected while token");
}
@ -256,7 +261,7 @@ impl<T: Iterator<Item = Token>> Parser<T> {
}
fn parse_let_stmt(&mut self) -> Stmt {
if !matches!(self.next(), Token::Let) {
if !matches!(self.next(), Token::Keyword(Keyword::Let)) {
panic!("Error parsing let: Expected let token");
}
@ -310,9 +315,9 @@ impl<T: Iterator<Item = Token>> Parser<T> {
/// Parse a primary expression (for now only number)
fn parse_primary(&mut self) -> Expr {
match self.next() {
Token::I64(val) => Expr::I64(val),
Token::Literal(Literal::I64(val)) => Expr::I64(val),
Token::Str(text) => Expr::Str(text.into()),
Token::Literal(Literal::Str(text)) => Expr::Str(text.into()),
Token::Ident(name) => Expr::Ident(name),
@ -377,8 +382,8 @@ impl BinOpType {
mod tests {
use super::{parse, BinOpType, Expr};
use crate::{
lexer::Token,
parser::{Ast, Stmt},
token::{Literal, Token},
};
#[test]
@ -386,13 +391,13 @@ mod tests {
// Expression: 1 + 2 * 3 + 4
// With precedence: (1 + (2 * 3)) + 4
let tokens = [
Token::I64(1),
Token::Literal(Literal::I64(1)),
Token::Add,
Token::I64(2),
Token::Literal(Literal::I64(2)),
Token::Mul,
Token::I64(3),
Token::Literal(Literal::I64(3)),
Token::Sub,
Token::I64(4),
Token::Literal(Literal::I64(4)),
];
let expected = Expr::BinOp(

147
src/token.rs Normal file
View File

@ -0,0 +1,147 @@
use crate::parser::BinOpType;
#[derive(Debug, PartialEq, Eq)]
pub enum Literal {
/// Integer literal (64-bit)
I64(i64),
/// String literal ("Some string")
Str(String),
}
#[derive(Debug, PartialEq, Eq)]
pub enum Keyword {
/// Let identifier (let)
Let,
/// While (while)
While,
/// For (for)
For,
/// If (if)
If,
/// Else (else)
Else,
}
#[derive(Debug, PartialEq, Eq)]
pub enum Token {
/// Literal values
Literal(Literal),
/// Identifier (variable / function / ... name)
Ident(String),
/// Specific identifiers that have a special meaning as keywords
Keyword(Keyword),
/// Left parenthesis ('(')
LParen,
/// Right parentheses (')')
RParen,
/// Left brace ({)
LBrace,
/// Right brace (})
RBrace,
/// Dollar sign ($)
Dollar,
/// Double Dollar sign ($$)
DoubleDollar,
/// Assignment (single equal) (=)
Assign,
/// Plus (+)
Add,
/// Minus (-)
Sub,
/// Asterisk (*)
Mul,
/// Slash (/)
Div,
/// Percent (%)
Mod,
/// Pipe (|)
BOr,
/// Ampersand (&)
BAnd,
/// Circumflex (^)
BXor,
/// Shift Left (<<)
Shl,
/// Shift Right (>>)
Shr,
/// Equal sign (==)
Equ,
/// Not Equal sign (!=)
Neq,
/// Greater than (>)
Gt,
/// Greater or equal (>=)
Ge,
/// Less than (<)
Lt,
/// Less or equal (<=)
Le,
/// Semicolon (;)
Semicolon,
/// End of file
EoF,
}
impl Token {
pub fn try_to_binop(&self) -> Option<BinOpType> {
Some(match self {
Token::Add => BinOpType::Add,
Token::Sub => BinOpType::Sub,
Token::Mul => BinOpType::Mul,
Token::Div => BinOpType::Div,
Token::Mod => BinOpType::Mod,
Token::BAnd => BinOpType::BAnd,
Token::BOr => BinOpType::BOr,
Token::BXor => BinOpType::BXor,
Token::Shl => BinOpType::Shl,
Token::Shr => BinOpType::Shr,
Token::Equ => BinOpType::Equ,
Token::Neq => BinOpType::Neq,
Token::Gt => BinOpType::Gt,
Token::Ge => BinOpType::Ge,
Token::Lt => BinOpType::Lt,
Token::Le => BinOpType::Le,
Token::Assign => BinOpType::Assign,
_ => return None,
})
}
}