263 lines
8.5 KiB
Rust
263 lines
8.5 KiB
Rust
use crate::token::Token;
|
|
use anyhow::Result;
|
|
use std::{iter::Peekable, str::Chars};
|
|
use thiserror::Error;
|
|
|
|
#[derive(Debug, Error)]
|
|
pub enum LexErr {
|
|
#[error("Failed to parse '{0}' as i64")]
|
|
NumericParse(String),
|
|
|
|
#[error("Invalid escape character '\\{0}'")]
|
|
InvalidStrEscape(char),
|
|
|
|
#[error("Lexer encountered unexpected char: '{0}'")]
|
|
UnexpectedChar(char),
|
|
|
|
#[error("Missing closing string quote '\"'")]
|
|
MissingClosingString,
|
|
}
|
|
|
|
/// Lex the provided code into a Token Buffer
|
|
pub fn lex(code: &str) -> Result<Vec<Token>, LexErr> {
|
|
let mut lexer = Lexer::new(code);
|
|
lexer.lex()
|
|
}
|
|
|
|
struct Lexer<'a> {
|
|
/// The sourcecode text as an iterator over the chars
|
|
code: Peekable<Chars<'a>>,
|
|
}
|
|
|
|
impl<'a> Lexer<'a> {
|
|
fn new(code: &'a str) -> Self {
|
|
let code = code.chars().peekable();
|
|
Self { code }
|
|
}
|
|
|
|
fn lex(&mut self) -> Result<Vec<Token>, LexErr> {
|
|
let mut tokens = Vec::new();
|
|
|
|
loop {
|
|
match self.next() {
|
|
// Stop lexing at EOF
|
|
'\0' => break,
|
|
|
|
// Skip whitespace
|
|
' ' | '\t' | '\n' | '\r' => (),
|
|
|
|
// Line comment. Consume every char until linefeed (next line)
|
|
'/' if matches!(self.peek(), '/') => while !matches!(self.next(), '\n' | '\0') {},
|
|
|
|
// Double character tokens
|
|
'>' if matches!(self.peek(), '>') => {
|
|
self.next();
|
|
tokens.push(Token::Shr);
|
|
}
|
|
'<' if matches!(self.peek(), '<') => {
|
|
self.next();
|
|
tokens.push(Token::Shl);
|
|
}
|
|
'=' if matches!(self.peek(), '=') => {
|
|
self.next();
|
|
tokens.push(Token::EquEqu);
|
|
}
|
|
'!' if matches!(self.peek(), '=') => {
|
|
self.next();
|
|
tokens.push(Token::NotEqu);
|
|
}
|
|
'<' if matches!(self.peek(), '=') => {
|
|
self.next();
|
|
tokens.push(Token::LAngleEqu);
|
|
}
|
|
'>' if matches!(self.peek(), '=') => {
|
|
self.next();
|
|
tokens.push(Token::RAngleEqu);
|
|
}
|
|
'<' if matches!(self.peek(), '-') => {
|
|
self.next();
|
|
tokens.push(Token::LArrow);
|
|
}
|
|
'&' if matches!(self.peek(), '&') => {
|
|
self.next();
|
|
tokens.push(Token::LAnd);
|
|
}
|
|
'|' if matches!(self.peek(), '|') => {
|
|
self.next();
|
|
tokens.push(Token::LOr);
|
|
}
|
|
|
|
// Single character tokens
|
|
';' => tokens.push(Token::Semicolon),
|
|
'+' => tokens.push(Token::Add),
|
|
'-' => tokens.push(Token::Sub),
|
|
'*' => tokens.push(Token::Mul),
|
|
'/' => tokens.push(Token::Div),
|
|
'%' => tokens.push(Token::Mod),
|
|
'|' => tokens.push(Token::BOr),
|
|
'&' => tokens.push(Token::BAnd),
|
|
'^' => tokens.push(Token::BXor),
|
|
'(' => tokens.push(Token::LParen),
|
|
')' => tokens.push(Token::RParen),
|
|
'~' => tokens.push(Token::Tilde),
|
|
'<' => tokens.push(Token::LAngle),
|
|
'>' => tokens.push(Token::RAngle),
|
|
'=' => tokens.push(Token::Equ),
|
|
'{' => tokens.push(Token::LBraces),
|
|
'}' => tokens.push(Token::RBraces),
|
|
'!' => tokens.push(Token::LNot),
|
|
|
|
// Special tokens with variable length
|
|
|
|
// Lex multiple characters together as numbers
|
|
ch @ '0'..='9' => tokens.push(self.lex_number(ch)?),
|
|
|
|
// Lex multiple characters together as a string
|
|
'"' => tokens.push(self.lex_str()?),
|
|
|
|
// Lex multiple characters together as identifier
|
|
ch @ ('a'..='z' | 'A'..='Z' | '_') => tokens.push(self.lex_identifier(ch)?),
|
|
|
|
ch => Err(LexErr::UnexpectedChar(ch))?,
|
|
}
|
|
}
|
|
|
|
Ok(tokens)
|
|
}
|
|
|
|
/// Lex multiple characters as a number until encountering a non numeric digit. This includes
|
|
/// the first character
|
|
fn lex_number(&mut self, first_char: char) -> Result<Token, LexErr> {
|
|
// String representation of the integer value
|
|
let mut sval = String::from(first_char);
|
|
|
|
// Do as long as a next char exists and it is a numeric char
|
|
loop {
|
|
// The next char is verified to be Some, so unwrap is safe
|
|
match self.peek() {
|
|
// Underscore is a separator, so remove it but don't add to number
|
|
'_' => {
|
|
self.next();
|
|
}
|
|
'0'..='9' => {
|
|
sval.push(self.next());
|
|
}
|
|
// Next char is not a number, so stop and finish the number token
|
|
_ => break,
|
|
}
|
|
}
|
|
|
|
// Try to convert the string representation of the value to i64
|
|
let i64val = sval.parse().map_err(|_| LexErr::NumericParse(sval))?;
|
|
Ok(Token::I64(i64val))
|
|
}
|
|
|
|
/// Lex characters as a string until encountering an unescaped closing doublequoute char '"'
|
|
fn lex_str(&mut self) -> Result<Token, LexErr> {
|
|
// Opening " was consumed in match
|
|
|
|
let mut text = String::new();
|
|
|
|
// Read all chars until encountering the closing "
|
|
loop {
|
|
match self.peek() {
|
|
'"' => break,
|
|
// If the end of file is reached while still waiting for '"', error out
|
|
'\0' => Err(LexErr::MissingClosingString)?,
|
|
_ => match self.next() {
|
|
// Backshlash indicates an escaped character
|
|
'\\' => match self.next() {
|
|
'n' => text.push('\n'),
|
|
'r' => text.push('\r'),
|
|
't' => text.push('\t'),
|
|
'\\' => text.push('\\'),
|
|
'"' => text.push('"'),
|
|
ch => Err(LexErr::InvalidStrEscape(ch))?,
|
|
},
|
|
// All other characters are simply appended to the string
|
|
ch => text.push(ch),
|
|
},
|
|
}
|
|
}
|
|
|
|
// Consume closing "
|
|
self.next();
|
|
|
|
Ok(Token::String(text))
|
|
}
|
|
|
|
/// Lex characters from the text as an identifier. This includes the first character passed in
|
|
fn lex_identifier(&mut self, first_char: char) -> Result<Token, LexErr> {
|
|
let mut ident = String::from(first_char);
|
|
|
|
// Do as long as a next char exists and it is a valid char for an identifier
|
|
loop {
|
|
match self.peek() {
|
|
// In the middle of an identifier numbers are also allowed
|
|
'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => {
|
|
ident.push(self.next());
|
|
}
|
|
// Next char is not valid, so stop and finish the ident token
|
|
_ => break,
|
|
}
|
|
}
|
|
|
|
// Check for pre-defined keywords
|
|
let token = match ident.as_str() {
|
|
"loop" => Token::Loop,
|
|
"print" => Token::Print,
|
|
"if" => Token::If,
|
|
"else" => Token::Else,
|
|
|
|
// If it doesn't match a keyword, it is a normal identifier
|
|
_ => Token::Ident(ident),
|
|
};
|
|
|
|
Ok(token)
|
|
}
|
|
|
|
/// Advance to next character and return the removed char
|
|
fn next(&mut self) -> char {
|
|
self.code.next().unwrap_or('\0')
|
|
}
|
|
|
|
/// Get the next character without removing it
|
|
fn peek(&mut self) -> char {
|
|
self.code.peek().copied().unwrap_or('\0')
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::{lex, Token};
|
|
|
|
#[test]
|
|
fn test_lexer() {
|
|
let code = "33 +5*2 + 4456467*2334+3 % - / << ^ | & >>";
|
|
let expected = vec![
|
|
Token::I64(33),
|
|
Token::Add,
|
|
Token::I64(5),
|
|
Token::Mul,
|
|
Token::I64(2),
|
|
Token::Add,
|
|
Token::I64(4456467),
|
|
Token::Mul,
|
|
Token::I64(2334),
|
|
Token::Add,
|
|
Token::I64(3),
|
|
Token::Mod,
|
|
Token::Sub,
|
|
Token::Div,
|
|
Token::Shl,
|
|
Token::BXor,
|
|
Token::BOr,
|
|
Token::BAnd,
|
|
Token::Shr,
|
|
];
|
|
|
|
let actual = lex(code).unwrap();
|
|
assert_eq!(expected, actual);
|
|
}
|
|
}
|