nek-lang/src/lexer.rs

248 lines
8.2 KiB
Rust

use std::{iter::Peekable, str::Chars};
use anyhow::Result;
use thiserror::Error;
use crate::token::Token;
#[derive(Debug, Error)]
pub enum LexErr {
#[error("Failed to parse '{0}' as i64")]
NumericParse(String),
#[error("Invalid escape character '\\{0}'")]
InvalidStrEscape(char),
#[error("Lexer encountered unexpected char: '{0}'")]
UnexpectedChar(char),
#[error("Missing closing string quote '\"'")]
MissingClosingString
}
struct Lexer<'a> {
code: Peekable<Chars<'a>>,
}
impl<'a> Lexer<'a> {
fn new(code: &'a str) -> Self {
let code = code.chars().peekable();
Self { code }
}
fn lex(&mut self) -> Result<Vec<Token>, LexErr> {
let mut tokens = Vec::new();
loop {
match self.next() {
// Skip whitespace
' ' | '\t' | '\n' | '\r' => (),
// Stop lexing at EOF
'\0' => break,
'>' if matches!(self.peek(), '>') => {
self.next();
tokens.push(Token::Shr);
}
'<' if matches!(self.peek(), '<') => {
self.next();
tokens.push(Token::Shl);
}
'=' if matches!(self.peek(), '=') => {
self.next();
tokens.push(Token::EquEqu);
}
'!' if matches!(self.peek(), '=') => {
self.next();
tokens.push(Token::NotEqu);
}
'<' if matches!(self.peek(), '=') => {
self.next();
tokens.push(Token::LAngleEqu);
}
'>' if matches!(self.peek(), '=') => {
self.next();
tokens.push(Token::RAngleEqu);
}
'<' if matches!(self.peek(), '-') => {
self.next();
tokens.push(Token::LArrow);
}
'&' if matches!(self.peek(), '&') => {
self.next();
tokens.push(Token::LAnd);
}
'|' if matches!(self.peek(), '|') => {
self.next();
tokens.push(Token::LOr);
}
// Line comment. Consume every char until linefeed (next line)
'/' if matches!(self.peek(), '/') => while self.next() != '\n' {},
';' => tokens.push(Token::Semicolon),
'+' => tokens.push(Token::Add),
'-' => tokens.push(Token::Sub),
'*' => tokens.push(Token::Mul),
'/' => tokens.push(Token::Div),
'%' => tokens.push(Token::Mod),
'|' => tokens.push(Token::BOr),
'&' => tokens.push(Token::BAnd),
'^' => tokens.push(Token::BXor),
'(' => tokens.push(Token::LParen),
')' => tokens.push(Token::RParen),
'~' => tokens.push(Token::Tilde),
'<' => tokens.push(Token::LAngle),
'>' => tokens.push(Token::RAngle),
'=' => tokens.push(Token::Equ),
'{' => tokens.push(Token::LBraces),
'}' => tokens.push(Token::RBraces),
'!' => tokens.push(Token::LNot),
// Lex numbers
ch @ '0'..='9' => {
let mut sval = String::from(ch);
// Do as long as a next char exists and it is a numeric char
loop {
// The next char is verified to be Some, so unwrap is safe
match self.peek() {
// Underscore is a separator, so remove it but don't add to number
'_' => {
self.next();
}
'0'..='9' => {
sval.push(self.next());
}
// Next char is not a number, so stop and finish the number token
_ => break,
}
}
// TODO: We only added numeric chars to the string, but the conversion could still fail
let i64val = sval.parse().map_err(|_| LexErr::NumericParse(sval))?;
tokens.push(Token::I64(i64val));
}
// Lex a string
'"' => {
// Opening " was consumed in match
let mut text = String::new();
loop {
match self.peek() {
'"' => break,
'\0' => Err(LexErr::MissingClosingString)?,
_ => {
match self.next() {
'\\' => {
match self.next() {
'n' => text.push('\n'),
'r' => text.push('\r'),
't' => text.push('\t'),
'\\' => text.push('\\'),
'"' => text.push('"'),
ch => Err(LexErr::InvalidStrEscape(ch))?,
}
}
ch => text.push(ch),
}
}
}
}
// Consume closing "
self.next();
tokens.push(Token::String(text))
}
// Lex characters as identifier
ch @ ('a'..='z' | 'A'..='Z' | '_') => {
let mut ident = String::from(ch);
// Do as long as a next char exists and it is a valid char for an identifier
loop {
match self.peek() {
'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => {
ident.push(self.next());
}
// Next char is not valid, so stop and finish the ident token
_ => break,
}
}
let token = match ident.as_str() {
"loop" => Token::Loop,
"print" => Token::Print,
"if" => Token::If,
"else" => Token::Else,
_ => Token::Ident(ident),
};
tokens.push(token);
}
//TODO: Don't panic, keep calm
ch => Err(LexErr::UnexpectedChar(ch))?,
}
}
Ok(tokens)
}
/// Advance to next character and return the removed char
fn next(&mut self) -> char {
self.code.next().unwrap_or('\0')
}
/// Get the next character without removing it
fn peek(&mut self) -> char {
self.code.peek().copied().unwrap_or('\0')
}
}
/// Lex the provided code into a Token Buffer
///
/// TODO: Don't panic and implement error handling using Result
pub fn lex(code: &str) -> Result<Vec<Token>, LexErr> {
let mut lexer = Lexer::new(code);
lexer.lex()
}
#[cfg(test)]
mod tests {
use super::{lex, Token};
#[test]
fn test_lexer() {
let code = "33 +5*2 + 4456467*2334+3 % - / << ^ | & >>";
let expected = vec![
Token::I64(33),
Token::Add,
Token::I64(5),
Token::Mul,
Token::I64(2),
Token::Add,
Token::I64(4456467),
Token::Mul,
Token::I64(2334),
Token::Add,
Token::I64(3),
Token::Mod,
Token::Sub,
Token::Div,
Token::Shl,
Token::BXor,
Token::BOr,
Token::BAnd,
Token::Shr,
];
let actual = lex(code).unwrap();
assert_eq!(expected, actual);
}
}