Refactor lexer match loop
This commit is contained in:
parent
726dd62794
commit
926bdeb2dc
166
src/lexer.rs
166
src/lexer.rs
@ -20,118 +20,100 @@ pub enum LexErr {
|
|||||||
|
|
||||||
/// Lex the provided code into a Token Buffer
|
/// Lex the provided code into a Token Buffer
|
||||||
pub fn lex(code: &str) -> Result<Vec<Token>, LexErr> {
|
pub fn lex(code: &str) -> Result<Vec<Token>, LexErr> {
|
||||||
let mut lexer = Lexer::new(code);
|
let lexer = Lexer::new(code);
|
||||||
lexer.lex()
|
lexer.lex()
|
||||||
}
|
}
|
||||||
|
|
||||||
struct Lexer<'a> {
|
struct Lexer<'a> {
|
||||||
/// The sourcecode text as an iterator over the chars
|
/// The sourcecode text as an iterator over the chars
|
||||||
code: Peekable<Chars<'a>>,
|
code: Peekable<Chars<'a>>,
|
||||||
|
/// The lexed tokens
|
||||||
|
tokens: Vec<Token>,
|
||||||
|
/// The sourcecode character that is currently being lexed
|
||||||
|
current_char: char,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Lexer<'a> {
|
impl<'a> Lexer<'a> {
|
||||||
fn new(code: &'a str) -> Self {
|
fn new(code: &'a str) -> Self {
|
||||||
let code = code.chars().peekable();
|
let code = code.chars().peekable();
|
||||||
Self { code }
|
let tokens = Vec::new();
|
||||||
|
let current_char = '\0';
|
||||||
|
Self {
|
||||||
|
code,
|
||||||
|
tokens,
|
||||||
|
current_char,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn lex(&mut self) -> Result<Vec<Token>, LexErr> {
|
fn lex(mut self) -> Result<Vec<Token>, LexErr> {
|
||||||
let mut tokens = Vec::new();
|
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
match self.next() {
|
self.current_char = self.next();
|
||||||
|
match (self.current_char, self.peek()) {
|
||||||
// Stop lexing at EOF
|
// Stop lexing at EOF
|
||||||
'\0' => break,
|
('\0', _) => break,
|
||||||
|
|
||||||
// Skip whitespace
|
// Skip whitespace
|
||||||
' ' | '\t' | '\n' | '\r' => (),
|
(' ' | '\t' | '\n' | '\r', _) => (),
|
||||||
|
|
||||||
// Line comment. Consume every char until linefeed (next line)
|
// Line comment. Consume every char until linefeed (next line)
|
||||||
'/' if matches!(self.peek(), '/') => while !matches!(self.next(), '\n' | '\0') {},
|
('/', '/') => while !matches!(self.next(), '\n' | '\0') {},
|
||||||
|
|
||||||
// Double character tokens
|
// Double character tokens
|
||||||
'>' if matches!(self.peek(), '>') => {
|
('>', '>') => self.push_tok_consume(T![>>]),
|
||||||
self.next();
|
('<', '<') => self.push_tok_consume(T![<<]),
|
||||||
tokens.push(T![>>]);
|
('=', '=') => self.push_tok_consume(T![==]),
|
||||||
}
|
('!', '=') => self.push_tok_consume(T![!=]),
|
||||||
'<' if matches!(self.peek(), '<') => {
|
('<', '=') => self.push_tok_consume(T![<=]),
|
||||||
self.next();
|
('>', '=') => self.push_tok_consume(T![>=]),
|
||||||
tokens.push(T![<<]);
|
('<', '-') => self.push_tok_consume(T![<-]),
|
||||||
}
|
('&', '&') => self.push_tok_consume(T![&&]),
|
||||||
'=' if matches!(self.peek(), '=') => {
|
('|', '|') => self.push_tok_consume(T![||]),
|
||||||
self.next();
|
|
||||||
tokens.push(T![==]);
|
|
||||||
}
|
|
||||||
'!' if matches!(self.peek(), '=') => {
|
|
||||||
self.next();
|
|
||||||
tokens.push(T![!=]);
|
|
||||||
}
|
|
||||||
'<' if matches!(self.peek(), '=') => {
|
|
||||||
self.next();
|
|
||||||
tokens.push(T![<=]);
|
|
||||||
}
|
|
||||||
'>' if matches!(self.peek(), '=') => {
|
|
||||||
self.next();
|
|
||||||
tokens.push(T![>=]);
|
|
||||||
}
|
|
||||||
'<' if matches!(self.peek(), '-') => {
|
|
||||||
self.next();
|
|
||||||
tokens.push(T![<-]);
|
|
||||||
}
|
|
||||||
'&' if matches!(self.peek(), '&') => {
|
|
||||||
self.next();
|
|
||||||
tokens.push(T![&&]);
|
|
||||||
}
|
|
||||||
'|' if matches!(self.peek(), '|') => {
|
|
||||||
self.next();
|
|
||||||
tokens.push(T![||]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Single character tokens
|
// Single character tokens
|
||||||
';' => tokens.push(T![;]),
|
(';', _) => self.push_tok(T![;]),
|
||||||
'+' => tokens.push(T![+]),
|
('+', _) => self.push_tok(T![+]),
|
||||||
'-' => tokens.push(T![-]),
|
('-', _) => self.push_tok(T![-]),
|
||||||
'*' => tokens.push(T![*]),
|
('*', _) => self.push_tok(T![*]),
|
||||||
'/' => tokens.push(T![/]),
|
('/', _) => self.push_tok(T![/]),
|
||||||
'%' => tokens.push(T![%]),
|
('%', _) => self.push_tok(T![%]),
|
||||||
'|' => tokens.push(T![|]),
|
('|', _) => self.push_tok(T![|]),
|
||||||
'&' => tokens.push(T![&]),
|
('&', _) => self.push_tok(T![&]),
|
||||||
'^' => tokens.push(T![^]),
|
('^', _) => self.push_tok(T![^]),
|
||||||
'(' => tokens.push(T!['(']),
|
('(', _) => self.push_tok(T!['(']),
|
||||||
')' => tokens.push(T![')']),
|
(')', _) => self.push_tok(T![')']),
|
||||||
'~' => tokens.push(T![~]),
|
('~', _) => self.push_tok(T![~]),
|
||||||
'<' => tokens.push(T![<]),
|
('<', _) => self.push_tok(T![<]),
|
||||||
'>' => tokens.push(T![>]),
|
('>', _) => self.push_tok(T![>]),
|
||||||
'=' => tokens.push(T![=]),
|
('=', _) => self.push_tok(T![=]),
|
||||||
'{' => tokens.push(T!['{']),
|
('{', _) => self.push_tok(T!['{']),
|
||||||
'}' => tokens.push(T!['}']),
|
('}', _) => self.push_tok(T!['}']),
|
||||||
'!' => tokens.push(T![!]),
|
('!', _) => self.push_tok(T![!]),
|
||||||
'[' => tokens.push(T!['[']),
|
('[', _) => self.push_tok(T!['[']),
|
||||||
']' => tokens.push(T![']']),
|
(']', _) => self.push_tok(T![']']),
|
||||||
|
|
||||||
// Special tokens with variable length
|
// Special tokens with variable length
|
||||||
|
|
||||||
// Lex multiple characters together as numbers
|
// Lex multiple characters together as numbers
|
||||||
ch @ '0'..='9' => tokens.push(self.lex_number(ch)?),
|
('0'..='9', _) => self.lex_number()?,
|
||||||
|
|
||||||
// Lex multiple characters together as a string
|
// Lex multiple characters together as a string
|
||||||
'"' => tokens.push(self.lex_str()?),
|
('"', _) => self.lex_str()?,
|
||||||
|
|
||||||
// Lex multiple characters together as identifier
|
// Lex multiple characters together as identifier
|
||||||
ch @ ('a'..='z' | 'A'..='Z' | '_') => tokens.push(self.lex_identifier(ch)?),
|
('a'..='z' | 'A'..='Z' | '_', _) => self.lex_identifier()?,
|
||||||
|
|
||||||
ch => Err(LexErr::UnexpectedChar(ch))?,
|
(ch, _) => Err(LexErr::UnexpectedChar(ch))?,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(tokens)
|
Ok(self.tokens)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Lex multiple characters as a number until encountering a non numeric digit. This includes
|
/// Lex multiple characters as a number until encountering a non numeric digit. The
|
||||||
/// the first character
|
/// successfully lexed i64 literal token is appended to the stored tokens.
|
||||||
fn lex_number(&mut self, first_char: char) -> Result<Token, LexErr> {
|
fn lex_number(&mut self) -> Result<(), LexErr> {
|
||||||
// String representation of the integer value
|
// String representation of the integer value
|
||||||
let mut sval = String::from(first_char);
|
let mut sval = String::from(self.current_char);
|
||||||
|
|
||||||
// Do as long as a next char exists and it is a numeric char
|
// Do as long as a next char exists and it is a numeric char
|
||||||
loop {
|
loop {
|
||||||
@ -151,11 +133,15 @@ impl<'a> Lexer<'a> {
|
|||||||
|
|
||||||
// Try to convert the string representation of the value to i64
|
// Try to convert the string representation of the value to i64
|
||||||
let i64val = sval.parse().map_err(|_| LexErr::NumericParse(sval))?;
|
let i64val = sval.parse().map_err(|_| LexErr::NumericParse(sval))?;
|
||||||
Ok(T![i64(i64val)])
|
|
||||||
|
self.push_tok(T![i64(i64val)]);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Lex characters as a string until encountering an unescaped closing doublequoute char '"'
|
/// Lex characters as a string until encountering an unescaped closing doublequoute char '"'.
|
||||||
fn lex_str(&mut self) -> Result<Token, LexErr> {
|
/// The successfully lexed string literal token is appended to the stored tokens.
|
||||||
|
fn lex_str(&mut self) -> Result<(), LexErr> {
|
||||||
// Opening " was consumed in match
|
// Opening " was consumed in match
|
||||||
|
|
||||||
let mut text = String::new();
|
let mut text = String::new();
|
||||||
@ -185,12 +171,15 @@ impl<'a> Lexer<'a> {
|
|||||||
// Consume closing "
|
// Consume closing "
|
||||||
self.next();
|
self.next();
|
||||||
|
|
||||||
Ok(T![str(text)])
|
self.push_tok(T![str(text)]);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Lex characters from the text as an identifier. This includes the first character passed in
|
/// Lex characters from the text as an identifier. The successfully lexed ident or keyword
|
||||||
fn lex_identifier(&mut self, first_char: char) -> Result<Token, LexErr> {
|
/// token is appended to the stored tokens.
|
||||||
let mut ident = String::from(first_char);
|
fn lex_identifier(&mut self) -> Result<(), LexErr> {
|
||||||
|
let mut ident = String::from(self.current_char);
|
||||||
|
|
||||||
// Do as long as a next char exists and it is a valid char for an identifier
|
// Do as long as a next char exists and it is a valid char for an identifier
|
||||||
loop {
|
loop {
|
||||||
@ -215,7 +204,20 @@ impl<'a> Lexer<'a> {
|
|||||||
_ => T![ident(ident)],
|
_ => T![ident(ident)],
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(token)
|
self.push_tok(token);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Push the given token into the stored tokens
|
||||||
|
fn push_tok(&mut self, token: Token) {
|
||||||
|
self.tokens.push(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same as `push_tok` but also consumes the next token, removing it from the code iter
|
||||||
|
fn push_tok_consume(&mut self, token: Token) {
|
||||||
|
self.next();
|
||||||
|
self.tokens.push(token);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Advance to next character and return the removed char
|
/// Advance to next character and return the removed char
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user