From f6152670aa5c2b578eacf31378868c59e1506f3a Mon Sep 17 00:00:00 2001 From: Daniel M Date: Thu, 3 Feb 2022 17:25:55 +0100 Subject: [PATCH] Small refactor for lexer --- src/lexer.rs | 184 +++++++++++++++++++++++++++------------------------ 1 file changed, 98 insertions(+), 86 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index 73ee243..4bca8e0 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -107,95 +107,16 @@ impl<'a> Lexer<'a> { '}' => tokens.push(Token::RBraces), '!' => tokens.push(Token::LNot), - // Lex numbers - ch @ '0'..='9' => { - // String representation of the integer value - let mut sval = String::from(ch); + // Special tokens with variable length - // Do as long as a next char exists and it is a numeric char - loop { - // The next char is verified to be Some, so unwrap is safe - match self.peek() { - // Underscore is a separator, so remove it but don't add to number - '_' => { - self.next(); - } - '0'..='9' => { - sval.push(self.next()); - } - // Next char is not a number, so stop and finish the number token - _ => break, - } - } + // Lex multiple characters together as numbers + ch @ '0'..='9' => tokens.push(self.lex_number(ch)?), - // Try to convert the string representation of the value to i64 - let i64val = sval.parse().map_err(|_| LexErr::NumericParse(sval))?; - tokens.push(Token::I64(i64val)); - } + // Lex multiple characters together as a string + '"' => tokens.push(self.lex_str()?), - // Lex a string - '"' => { - // Opening " was consumed in match - - let mut text = String::new(); - - // Read all chars until encountering the closing " - loop { - match self.peek() { - '"' => break, - // If the end of file is reached while still waiting for '"', error out - '\0' => Err(LexErr::MissingClosingString)?, - _ => match self.next() { - // Backshlash indicates an escaped character - '\\' => match self.next() { - 'n' => text.push('\n'), - 'r' => text.push('\r'), - 't' => text.push('\t'), - '\\' => text.push('\\'), - '"' => text.push('"'), - ch => Err(LexErr::InvalidStrEscape(ch))?, - }, - // All other characters are simply appended to the string - ch => text.push(ch), - }, - } - } - - // Consume closing " - self.next(); - - tokens.push(Token::String(text)) - } - - // Lex characters as identifier - ch @ ('a'..='z' | 'A'..='Z' | '_') => { - let mut ident = String::from(ch); - - // Do as long as a next char exists and it is a valid char for an identifier - loop { - match self.peek() { - // In the middle of an identifier numbers are also allowed - 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => { - ident.push(self.next()); - } - // Next char is not valid, so stop and finish the ident token - _ => break, - } - } - - // Check for pre-defined keywords - let token = match ident.as_str() { - "loop" => Token::Loop, - "print" => Token::Print, - "if" => Token::If, - "else" => Token::Else, - - // If it doesn't match a keyword, it is a normal identifier - _ => Token::Ident(ident), - }; - - tokens.push(token); - } + // Lex multiple characters together as identifier + ch @ ('a'..='z' | 'A'..='Z' | '_') => tokens.push(self.lex_identifier(ch)?), ch => Err(LexErr::UnexpectedChar(ch))?, } @@ -204,6 +125,97 @@ impl<'a> Lexer<'a> { Ok(tokens) } + /// Lex multiple characters as a number until encountering a non numeric digit. This includes + /// the first character + fn lex_number(&mut self, first_char: char) -> Result { + // String representation of the integer value + let mut sval = String::from(first_char); + + // Do as long as a next char exists and it is a numeric char + loop { + // The next char is verified to be Some, so unwrap is safe + match self.peek() { + // Underscore is a separator, so remove it but don't add to number + '_' => { + self.next(); + } + '0'..='9' => { + sval.push(self.next()); + } + // Next char is not a number, so stop and finish the number token + _ => break, + } + } + + // Try to convert the string representation of the value to i64 + let i64val = sval.parse().map_err(|_| LexErr::NumericParse(sval))?; + Ok(Token::I64(i64val)) + } + + /// Lex characters as a string until encountering an unescaped closing doublequoute char '"' + fn lex_str(&mut self) -> Result { + // Opening " was consumed in match + + let mut text = String::new(); + + // Read all chars until encountering the closing " + loop { + match self.peek() { + '"' => break, + // If the end of file is reached while still waiting for '"', error out + '\0' => Err(LexErr::MissingClosingString)?, + _ => match self.next() { + // Backshlash indicates an escaped character + '\\' => match self.next() { + 'n' => text.push('\n'), + 'r' => text.push('\r'), + 't' => text.push('\t'), + '\\' => text.push('\\'), + '"' => text.push('"'), + ch => Err(LexErr::InvalidStrEscape(ch))?, + }, + // All other characters are simply appended to the string + ch => text.push(ch), + }, + } + } + + // Consume closing " + self.next(); + + Ok(Token::String(text)) + } + + /// Lex characters from the text as an identifier. This includes the first character passed in + fn lex_identifier(&mut self, first_char: char) -> Result { + let mut ident = String::from(first_char); + + // Do as long as a next char exists and it is a valid char for an identifier + loop { + match self.peek() { + // In the middle of an identifier numbers are also allowed + 'a'..='z' | 'A'..='Z' | '0'..='9' | '_' => { + ident.push(self.next()); + } + // Next char is not valid, so stop and finish the ident token + _ => break, + } + } + + // Check for pre-defined keywords + let token = match ident.as_str() { + "loop" => Token::Loop, + "print" => Token::Print, + "if" => Token::If, + "else" => Token::Else, + + // If it doesn't match a keyword, it is a normal identifier + _ => Token::Ident(ident), + }; + + Ok(token) + } + /// Advance to next character and return the removed char fn next(&mut self) -> char { self.code.next().unwrap_or('\0')