From f2a00e65601f45ec1156375f45449e17b265fc80 Mon Sep 17 00:00:00 2001 From: Daniel M Date: Thu, 23 Dec 2021 16:48:49 +0100 Subject: [PATCH] Initial commit - Implemented basic lexer - No spans implemented yet - No real error handling yet --- .gitignore | 1 + Cargo.lock | 14 ++ Cargo.toml | 4 + plang2/Cargo.toml | 7 + plang2/src/main.rs | 23 +++ plang2_lib/Cargo.toml | 8 + plang2_lib/src/lexer.rs | 313 ++++++++++++++++++++++++++++++++++++++++ plang2_lib/src/lib.rs | 5 + plang2_lib/src/token.rs | 198 +++++++++++++++++++++++++ 9 files changed, 573 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 plang2/Cargo.toml create mode 100644 plang2/src/main.rs create mode 100644 plang2_lib/Cargo.toml create mode 100644 plang2_lib/src/lexer.rs create mode 100644 plang2_lib/src/lib.rs create mode 100644 plang2_lib/src/token.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..dc6ec11 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,14 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "plang2" +version = "0.1.0" +dependencies = [ + "plang2_lib", +] + +[[package]] +name = "plang2_lib" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..674f3a3 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,4 @@ +[workspace] +members = [ + "plang2_lib", "plang2" +] \ No newline at end of file diff --git a/plang2/Cargo.toml b/plang2/Cargo.toml new file mode 100644 index 0000000..0fa7a18 --- /dev/null +++ b/plang2/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "plang2" +version = "0.1.0" +edition = "2021" + +[dependencies] +plang2_lib = { path = "../plang2_lib" } \ No newline at end of file diff --git a/plang2/src/main.rs b/plang2/src/main.rs new file mode 100644 index 0000000..9c4f981 --- /dev/null +++ b/plang2/src/main.rs @@ -0,0 +1,23 @@ +#![allow(dead_code, unused)] +use plang2_lib::*; + +fn main() { + + let code = r#" + // This is the main function + fn main() { + let a = 5465; + let b = 8; + let c = a + b; + + print_int(c); + } + "#; + + let mut lexer = Lexer::new(code); + + let tokens = lexer.tokenize().unwrap(); + + println!("Tokens: \n{}\n", tokens); + +} diff --git a/plang2_lib/Cargo.toml b/plang2_lib/Cargo.toml new file mode 100644 index 0000000..37d38a0 --- /dev/null +++ b/plang2_lib/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "plang2_lib" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/plang2_lib/src/lexer.rs b/plang2_lib/src/lexer.rs new file mode 100644 index 0000000..b852d2c --- /dev/null +++ b/plang2_lib/src/lexer.rs @@ -0,0 +1,313 @@ +use std::{iter::Peekable, str::CharIndices}; + +use super::token::*; + +#[derive(Debug)] +pub enum LexErrType { + InvalidCharacter(char), + InvalidEscapeChar(char), + MissingQuoteEnd, +} + +// TODO: Make real errors that contain the span (offending text section with filename + line) +#[derive(Debug)] +pub struct LexErr { + etype: LexErrType, +} + +type LexRes = Result; + +pub struct Lexer<'a> { + // code: &'a str, + code_iter: Peekable>, + curr_char: Option<(usize, char)>, +} + +impl<'a> Lexer<'a> { + pub fn new(code: &'a str) -> Self { + let mut code_iter = code.char_indices().peekable(); + let curr_char = code_iter.next(); + Self { + // code, + code_iter, + curr_char, + } + } + + pub fn tokenize(&mut self) -> LexRes { + let mut tokens = Vec::new(); + + loop { + let (_idx, ch) = match self.curr_char { + Some(it) => it, + None => break, + }; + + let (_idx_nxt, ch_nxt) = self + .peek() + .map(|(a, b)| (Some(a), Some(b))) + .unwrap_or_default(); + + match ch { + // Skip whitespace + ' ' | '\t' | '\n' | '\r' => (), + + // Lex tokens with 2 char length + '/' if matches!(ch_nxt, Some('/')) => self.advance_until_new_line(), + '=' if matches!(ch_nxt, Some('=')) => { + self.advance(); + tokens.push(Token::Op(Op::Eq)); + } + '!' if matches!(ch_nxt, Some('=')) => { + self.advance(); + tokens.push(Token::Op(Op::Neq)); + } + '>' if matches!(ch_nxt, Some('=')) => { + self.advance(); + tokens.push(Token::Op(Op::Ge)); + } + '<' if matches!(ch_nxt, Some('=')) => { + self.advance(); + tokens.push(Token::Op(Op::Le)); + } + '-' if matches!(ch_nxt, Some('>')) => { + self.advance(); + tokens.push(Token::Op(Op::Arrow)); + } + '&' if matches!(ch_nxt, Some('&')) => { + self.advance(); + tokens.push(Token::Op(Op::And)); + } + '|' if matches!(ch_nxt, Some('|')) => { + self.advance(); + tokens.push(Token::Op(Op::Or)); + } + + // Lex tokens with 1 char length + '+' => tokens.push(Token::Op(Op::Add)), + '-' => tokens.push(Token::Op(Op::Sub)), + '*' => tokens.push(Token::Op(Op::Mul)), + '/' => tokens.push(Token::Op(Op::Div)), + '%' => tokens.push(Token::Op(Op::Mod)), + '(' => tokens.push(Token::Open(Group::Paren)), + '[' => tokens.push(Token::Open(Group::Bracket)), + '{' => tokens.push(Token::Open(Group::Braces)), + ')' => tokens.push(Token::Close(Group::Paren)), + ']' => tokens.push(Token::Close(Group::Bracket)), + '}' => tokens.push(Token::Close(Group::Braces)), + '=' => tokens.push(Token::Op(Op::Assign)), + '>' => tokens.push(Token::Op(Op::Gt)), + '<' => tokens.push(Token::Op(Op::Lt)), + ';' => tokens.push(Token::Semicolon), + ':' => tokens.push(Token::Colon), + ',' => tokens.push(Token::Comma), + '.' => tokens.push(Token::Dot), + '!' => tokens.push(Token::Op(Op::Not)), + '^' => tokens.push(Token::Op(Op::Xor)), + + // Lex Strings + '"' => tokens.push(self.read_string()?), + + // Lex numbers + '0'..='9' => tokens.push(self.read_num()?), + + // Lex identifiers / keywords + 'a'..='z' | 'A'..='Z' | '_' => tokens.push(self.read_ident_or_keyword()?), + + // Anything else is an error + _ => { + return Err(LexErr::new(LexErrType::InvalidCharacter(ch))) + } + } + + self.advance(); + } + + Ok(TokenStream::new(tokens)) + } + + fn peek(&mut self) -> Option<&(usize, char)> { + self.code_iter.peek() + } + + fn advance(&mut self) { + self.curr_char = self.code_iter.next(); + } + + fn advance_until_new_line(&mut self) { + while !matches!(self.curr_char, Some((_, '\n'))) { + self.advance(); + } + if matches!(self.curr_char, Some((_, '\r'))) { + self.advance(); + } + } + + fn read_num(&mut self) -> LexRes { + let mut snum = format!("{}", self.curr_char.unwrap().1); + + while let Some((_idx, ch)) = self.peek() { + match ch { + '0'..='9' => snum.push(*ch), + _ => break, + } + self.advance(); + } + + // Only verified numeric chars were added so this should not fail + // Actually it could easily fail if the number is too big + // TODO: So this should be checked and converted into a LexErr + Ok(Token::Literal(Literal::Int64(snum.parse().unwrap()))) + } + + fn read_string(&mut self) -> LexRes { + let mut text = String::new(); + + let mut escape = false; + loop { + let (_idx, ch) = match self.peek() { + Some(it) => *it, + None => return Err(LexErr::new(LexErrType::MissingQuoteEnd)), + }; + + if escape { + match ch { + '"' | '\\' => text.push(ch), + '\n' => text.push('\n'), + 'r' => text.push('\r'), + 't' => text.push('\t'), + _ => return Err(LexErr::new(LexErrType::InvalidEscapeChar(ch))), + } + escape = false; + } else { + match ch { + '"' => break, + '\\' => escape = true, + _ => text.push(ch), + } + } + + self.advance(); + } + self.advance(); + + Ok(Token::Literal(Literal::String(text))) + } + + fn read_ident_or_keyword(&mut self) -> LexRes { + let mut ident = format!("{}", self.curr_char.unwrap().1); + + while let Some((_idx, ch)) = self.peek() { + match ch { + '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => ident.push(*ch), + _ => break, + } + self.advance(); + } + + let token = match ident.as_str() { + "let" => Token::Keyword(Keyword::Let), + "if" => Token::Keyword(Keyword::If), + "else" => Token::Keyword(Keyword::Else), + "while" => Token::Keyword(Keyword::While), + "loop" => Token::Keyword(Keyword::Loop), + "fn" => Token::Keyword(Keyword::Fn), + "return" => Token::Keyword(Keyword::Return), + "void" => Token::Keyword(Keyword::Void), + + "true" => Token::Literal(Literal::Boolean(true)), + "false" => Token::Literal(Literal::Boolean(false)), + + _ => Token::Ident(ident), + }; + + Ok(token) + } +} + +impl LexErr { + pub fn new(etype: LexErrType) -> Self { + Self { etype } + } +} + + +#[cfg(test)] +mod test { + use super::*; + + /// Try to lex a sequential string containing at least one of each tokens + #[test] + fn test_general() { + let code = r#" + // A comment + + - + * / % + == != > < >= <= + = -> + && || ^ ! + ([{)]} + 4564 "a string" false true + an_5ident6 + ; : , . + let if while loop else fn return void + "#; + + let expected_tokens = vec![ + Token::Op(Op::Add), + Token::Op(Op::Sub), + + Token::Op(Op::Mul), + Token::Op(Op::Div), + Token::Op(Op::Mod), + + Token::Op(Op::Eq), + Token::Op(Op::Neq), + Token::Op(Op::Gt), + Token::Op(Op::Lt), + Token::Op(Op::Ge), + Token::Op(Op::Le), + + Token::Op(Op::Assign), + Token::Op(Op::Arrow), + + Token::Op(Op::And), + Token::Op(Op::Or), + Token::Op(Op::Xor), + Token::Op(Op::Not), + + Token::Open(Group::Paren), + Token::Open(Group::Bracket), + Token::Open(Group::Braces), + Token::Close(Group::Paren), + Token::Close(Group::Bracket), + Token::Close(Group::Braces), + + Token::Literal(Literal::Int64(4564)), + Token::Literal(Literal::String("a string".to_string())), + Token::Literal(Literal::Boolean(false)), + Token::Literal(Literal::Boolean(true)), + + Token::Ident("an_5ident6".to_string()), + + Token::Semicolon, + Token::Colon, + Token::Comma, + Token::Dot, + + Token::Keyword(Keyword::Let), + Token::Keyword(Keyword::If), + Token::Keyword(Keyword::While), + Token::Keyword(Keyword::Loop), + Token::Keyword(Keyword::Else), + Token::Keyword(Keyword::Fn), + Token::Keyword(Keyword::Return), + Token::Keyword(Keyword::Void), + ]; + + let mut lexer = Lexer::new(code); + let tokens = lexer.tokenize().unwrap(); + + assert_eq!(tokens.as_vec(), &expected_tokens); + } +} diff --git a/plang2_lib/src/lib.rs b/plang2_lib/src/lib.rs new file mode 100644 index 0000000..7269279 --- /dev/null +++ b/plang2_lib/src/lib.rs @@ -0,0 +1,5 @@ +pub mod token; +pub mod lexer; + +pub use token::*; +pub use lexer::*; diff --git a/plang2_lib/src/token.rs b/plang2_lib/src/token.rs new file mode 100644 index 0000000..0e7a8c5 --- /dev/null +++ b/plang2_lib/src/token.rs @@ -0,0 +1,198 @@ +use std::{fmt::Display, borrow::Cow}; + +/// Operators +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum Op { + // Addition + Add, + Sub, + + // Multiplications + Mul, + Div, + Mod, + + // Assignment + Assign, + + // Equality + Eq, + Neq, + Gt, + Lt, + Ge, + Le, + + // Bool + And, + Or, + Not, + Xor, + + Arrow, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum Group { + Paren, + Bracket, + Braces, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum Literal { + Boolean(bool), + Int64(i64), + String(String), +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum Keyword { + Let, + While, + Loop, + If, + Else, + Fn, + Return, + Void, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum Token { + Literal(Literal), + Op(Op), + Open(Group), + Close(Group), + + Ident(String), + + Keyword(Keyword), + + Semicolon, + Colon, + Comma, + Dot, +} + +pub struct TokenStream { + tokens: Vec, + idx: usize, +} + +impl Display for Token { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + + let op: Cow<'static, str> = match self { + Token::Op(Op::Add) => "+".into(), + Token::Op(Op::Sub) => "-".into(), + + Token::Op(Op::Mul) => "*".into(), + Token::Op(Op::Div) => "/".into(), + Token::Op(Op::Mod) => "%".into(), + + Token::Op(Op::Eq) => "==".into(), + Token::Op(Op::Neq) => "!=".into(), + Token::Op(Op::Gt) => ">".into(), + Token::Op(Op::Lt) => "<".into(), + Token::Op(Op::Ge) => ">=".into(), + Token::Op(Op::Le) => "<=".into(), + + Token::Op(Op::Assign) => "=".into(), + Token::Op(Op::Arrow) => "->".into(), + + Token::Op(Op::And) => "&&".into(), + Token::Op(Op::Or) => "||".into(), + Token::Op(Op::Xor) => "^".into(), + Token::Op(Op::Not) => "!".into(), + + Token::Open(Group::Paren) => "(".into(), + Token::Open(Group::Bracket) => "[".into(), + Token::Open(Group::Braces) => "{".into(), + Token::Close(Group::Paren) => ")".into(), + Token::Close(Group::Bracket) => "]".into(), + Token::Close(Group::Braces) => "}".into(), + + Token::Literal(Literal::Int64(num)) => format!("Int64({})", num).into(), + Token::Literal(Literal::String(text)) => format!("String({})", text).into(), + Token::Literal(Literal::Boolean(val)) => format!("Boolean({})", val).into(), + + Token::Ident(ident) => format!("Ident({})", ident).into(), + + Token::Semicolon => ";".into(), + Token::Colon => ":".into(), + Token::Comma => ",".into(), + Token::Dot => ".".into(), + + Token::Keyword(Keyword::Let) => "let".into(), + Token::Keyword(Keyword::If) => "if".into(), + Token::Keyword(Keyword::While) => "while".into(), + Token::Keyword(Keyword::Loop) => "loop".into(), + Token::Keyword(Keyword::Else) => "else".into(), + Token::Keyword(Keyword::Fn) => "fn".into(), + Token::Keyword(Keyword::Return) => "return".into(), + Token::Keyword(Keyword::Void) => "void".into(), + }; + + write!(f, "{}", op) + } +} + +impl TokenStream { + pub fn new(tokens: Vec) -> Self { + Self { tokens, idx: 0 } + } + + pub fn as_vec(&self) -> &Vec { + &self.tokens + } + + pub fn curr(&self) -> Option<&Token> { + self.tokens.get(self.idx) + } + + pub fn peek(&self) -> Option<&Token> { + self.tokens.get(self.idx + 1) + } + pub fn advance(&mut self) { + self.idx += 1 + } +} + +impl Display for TokenStream { + /// Print the TokenStream with autofomatting + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut indent = 0_usize; + let mut fresh_line = true; + + for tok in self.tokens.iter() { + if matches!(tok, Token::Close(Group::Braces)) { + indent = indent.saturating_sub(1); + fresh_line = true; + } + + if fresh_line { + write!(f, "{}", " ".repeat(indent * 4))?; + fresh_line = false; + } + + write!(f, "{} ", tok)?; + + + match tok { + Token::Open(Group::Braces) => { + writeln!(f)?; + indent += 1; + fresh_line = true; + } + Token::Semicolon | Token::Close(Group::Braces) => { + writeln!(f)?; + fresh_line = true; + } + _ => () + } + } + + Ok(()) + } +}