Initial commit

- Implemented basic lexer - No spans implemented yet - No real error handling yet
2021-12-23 16:48:49 +01:00 · 2021-12-23 16:48:49 +01:00 · f2a00e6560
commit f2a00e6560
9 changed files with 573 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+/target
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,14 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "plang2"
+version = "0.1.0"
+dependencies = [
+ "plang2_lib",
+]
+
+[[package]]
+name = "plang2_lib"
+version = "0.1.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,4 @@
+[workspace]
+members = [
+    "plang2_lib", "plang2"
+]
--- a/plang2/Cargo.toml
+++ b/plang2/Cargo.toml
@ -0,0 +1,7 @@
+[package]
+name = "plang2"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+plang2_lib = { path = "../plang2_lib" }
--- a/plang2/src/main.rs
+++ b/plang2/src/main.rs
@ -0,0 +1,23 @@
+#![allow(dead_code, unused)]
+use plang2_lib::*;
+
+fn main() {
+
+    let code = r#"
+        // This is the main function
+        fn main() {
+            let a = 5465;
+            let b = 8;
+            let c = a + b;
+
+            print_int(c);
+        }
+    "#;
+
+    let mut lexer = Lexer::new(code);
+
+    let tokens = lexer.tokenize().unwrap();
+
+    println!("Tokens: \n{}\n", tokens);
+
+}
--- a/plang2_lib/Cargo.toml
+++ b/plang2_lib/Cargo.toml
@ -0,0 +1,8 @@
+[package]
+name = "plang2_lib"
+version = "0.1.0"
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
--- a/plang2_lib/src/lexer.rs
+++ b/plang2_lib/src/lexer.rs
@ -0,0 +1,313 @@
+use std::{iter::Peekable, str::CharIndices};
+
+use super::token::*;
+
+#[derive(Debug)]
+pub enum LexErrType {
+    InvalidCharacter(char),
+    InvalidEscapeChar(char),
+    MissingQuoteEnd,
+}
+
+// TODO: Make real errors that contain the span (offending text section with filename + line)
+#[derive(Debug)]
+pub struct LexErr {
+    etype: LexErrType,
+}
+
+type LexRes<T> = Result<T, LexErr>;
+
+pub struct Lexer<'a> {
+    // code: &'a str,
+    code_iter: Peekable<CharIndices<'a>>,
+    curr_char: Option<(usize, char)>,
+}
+
+impl<'a> Lexer<'a> {
+    pub fn new(code: &'a str) -> Self {
+        let mut code_iter = code.char_indices().peekable();
+        let curr_char = code_iter.next();
+        Self {
+            // code,
+            code_iter,
+            curr_char,
+        }
+    }
+
+    pub fn tokenize(&mut self) -> LexRes<TokenStream> {
+        let mut tokens = Vec::new();
+
+        loop {
+            let (_idx, ch) = match self.curr_char {
+                Some(it) => it,
+                None => break,
+            };
+
+            let (_idx_nxt, ch_nxt) = self
+                .peek()
+                .map(|(a, b)| (Some(a), Some(b)))
+                .unwrap_or_default();
+
+            match ch {
+                // Skip whitespace
+                ' ' | '\t' | '\n' | '\r' => (),
+
+                // Lex tokens with 2 char length
+                '/' if matches!(ch_nxt, Some('/')) => self.advance_until_new_line(),
+                '=' if matches!(ch_nxt, Some('=')) => {
+                    self.advance();
+                    tokens.push(Token::Op(Op::Eq));
+                }
+                '!' if matches!(ch_nxt, Some('=')) => {
+                    self.advance();
+                    tokens.push(Token::Op(Op::Neq));
+                }
+                '>' if matches!(ch_nxt, Some('=')) => {
+                    self.advance();
+                    tokens.push(Token::Op(Op::Ge));
+                }
+                '<' if matches!(ch_nxt, Some('=')) => {
+                    self.advance();
+                    tokens.push(Token::Op(Op::Le));
+                }
+                '-' if matches!(ch_nxt, Some('>')) => {
+                    self.advance();
+                    tokens.push(Token::Op(Op::Arrow));
+                }
+                '&' if matches!(ch_nxt, Some('&')) => {
+                    self.advance();
+                    tokens.push(Token::Op(Op::And));
+                }
+                '|' if matches!(ch_nxt, Some('|')) => {
+                    self.advance();
+                    tokens.push(Token::Op(Op::Or));
+                }
+
+                // Lex tokens with 1 char length
+                '+' => tokens.push(Token::Op(Op::Add)),
+                '-' => tokens.push(Token::Op(Op::Sub)),
+                '*' => tokens.push(Token::Op(Op::Mul)),
+                '/' => tokens.push(Token::Op(Op::Div)),
+                '%' => tokens.push(Token::Op(Op::Mod)),
+                '(' => tokens.push(Token::Open(Group::Paren)),
+                '[' => tokens.push(Token::Open(Group::Bracket)),
+                '{' => tokens.push(Token::Open(Group::Braces)),
+                ')' => tokens.push(Token::Close(Group::Paren)),
+                ']' => tokens.push(Token::Close(Group::Bracket)),
+                '}' => tokens.push(Token::Close(Group::Braces)),
+                '=' => tokens.push(Token::Op(Op::Assign)),
+                '>' => tokens.push(Token::Op(Op::Gt)),
+                '<' => tokens.push(Token::Op(Op::Lt)),
+                ';' => tokens.push(Token::Semicolon),
+                ':' => tokens.push(Token::Colon),
+                ',' => tokens.push(Token::Comma),
+                '.' => tokens.push(Token::Dot),
+                '!' => tokens.push(Token::Op(Op::Not)),
+                '^' => tokens.push(Token::Op(Op::Xor)),
+
+                // Lex Strings
+                '"' => tokens.push(self.read_string()?),
+
+                // Lex numbers
+                '0'..='9' => tokens.push(self.read_num()?),
+
+                // Lex identifiers / keywords
+                'a'..='z' | 'A'..='Z' | '_' => tokens.push(self.read_ident_or_keyword()?),
+
+                // Anything else is an error
+                _ => {
+                    return Err(LexErr::new(LexErrType::InvalidCharacter(ch)))
+                }
+            }
+
+            self.advance();
+        }
+
+        Ok(TokenStream::new(tokens))
+    }
+
+    fn peek(&mut self) -> Option<&(usize, char)> {
+        self.code_iter.peek()
+    }
+
+    fn advance(&mut self) {
+        self.curr_char = self.code_iter.next();
+    }
+
+    fn advance_until_new_line(&mut self) {
+        while !matches!(self.curr_char, Some((_, '\n'))) {
+            self.advance();
+        }
+        if matches!(self.curr_char, Some((_, '\r'))) {
+            self.advance();
+        }
+    }
+
+    fn read_num(&mut self) -> LexRes<Token> {
+        let mut snum = format!("{}", self.curr_char.unwrap().1);
+
+        while let Some((_idx, ch)) = self.peek() {
+            match ch {
+                '0'..='9' => snum.push(*ch),
+                _ => break,
+            }
+            self.advance();
+        }
+
+        // Only verified numeric chars were added so this should not fail
+        // Actually it could easily fail if the number is too big
+        // TODO: So this should be checked and converted into a LexErr
+        Ok(Token::Literal(Literal::Int64(snum.parse().unwrap())))
+    }
+
+    fn read_string(&mut self) -> LexRes<Token> {
+        let mut text = String::new();
+
+        let mut escape = false;
+        loop {
+            let (_idx, ch) = match self.peek() {
+                Some(it) => *it,
+                None => return Err(LexErr::new(LexErrType::MissingQuoteEnd)),
+            };
+
+            if escape {
+                match ch {
+                    '"' | '\\' => text.push(ch),
+                    '\n' => text.push('\n'),
+                    'r' => text.push('\r'),
+                    't' => text.push('\t'),
+                    _ => return Err(LexErr::new(LexErrType::InvalidEscapeChar(ch))),
+                }
+                escape = false;
+            } else {
+                match ch {
+                    '"' => break,
+                    '\\' => escape = true,
+                    _ => text.push(ch),
+                }
+            }
+
+            self.advance();
+        }
+        self.advance();
+
+        Ok(Token::Literal(Literal::String(text)))
+    }
+
+    fn read_ident_or_keyword(&mut self) -> LexRes<Token> {
+        let mut ident = format!("{}", self.curr_char.unwrap().1);
+
+        while let Some((_idx, ch)) = self.peek() {
+            match ch {
+                '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => ident.push(*ch),
+                _ => break,
+            }
+            self.advance();
+        }
+
+        let token = match ident.as_str() {
+            "let" => Token::Keyword(Keyword::Let),
+            "if" => Token::Keyword(Keyword::If),
+            "else" => Token::Keyword(Keyword::Else),
+            "while" => Token::Keyword(Keyword::While),
+            "loop" => Token::Keyword(Keyword::Loop),
+            "fn" => Token::Keyword(Keyword::Fn),
+            "return" => Token::Keyword(Keyword::Return),
+            "void" => Token::Keyword(Keyword::Void),
+
+            "true" => Token::Literal(Literal::Boolean(true)),
+            "false" => Token::Literal(Literal::Boolean(false)),
+
+            _ => Token::Ident(ident),
+        };
+
+        Ok(token)
+    }
+}
+
+impl LexErr {
+    pub fn new(etype: LexErrType) -> Self {
+        Self { etype }
+    }
+}
+
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    /// Try to lex a sequential string containing at least one of each tokens
+    #[test]
+    fn test_general() {
+        let code = r#"
+            // A comment
+            + -
+            * / %
+            == != > < >= <=
+            = ->
+            && || ^ !
+            ([{)]}
+            4564 "a string" false true
+            an_5ident6
+            ; : , .
+            let if while loop else fn return void
+        "#;
+
+        let expected_tokens = vec![
+            Token::Op(Op::Add),
+            Token::Op(Op::Sub),
+
+            Token::Op(Op::Mul),
+            Token::Op(Op::Div),
+            Token::Op(Op::Mod),
+            
+            Token::Op(Op::Eq),
+            Token::Op(Op::Neq),
+            Token::Op(Op::Gt),
+            Token::Op(Op::Lt),
+            Token::Op(Op::Ge),
+            Token::Op(Op::Le),
+
+            Token::Op(Op::Assign),
+            Token::Op(Op::Arrow),
+            
+            Token::Op(Op::And),
+            Token::Op(Op::Or),
+            Token::Op(Op::Xor),
+            Token::Op(Op::Not),
+
+            Token::Open(Group::Paren),
+            Token::Open(Group::Bracket),
+            Token::Open(Group::Braces),
+            Token::Close(Group::Paren),
+            Token::Close(Group::Bracket),
+            Token::Close(Group::Braces),
+
+            Token::Literal(Literal::Int64(4564)),
+            Token::Literal(Literal::String("a string".to_string())),
+            Token::Literal(Literal::Boolean(false)),
+            Token::Literal(Literal::Boolean(true)),
+
+            Token::Ident("an_5ident6".to_string()),
+
+            Token::Semicolon,
+            Token::Colon,
+            Token::Comma,
+            Token::Dot,
+
+            Token::Keyword(Keyword::Let),
+            Token::Keyword(Keyword::If),
+            Token::Keyword(Keyword::While),
+            Token::Keyword(Keyword::Loop),
+            Token::Keyword(Keyword::Else),
+            Token::Keyword(Keyword::Fn),
+            Token::Keyword(Keyword::Return),
+            Token::Keyword(Keyword::Void),
+        ];
+
+        let mut lexer = Lexer::new(code);
+        let tokens = lexer.tokenize().unwrap();
+
+        assert_eq!(tokens.as_vec(), &expected_tokens);
+    }
+}
--- a/plang2_lib/src/lib.rs
+++ b/plang2_lib/src/lib.rs
@ -0,0 +1,5 @@
+pub mod token;
+pub mod lexer;
+
+pub use token::*;
+pub use lexer::*;
--- a/plang2_lib/src/token.rs
+++ b/plang2_lib/src/token.rs
@ -0,0 +1,198 @@
+use std::{fmt::Display, borrow::Cow};
+
+/// Operators
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub enum Op {
+    // Addition
+    Add,
+    Sub,
+
+    // Multiplications
+    Mul,
+    Div,
+    Mod,
+
+    // Assignment
+    Assign,
+
+    // Equality
+    Eq,
+    Neq,
+    Gt,
+    Lt,
+    Ge,
+    Le,
+
+    // Bool
+    And,
+    Or,
+    Not,
+    Xor,
+
+    Arrow,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub enum Group {
+    Paren,
+    Bracket,
+    Braces,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub enum Literal {
+    Boolean(bool),
+    Int64(i64),
+    String(String),
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub enum Keyword {
+    Let,
+    While,
+    Loop,
+    If,
+    Else,
+    Fn,
+    Return,
+    Void,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub enum Token {
+    Literal(Literal),
+    Op(Op),
+    Open(Group),
+    Close(Group),
+
+    Ident(String),
+
+    Keyword(Keyword),
+
+    Semicolon,
+    Colon,
+    Comma,
+    Dot,
+}
+
+pub struct TokenStream {
+    tokens: Vec<Token>,
+    idx: usize,
+}
+
+impl Display for Token {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+
+        let op: Cow<'static, str> = match self {
+            Token::Op(Op::Add) => "+".into(),
+            Token::Op(Op::Sub) => "-".into(),
+
+            Token::Op(Op::Mul) => "*".into(),
+            Token::Op(Op::Div) => "/".into(),
+            Token::Op(Op::Mod) => "%".into(),
+            
+            Token::Op(Op::Eq) => "==".into(),
+            Token::Op(Op::Neq) => "!=".into(),
+            Token::Op(Op::Gt) => ">".into(),
+            Token::Op(Op::Lt) => "<".into(),
+            Token::Op(Op::Ge) => ">=".into(),
+            Token::Op(Op::Le) => "<=".into(),
+
+            Token::Op(Op::Assign) => "=".into(),
+            Token::Op(Op::Arrow) => "->".into(),
+            
+            Token::Op(Op::And) => "&&".into(),
+            Token::Op(Op::Or) => "||".into(),
+            Token::Op(Op::Xor) => "^".into(),
+            Token::Op(Op::Not) => "!".into(),
+
+            Token::Open(Group::Paren) => "(".into(),
+            Token::Open(Group::Bracket) => "[".into(),
+            Token::Open(Group::Braces) => "{".into(),
+            Token::Close(Group::Paren) => ")".into(),
+            Token::Close(Group::Bracket) => "]".into(),
+            Token::Close(Group::Braces) => "}".into(),
+
+            Token::Literal(Literal::Int64(num)) => format!("Int64({})", num).into(),
+            Token::Literal(Literal::String(text)) => format!("String({})", text).into(),
+            Token::Literal(Literal::Boolean(val)) => format!("Boolean({})", val).into(),
+
+            Token::Ident(ident) => format!("Ident({})", ident).into(),
+
+            Token::Semicolon => ";".into(),
+            Token::Colon => ":".into(),
+            Token::Comma => ",".into(),
+            Token::Dot => ".".into(),
+
+            Token::Keyword(Keyword::Let) => "let".into(),
+            Token::Keyword(Keyword::If) => "if".into(),
+            Token::Keyword(Keyword::While) => "while".into(),
+            Token::Keyword(Keyword::Loop) => "loop".into(),
+            Token::Keyword(Keyword::Else) => "else".into(),
+            Token::Keyword(Keyword::Fn) => "fn".into(),
+            Token::Keyword(Keyword::Return) => "return".into(),
+            Token::Keyword(Keyword::Void) => "void".into(),
+        };
+
+        write!(f, "{}", op)
+    }
+}
+
+impl TokenStream {
+    pub fn new(tokens: Vec<Token>) -> Self {
+        Self { tokens, idx: 0 }
+    }
+
+    pub fn as_vec(&self) -> &Vec<Token> {
+        &self.tokens
+    }
+
+    pub fn curr(&self) -> Option<&Token> {
+        self.tokens.get(self.idx)
+    }
+
+    pub fn peek(&self) -> Option<&Token> {
+        self.tokens.get(self.idx + 1)
+    }
+    pub fn advance(&mut self) {
+        self.idx += 1
+    }
+}
+
+impl Display for TokenStream {
+    /// Print the TokenStream with autofomatting
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let mut indent = 0_usize;
+        let mut fresh_line = true; 
+
+        for tok in self.tokens.iter() {
+            if matches!(tok, Token::Close(Group::Braces)) {
+                indent = indent.saturating_sub(1);
+                fresh_line = true;
+            }
+
+            if fresh_line {
+                write!(f, "{}", " ".repeat(indent * 4))?;
+                fresh_line = false;
+            }
+
+            write!(f, "{} ", tok)?;
+            
+
+            match tok {
+                Token::Open(Group::Braces) => {
+                    writeln!(f)?;
+                    indent += 1;
+                    fresh_line = true;
+                }
+                Token::Semicolon | Token::Close(Group::Braces) => {
+                    writeln!(f)?;
+                    fresh_line = true;
+                }
+                _ => ()
+            }
+        }
+
+        Ok(())
+    }
+}