Initial commit

- Implemented basic lexer - No spans implemented yet - No real error handling yet
2021-12-23 16:48:49 +01:00 · 2021-12-23 16:48:49 +01:00 · f2a00e6560
commit f2a00e6560
9 changed files with 573 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 /target
--- a/Cargo.lock
+++ b/Cargo.lock
@ -0,0 +1,14 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 3
 [[package]]
 name = "plang2"
 version = "0.1.0"
 dependencies = [
 "plang2_lib",
 ]
 [[package]]
 name = "plang2_lib"
 version = "0.1.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,4 @@
 [workspace]
 members = [
    "plang2_lib", "plang2"
 ]
--- a/plang2/Cargo.toml
+++ b/plang2/Cargo.toml
@ -0,0 +1,7 @@
 [package]
 name = "plang2"
 version = "0.1.0"
 edition = "2021"
 [dependencies]
 plang2_lib = { path = "../plang2_lib" }
--- a/plang2/src/main.rs
+++ b/plang2/src/main.rs
@ -0,0 +1,23 @@
 #![allow(dead_code, unused)]
 use plang2_lib::*;
 fn main() {
    let code = r#"
        // This is the main function
        fn main() {
            let a = 5465;
            let b = 8;
            let c = a + b;
            print_int(c);
        }
    "#;
    let mut lexer = Lexer::new(code);
    let tokens = lexer.tokenize().unwrap();
    println!("Tokens: \n{}\n", tokens);
 }
--- a/plang2_lib/Cargo.toml
+++ b/plang2_lib/Cargo.toml
@ -0,0 +1,8 @@
 [package]
 name = "plang2_lib"
 version = "0.1.0"
 edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 [dependencies]
--- a/plang2_lib/src/lexer.rs
+++ b/plang2_lib/src/lexer.rs
@ -0,0 +1,313 @@
 use std::{iter::Peekable, str::CharIndices};
 use super::token::*;
 #[derive(Debug)]
 pub enum LexErrType {
    InvalidCharacter(char),
    InvalidEscapeChar(char),
    MissingQuoteEnd,
 }
 // TODO: Make real errors that contain the span (offending text section with filename + line)
 #[derive(Debug)]
 pub struct LexErr {
    etype: LexErrType,
 }
 type LexRes<T> = Result<T, LexErr>;
 pub struct Lexer<'a> {
    // code: &'a str,
    code_iter: Peekable<CharIndices<'a>>,
    curr_char: Option<(usize, char)>,
 }
 impl<'a> Lexer<'a> {
    pub fn new(code: &'a str) -> Self {
        let mut code_iter = code.char_indices().peekable();
        let curr_char = code_iter.next();
        Self {
            // code,
            code_iter,
            curr_char,
        }
    }
    pub fn tokenize(&mut self) -> LexRes<TokenStream> {
        let mut tokens = Vec::new();
        loop {
            let (_idx, ch) = match self.curr_char {
                Some(it) => it,
                None => break,
            };
            let (_idx_nxt, ch_nxt) = self
                .peek()
                .map(|(a, b)| (Some(a), Some(b)))
                .unwrap_or_default();
            match ch {
                // Skip whitespace
                ' ' | '\t' | '\n' | '\r' => (),
                // Lex tokens with 2 char length
                '/' if matches!(ch_nxt, Some('/')) => self.advance_until_new_line(),
                '=' if matches!(ch_nxt, Some('=')) => {
                    self.advance();
                    tokens.push(Token::Op(Op::Eq));
                }
                '!' if matches!(ch_nxt, Some('=')) => {
                    self.advance();
                    tokens.push(Token::Op(Op::Neq));
                }
                '>' if matches!(ch_nxt, Some('=')) => {
                    self.advance();
                    tokens.push(Token::Op(Op::Ge));
                }
                '<' if matches!(ch_nxt, Some('=')) => {
                    self.advance();
                    tokens.push(Token::Op(Op::Le));
                }
                '-' if matches!(ch_nxt, Some('>')) => {
                    self.advance();
                    tokens.push(Token::Op(Op::Arrow));
                }
                '&' if matches!(ch_nxt, Some('&')) => {
                    self.advance();
                    tokens.push(Token::Op(Op::And));
                }
                '|' if matches!(ch_nxt, Some('|')) => {
                    self.advance();
                    tokens.push(Token::Op(Op::Or));
                }
                // Lex tokens with 1 char length
                '+' => tokens.push(Token::Op(Op::Add)),
                '-' => tokens.push(Token::Op(Op::Sub)),
                '*' => tokens.push(Token::Op(Op::Mul)),
                '/' => tokens.push(Token::Op(Op::Div)),
                '%' => tokens.push(Token::Op(Op::Mod)),
                '(' => tokens.push(Token::Open(Group::Paren)),
                '[' => tokens.push(Token::Open(Group::Bracket)),
                '{' => tokens.push(Token::Open(Group::Braces)),
                ')' => tokens.push(Token::Close(Group::Paren)),
                ']' => tokens.push(Token::Close(Group::Bracket)),
                '}' => tokens.push(Token::Close(Group::Braces)),
                '=' => tokens.push(Token::Op(Op::Assign)),
                '>' => tokens.push(Token::Op(Op::Gt)),
                '<' => tokens.push(Token::Op(Op::Lt)),
                ';' => tokens.push(Token::Semicolon),
                ':' => tokens.push(Token::Colon),
                ',' => tokens.push(Token::Comma),
                '.' => tokens.push(Token::Dot),
                '!' => tokens.push(Token::Op(Op::Not)),
                '^' => tokens.push(Token::Op(Op::Xor)),
                // Lex Strings
                '"' => tokens.push(self.read_string()?),
                // Lex numbers
                '0'..='9' => tokens.push(self.read_num()?),
                // Lex identifiers / keywords
                'a'..='z' | 'A'..='Z' | '_' => tokens.push(self.read_ident_or_keyword()?),
                // Anything else is an error
                _ => {
                    return Err(LexErr::new(LexErrType::InvalidCharacter(ch)))
                }
            }
            self.advance();
        }
        Ok(TokenStream::new(tokens))
    }
    fn peek(&mut self) -> Option<&(usize, char)> {
        self.code_iter.peek()
    }
    fn advance(&mut self) {
        self.curr_char = self.code_iter.next();
    }
    fn advance_until_new_line(&mut self) {
        while !matches!(self.curr_char, Some((_, '\n'))) {
            self.advance();
        }
        if matches!(self.curr_char, Some((_, '\r'))) {
            self.advance();
        }
    }
    fn read_num(&mut self) -> LexRes<Token> {
        let mut snum = format!("{}", self.curr_char.unwrap().1);
        while let Some((_idx, ch)) = self.peek() {
            match ch {
                '0'..='9' => snum.push(*ch),
                _ => break,
            }
            self.advance();
        }
        // Only verified numeric chars were added so this should not fail
        // Actually it could easily fail if the number is too big
        // TODO: So this should be checked and converted into a LexErr
        Ok(Token::Literal(Literal::Int64(snum.parse().unwrap())))
    }
    fn read_string(&mut self) -> LexRes<Token> {
        let mut text = String::new();
        let mut escape = false;
        loop {
            let (_idx, ch) = match self.peek() {
                Some(it) => *it,
                None => return Err(LexErr::new(LexErrType::MissingQuoteEnd)),
            };
            if escape {
                match ch {
                    '"' | '\\' => text.push(ch),
                    '\n' => text.push('\n'),
                    'r' => text.push('\r'),
                    't' => text.push('\t'),
                    _ => return Err(LexErr::new(LexErrType::InvalidEscapeChar(ch))),
                }
                escape = false;
            } else {
                match ch {
                    '"' => break,
                    '\\' => escape = true,
                    _ => text.push(ch),
                }
            }
            self.advance();
        }
        self.advance();
        Ok(Token::Literal(Literal::String(text)))
    }
    fn read_ident_or_keyword(&mut self) -> LexRes<Token> {
        let mut ident = format!("{}", self.curr_char.unwrap().1);
        while let Some((_idx, ch)) = self.peek() {
            match ch {
                '0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => ident.push(*ch),
                _ => break,
            }
            self.advance();
        }
        let token = match ident.as_str() {
            "let" => Token::Keyword(Keyword::Let),
            "if" => Token::Keyword(Keyword::If),
            "else" => Token::Keyword(Keyword::Else),
            "while" => Token::Keyword(Keyword::While),
            "loop" => Token::Keyword(Keyword::Loop),
            "fn" => Token::Keyword(Keyword::Fn),
            "return" => Token::Keyword(Keyword::Return),
            "void" => Token::Keyword(Keyword::Void),
            "true" => Token::Literal(Literal::Boolean(true)),
            "false" => Token::Literal(Literal::Boolean(false)),
            _ => Token::Ident(ident),
        };
        Ok(token)
    }
 }
 impl LexErr {
    pub fn new(etype: LexErrType) -> Self {
        Self { etype }
    }
 }
 #[cfg(test)]
 mod test {
    use super::*;
    /// Try to lex a sequential string containing at least one of each tokens
    #[test]
    fn test_general() {
        let code = r#"
            // A comment
            + -
            * / %
            == != > < >= <=
            = ->
            && || ^ !
            ([{)]}
            4564 "a string" false true
            an_5ident6
            ; : , .
            let if while loop else fn return void
        "#;
        let expected_tokens = vec![
            Token::Op(Op::Add),
            Token::Op(Op::Sub),
            Token::Op(Op::Mul),
            Token::Op(Op::Div),
            Token::Op(Op::Mod),
            Token::Op(Op::Eq),
            Token::Op(Op::Neq),
            Token::Op(Op::Gt),
            Token::Op(Op::Lt),
            Token::Op(Op::Ge),
            Token::Op(Op::Le),
            Token::Op(Op::Assign),
            Token::Op(Op::Arrow),
            Token::Op(Op::And),
            Token::Op(Op::Or),
            Token::Op(Op::Xor),
            Token::Op(Op::Not),
            Token::Open(Group::Paren),
            Token::Open(Group::Bracket),
            Token::Open(Group::Braces),
            Token::Close(Group::Paren),
            Token::Close(Group::Bracket),
            Token::Close(Group::Braces),
            Token::Literal(Literal::Int64(4564)),
            Token::Literal(Literal::String("a string".to_string())),
            Token::Literal(Literal::Boolean(false)),
            Token::Literal(Literal::Boolean(true)),
            Token::Ident("an_5ident6".to_string()),
            Token::Semicolon,
            Token::Colon,
            Token::Comma,
            Token::Dot,
            Token::Keyword(Keyword::Let),
            Token::Keyword(Keyword::If),
            Token::Keyword(Keyword::While),
            Token::Keyword(Keyword::Loop),
            Token::Keyword(Keyword::Else),
            Token::Keyword(Keyword::Fn),
            Token::Keyword(Keyword::Return),
            Token::Keyword(Keyword::Void),
        ];
        let mut lexer = Lexer::new(code);
        let tokens = lexer.tokenize().unwrap();
        assert_eq!(tokens.as_vec(), &expected_tokens);
    }
 }
--- a/plang2_lib/src/lib.rs
+++ b/plang2_lib/src/lib.rs
@ -0,0 +1,5 @@
 pub mod token;
 pub mod lexer;
 pub use token::*;
 pub use lexer::*;
--- a/plang2_lib/src/token.rs
+++ b/plang2_lib/src/token.rs
@ -0,0 +1,198 @@
 use std::{fmt::Display, borrow::Cow};
 /// Operators
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum Op {
    // Addition
    Add,
    Sub,
    // Multiplications
    Mul,
    Div,
    Mod,
    // Assignment
    Assign,
    // Equality
    Eq,
    Neq,
    Gt,
    Lt,
    Ge,
    Le,
    // Bool
    And,
    Or,
    Not,
    Xor,
    Arrow,
 }
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum Group {
    Paren,
    Bracket,
    Braces,
 }
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum Literal {
    Boolean(bool),
    Int64(i64),
    String(String),
 }
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum Keyword {
    Let,
    While,
    Loop,
    If,
    Else,
    Fn,
    Return,
    Void,
 }
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum Token {
    Literal(Literal),
    Op(Op),
    Open(Group),
    Close(Group),
    Ident(String),
    Keyword(Keyword),
    Semicolon,
    Colon,
    Comma,
    Dot,
 }
 pub struct TokenStream {
    tokens: Vec<Token>,
    idx: usize,
 }
 impl Display for Token {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let op: Cow<'static, str> = match self {
            Token::Op(Op::Add) => "+".into(),
            Token::Op(Op::Sub) => "-".into(),
            Token::Op(Op::Mul) => "*".into(),
            Token::Op(Op::Div) => "/".into(),
            Token::Op(Op::Mod) => "%".into(),
            Token::Op(Op::Eq) => "==".into(),
            Token::Op(Op::Neq) => "!=".into(),
            Token::Op(Op::Gt) => ">".into(),
            Token::Op(Op::Lt) => "<".into(),
            Token::Op(Op::Ge) => ">=".into(),
            Token::Op(Op::Le) => "<=".into(),
            Token::Op(Op::Assign) => "=".into(),
            Token::Op(Op::Arrow) => "->".into(),
            Token::Op(Op::And) => "&&".into(),
            Token::Op(Op::Or) => "||".into(),
            Token::Op(Op::Xor) => "^".into(),
            Token::Op(Op::Not) => "!".into(),
            Token::Open(Group::Paren) => "(".into(),
            Token::Open(Group::Bracket) => "[".into(),
            Token::Open(Group::Braces) => "{".into(),
            Token::Close(Group::Paren) => ")".into(),
            Token::Close(Group::Bracket) => "]".into(),
            Token::Close(Group::Braces) => "}".into(),
            Token::Literal(Literal::Int64(num)) => format!("Int64({})", num).into(),
            Token::Literal(Literal::String(text)) => format!("String({})", text).into(),
            Token::Literal(Literal::Boolean(val)) => format!("Boolean({})", val).into(),
            Token::Ident(ident) => format!("Ident({})", ident).into(),
            Token::Semicolon => ";".into(),
            Token::Colon => ":".into(),
            Token::Comma => ",".into(),
            Token::Dot => ".".into(),
            Token::Keyword(Keyword::Let) => "let".into(),
            Token::Keyword(Keyword::If) => "if".into(),
            Token::Keyword(Keyword::While) => "while".into(),
            Token::Keyword(Keyword::Loop) => "loop".into(),
            Token::Keyword(Keyword::Else) => "else".into(),
            Token::Keyword(Keyword::Fn) => "fn".into(),
            Token::Keyword(Keyword::Return) => "return".into(),
            Token::Keyword(Keyword::Void) => "void".into(),
        };
        write!(f, "{}", op)
    }
 }
 impl TokenStream {
    pub fn new(tokens: Vec<Token>) -> Self {
        Self { tokens, idx: 0 }
    }
    pub fn as_vec(&self) -> &Vec<Token> {
        &self.tokens
    }
    pub fn curr(&self) -> Option<&Token> {
        self.tokens.get(self.idx)
    }
    pub fn peek(&self) -> Option<&Token> {
        self.tokens.get(self.idx + 1)
    }
    pub fn advance(&mut self) {
        self.idx += 1
    }
 }
 impl Display for TokenStream {
    /// Print the TokenStream with autofomatting
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let mut indent = 0_usize;
        let mut fresh_line = true; 
        for tok in self.tokens.iter() {
            if matches!(tok, Token::Close(Group::Braces)) {
                indent = indent.saturating_sub(1);
                fresh_line = true;
            }
            if fresh_line {
                write!(f, "{}", " ".repeat(indent * 4))?;
                fresh_line = false;
            }
            write!(f, "{} ", tok)?;
            match tok {
                Token::Open(Group::Braces) => {
                    writeln!(f)?;
                    indent += 1;
                    fresh_line = true;
                }
                Token::Semicolon | Token::Close(Group::Braces) => {
                    writeln!(f)?;
                    fresh_line = true;
                }
                _ => ()
            }
        }
        Ok(())
    }
 }