Initial commit

- Implemented basic lexer
- No spans implemented yet
- No real error handling yet
This commit is contained in:
Daniel M 2021-12-23 16:48:49 +01:00
commit f2a00e6560
9 changed files with 573 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target

14
Cargo.lock generated Normal file
View File

@ -0,0 +1,14 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "plang2"
version = "0.1.0"
dependencies = [
"plang2_lib",
]
[[package]]
name = "plang2_lib"
version = "0.1.0"

4
Cargo.toml Normal file
View File

@ -0,0 +1,4 @@
[workspace]
members = [
"plang2_lib", "plang2"
]

7
plang2/Cargo.toml Normal file
View File

@ -0,0 +1,7 @@
[package]
name = "plang2"
version = "0.1.0"
edition = "2021"
[dependencies]
plang2_lib = { path = "../plang2_lib" }

23
plang2/src/main.rs Normal file
View File

@ -0,0 +1,23 @@
#![allow(dead_code, unused)]
use plang2_lib::*;
fn main() {
let code = r#"
// This is the main function
fn main() {
let a = 5465;
let b = 8;
let c = a + b;
print_int(c);
}
"#;
let mut lexer = Lexer::new(code);
let tokens = lexer.tokenize().unwrap();
println!("Tokens: \n{}\n", tokens);
}

8
plang2_lib/Cargo.toml Normal file
View File

@ -0,0 +1,8 @@
[package]
name = "plang2_lib"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

313
plang2_lib/src/lexer.rs Normal file
View File

@ -0,0 +1,313 @@
use std::{iter::Peekable, str::CharIndices};
use super::token::*;
#[derive(Debug)]
pub enum LexErrType {
InvalidCharacter(char),
InvalidEscapeChar(char),
MissingQuoteEnd,
}
// TODO: Make real errors that contain the span (offending text section with filename + line)
#[derive(Debug)]
pub struct LexErr {
etype: LexErrType,
}
type LexRes<T> = Result<T, LexErr>;
pub struct Lexer<'a> {
// code: &'a str,
code_iter: Peekable<CharIndices<'a>>,
curr_char: Option<(usize, char)>,
}
impl<'a> Lexer<'a> {
pub fn new(code: &'a str) -> Self {
let mut code_iter = code.char_indices().peekable();
let curr_char = code_iter.next();
Self {
// code,
code_iter,
curr_char,
}
}
pub fn tokenize(&mut self) -> LexRes<TokenStream> {
let mut tokens = Vec::new();
loop {
let (_idx, ch) = match self.curr_char {
Some(it) => it,
None => break,
};
let (_idx_nxt, ch_nxt) = self
.peek()
.map(|(a, b)| (Some(a), Some(b)))
.unwrap_or_default();
match ch {
// Skip whitespace
' ' | '\t' | '\n' | '\r' => (),
// Lex tokens with 2 char length
'/' if matches!(ch_nxt, Some('/')) => self.advance_until_new_line(),
'=' if matches!(ch_nxt, Some('=')) => {
self.advance();
tokens.push(Token::Op(Op::Eq));
}
'!' if matches!(ch_nxt, Some('=')) => {
self.advance();
tokens.push(Token::Op(Op::Neq));
}
'>' if matches!(ch_nxt, Some('=')) => {
self.advance();
tokens.push(Token::Op(Op::Ge));
}
'<' if matches!(ch_nxt, Some('=')) => {
self.advance();
tokens.push(Token::Op(Op::Le));
}
'-' if matches!(ch_nxt, Some('>')) => {
self.advance();
tokens.push(Token::Op(Op::Arrow));
}
'&' if matches!(ch_nxt, Some('&')) => {
self.advance();
tokens.push(Token::Op(Op::And));
}
'|' if matches!(ch_nxt, Some('|')) => {
self.advance();
tokens.push(Token::Op(Op::Or));
}
// Lex tokens with 1 char length
'+' => tokens.push(Token::Op(Op::Add)),
'-' => tokens.push(Token::Op(Op::Sub)),
'*' => tokens.push(Token::Op(Op::Mul)),
'/' => tokens.push(Token::Op(Op::Div)),
'%' => tokens.push(Token::Op(Op::Mod)),
'(' => tokens.push(Token::Open(Group::Paren)),
'[' => tokens.push(Token::Open(Group::Bracket)),
'{' => tokens.push(Token::Open(Group::Braces)),
')' => tokens.push(Token::Close(Group::Paren)),
']' => tokens.push(Token::Close(Group::Bracket)),
'}' => tokens.push(Token::Close(Group::Braces)),
'=' => tokens.push(Token::Op(Op::Assign)),
'>' => tokens.push(Token::Op(Op::Gt)),
'<' => tokens.push(Token::Op(Op::Lt)),
';' => tokens.push(Token::Semicolon),
':' => tokens.push(Token::Colon),
',' => tokens.push(Token::Comma),
'.' => tokens.push(Token::Dot),
'!' => tokens.push(Token::Op(Op::Not)),
'^' => tokens.push(Token::Op(Op::Xor)),
// Lex Strings
'"' => tokens.push(self.read_string()?),
// Lex numbers
'0'..='9' => tokens.push(self.read_num()?),
// Lex identifiers / keywords
'a'..='z' | 'A'..='Z' | '_' => tokens.push(self.read_ident_or_keyword()?),
// Anything else is an error
_ => {
return Err(LexErr::new(LexErrType::InvalidCharacter(ch)))
}
}
self.advance();
}
Ok(TokenStream::new(tokens))
}
fn peek(&mut self) -> Option<&(usize, char)> {
self.code_iter.peek()
}
fn advance(&mut self) {
self.curr_char = self.code_iter.next();
}
fn advance_until_new_line(&mut self) {
while !matches!(self.curr_char, Some((_, '\n'))) {
self.advance();
}
if matches!(self.curr_char, Some((_, '\r'))) {
self.advance();
}
}
fn read_num(&mut self) -> LexRes<Token> {
let mut snum = format!("{}", self.curr_char.unwrap().1);
while let Some((_idx, ch)) = self.peek() {
match ch {
'0'..='9' => snum.push(*ch),
_ => break,
}
self.advance();
}
// Only verified numeric chars were added so this should not fail
// Actually it could easily fail if the number is too big
// TODO: So this should be checked and converted into a LexErr
Ok(Token::Literal(Literal::Int64(snum.parse().unwrap())))
}
fn read_string(&mut self) -> LexRes<Token> {
let mut text = String::new();
let mut escape = false;
loop {
let (_idx, ch) = match self.peek() {
Some(it) => *it,
None => return Err(LexErr::new(LexErrType::MissingQuoteEnd)),
};
if escape {
match ch {
'"' | '\\' => text.push(ch),
'\n' => text.push('\n'),
'r' => text.push('\r'),
't' => text.push('\t'),
_ => return Err(LexErr::new(LexErrType::InvalidEscapeChar(ch))),
}
escape = false;
} else {
match ch {
'"' => break,
'\\' => escape = true,
_ => text.push(ch),
}
}
self.advance();
}
self.advance();
Ok(Token::Literal(Literal::String(text)))
}
fn read_ident_or_keyword(&mut self) -> LexRes<Token> {
let mut ident = format!("{}", self.curr_char.unwrap().1);
while let Some((_idx, ch)) = self.peek() {
match ch {
'0'..='9' | 'a'..='z' | 'A'..='Z' | '_' => ident.push(*ch),
_ => break,
}
self.advance();
}
let token = match ident.as_str() {
"let" => Token::Keyword(Keyword::Let),
"if" => Token::Keyword(Keyword::If),
"else" => Token::Keyword(Keyword::Else),
"while" => Token::Keyword(Keyword::While),
"loop" => Token::Keyword(Keyword::Loop),
"fn" => Token::Keyword(Keyword::Fn),
"return" => Token::Keyword(Keyword::Return),
"void" => Token::Keyword(Keyword::Void),
"true" => Token::Literal(Literal::Boolean(true)),
"false" => Token::Literal(Literal::Boolean(false)),
_ => Token::Ident(ident),
};
Ok(token)
}
}
impl LexErr {
pub fn new(etype: LexErrType) -> Self {
Self { etype }
}
}
#[cfg(test)]
mod test {
use super::*;
/// Try to lex a sequential string containing at least one of each tokens
#[test]
fn test_general() {
let code = r#"
// A comment
+ -
* / %
== != > < >= <=
= ->
&& || ^ !
([{)]}
4564 "a string" false true
an_5ident6
; : , .
let if while loop else fn return void
"#;
let expected_tokens = vec![
Token::Op(Op::Add),
Token::Op(Op::Sub),
Token::Op(Op::Mul),
Token::Op(Op::Div),
Token::Op(Op::Mod),
Token::Op(Op::Eq),
Token::Op(Op::Neq),
Token::Op(Op::Gt),
Token::Op(Op::Lt),
Token::Op(Op::Ge),
Token::Op(Op::Le),
Token::Op(Op::Assign),
Token::Op(Op::Arrow),
Token::Op(Op::And),
Token::Op(Op::Or),
Token::Op(Op::Xor),
Token::Op(Op::Not),
Token::Open(Group::Paren),
Token::Open(Group::Bracket),
Token::Open(Group::Braces),
Token::Close(Group::Paren),
Token::Close(Group::Bracket),
Token::Close(Group::Braces),
Token::Literal(Literal::Int64(4564)),
Token::Literal(Literal::String("a string".to_string())),
Token::Literal(Literal::Boolean(false)),
Token::Literal(Literal::Boolean(true)),
Token::Ident("an_5ident6".to_string()),
Token::Semicolon,
Token::Colon,
Token::Comma,
Token::Dot,
Token::Keyword(Keyword::Let),
Token::Keyword(Keyword::If),
Token::Keyword(Keyword::While),
Token::Keyword(Keyword::Loop),
Token::Keyword(Keyword::Else),
Token::Keyword(Keyword::Fn),
Token::Keyword(Keyword::Return),
Token::Keyword(Keyword::Void),
];
let mut lexer = Lexer::new(code);
let tokens = lexer.tokenize().unwrap();
assert_eq!(tokens.as_vec(), &expected_tokens);
}
}

5
plang2_lib/src/lib.rs Normal file
View File

@ -0,0 +1,5 @@
pub mod token;
pub mod lexer;
pub use token::*;
pub use lexer::*;

198
plang2_lib/src/token.rs Normal file
View File

@ -0,0 +1,198 @@
use std::{fmt::Display, borrow::Cow};
/// Operators
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum Op {
// Addition
Add,
Sub,
// Multiplications
Mul,
Div,
Mod,
// Assignment
Assign,
// Equality
Eq,
Neq,
Gt,
Lt,
Ge,
Le,
// Bool
And,
Or,
Not,
Xor,
Arrow,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum Group {
Paren,
Bracket,
Braces,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum Literal {
Boolean(bool),
Int64(i64),
String(String),
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum Keyword {
Let,
While,
Loop,
If,
Else,
Fn,
Return,
Void,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum Token {
Literal(Literal),
Op(Op),
Open(Group),
Close(Group),
Ident(String),
Keyword(Keyword),
Semicolon,
Colon,
Comma,
Dot,
}
pub struct TokenStream {
tokens: Vec<Token>,
idx: usize,
}
impl Display for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let op: Cow<'static, str> = match self {
Token::Op(Op::Add) => "+".into(),
Token::Op(Op::Sub) => "-".into(),
Token::Op(Op::Mul) => "*".into(),
Token::Op(Op::Div) => "/".into(),
Token::Op(Op::Mod) => "%".into(),
Token::Op(Op::Eq) => "==".into(),
Token::Op(Op::Neq) => "!=".into(),
Token::Op(Op::Gt) => ">".into(),
Token::Op(Op::Lt) => "<".into(),
Token::Op(Op::Ge) => ">=".into(),
Token::Op(Op::Le) => "<=".into(),
Token::Op(Op::Assign) => "=".into(),
Token::Op(Op::Arrow) => "->".into(),
Token::Op(Op::And) => "&&".into(),
Token::Op(Op::Or) => "||".into(),
Token::Op(Op::Xor) => "^".into(),
Token::Op(Op::Not) => "!".into(),
Token::Open(Group::Paren) => "(".into(),
Token::Open(Group::Bracket) => "[".into(),
Token::Open(Group::Braces) => "{".into(),
Token::Close(Group::Paren) => ")".into(),
Token::Close(Group::Bracket) => "]".into(),
Token::Close(Group::Braces) => "}".into(),
Token::Literal(Literal::Int64(num)) => format!("Int64({})", num).into(),
Token::Literal(Literal::String(text)) => format!("String({})", text).into(),
Token::Literal(Literal::Boolean(val)) => format!("Boolean({})", val).into(),
Token::Ident(ident) => format!("Ident({})", ident).into(),
Token::Semicolon => ";".into(),
Token::Colon => ":".into(),
Token::Comma => ",".into(),
Token::Dot => ".".into(),
Token::Keyword(Keyword::Let) => "let".into(),
Token::Keyword(Keyword::If) => "if".into(),
Token::Keyword(Keyword::While) => "while".into(),
Token::Keyword(Keyword::Loop) => "loop".into(),
Token::Keyword(Keyword::Else) => "else".into(),
Token::Keyword(Keyword::Fn) => "fn".into(),
Token::Keyword(Keyword::Return) => "return".into(),
Token::Keyword(Keyword::Void) => "void".into(),
};
write!(f, "{}", op)
}
}
impl TokenStream {
pub fn new(tokens: Vec<Token>) -> Self {
Self { tokens, idx: 0 }
}
pub fn as_vec(&self) -> &Vec<Token> {
&self.tokens
}
pub fn curr(&self) -> Option<&Token> {
self.tokens.get(self.idx)
}
pub fn peek(&self) -> Option<&Token> {
self.tokens.get(self.idx + 1)
}
pub fn advance(&mut self) {
self.idx += 1
}
}
impl Display for TokenStream {
/// Print the TokenStream with autofomatting
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mut indent = 0_usize;
let mut fresh_line = true;
for tok in self.tokens.iter() {
if matches!(tok, Token::Close(Group::Braces)) {
indent = indent.saturating_sub(1);
fresh_line = true;
}
if fresh_line {
write!(f, "{}", " ".repeat(indent * 4))?;
fresh_line = false;
}
write!(f, "{} ", tok)?;
match tok {
Token::Open(Group::Braces) => {
writeln!(f)?;
indent += 1;
fresh_line = true;
}
Token::Semicolon | Token::Close(Group::Braces) => {
writeln!(f)?;
fresh_line = true;
}
_ => ()
}
}
Ok(())
}
}