From 70c9d073f976f9ccd6361ecb7e878a1d14bd2a31 Mon Sep 17 00:00:00 2001 From: Daniel M Date: Fri, 11 Feb 2022 18:34:46 +0100 Subject: [PATCH] Add a few more comments --- src/ast.rs | 86 +++++++++++++++++++++++------------ src/astoptimizer.rs | 5 +++ src/interpreter.rs | 107 +++++++++++++++++++++++++++++++++++++++----- src/lexer.rs | 42 ++++++++++++----- src/lib.rs | 7 +++ src/main.rs | 3 ++ src/parser.rs | 90 ++++++++++++++++++++++++++++++++----- src/stringstore.rs | 20 +++++++++ src/token.rs | 14 ++++-- 9 files changed, 310 insertions(+), 64 deletions(-) diff --git a/src/ast.rs b/src/ast.rs index 33bedc3..361bad8 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -1,80 +1,82 @@ use std::rc::Rc; -use crate::stringstore::{StringStore, Sid}; +use crate::stringstore::{Sid, StringStore}; -/// Types for binary operators +/// Types for binary operations #[derive(Debug, PartialEq, Eq, Clone)] pub enum BinOpType { - /// Addition + /// Addition ("+") Add, - /// Subtraction + /// Subtraction ("-") Sub, - /// Multiplication + /// Multiplication ("*") Mul, - /// Divide + /// Division ("/") Div, - /// Modulo + /// Modulo / Remainder ("%") Mod, - /// Compare Equal + /// Compare Equal ("==") EquEqu, - /// Compare Not Equal + /// Compare Not Equal ("!=") NotEqu, - /// Less than + /// Compare Less than ("<") Less, - /// Less than or Equal + /// Compare Less than or Equal ("<=") LessEqu, - /// Greater than + /// Compare Greater than (">") Greater, - /// Greater than or Equal + /// Compare Greater than or Equal (">=") GreaterEqu, - /// Bitwise OR (inclusive or) + /// Bitwise Or ("|") BOr, - /// Bitwise And + /// Bitwise And ("&") BAnd, - /// Bitwise Xor (exclusive or) + /// Bitwise Xor / Exclusive Or ("^") BXor, - /// Logical And + /// Logical And ("&&") LAnd, - /// Logical Or + /// Logical Or ("||") LOr, - /// Shift Left + /// Bitwise Shift Left ("<<") Shl, - /// Shift Right + /// Bitwise Shift Right (">>") Shr, - /// Assign value to variable + /// Assign value to variable ("=") Assign, } +/// Types for unary operations #[derive(Debug, PartialEq, Eq, Clone)] pub enum UnOpType { - /// Unary Negate + /// Unary Negation ("-") Negate, - /// Bitwise Not + /// Bitwise Not / Bitflip ("~") BNot, - /// Logical Not + /// Logical Not ("!") LNot, } +/// Ast Node for possible Expression variants #[derive(Debug, PartialEq, Eq, Clone)] pub enum Expression { /// Integer literal (64-bit) @@ -82,15 +84,16 @@ pub enum Expression { /// String literal String(Sid), - /// Array with size + /// Array with size as an expression ArrayLiteral(Box), - - /// Array access with name, stackpos and position + /// Array access with name, stackpos and position as expression ArrayAccess(Sid, usize, Box), + /// Function call with name, stackpos and the arguments as a vec of expressions FunCall(Sid, usize, Vec), - /// Variable + /// Variable with name and the stackpos from behind. This means that stackpos 0 refers to the + /// last variable on the stack and not the first Var(Sid, usize), /// Binary operation. Consists of type, left hand side and right hand side BinOp(BinOpType, Box, Box), @@ -98,6 +101,7 @@ pub enum Expression { UnOp(UnOpType, Box), } +/// Ast Node for a loop #[derive(Debug, PartialEq, Eq, Clone)] pub struct Loop { /// The condition that determines if the loop should continue @@ -108,6 +112,7 @@ pub struct Loop { pub body: BlockScope, } +/// Ast Node for an if #[derive(Debug, PartialEq, Eq, Clone)] pub struct If { /// The condition @@ -118,40 +123,65 @@ pub struct If { pub body_false: BlockScope, } +/// Ast Node for a function declaration #[derive(Debug, PartialEq, Eq, Clone)] pub struct FunDecl { + /// The function name as StringID, stored in the stringstore pub name: Sid, + /// The absolute position on the function stack where the function is stored pub fun_stackpos: usize, + /// The argument names as StringIDs pub argnames: Vec, + /// The function body pub body: Rc, } +/// Ast Node for a variable declaration #[derive(Debug, PartialEq, Eq, Clone)] pub struct VarDecl { + /// The variable name as StringID, stored in the stringstore pub name: Sid, + /// The absolute position on the variable stack where the variable is stored pub var_stackpos: usize, + /// The right hand side that generates the initial value for the variable pub rhs: Expression, } +/// Ast Node for the possible Statement variants #[derive(Debug, PartialEq, Eq, Clone)] pub enum Statement { + /// Return from a function with the given result value as an expression Return(Expression), + /// Break out of the current loop Break, + /// End the current loop iteration early and continue with the next loop iteration Continue, + /// A variable declaration Declaration(VarDecl), + /// A function declaration FunDeclare(FunDecl), + /// A simple expression. This could be a function call or an assignment for example Expr(Expression), + /// A freestanding block scope Block(BlockScope), + /// A loop Loop(Loop), + /// An if If(If), + /// A print statement that will output the value of the given expression to the terminal Print(Expression), } +/// A number of statements that form a block of code together pub type BlockScope = Vec; +/// A full abstract syntax tree #[derive(Clone, Default)] pub struct Ast { + /// The stringstore contains the actual string values which are replaced with StringIDs in the + /// Ast. So this is needed to get the actual strings later pub stringstore: StringStore, + /// The main (top-level) code given as a number of statements pub main: BlockScope, } diff --git a/src/astoptimizer.rs b/src/astoptimizer.rs index 12f1f76..2e0a71a 100644 --- a/src/astoptimizer.rs +++ b/src/astoptimizer.rs @@ -1,9 +1,14 @@ use crate::ast::{Ast, BlockScope, Expression, If, Loop, Statement, BinOpType, UnOpType, VarDecl}; +/// A trait that allows to optimize an abstract syntax tree pub trait AstOptimizer { + /// Consume an abstract syntax tree and return an ast that has the same functionality but with + /// optional optimizations. fn optimize(ast: Ast) -> Ast; } +/// A very simple optimizer that applies trivial optimizations like precalculation expressions that +/// have only literals as operands pub struct SimpleAstOptimizer; impl AstOptimizer for SimpleAstOptimizer { diff --git a/src/interpreter.rs b/src/interpreter.rs index 2b8bf7d..a05d1fb 100644 --- a/src/interpreter.rs +++ b/src/interpreter.rs @@ -10,6 +10,7 @@ use crate::{ stringstore::{Sid, StringStore}, }; +/// Runtime errors that can occur during execution #[derive(Debug, Error)] pub enum RuntimeError { #[error("Invalid array Index: {0:?}")] @@ -37,41 +38,62 @@ pub enum RuntimeError { InvalidNumberOfArgs(String, usize, usize), } +/// Possible variants for the values #[derive(Debug, PartialEq, Eq, Clone)] pub enum Value { + /// 64-bit integer value I64(i64), + /// String value String(Sid), + /// Array value Array(Rc>>), + /// Void value Void, } +/// The exit type of a block. When a block ends, the exit type specified why the block ended. #[derive(Debug, PartialEq, Eq, Clone)] pub enum BlockExit { + /// Normal exit when the block just ends normally (no returns / breaks / continues / etc.) Normal, + /// The block ended through a break statement. This will be propagated up to the next loop + /// and cause it to fully terminate Break, + /// The block ended through a continue statement. This will be propagated up to the next loop + /// and cause it to start the next iteration Continue, + /// The block ended through a return statement. This will propagate up to the next function + /// body end Return(Value), } #[derive(Default)] pub struct Interpreter { + /// Run the SimpleAstOptimizer over the Ast before executing pub optimize_ast: bool, + /// Print the tokens after lexing pub print_tokens: bool, + /// Print the ast after parsing pub print_ast: bool, + /// Capture the output values of print statements instead of printing them to the terminal pub capture_output: bool, + /// The stored values that were captured output: Vec, - // Variable table stores the runtime values of variables + /// Variable table stores the runtime values of variables as a stack vartable: Vec, + /// Function table stores the functions during runtime as a stack funtable: Vec, + /// The stringstore contains all strings used throughout the program stringstore: StringStore, } impl Interpreter { + /// Create a new Interpreter pub fn new() -> Self { Self { optimize_ast: true, @@ -79,20 +101,28 @@ impl Interpreter { } } + /// Get the captured output pub fn output(&self) -> &[Value] { &self.output } + /// Try to retrieve a variable value from the varstack. The idx is the index from the back of + /// the stack. So 0 is the last value, not the first fn get_var(&self, idx: usize) -> Option { self.vartable.get(self.vartable.len() - idx - 1).cloned() } + /// Try to retrieve a mutable reference to a variable value from the varstack. The idx is the + /// index from the back of the stack. So 0 is the last value, not the first fn get_var_mut(&mut self, idx: usize) -> Option<&mut Value> { let idx = self.vartable.len() - idx - 1; self.vartable.get_mut(idx) } + /// Lex, parse and then run the given sourecode. This will terminate the program when an error + /// occurs and print an appropriate error message. pub fn run_str(&mut self, code: &str) { + // Lex the tokens let tokens = match lex(code) { Ok(tokens) => tokens, Err(e) => nice_panic!("Lexing error: {}", e), @@ -102,18 +132,22 @@ impl Interpreter { println!("Tokens: {:?}", tokens); } + // Parse the ast let ast = match parse(tokens) { Ok(ast) => ast, Err(e) => nice_panic!("Parsing error: {}", e), }; + // Run the ast match self.run_ast(ast) { Ok(_) => (), Err(e) => nice_panic!("Runtime error: {}", e), } } + /// Execute the given Ast within the interpreter pub fn run_ast(&mut self, mut ast: Ast) -> Result<(), RuntimeError> { + // Optimize the ast if self.optimize_ast { ast = SimpleAstOptimizer::optimize(ast); } @@ -122,16 +156,22 @@ impl Interpreter { println!("{:#?}", ast.main); } + // Take over the stringstore of the given ast self.stringstore = ast.stringstore; + // Run the top level block (the main) self.run_block(&ast.main)?; Ok(()) } + /// Run all statements in the given block pub fn run_block(&mut self, prog: &BlockScope) -> Result { self.run_block_fp_offset(prog, 0) } + /// Same as run_block, but with an additional framepointer offset. This allows to free more + /// values from the stack than normally and can be used when passing arguments inside a + /// function body scope from the outside pub fn run_block_fp_offset( &mut self, prog: &BlockScope, @@ -139,7 +179,9 @@ impl Interpreter { ) -> Result { let framepointer = self.vartable.len() - framepointer_offset; - for stmt in prog { + let mut block_exit = BlockExit::Normal; + + 'blockloop: for stmt in prog { match stmt { Statement::Break => return Ok(BlockExit::Break), Statement::Continue => return Ok(BlockExit::Continue), @@ -147,8 +189,8 @@ impl Interpreter { Statement::Return(expr) => { let val = self.resolve_expr(expr)?; - self.vartable.truncate(framepointer); - return Ok(BlockExit::Return(val)); + block_exit = BlockExit::Return(val); + break 'blockloop; } Statement::Expr(expr) => { @@ -163,8 +205,8 @@ impl Interpreter { Statement::Block(block) => match self.run_block(block)? { // Propagate return, continue and break be @ (BlockExit::Return(_) | BlockExit::Continue | BlockExit::Break) => { - self.vartable.truncate(framepointer); - return Ok(be); + block_exit = be; + break 'blockloop; } _ => (), }, @@ -172,23 +214,26 @@ impl Interpreter { Statement::Loop(looop) => { // loop runs as long condition != 0 loop { + // Check the loop condition if let Some(condition) = &looop.condition { if matches!(self.resolve_expr(condition)?, Value::I64(0)) { break; } } + // Run the body let be = self.run_block(&looop.body)?; match be { // Propagate return be @ BlockExit::Return(_) => { - self.vartable.truncate(framepointer); - return Ok(be); + block_exit = be; + break 'blockloop; } BlockExit::Break => break, BlockExit::Continue | BlockExit::Normal => (), } + // Run the advancement if let Some(adv) = &looop.advancement { self.resolve_expr(&adv)?; } @@ -210,6 +255,7 @@ impl Interpreter { body_true, body_false, }) => { + // Run the right block depending on the conditions result being 0 or not let exit = if matches!(self.resolve_expr(condition)?, Value::I64(0)) { self.run_block(body_false)? } else { @@ -219,8 +265,8 @@ impl Interpreter { match exit { // Propagate return, continue and break be @ (BlockExit::Return(_) | BlockExit::Continue | BlockExit::Break) => { - self.vartable.truncate(framepointer); - return Ok(be); + block_exit = be; + break 'blockloop; } _ => (), } @@ -234,9 +280,10 @@ impl Interpreter { self.vartable.truncate(framepointer); - Ok(BlockExit::Normal) + Ok(block_exit) } + /// Execute the given expression to retrieve the resulting value fn resolve_expr(&mut self, expr: &Expression) -> Result { let val = match expr { Expression::I64(val) => Value::I64(*val), @@ -271,6 +318,7 @@ impl Interpreter { // Function existance has been verified in the parser, so unwrap here shouldn't fail let expected_num_args = self.funtable.get(*fun_stackpos).unwrap().argnames.len(); + // Check if the number of provided arguments matches the number of expected arguments if expected_num_args != args_len { let fun_name = self .stringstore @@ -284,6 +332,7 @@ impl Interpreter { )); } + // Run the function body and return the BlockExit type match self.run_block_fp_offset( &Rc::clone(&self.funtable.get(*fun_stackpos).unwrap().body), expected_num_args, @@ -297,17 +346,23 @@ impl Interpreter { Ok(val) } + /// Retrive the value of a given array at the specified index from the varstack. The name is + /// given as a StringID and is used to reference the variable name in case of an error. The + /// idx is the stackpos where the array variable should be located and the arr_idx is the + /// actual array access index, given as an expression. fn resolve_array_access( &mut self, name: Sid, idx: usize, arr_idx: &Expression, ) -> Result { + // Resolve the array index into a value and check if it is a valid array index let arr_idx = match self.resolve_expr(arr_idx)? { Value::I64(size) if !size.is_negative() => size, val => return Err(RuntimeError::InvalidArrayIndex(val)), }; + // Get the array value let val = match self.get_var(idx) { Some(val) => val, None => { @@ -320,6 +375,7 @@ impl Interpreter { } }; + // Make sure it is an array let arr = match val { Value::Array(arr) => arr, _ => { @@ -332,12 +388,16 @@ impl Interpreter { } }; - let arr = arr.borrow_mut(); + // Get the value of the requested cell inside the array + let arr = arr.borrow(); arr.get(arr_idx as usize) .cloned() .ok_or(RuntimeError::ArrayOutOfBounds(arr_idx as usize, arr.len())) } + /// Retrive the value of a given variable from the varstack. The name is given as a StringID + /// and is used to reference the variable name in case of an error. The idx is the stackpos + /// where the variable should be located fn resolve_var(&mut self, name: Sid, idx: usize) -> Result { match self.get_var(idx) { Some(val) => Ok(val), @@ -352,9 +412,12 @@ impl Interpreter { } } + /// Execute a unary operation and get the resulting value fn resolve_unop(&mut self, uo: &UnOpType, operand: &Expression) -> Result { + // Recursively resolve the operands expression into an actual value let operand = self.resolve_expr(operand)?; + // Perform the correct operation, considering the operation and value type Ok(match (operand, uo) { (Value::I64(val), UnOpType::Negate) => Value::I64(-val), (Value::I64(val), UnOpType::BNot) => Value::I64(!val), @@ -363,6 +426,7 @@ impl Interpreter { }) } + /// Execute a binary operation and get the resulting value fn resolve_binop( &mut self, bo: &BinOpType, @@ -371,8 +435,11 @@ impl Interpreter { ) -> Result { let rhs = self.resolve_expr(rhs)?; + // Handle assignments separate from the other binary operations match (&bo, &lhs) { + // Normal variable assignment (BinOpType::Assign, Expression::Var(name, idx)) => { + // Get the variable mutably and assign the right hand side value match self.get_var_mut(*idx) { Some(val) => *val = rhs.clone(), None => { @@ -384,14 +451,18 @@ impl Interpreter { )) } } + return Ok(rhs); } + // Array index assignment (BinOpType::Assign, Expression::ArrayAccess(name, idx, arr_idx)) => { + // Calculate the array index let arr_idx = match self.resolve_expr(arr_idx)? { Value::I64(size) if !size.is_negative() => size, val => return Err(RuntimeError::InvalidArrayIndex(val)), }; + // Get the mutable ref to the array variable let val = match self.get_var_mut(*idx) { Some(val) => val, None => { @@ -404,7 +475,9 @@ impl Interpreter { } }; + // Verify that it actually is an array match val { + // Assign the right hand side value to the array it the given index Value::Array(arr) => arr.borrow_mut()[arr_idx as usize] = rhs.clone(), _ => { return Err(RuntimeError::TryingToIndexNonArray( @@ -421,8 +494,14 @@ impl Interpreter { _ => (), } + // This code is only executed if the binop is not an assignment as the assignments return + // early + + // Resolve the left hand side to the value let lhs = self.resolve_expr(lhs)?; + // Perform the appropriate calculations considering the operation type and datatypes of the + // two values let result = match (lhs, rhs) { (Value::I64(lhs), Value::I64(rhs)) => match bo { BinOpType::Add => Value::I64(lhs + rhs), @@ -456,6 +535,8 @@ impl Interpreter { Ok(result) } + /// Get a string representation of the given value. This uses the interpreters StringStore to + /// retrive the text values of Strings fn value_to_string(&self, val: &Value) -> String { match val { Value::I64(val) => format!("{}", val), @@ -476,6 +557,8 @@ mod test { use super::{Interpreter, Value}; use crate::ast::{BinOpType, Expression}; + /// Simple test to check if a simple expression is executed properly. + /// Full system tests from lexing to execution can be found in `lib.rs` #[test] fn test_interpreter_expr() { // Expression: 1 + 2 * 3 + 4 diff --git a/src/lexer.rs b/src/lexer.rs index 2dd491d..73b40dd 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -3,6 +3,7 @@ use thiserror::Error; use crate::{token::Token, T}; +/// Errors that can occur while lexing a given string #[derive(Debug, Error)] pub enum LexErr { #[error("Failed to parse '{0}' as i64")] @@ -24,8 +25,11 @@ pub fn lex(code: &str) -> Result, LexErr> { lexer.lex() } +/// The lexer is created from a reference to a sourcecode string and is consumed to create a token +/// buffer from that sourcecode. struct Lexer<'a> { - /// The sourcecode text as an iterator over the chars + /// The sourcecode text as a peekable iterator over the chars. Peekable allows for look-ahead + /// and the use of the Chars iterator allows to support unicode characters code: Peekable>, /// The lexed tokens tokens: Vec, @@ -34,6 +38,8 @@ struct Lexer<'a> { } impl<'a> Lexer<'a> { + + /// Create a new lexer from the given sourcecode fn new(code: &'a str) -> Self { let code = code.chars().peekable(); let tokens = Vec::new(); @@ -45,14 +51,18 @@ impl<'a> Lexer<'a> { } } + /// Consume the lexer and try to lex the contained sourcecode into a token buffer fn lex(mut self) -> Result, LexErr> { + loop { self.current_char = self.next(); + // Match on the current and next character. This gives a 1-char look-ahead and + // can be used to directly match 2-char tokens match (self.current_char, self.peek()) { // Stop lexing at EOF ('\0', _) => break, - // Skip whitespace + // Skip / ignore whitespace (' ' | '\t' | '\n' | '\r', _) => (), // Line comment. Consume every char until linefeed (next line) @@ -100,9 +110,10 @@ impl<'a> Lexer<'a> { // Lex multiple characters together as a string ('"', _) => self.lex_str()?, - // Lex multiple characters together as identifier + // Lex multiple characters together as identifier or keyword ('a'..='z' | 'A'..='Z' | '_', _) => self.lex_identifier()?, + // Any character that was not handled otherwise is invalid (ch, _) => Err(LexErr::UnexpectedChar(ch))?, } } @@ -132,7 +143,8 @@ impl<'a> Lexer<'a> { } } - // Try to convert the string representation of the value to i64 + // Try to convert the string representation of the value to i64. The error is mapped to + // the appropriate LexErr let i64val = sval.parse().map_err(|_| LexErr::NumericParse(sval))?; self.push_tok(T![i64(i64val)]); @@ -143,24 +155,28 @@ impl<'a> Lexer<'a> { /// Lex characters as a string until encountering an unescaped closing doublequoute char '"'. /// The successfully lexed string literal token is appended to the stored tokens. fn lex_str(&mut self) -> Result<(), LexErr> { - // Opening " was consumed in match - + // The opening " was consumed in match, so a fresh string can be used let mut text = String::new(); // Read all chars until encountering the closing " loop { match self.peek() { + // An unescaped doubleqoute ends the current string '"' => break, + // If the end of file is reached while still waiting for '"', error out '\0' => Err(LexErr::MissingClosingString)?, + _ => match self.next() { - // Backshlash indicates an escaped character + // Backslash indicates an escaped character, so consume one more char and + // treat it as the escaped char '\\' => match self.next() { 'n' => text.push('\n'), 'r' => text.push('\r'), 't' => text.push('\t'), '\\' => text.push('\\'), '"' => text.push('"'), + // If the escaped char is not handled, it is unsupported and an error ch => Err(LexErr::InvalidStrEscape(ch))?, }, // All other characters are simply appended to the string @@ -219,18 +235,23 @@ impl<'a> Lexer<'a> { self.tokens.push(token); } - /// Same as `push_tok` but also consumes the next token, removing it from the code iter + /// Same as `push_tok` but also consumes the next token, removing it from the code iter. This + /// is useful when lexing double char tokens where the second token has only been peeked. fn push_tok_consume(&mut self, token: Token) { self.next(); self.tokens.push(token); } - /// Advance to next character and return the removed char + /// Advance to next character and return the removed char. When the end of the code is reached, + /// `'\0'` is returned. This is used instead of an Option::None since it allows for much + /// shorter and cleaner code in the main loop. The `'\0'` character would not be valid anyways fn next(&mut self) -> char { self.code.next().unwrap_or('\0') } - /// Get the next character without removing it + /// Get the next character without removing it. When the end of the code is reached, + /// `'\0'` is returned. This is used instead of an Option::None since it allows for much + /// shorter and cleaner code in the main loop. The `'\0'` character would not be valid anyways fn peek(&mut self) -> char { self.code.peek().copied().unwrap_or('\0') } @@ -240,6 +261,7 @@ impl<'a> Lexer<'a> { mod tests { use crate::{lexer::lex, T}; + /// A general test to check if the lexer actually lexes tokens correctly #[test] fn test_lexer() { let code = r#"53+1-567_000 * / % | ~ ! < > & ^ ({[]});= <- >= <= diff --git a/src/lib.rs b/src/lib.rs index a157943..91731d6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,18 +7,25 @@ pub mod stringstore; pub mod astoptimizer; pub mod util; +/// A bunch of full program tests using the example code programs as test subjects. #[cfg(test)] mod tests { use crate::interpreter::{Interpreter, Value}; use std::fs::read_to_string; + /// Run a nek program with the given filename from the examples directory and assert the + /// captured output with the expected result. This only works if the program just outputs one + /// value as the result fn run_example_check_single_i64_output(filename: &str, correct_result: i64) { let mut interpreter = Interpreter::new(); + // Enable output capturing. This captures all calls to `print` interpreter.capture_output = true; + // Load and run the given program let code = read_to_string(format!("examples/{filename}")).unwrap(); interpreter.run_str(&code); + // Compare the captured output with the expected value let expected_output = [Value::I64(correct_result)]; assert_eq!(interpreter.output(), &expected_output); } diff --git a/src/main.rs b/src/main.rs index 3feea7d..0e4ca8a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,8 @@ use std::{env::args, fs, process::exit}; use nek_lang::{interpreter::Interpreter, nice_panic}; +/// Cli configuration flags and arguments. This could be done with `clap`, but since only so few +/// arguments are supported this seems kind of overkill. #[derive(Debug, Default)] struct CliConfig { print_tokens: bool, @@ -38,6 +40,7 @@ fn main() { Ok(code) => code, Err(_) => nice_panic!("Error: Could not read file '{}'", file), }; + // Lex, parse and run the program interpreter.run_str(&code); } else { println!("Error: No file given\n"); diff --git a/src/parser.rs b/src/parser.rs index 6170408..e23b790 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -8,24 +8,34 @@ use crate::{ T, }; +/// Errors that can occur while parsing #[derive(Debug, Error)] pub enum ParseErr { #[error("Unexpected Token \"{0:?}\", expected \"{1}\"")] UnexpectedToken(Token, String), + #[error("Left hand side of declaration is not a variable")] DeclarationOfNonVar, + #[error("Use of undefined variable \"{0}\"")] UseOfUndeclaredVar(String), + #[error("Use of undefined function \"{0}\"")] UseOfUndeclaredFun(String), + #[error("Redeclation of function \"{0}\"")] RedeclarationFun(String), + #[error("Function not declared at top level \"{0}\"")] FunctionOnNonTopLevel(String), } +/// A result that can either be Ok, or a ParseErr type ResPE = Result; +/// This macro can be used to quickly and easily assert if the next token is matching the expected +/// token and return an appropriate error if not. Since this is intended to be used inside the +/// parser, the first argument should always be `self`. macro_rules! validate_next { ($self:ident, $expected_tok:pat, $expected_str:expr) => { match $self.next() { @@ -41,6 +51,7 @@ pub fn parse, A: IntoIterator>(tokens: A parser.parse() } +/// A parser that takes in a Token Stream and can create a full abstract syntax tree from it. struct Parser> { tokens: PutBackIter, string_store: StringStore, @@ -65,6 +76,7 @@ impl> Parser { } } + /// Consume the parser and try to create the abstract syntax tree from the token stream pub fn parse(mut self) -> ResPE { let main = self.parse_scoped_block()?; Ok(Ast { @@ -73,25 +85,32 @@ impl> Parser { }) } + /// Parse a series of statements together as a BlockScope. This will continuously parse + /// statements until encountering end-of-file or a block end '}' . fn parse_scoped_block(&mut self) -> ResPE { self.parse_scoped_block_fp_offset(0) } - /// Parse tokens into an abstract syntax tree. This will continuously parse statements until - /// encountering end-of-file or a block end '}' . - fn parse_scoped_block_fp_offset(&mut self, framepoint_offset: usize) -> ResPE { + /// Same as parse_scoped_block, but an offset to the framepointer can be specified to allow + /// for easily passing variables into scopes from the outside. This is used when parsing + /// function calls + fn parse_scoped_block_fp_offset(&mut self, framepointer_offset: usize) -> ResPE { self.nesting_level += 1; - let framepointer = self.var_stack.len() - framepoint_offset; + let framepointer = self.var_stack.len() - framepointer_offset; let mut prog = Vec::new(); loop { match self.peek() { + // Just a semicolon is an empty statement. So just consume it T![;] => { self.next(); } + // '}' end the current block and EoF ends everything, as the end of the tokenstream + // is reached T![EoF] | T!['}'] => break, + // Create a new scoped block T!['{'] => { self.next(); prog.push(Statement::Block(self.parse_scoped_block()?)); @@ -99,49 +118,57 @@ impl> Parser { validate_next!(self, T!['}'], "}"); } - // By default try to lex a statement + // By default try to lex statements _ => prog.push(self.parse_stmt()?), } } + // Reset the stack to where it was before entering the scope self.var_stack.truncate(framepointer); self.nesting_level -= 1; Ok(prog) } - /// Parse a single statement from the tokens. + /// Parse a single statement from the tokens fn parse_stmt(&mut self) -> ResPE { let stmt = match self.peek() { + // Break statement T![break] => { self.next(); + // After the statement, there must be a semicolon validate_next!(self, T![;], ";"); Statement::Break } + // Continue statement T![continue] => { self.next(); + // After the statement, there must be a semicolon validate_next!(self, T![;], ";"); Statement::Continue } + // Loop statement T![loop] => Statement::Loop(self.parse_loop()?), + // Print statement T![print] => { self.next(); let expr = self.parse_expr()?; - // After a statement, there must be a semicolon + // After the statement, there must be a semicolon validate_next!(self, T![;], ";"); Statement::Print(expr) } + // Return statement T![return] => { self.next(); let stmt = Statement::Return(self.parse_expr()?); @@ -152,23 +179,29 @@ impl> Parser { stmt } + // If statement T![if] => Statement::If(self.parse_if()?), + // Function definition statement T![fun] => { self.next(); + // Expect an identifier as the function name let fun_name = match self.next() { T![ident(fun_name)] => fun_name, tok => return Err(ParseErr::UnexpectedToken(tok, "".to_string())), }; + // Only allow function definitions on the top level if self.nesting_level > 1 { return Err(ParseErr::FunctionOnNonTopLevel(fun_name)); } + // Intern the function name let fun_name = self.string_store.intern_or_lookup(&fun_name); + // Check if the function name already exists if self.fun_stack.contains(&fun_name) { return Err(ParseErr::RedeclarationFun( self.string_store @@ -178,19 +211,24 @@ impl> Parser { )); } + // Put the function name on the fucntion stack for precalculating the stack + // positions let fun_stackpos = self.fun_stack.len(); self.fun_stack.push(fun_name); + let mut arg_names = Vec::new(); validate_next!(self, T!['('], "("); + // Parse the optional arguments inside the parentheses while matches!(self.peek(), T![ident(_)]) { let var_name = match self.next() { T![ident(var_name)] => var_name, _ => unreachable!(), }; + // Intern argument names let var_name = self.string_store.intern_or_lookup(&var_name); arg_names.push(var_name); @@ -221,10 +259,13 @@ impl> Parser { }) } + // Either a variable declaration statement or an expression statement _ => { + // To decide if it is a declaration or an expression, a lookahead is needed let first = self.next(); let stmt = match (first, self.peek()) { + // Identifier and "<-" is a declaration (T![ident(name)], T![<-]) => { self.next(); @@ -240,7 +281,9 @@ impl> Parser { rhs, }) } + // Anything else must be an expression (first, _) => { + // Put the first token back in order for the parse_expr to see it self.putback(first); Statement::Expr(self.parse_expr()?) } @@ -269,6 +312,7 @@ impl> Parser { let mut body_false = BlockScope::default(); + // Optionally parse the else part if self.peek() == &T![else] { self.next(); @@ -293,9 +337,11 @@ impl> Parser { let mut condition = None; let mut advancement = None; + // Check if the optional condition is present if !matches!(self.peek(), T!['{']) { condition = Some(self.parse_expr()?); + // Check if the optional advancement is present if matches!(self.peek(), T![;]) { self.next(); advancement = Some(self.parse_expr()?); @@ -321,7 +367,9 @@ impl> Parser { self.parse_expr_precedence(lhs, 0) } - /// Parse binary expressions with a precedence equal to or higher than min_prec + /// Parse binary expressions with a precedence equal to or higher than min_prec. + /// This uses the precedence climbing methode for dealing with the operator precedences: + /// https://en.wikipedia.org/wiki/Operator-precedence_parser#Precedence_climbing_method fn parse_expr_precedence(&mut self, mut lhs: Expression, min_prec: u8) -> ResPE { while let Some(binop) = &self.peek().try_to_binop() { // Stop if the next operator has a lower binding power @@ -349,7 +397,8 @@ impl> Parser { Ok(lhs) } - /// Parse a primary expression (for now only number) + /// Parse a primary expression. A primary can be a literal value, variable, function call, + /// array indexing, parentheses grouping or a unary operation fn parse_primary(&mut self) -> ResPE { let primary = match self.next() { // Literal i64 @@ -370,6 +419,7 @@ impl> Parser { // Array sccess, aka indexing. An ident followed by square brackets containing the // index as an expression T![ident(name)] if self.peek() == &T!['['] => { + // Get the stack position of the array variable let sid = self.string_store.intern_or_lookup(&name); let stackpos = self.get_stackpos(sid)?; @@ -382,6 +432,7 @@ impl> Parser { Expression::ArrayAccess(sid, stackpos, index.into()) } + // Identifier followed by parenthesis is a function call T![ident(name)] if self.peek() == &T!['('] => { // Skip the opening parenthesis self.next(); @@ -390,6 +441,7 @@ impl> Parser { let mut args = Vec::new(); + // Parse the arguments as expressions while !matches!(self.peek(), T![')']) { let arg = self.parse_expr()?; args.push(arg); @@ -402,19 +454,24 @@ impl> Parser { validate_next!(self, T![')'], ")"); + // Find the function stack position let fun_stackpos = self.get_fun_stackpos(sid)?; Expression::FunCall(sid, fun_stackpos, args) } + // Just an identifier is a variable T![ident(name)] => { + // Find the variable stack position let sid = self.string_store.intern_or_lookup(&name); let stackpos = self.get_stackpos(sid)?; + Expression::Var(sid, stackpos) } // Parentheses grouping T!['('] => { + // Contained inbetween the parentheses can be any other expression let inner_expr = self.parse_expr()?; // Verify that there is a closing parenthesis @@ -425,7 +482,10 @@ impl> Parser { // Unary operations or invalid token tok => match tok.try_to_unop() { + // If the token is a valid unary operation, parse it as such Some(uot) => Expression::UnOp(uot, self.parse_primary()?.into()), + + // Otherwise it's an unexpected token None => return Err(ParseErr::UnexpectedToken(tok, "primary".to_string())), }, }; @@ -433,6 +493,8 @@ impl> Parser { Ok(primary) } + /// Try to get the position of a variable on the variable stack. This is needed to precalculate + /// the stackpositions in order to save time when executing fn get_stackpos(&self, varid: Sid) -> ResPE { self.var_stack .iter() @@ -447,6 +509,8 @@ impl> Parser { )) } + /// Try to get the position of a function on the function stack. This is needed to precalculate + /// the stackpositions in order to save time when executing fn get_fun_stackpos(&self, varid: Sid) -> ResPE { self.fun_stack .iter() @@ -461,16 +525,19 @@ impl> Parser { )) } - /// Get the next Token without removing it + /// Get the next Token without removing it. If there are no more tokens left, the EoF token is + /// returned. This follows the same reasoning as in the Lexer fn peek(&mut self) -> &Token { self.tokens.peek().unwrap_or(&T![EoF]) } + /// Put a single token back into the token stream fn putback(&mut self, tok: Token) { self.tokens.putback(tok); } - /// Advance to next Token and return the removed Token + /// Advance to next Token and return the removed Token. If there are no more tokens left, the + /// EoF token is returned. This follows the same reasoning as in the Lexer fn next(&mut self) -> Token { self.tokens.next().unwrap_or(T![EoF]) } @@ -484,6 +551,7 @@ mod tests { T, }; + /// A very simple test to check if the parser correctly parses a simple expression #[test] fn test_parser() { // Expression: 1 + 2 * 3 - 4 diff --git a/src/stringstore.rs b/src/stringstore.rs index 23a0429..3869ab7 100644 --- a/src/stringstore.rs +++ b/src/stringstore.rs @@ -1,20 +1,35 @@ use std::collections::HashMap; +/// A StringID that identifies a String inside the stringstore. This is only valid for the +/// StringStore that created the ID. These StringIDs can be trivialy and cheaply copied #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct Sid(usize); +/// A Datastructure that stores strings, handing out StringIDs that can be used to retrieve the +/// real strings at a later point. This is called interning. #[derive(Clone, Default)] pub struct StringStore { + /// The actual strings that are stored in the StringStore. The StringIDs match the index of the + /// string inside of this strings vector strings: Vec, + /// A Hashmap that allows to match already interned Strings to their StringID. This allows for + /// deduplication since the same string won't be stored twice sids: HashMap, } impl StringStore { + /// Create a new empty StringStore pub fn new() -> Self { Self { strings: Vec::new(), sids: HashMap::new() } } + /// Put the given string into the StringStore and get a StringID in return. If the string is + /// not yet stored, it will be after this. + /// + /// Note: The generated StringIDs are only valid for the StringStore that created them. Using + /// the IDs with another StringStore is undefined behavior. It might return wrong Strings or + /// None. pub fn intern_or_lookup(&mut self, text: &str) -> Sid { self.sids.get(text).copied().unwrap_or_else(|| { let sid = Sid(self.strings.len()); @@ -24,6 +39,11 @@ impl StringStore { }) } + /// Lookup and retrieve a string by the StringID. If the String is not found, None is returned. + /// + /// Note: The generated StringIDs are only valid for the StringStore that created them. Using + /// the IDs with another StringStore is undefined behavior. It might return wrong Strings or + /// None. pub fn lookup(&self, sid: Sid) -> Option<&String> { self.strings.get(sid.0) } diff --git a/src/token.rs b/src/token.rs index 0fadc77..4cb285c 100644 --- a/src/token.rs +++ b/src/token.rs @@ -64,6 +64,7 @@ pub enum Combo { LessThanMinus, } +/// Tokens are a group of one or more sourcecode characters that have a meaning together #[derive(Debug, PartialEq, Eq)] pub enum Token { /// Literal value token @@ -72,7 +73,7 @@ pub enum Token { /// Keyword token Keyword(Keyword), - /// Identifier (name for variables, functions, ...) + /// Identifier token (names for variables, functions, ...) Ident(String), /// Combined tokens consisting of multiple characters @@ -87,7 +88,8 @@ pub enum Token { /// Semicolon (";") Semicolon, - /// End of file + /// End of file (This is not generated by the lexer, but the parser uses this to find the + /// end of the token stream) EoF, /// Left Bracket ("[") @@ -182,6 +184,8 @@ impl Token { }) } + /// If the token can be used as a unary operation type, get the matching UnOpType. Otherwise + /// return None pub fn try_to_unop(&self) -> Option { Some(match self { T![-] => UnOpType::Negate, @@ -193,7 +197,11 @@ impl Token { } } -/// Macro to quickly create a token of the specified kind +/// Macro to quickly create a token of the specified kind. As this is implemented as a macro, it +/// can be used anywhere including in patterns. +/// +/// An implementation should exist for each token, so that there is no need to ever write out the +/// long token definitions. #[macro_export] macro_rules! T { // Keywords