From caee5ce154edb5bd646b60983430229f374bb7ac Mon Sep 17 00:00:00 2001 From: Ishan Jain Date: Sat, 8 Jun 2024 08:16:25 +0530 Subject: [PATCH] added scanner --- Cargo.lock | 16 +++ Cargo.toml | 3 +- src/loxi.rs | 21 +++ src/main.rs | 19 ++- src/repl.rs | 24 ++++ src/scanner/mod.rs | 350 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 430 insertions(+), 3 deletions(-) create mode 100644 Cargo.lock create mode 100644 src/loxi.rs create mode 100644 src/repl.rs create mode 100644 src/scanner/mod.rs diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..8d88eb5 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,16 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "loxi" +version = "0.1.0" +dependencies = [ + "lazy_static", +] diff --git a/Cargo.toml b/Cargo.toml index bd9997a..361b7ba 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,5 @@ name = "loxi" version = "0.1.0" edition = "2021" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] +lazy_static = "1.4.0" diff --git a/src/loxi.rs b/src/loxi.rs new file mode 100644 index 0000000..515eb2f --- /dev/null +++ b/src/loxi.rs @@ -0,0 +1,21 @@ +use crate::scanner::{Scanner, ScannerError}; +use std::io::{Result as IoResult, Write}; + +pub fn run(program: &str) { + let tokens = Scanner::new(program); + + for token in tokens { + println!("{:?}", token); + } +} + +fn print_parser_errors(mut out: W, errors: &[ScannerError]) -> IoResult<()> { + for error in errors { + out.write_fmt(format_args!( + "\tline: {} | error: {}\n", + error.line, error.message + )) + .unwrap(); + } + out.flush() +} diff --git a/src/main.rs b/src/main.rs index e7a11a9..1f12993 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,20 @@ +#[macro_use] +mod scanner; +mod loxi; +mod repl; + +use std::env; + fn main() { - println!("Hello, world!"); + let args: Vec = env::args().collect(); + + match args.len() { + 1 => repl::init(), + v if v > 1 => { + println!("Usage: loxi [script]") + } + _ => { + // TODO: Read the file + } + } } diff --git a/src/repl.rs b/src/repl.rs new file mode 100644 index 0000000..0253451 --- /dev/null +++ b/src/repl.rs @@ -0,0 +1,24 @@ +use crate::loxi::run; +use std::io::{self, BufRead, Write}; + +const PROMPT: &[u8] = b">> "; + +pub fn init() { + let stdin = io::stdin(); + let read_handle = stdin.lock(); + let stdout = io::stdout(); + let write_handle = stdout.lock(); + + start(read_handle, write_handle); +} + +fn start(mut ip: R, mut out: W) { + loop { + out.write_all(PROMPT).unwrap(); + out.flush().unwrap(); + let mut s = String::new(); + ip.read_line(&mut s).unwrap(); + + run(&s); + } +} diff --git a/src/scanner/mod.rs b/src/scanner/mod.rs new file mode 100644 index 0000000..8a0ac11 --- /dev/null +++ b/src/scanner/mod.rs @@ -0,0 +1,350 @@ +use lazy_static::lazy_static; +use std::{collections::HashMap, iter::Peekable, str::Chars}; + +lazy_static! { + static ref IDENTMAP: HashMap<&'static str, TokenType> = { + let mut m = HashMap::new(); + m.insert("fun", TokenType::Fun); + m.insert("var", TokenType::Var); + m.insert("true", TokenType::True); + m.insert("false", TokenType::False); + m.insert("return", TokenType::Return); + m.insert("if", TokenType::If); + m.insert("else", TokenType::Else); + m.insert("for", TokenType::For); + m.insert("nil", TokenType::Nil); + m.insert("and", TokenType::And); + m.insert("class", TokenType::Class); + m.insert("or", TokenType::Or); + m.insert("print", TokenType::Print); + m.insert("return", TokenType::Return); + m.insert("super", TokenType::Super); + m.insert("this", TokenType::This); + m.insert("while", TokenType::While); + m + }; +} + +pub struct Scanner<'a> { + input: Peekable>, + eof_sent: bool, + line: u64, + // Errors + // Provide an interface to log errors in the scanning process +} + +impl<'a> Scanner<'a> { + pub fn new(program: &'a str) -> Self { + Self { + input: program.chars().peekable(), + eof_sent: false, + line: 0, + } + } + + fn skip_whitespace(&mut self) { + while let Some(c) = self.input.peek() { + match c { + '\n' => { + self.read_char(); + self.line += 1; + } + ' ' | '\t' | '\r' => { + self.read_char(); + } + _ => break, + } + } + } + + #[inline] + fn read_char(&mut self) -> Option { + self.input.next() + } + + fn read_number(&mut self, first: char) -> Result { + let mut number = first.to_string(); + let mut decimal_found = false; + + while let Some(c) = self.input.next() { + match c { + v if v.is_ascii_digit() => number.push(c), + + '.' if !decimal_found => { + number.push(c); + decimal_found = true; + + if let Some(&next_char) = self.input.peek() { + if !next_char.is_ascii_digit() { + return Err("trailing dot when parsing number".to_string()); + } + } + } + ' ' | '\t' | '\r' | '\n' => return Ok(number), + v => { + return Err(format!( + "error in parsing number, unexpected character: {:?}", + v + )) + } + } + } + + Ok(number) + } + + fn read_string(&mut self) -> Result { + let mut out = String::new(); + + while let Some(c) = self.read_char() { + match c { + '"' => return Ok(out), + '\n' => return Err("unterminated string".to_string()), + '\\' => { + let next_char = self + .read_char() + .ok_or_else(|| "Unterminated escape sequence".to_string())?; + + match next_char { + 'n' => out.push('\n'), + 'r' => out.push('\r'), + 't' => out.push('\t'), + '"' => out.push('\"'), + '\\' => out.push('\\'), + _ => return Err("invalid escape sequence".to_string()), + } + } + _ => out.push(c), + } + } + + Ok(out) + } + + fn read_identifier(&mut self, first: char) -> String { + let mut ident = first.to_string(); + while self.input.peek().map_or(false, |&c| is_letter(c)) { + ident.push(self.read_char().unwrap()); + } + ident + } +} + +impl<'a> Iterator for Scanner<'a> { + type Item = Token; + + fn next(&mut self) -> Option { + self.skip_whitespace(); + let ch = self.read_char(); + + match ch { + Some('(') => Some(Token::new(TokenType::LeftParen)), + Some(')') => Some(Token::new(TokenType::RightParen)), + Some('{') => Some(Token::new(TokenType::LeftBrace)), + Some('}') => Some(Token::new(TokenType::RightBrace)), + Some(',') => Some(Token::new(TokenType::Comma)), + Some('.') => { + match self.input.peek() { + Some(v) if v.is_ascii_digit() => { + // TODO: Log trailing dot error + None + } + + _ => Some(Token::new(TokenType::Dot)), + } + } + Some('-') => Some(Token::new(TokenType::Minus)), + Some('+') => Some(Token::new(TokenType::Plus)), + Some(';') => Some(Token::new(TokenType::Semicolon)), + Some('*') => Some(Token::new(TokenType::Star)), + Some('=') => { + if let Some(&next) = self.input.peek() { + if next == '=' { + self.read_char(); + Some(Token::new(TokenType::EqualEqual)) + } else { + Some(Token::new(TokenType::Equal)) + } + } else { + Some(Token::new(TokenType::Equal)) + } + } + Some('!') => { + if let Some(&next) = self.input.peek() { + if next == '=' { + self.read_char(); + Some(Token::new(TokenType::BangEqual)) + } else { + Some(Token::new(TokenType::Bang)) + } + } else { + Some(Token::new(TokenType::Bang)) + } + } + Some('<') => { + if let Some(&next) = self.input.peek() { + if next == '=' { + self.read_char(); + Some(Token::new(TokenType::LessEqual)) + } else { + Some(Token::new(TokenType::Less)) + } + } else { + Some(Token::new(TokenType::Less)) + } + } + Some('>') => { + if let Some(&next) = self.input.peek() { + if next == '=' { + self.read_char(); + Some(Token::new(TokenType::GreaterEqual)) + } else { + Some(Token::new(TokenType::Greater)) + } + } else { + Some(Token::new(TokenType::Greater)) + } + } + Some('/') => { + // TODO: All this needs to be cleaned + if let Some(&next) = self.input.peek() { + if next == '/' { + // Found a comment! + // Skip till the end of line + + while let Some(next) = self.read_char() { + if next == '\n' { + break; + } + } + None + } else { + Some(Token::new(TokenType::Slash)) + } + } else { + Some(Token::new(TokenType::Slash)) + } + } + Some('"') => { + match self.read_string() { + Ok(s) => Some(Token::with_lexeme(TokenType::LString, s)), + Err(e) => { + // TODO: Log errors + return None; + } + } + } + Some(c) if c.is_ascii_digit() => { + match self.read_number(c) { + Ok(v) => Some(Token::with_lexeme(TokenType::Number, v)), + Err(e) => { + // TODO: Log error + None + } + } + } + Some(c) if is_letter(c) => { + let ident = self.read_identifier(c); + Some(lookup_ident(&ident)) + } + Some('\n') => { + unreachable!() + } + None if !self.eof_sent => { + self.eof_sent = true; + Some(Token::new(TokenType::Eof)) + } + + None => None, + _ => Some(Token::new(TokenType::Illegal)), + } + } +} + +fn lookup_ident(ident: &str) -> Token { + match IDENTMAP.get(&ident) { + Some(v) => Token::new(*v), + None => Token::with_lexeme(TokenType::Identifier, ident.to_string()), + } +} + +#[inline] +fn is_letter(ch: char) -> bool { + ch.is_alphabetic() || ch == '_' +} + +pub struct ScannerError { + pub line: u64, + pub message: String, +} + +#[derive(Debug, Copy, Clone)] +pub enum TokenType { + LeftParen, + RightParen, + LeftBrace, + RightBrace, + Comma, + Dot, + Minus, + Plus, + Semicolon, + Slash, + Star, + Bang, + BangEqual, + Equal, + EqualEqual, + Greater, + GreaterEqual, + Less, + LessEqual, + Identifier, + LString, + Number, + And, + Class, + Else, + False, + Fun, + For, + If, + Nil, + Or, + Print, + Return, + Super, + This, + True, + Var, + While, + Eof, + Illegal, +} + +#[derive(Debug)] +pub struct Token { + ttype: TokenType, + line: u64, + lexeme: String, + literal: Option, +} + +impl Token { + pub fn new(ttype: TokenType) -> Self { + Token { + ttype, + line: 0, + lexeme: "".to_string(), + literal: None, + } + } + pub fn with_lexeme(ttype: TokenType, l: String) -> Self { + Token { + ttype, + line: 0, + lexeme: l, + literal: None, + } + } +}