diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index 7c1e999..580bdf0 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -1,15 +1,21 @@ use std::collections::HashMap; +use std::str; lazy_static! { static ref IDENTMAP: HashMap<&'static str, Token> = { let mut m = HashMap::new(); m.insert("fn", Token::Function); m.insert("let", Token::Let); + m.insert("true", Token::True); + m.insert("false", Token::False); + m.insert("return", Token::Return); + m.insert("if", Token::If); + m.insert("else", Token::Else); m }; } -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Clone)] pub enum Token { Illegal, EOF, @@ -23,6 +29,9 @@ pub enum Token { Multiply, Divide, Subtract, + ExclamationMark, + LessThan, + GreaterThan, // Delimiter Comma, @@ -34,30 +43,39 @@ pub enum Token { // Keywords Function, + If, Let, + True, + Else, + False, + Return, + Ident(String), } #[derive(Debug)] pub struct Lexer { - input: Vec, + input: Vec, position: usize, read_position: usize, - ch: char, + ch: u8, } impl Lexer { pub fn new(input: &str) -> Lexer { Lexer { - input: input.chars().collect::>(), + input: input.bytes().collect::>(), position: 0, read_position: 0, - ch: '0', + ch: 0, } } fn read_char(&mut self) { - if self.read_position >= self.input.len() { - self.ch = '0'; + if self.read_position == self.input.len() { + self.ch = 0; + } else if self.read_position > self.input.len() { + // 3 = ETX + self.ch = 3; } else { self.ch = self.input[self.read_position]; } @@ -71,8 +89,26 @@ impl Lexer { while is_letter(self.ch) { self.read_char(); } + self.read_position -= 1; + String::from_utf8_lossy(&self.input[pos..self.position]).to_string() + } - self.input[pos..self.position].iter().collect::() + fn skip_whitespace(&mut self) { + while self.ch == b' ' || self.ch == b'\t' || self.ch == b'\n' || self.ch == b'\r' { + self.read_char(); + } + } + + // use i64 for all numbers for now. + fn read_number(&mut self) -> i64 { + let pos = self.position; + while self.ch.is_ascii_digit() { + self.read_char(); + } + self.read_position -= 1; + String::from_utf8_lossy(&self.input[pos..self.position]) + .parse::() + .unwrap() } } @@ -81,25 +117,47 @@ impl Iterator for Lexer { fn next(&mut self) -> Option { self.read_char(); + self.skip_whitespace(); - match self.ch { - '=' => Some(Token::Assign), - '+' => Some(Token::Plus), - '*' => Some(Token::Multiply), - '/' => Some(Token::Divide), - '-' => Some(Token::Subtract), - ',' => Some(Token::Comma), - ';' => Some(Token::Semicolon), - '(' => Some(Token::LParen), - ')' => Some(Token::RParen), - '[' => Some(Token::LBrace), - ']' => Some(Token::RBrace), - '0' => Some(Token::EOF), - _ => None, - } + let v = match self.ch { + b'=' => Some(Token::Assign), + b'+' => Some(Token::Plus), + b'*' => Some(Token::Multiply), + b'/' => Some(Token::Divide), + b'-' => Some(Token::Subtract), + b',' => Some(Token::Comma), + b';' => Some(Token::Semicolon), + b'(' => Some(Token::LParen), + b')' => Some(Token::RParen), + b'{' => Some(Token::LBrace), + b'}' => Some(Token::RBrace), + b'!' => Some(Token::ExclamationMark), + b'>' => Some(Token::GreaterThan), + b'<' => Some(Token::LessThan), + 0 => Some(Token::EOF), + //ETX-> End of text. It's the value of self.ch after all the text is parsed. + 3 => None, + _ if is_letter(self.ch) => { + let ident = self.read_identifier(); + Some(lookup_ident(&ident)) + } + _ if self.ch.is_ascii_digit() => { + let number = self.read_number(); + Some(Token::Int(number)) + } + _ => Some(Token::Illegal), + }; + v } } -fn is_letter(c: char) -> bool { - c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == '_' +fn is_letter(c: u8) -> bool { + c.is_ascii_alphabetic() || c == b'_' +} + +fn lookup_ident(ident: &str) -> Token { + match IDENTMAP.get(ident) { + Some(v) => v.clone(), + None => Token::Ident(ident.to_string()), + } } diff --git a/src/main.rs b/src/main.rs index b392176..9882ec2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,28 +8,140 @@ fn main() {} #[cfg(test)] mod tests { use lexer::{Lexer, Token}; + use std::collections::HashMap; + #[test] fn new_token() { - let input = "=+()[],;"; - let expected = vec![ - Token::Assign, - Token::Plus, - Token::LParen, - Token::RParen, - Token::LBrace, - Token::RBrace, - Token::Comma, - Token::Semicolon, - Token::EOF, - ]; + let mut tests = HashMap::new(); - let tokenized_output = Lexer::new(input).collect::>(); + tests.insert( + "=+(){},;", + vec![ + Token::Assign, + Token::Plus, + Token::LParen, + Token::RParen, + Token::LBrace, + Token::RBrace, + Token::Comma, + Token::Semicolon, + Token::EOF, + ], + ); + tests.insert( + "let five = 5; + let ten = 10; - assert_eq!(expected.len(), tokenized_output.len()); - println!("{:?}", tokenized_output); + let add = fn(x, y) { + x + y; + }; - for (exp, actual) in expected.into_iter().zip(tokenized_output) { - assert_eq!(actual, exp); + let result = add(five, ten);", + vec![ + Token::Let, + Token::Ident("five".to_string()), + Token::Assign, + Token::Int(5), + Token::Semicolon, + Token::Let, + Token::Ident("ten".to_string()), + Token::Assign, + Token::Int(10), + Token::Semicolon, + Token::Let, + Token::Ident("add".to_string()), + Token::Assign, + Token::Function, + Token::LParen, + Token::Ident("x".to_string()), + Token::Comma, + Token::Ident("y".to_string()), + Token::RParen, + Token::LBrace, + Token::Ident("x".to_string()), + Token::Plus, + Token::Ident("y".to_string()), + Token::Semicolon, + Token::RBrace, + Token::Semicolon, + Token::Let, + Token::Ident("result".to_string()), + Token::Assign, + Token::Ident("add".to_string()), + Token::LParen, + Token::Ident("five".to_string()), + Token::Comma, + Token::Ident("ten".to_string()), + Token::RParen, + Token::Semicolon, + Token::EOF, + ], + ); + tests.insert( + "let result = add(five, ten); + !-/*5; + 5 < 10 > 5; + + if(5 < 10) { + return true; + } else { + return false; + } + ", + vec![ + Token::Let, + Token::Ident("result".to_string()), + Token::Assign, + Token::Ident("add".to_string()), + Token::LParen, + Token::Ident("five".to_string()), + Token::Comma, + Token::Ident("ten".to_string()), + Token::RParen, + Token::Semicolon, + Token::ExclamationMark, + Token::Subtract, + Token::Divide, + Token::Multiply, + Token::Int(5), + Token::Semicolon, + Token::Int(5), + Token::LessThan, + Token::Int(10), + Token::GreaterThan, + Token::Int(5), + Token::Semicolon, + Token::If, + Token::LParen, + Token::Int(5), + Token::LessThan, + Token::Int(10), + Token::RParen, + Token::LBrace, + Token::Return, + Token::True, + Token::Semicolon, + Token::RBrace, + Token::Else, + Token::LBrace, + Token::Return, + Token::False, + Token::Semicolon, + Token::RBrace, + Token::EOF, + ], + ); + + for (k, v) in tests { + let tokenized_output = Lexer::new(k).collect::>(); + // assert_eq!(v.len(), tokenized_output.len()); + + for (exp, actual) in v.into_iter().zip(tokenized_output) { + if actual != exp { + println!("Expect: {:?}, Actual: {:?}", actual, exp); + } + assert_eq!(actual, exp); + } } } }