Added true,false,return,if,else keywords, Single character tokens, tests

and change Vec<char> to Vec<u8>
This commit is contained in:
Ishan Jain 2019-02-04 17:16:17 +05:30
parent 561c4c6089
commit f31e4bfee1
2 changed files with 212 additions and 42 deletions

View File

@ -1,15 +1,21 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::str;
lazy_static! { lazy_static! {
static ref IDENTMAP: HashMap<&'static str, Token> = { static ref IDENTMAP: HashMap<&'static str, Token> = {
let mut m = HashMap::new(); let mut m = HashMap::new();
m.insert("fn", Token::Function); m.insert("fn", Token::Function);
m.insert("let", Token::Let); m.insert("let", Token::Let);
m.insert("true", Token::True);
m.insert("false", Token::False);
m.insert("return", Token::Return);
m.insert("if", Token::If);
m.insert("else", Token::Else);
m m
}; };
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq, Clone)]
pub enum Token { pub enum Token {
Illegal, Illegal,
EOF, EOF,
@ -23,6 +29,9 @@ pub enum Token {
Multiply, Multiply,
Divide, Divide,
Subtract, Subtract,
ExclamationMark,
LessThan,
GreaterThan,
// Delimiter // Delimiter
Comma, Comma,
@ -34,30 +43,39 @@ pub enum Token {
// Keywords // Keywords
Function, Function,
If,
Let, Let,
True,
Else,
False,
Return,
Ident(String),
} }
#[derive(Debug)] #[derive(Debug)]
pub struct Lexer { pub struct Lexer {
input: Vec<char>, input: Vec<u8>,
position: usize, position: usize,
read_position: usize, read_position: usize,
ch: char, ch: u8,
} }
impl Lexer { impl Lexer {
pub fn new(input: &str) -> Lexer { pub fn new(input: &str) -> Lexer {
Lexer { Lexer {
input: input.chars().collect::<Vec<char>>(), input: input.bytes().collect::<Vec<u8>>(),
position: 0, position: 0,
read_position: 0, read_position: 0,
ch: '0', ch: 0,
} }
} }
fn read_char(&mut self) { fn read_char(&mut self) {
if self.read_position >= self.input.len() { if self.read_position == self.input.len() {
self.ch = '0'; self.ch = 0;
} else if self.read_position > self.input.len() {
// 3 = ETX
self.ch = 3;
} else { } else {
self.ch = self.input[self.read_position]; self.ch = self.input[self.read_position];
} }
@ -71,8 +89,26 @@ impl Lexer {
while is_letter(self.ch) { while is_letter(self.ch) {
self.read_char(); self.read_char();
} }
self.read_position -= 1;
String::from_utf8_lossy(&self.input[pos..self.position]).to_string()
}
self.input[pos..self.position].iter().collect::<String>() fn skip_whitespace(&mut self) {
while self.ch == b' ' || self.ch == b'\t' || self.ch == b'\n' || self.ch == b'\r' {
self.read_char();
}
}
// use i64 for all numbers for now.
fn read_number(&mut self) -> i64 {
let pos = self.position;
while self.ch.is_ascii_digit() {
self.read_char();
}
self.read_position -= 1;
String::from_utf8_lossy(&self.input[pos..self.position])
.parse::<i64>()
.unwrap()
} }
} }
@ -81,25 +117,47 @@ impl Iterator for Lexer {
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
self.read_char(); self.read_char();
self.skip_whitespace();
match self.ch { let v = match self.ch {
'=' => Some(Token::Assign), b'=' => Some(Token::Assign),
'+' => Some(Token::Plus), b'+' => Some(Token::Plus),
'*' => Some(Token::Multiply), b'*' => Some(Token::Multiply),
'/' => Some(Token::Divide), b'/' => Some(Token::Divide),
'-' => Some(Token::Subtract), b'-' => Some(Token::Subtract),
',' => Some(Token::Comma), b',' => Some(Token::Comma),
';' => Some(Token::Semicolon), b';' => Some(Token::Semicolon),
'(' => Some(Token::LParen), b'(' => Some(Token::LParen),
')' => Some(Token::RParen), b')' => Some(Token::RParen),
'[' => Some(Token::LBrace), b'{' => Some(Token::LBrace),
']' => Some(Token::RBrace), b'}' => Some(Token::RBrace),
'0' => Some(Token::EOF), b'!' => Some(Token::ExclamationMark),
_ => None, b'>' => Some(Token::GreaterThan),
b'<' => Some(Token::LessThan),
0 => Some(Token::EOF),
//ETX-> End of text. It's the value of self.ch after all the text is parsed.
3 => None,
_ if is_letter(self.ch) => {
let ident = self.read_identifier();
Some(lookup_ident(&ident))
} }
_ if self.ch.is_ascii_digit() => {
let number = self.read_number();
Some(Token::Int(number))
}
_ => Some(Token::Illegal),
};
v
} }
} }
fn is_letter(c: char) -> bool { fn is_letter(c: u8) -> bool {
c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == '_' c.is_ascii_alphabetic() || c == b'_'
}
fn lookup_ident(ident: &str) -> Token {
match IDENTMAP.get(ident) {
Some(v) => v.clone(),
None => Token::Ident(ident.to_string()),
}
} }

View File

@ -8,10 +8,15 @@ fn main() {}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use lexer::{Lexer, Token}; use lexer::{Lexer, Token};
use std::collections::HashMap;
#[test] #[test]
fn new_token() { fn new_token() {
let input = "=+()[],;"; let mut tests = HashMap::new();
let expected = vec![
tests.insert(
"=+(){},;",
vec![
Token::Assign, Token::Assign,
Token::Plus, Token::Plus,
Token::LParen, Token::LParen,
@ -21,15 +26,122 @@ mod tests {
Token::Comma, Token::Comma,
Token::Semicolon, Token::Semicolon,
Token::EOF, Token::EOF,
]; ],
);
tests.insert(
"let five = 5;
let ten = 10;
let tokenized_output = Lexer::new(input).collect::<Vec<Token>>(); let add = fn(x, y) {
x + y;
};
assert_eq!(expected.len(), tokenized_output.len()); let result = add(five, ten);",
println!("{:?}", tokenized_output); vec![
Token::Let,
Token::Ident("five".to_string()),
Token::Assign,
Token::Int(5),
Token::Semicolon,
Token::Let,
Token::Ident("ten".to_string()),
Token::Assign,
Token::Int(10),
Token::Semicolon,
Token::Let,
Token::Ident("add".to_string()),
Token::Assign,
Token::Function,
Token::LParen,
Token::Ident("x".to_string()),
Token::Comma,
Token::Ident("y".to_string()),
Token::RParen,
Token::LBrace,
Token::Ident("x".to_string()),
Token::Plus,
Token::Ident("y".to_string()),
Token::Semicolon,
Token::RBrace,
Token::Semicolon,
Token::Let,
Token::Ident("result".to_string()),
Token::Assign,
Token::Ident("add".to_string()),
Token::LParen,
Token::Ident("five".to_string()),
Token::Comma,
Token::Ident("ten".to_string()),
Token::RParen,
Token::Semicolon,
Token::EOF,
],
);
tests.insert(
"let result = add(five, ten);
!-/*5;
5 < 10 > 5;
for (exp, actual) in expected.into_iter().zip(tokenized_output) { if(5 < 10) {
return true;
} else {
return false;
}
",
vec![
Token::Let,
Token::Ident("result".to_string()),
Token::Assign,
Token::Ident("add".to_string()),
Token::LParen,
Token::Ident("five".to_string()),
Token::Comma,
Token::Ident("ten".to_string()),
Token::RParen,
Token::Semicolon,
Token::ExclamationMark,
Token::Subtract,
Token::Divide,
Token::Multiply,
Token::Int(5),
Token::Semicolon,
Token::Int(5),
Token::LessThan,
Token::Int(10),
Token::GreaterThan,
Token::Int(5),
Token::Semicolon,
Token::If,
Token::LParen,
Token::Int(5),
Token::LessThan,
Token::Int(10),
Token::RParen,
Token::LBrace,
Token::Return,
Token::True,
Token::Semicolon,
Token::RBrace,
Token::Else,
Token::LBrace,
Token::Return,
Token::False,
Token::Semicolon,
Token::RBrace,
Token::EOF,
],
);
for (k, v) in tests {
let tokenized_output = Lexer::new(k).collect::<Vec<Token>>();
// assert_eq!(v.len(), tokenized_output.len());
for (exp, actual) in v.into_iter().zip(tokenized_output) {
if actual != exp {
println!("Expect: {:?}, Actual: {:?}", actual, exp);
}
assert_eq!(actual, exp); assert_eq!(actual, exp);
} }
} }
} }
}