Added true,false,return,if,else keywords, Single character tokens, tests

and change Vec<char> to Vec<u8>
2019-02-04 17:16:17 +05:30 · 2019-02-04 17:16:17 +05:30 · f31e4bfee1
commit f31e4bfee1
parent 561c4c6089
2 changed files with 212 additions and 42 deletions
--- a/src/lexer/mod.rs
+++ b/src/lexer/mod.rs
@ -1,15 +1,21 @@
 use std::collections::HashMap;
+use std::str;

 lazy_static! {
    static ref IDENTMAP: HashMap<&'static str, Token> = {
        let mut m = HashMap::new();
        m.insert("fn", Token::Function);
        m.insert("let", Token::Let);
+        m.insert("true", Token::True);
+        m.insert("false", Token::False);
+        m.insert("return", Token::Return);
+        m.insert("if", Token::If);
+        m.insert("else", Token::Else);
        m
    };
 }

-#[derive(Debug, PartialEq)]
+#[derive(Debug, PartialEq, Clone)]
 pub enum Token {
    Illegal,
    EOF,
@ -23,6 +29,9 @@ pub enum Token {
    Multiply,
    Divide,
    Subtract,
+    ExclamationMark,
+    LessThan,
+    GreaterThan,

    // Delimiter
    Comma,
@ -34,30 +43,39 @@ pub enum Token {

    // Keywords
    Function,
+    If,
    Let,
+    True,
+    Else,
+    False,
+    Return,
+    Ident(String),
 }

 #[derive(Debug)]
 pub struct Lexer {
-    input: Vec<char>,
+    input: Vec<u8>,
    position: usize,
    read_position: usize,
-    ch: char,
+    ch: u8,
 }

 impl Lexer {
    pub fn new(input: &str) -> Lexer {
        Lexer {
-            input: input.chars().collect::<Vec<char>>(),
+            input: input.bytes().collect::<Vec<u8>>(),
            position: 0,
            read_position: 0,
-            ch: '0',
+            ch: 0,
        }
    }

    fn read_char(&mut self) {
-        if self.read_position >= self.input.len() {
-            self.ch = '0';
+        if self.read_position == self.input.len() {
+            self.ch = 0;
+        } else if self.read_position > self.input.len() {
+            // 3 = ETX
+            self.ch = 3;
        } else {
            self.ch = self.input[self.read_position];
        }
@ -71,8 +89,26 @@ impl Lexer {
        while is_letter(self.ch) {
            self.read_char();
        }
+        self.read_position -= 1;
+        String::from_utf8_lossy(&self.input[pos..self.position]).to_string()
+    }

-        self.input[pos..self.position].iter().collect::<String>()
+    fn skip_whitespace(&mut self) {
+        while self.ch == b' ' || self.ch == b'\t' || self.ch == b'\n' || self.ch == b'\r' {
+            self.read_char();
+        }
+    }
+
+    // use i64 for all numbers for now.
+    fn read_number(&mut self) -> i64 {
+        let pos = self.position;
+        while self.ch.is_ascii_digit() {
+            self.read_char();
+        }
+        self.read_position -= 1;
+        String::from_utf8_lossy(&self.input[pos..self.position])
+            .parse::<i64>()
+            .unwrap()
    }
 }

@ -81,25 +117,47 @@ impl Iterator for Lexer {

    fn next(&mut self) -> Option<Self::Item> {
        self.read_char();
+        self.skip_whitespace();

-        match self.ch {
-            '=' => Some(Token::Assign),
-            '+' => Some(Token::Plus),
-            '*' => Some(Token::Multiply),
-            '/' => Some(Token::Divide),
-            '-' => Some(Token::Subtract),
-            ',' => Some(Token::Comma),
-            ';' => Some(Token::Semicolon),
-            '(' => Some(Token::LParen),
-            ')' => Some(Token::RParen),
-            '[' => Some(Token::LBrace),
-            ']' => Some(Token::RBrace),
-            '0' => Some(Token::EOF),
-            _ => None,
-        }
+        let v = match self.ch {
+            b'=' => Some(Token::Assign),
+            b'+' => Some(Token::Plus),
+            b'*' => Some(Token::Multiply),
+            b'/' => Some(Token::Divide),
+            b'-' => Some(Token::Subtract),
+            b',' => Some(Token::Comma),
+            b';' => Some(Token::Semicolon),
+            b'(' => Some(Token::LParen),
+            b')' => Some(Token::RParen),
+            b'{' => Some(Token::LBrace),
+            b'}' => Some(Token::RBrace),
+            b'!' => Some(Token::ExclamationMark),
+            b'>' => Some(Token::GreaterThan),
+            b'<' => Some(Token::LessThan),
+            0 => Some(Token::EOF),
+            //ETX-> End of text. It's the value of self.ch after all the text is parsed.
+            3 => None,
+            _ if is_letter(self.ch) => {
+                let ident = self.read_identifier();
+                Some(lookup_ident(&ident))
+            }
+            _ if self.ch.is_ascii_digit() => {
+                let number = self.read_number();
+                Some(Token::Int(number))
+            }
+            _ => Some(Token::Illegal),
+        };
+        v
    }
 }

-fn is_letter(c: char) -> bool {
-    c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c == '_'
+fn is_letter(c: u8) -> bool {
+    c.is_ascii_alphabetic() || c == b'_'
+}
+
+fn lookup_ident(ident: &str) -> Token {
+    match IDENTMAP.get(ident) {
+        Some(v) => v.clone(),
+        None => Token::Ident(ident.to_string()),
+    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -8,28 +8,140 @@ fn main() {}
 #[cfg(test)]
 mod tests {
    use lexer::{Lexer, Token};
+    use std::collections::HashMap;
+
    #[test]
    fn new_token() {
-        let input = "=+()[],;";
-        let expected = vec![
-            Token::Assign,
-            Token::Plus,
-            Token::LParen,
-            Token::RParen,
-            Token::LBrace,
-            Token::RBrace,
-            Token::Comma,
-            Token::Semicolon,
-            Token::EOF,
-        ];
+        let mut tests = HashMap::new();

-        let tokenized_output = Lexer::new(input).collect::<Vec<Token>>();
+        tests.insert(
+            "=+(){},;",
+            vec![
+                Token::Assign,
+                Token::Plus,
+                Token::LParen,
+                Token::RParen,
+                Token::LBrace,
+                Token::RBrace,
+                Token::Comma,
+                Token::Semicolon,
+                Token::EOF,
+            ],
+        );
+        tests.insert(
+            "let five = 5;
+            let ten = 10;

-        assert_eq!(expected.len(), tokenized_output.len());
-        println!("{:?}", tokenized_output);
+        let add = fn(x, y) {
+            x + y;
+        };

-        for (exp, actual) in expected.into_iter().zip(tokenized_output) {
-            assert_eq!(actual, exp);
+        let result = add(five, ten);",
+            vec![
+                Token::Let,
+                Token::Ident("five".to_string()),
+                Token::Assign,
+                Token::Int(5),
+                Token::Semicolon,
+                Token::Let,
+                Token::Ident("ten".to_string()),
+                Token::Assign,
+                Token::Int(10),
+                Token::Semicolon,
+                Token::Let,
+                Token::Ident("add".to_string()),
+                Token::Assign,
+                Token::Function,
+                Token::LParen,
+                Token::Ident("x".to_string()),
+                Token::Comma,
+                Token::Ident("y".to_string()),
+                Token::RParen,
+                Token::LBrace,
+                Token::Ident("x".to_string()),
+                Token::Plus,
+                Token::Ident("y".to_string()),
+                Token::Semicolon,
+                Token::RBrace,
+                Token::Semicolon,
+                Token::Let,
+                Token::Ident("result".to_string()),
+                Token::Assign,
+                Token::Ident("add".to_string()),
+                Token::LParen,
+                Token::Ident("five".to_string()),
+                Token::Comma,
+                Token::Ident("ten".to_string()),
+                Token::RParen,
+                Token::Semicolon,
+                Token::EOF,
+            ],
+        );
+        tests.insert(
+            "let result = add(five, ten);
+        !-/*5;
+        5 < 10 > 5;
+
+        if(5 < 10) {
+            return true;
+        } else {
+            return false;
+        }
+        ",
+            vec![
+                Token::Let,
+                Token::Ident("result".to_string()),
+                Token::Assign,
+                Token::Ident("add".to_string()),
+                Token::LParen,
+                Token::Ident("five".to_string()),
+                Token::Comma,
+                Token::Ident("ten".to_string()),
+                Token::RParen,
+                Token::Semicolon,
+                Token::ExclamationMark,
+                Token::Subtract,
+                Token::Divide,
+                Token::Multiply,
+                Token::Int(5),
+                Token::Semicolon,
+                Token::Int(5),
+                Token::LessThan,
+                Token::Int(10),
+                Token::GreaterThan,
+                Token::Int(5),
+                Token::Semicolon,
+                Token::If,
+                Token::LParen,
+                Token::Int(5),
+                Token::LessThan,
+                Token::Int(10),
+                Token::RParen,
+                Token::LBrace,
+                Token::Return,
+                Token::True,
+                Token::Semicolon,
+                Token::RBrace,
+                Token::Else,
+                Token::LBrace,
+                Token::Return,
+                Token::False,
+                Token::Semicolon,
+                Token::RBrace,
+                Token::EOF,
+            ],
+        );
+
+        for (k, v) in tests {
+            let tokenized_output = Lexer::new(k).collect::<Vec<Token>>();
+            //            assert_eq!(v.len(), tokenized_output.len());
+
+            for (exp, actual) in v.into_iter().zip(tokenized_output) {
+                if actual != exp {
+                    println!("Expect: {:?}, Actual: {:?}", actual, exp);
+                }
+                assert_eq!(actual, exp);
+            }
        }
    }
 }