Refactored lexer

1. Changed input from Vec<u8> to Peekable<Chars>
2. Refactored methods implemented on lexer
This commit is contained in:
Ishan Jain 2019-02-04 19:57:41 +05:30
parent f31e4bfee1
commit c0efc2c316
2 changed files with 79 additions and 66 deletions

View File

@ -1,5 +1,6 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::str; use std::iter::Peekable;
use std::str::{self, Chars};
lazy_static! { lazy_static! {
static ref IDENTMAP: HashMap<&'static str, Token> = { static ref IDENTMAP: HashMap<&'static str, Token> = {
@ -53,106 +54,118 @@ pub enum Token {
} }
#[derive(Debug)] #[derive(Debug)]
pub struct Lexer { pub struct Lexer<'a> {
input: Vec<u8>, input: Peekable<Chars<'a>>,
position: usize, eof_sent: bool,
read_position: usize,
ch: u8,
} }
impl Lexer { impl<'a> Lexer<'a> {
pub fn new(input: &str) -> Lexer { pub fn new(input: &'a str) -> Lexer<'a> {
let input = input.chars().peekable();
Lexer { Lexer {
input: input.bytes().collect::<Vec<u8>>(), input,
position: 0, eof_sent: false,
read_position: 0,
ch: 0,
} }
} }
fn read_char(&mut self) { fn read_char(&mut self) -> Option<char> {
if self.read_position == self.input.len() { self.input.next()
self.ch = 0;
} else if self.read_position > self.input.len() {
// 3 = ETX
self.ch = 3;
} else {
self.ch = self.input[self.read_position];
}
self.position = self.read_position;
self.read_position += 1;
} }
fn read_identifier(&mut self) -> String { fn read_identifier(&mut self, first: char) -> String {
let pos = self.position; let mut ident = Vec::new();
while is_letter(self.ch) { ident.push(first);
self.read_char();
while self.peek_is_letter() {
ident.push(self.read_char().unwrap());
}
ident.into_iter().collect::<String>()
}
fn peek_is_letter(&mut self) -> bool {
match self.input.peek() {
Some(v) => is_letter(v),
None => false,
}
}
fn peek_is_ascii_digit(&mut self) -> bool {
match self.input.peek() {
Some(v) => v.is_ascii_digit(),
None => false,
} }
self.read_position -= 1;
String::from_utf8_lossy(&self.input[pos..self.position]).to_string()
} }
fn skip_whitespace(&mut self) { fn skip_whitespace(&mut self) {
while self.ch == b' ' || self.ch == b'\t' || self.ch == b'\n' || self.ch == b'\r' { while let Some(&v) = self.input.peek() {
self.read_char(); if v == ' ' || v == '\t' || v == '\n' || v == '\r' {
self.read_char();
} else {
break;
}
} }
} }
// use i64 for all numbers for now. // use i64 for all numbers for now.
fn read_number(&mut self) -> i64 { fn read_number(&mut self, first: char) -> i64 {
let pos = self.position; let mut number = Vec::new();
while self.ch.is_ascii_digit() { number.push(first);
self.read_char();
while self.peek_is_ascii_digit() {
number.push(self.read_char().unwrap());
} }
self.read_position -= 1;
String::from_utf8_lossy(&self.input[pos..self.position]) number
.into_iter()
.collect::<String>()
.parse::<i64>() .parse::<i64>()
.unwrap() .unwrap()
} }
} }
impl Iterator for Lexer { impl<'a> Iterator for Lexer<'a> {
type Item = Token; type Item = Token;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
self.read_char();
self.skip_whitespace(); self.skip_whitespace();
let ch = self.read_char();
let v = match self.ch { let v = match ch {
b'=' => Some(Token::Assign), Some('=') => Some(Token::Assign),
b'+' => Some(Token::Plus), Some('+') => Some(Token::Plus),
b'*' => Some(Token::Multiply), Some('*') => Some(Token::Multiply),
b'/' => Some(Token::Divide), Some('/') => Some(Token::Divide),
b'-' => Some(Token::Subtract), Some('-') => Some(Token::Subtract),
b',' => Some(Token::Comma), Some(',') => Some(Token::Comma),
b';' => Some(Token::Semicolon), Some(';') => Some(Token::Semicolon),
b'(' => Some(Token::LParen), Some('(') => Some(Token::LParen),
b')' => Some(Token::RParen), Some(')') => Some(Token::RParen),
b'{' => Some(Token::LBrace), Some('{') => Some(Token::LBrace),
b'}' => Some(Token::RBrace), Some('}') => Some(Token::RBrace),
b'!' => Some(Token::ExclamationMark), Some('!') => Some(Token::ExclamationMark),
b'>' => Some(Token::GreaterThan), Some('>') => Some(Token::GreaterThan),
b'<' => Some(Token::LessThan), Some('<') => Some(Token::LessThan),
0 => Some(Token::EOF), Some(ch @ _) if is_letter(&ch) => {
//ETX-> End of text. It's the value of self.ch after all the text is parsed. let ident = self.read_identifier(ch);
3 => None,
_ if is_letter(self.ch) => {
let ident = self.read_identifier();
Some(lookup_ident(&ident)) Some(lookup_ident(&ident))
} }
_ if self.ch.is_ascii_digit() => { Some(ch @ _) if ch.is_ascii_digit() => {
let number = self.read_number(); let number = self.read_number(ch);
Some(Token::Int(number)) Some(Token::Int(number))
} }
None if !self.eof_sent => {
self.eof_sent = true;
Some(Token::EOF)
}
None => None,
_ => Some(Token::Illegal), _ => Some(Token::Illegal),
}; };
v v
} }
} }
fn is_letter(c: u8) -> bool { fn is_letter(c: &char) -> bool {
c.is_ascii_alphabetic() || c == b'_' c.is_ascii_alphabetic() || c == &'_'
} }
fn lookup_ident(ident: &str) -> Token { fn lookup_ident(ident: &str) -> Token {

View File

@ -134,11 +134,11 @@ mod tests {
for (k, v) in tests { for (k, v) in tests {
let tokenized_output = Lexer::new(k).collect::<Vec<Token>>(); let tokenized_output = Lexer::new(k).collect::<Vec<Token>>();
// assert_eq!(v.len(), tokenized_output.len()); // assert_eq!(v.len(), tokenized_output.len());
for (exp, actual) in v.into_iter().zip(tokenized_output) { for (exp, actual) in v.into_iter().zip(tokenized_output) {
if actual != exp { if actual != exp {
println!("Expect: {:?}, Actual: {:?}", actual, exp); println!("Expect: {:?}, Actual: {:?}", exp, actual);
} }
assert_eq!(actual, exp); assert_eq!(actual, exp);
} }