tokenizer rewrite
This commit is contained in:
@@ -1,3 +1,4 @@
|
|||||||
|
use std::iter::Peekable;
|
||||||
use std::{error, io};
|
use std::{error, io};
|
||||||
use std::collections::VecDeque;
|
use std::collections::VecDeque;
|
||||||
|
|
||||||
@@ -11,6 +12,7 @@ pub enum TokenizeError {
|
|||||||
InvalidNumericConstant(String),
|
InvalidNumericConstant(String),
|
||||||
InvalidIdentifier(String),
|
InvalidIdentifier(String),
|
||||||
UnableToMatchToken(String),
|
UnableToMatchToken(String),
|
||||||
|
InvalidCharacter(char),
|
||||||
UnclosedString,
|
UnclosedString,
|
||||||
IO(io::Error),
|
IO(io::Error),
|
||||||
Regex(regex::Error),
|
Regex(regex::Error),
|
||||||
@@ -27,6 +29,7 @@ impl Display for TokenizeError {
|
|||||||
=> write!(f, "invalid identifier `{ident}`"),
|
=> write!(f, "invalid identifier `{ident}`"),
|
||||||
TokenizeError::UnableToMatchToken(token)
|
TokenizeError::UnableToMatchToken(token)
|
||||||
=> write!(f, "the token `{token}` was unable to be parsed"),
|
=> write!(f, "the token `{token}` was unable to be parsed"),
|
||||||
|
TokenizeError::InvalidCharacter(c) => write!(f, "`{c}` is not understood"),
|
||||||
TokenizeError::UnclosedString => write!(f, "newline was found before string was closed"),
|
TokenizeError::UnclosedString => write!(f, "newline was found before string was closed"),
|
||||||
TokenizeError::IO(io) => write!(f, "{io}"),
|
TokenizeError::IO(io) => write!(f, "{io}"),
|
||||||
TokenizeError::Regex(re) => write!(f, "{re}"),
|
TokenizeError::Regex(re) => write!(f, "{re}"),
|
||||||
@@ -149,7 +152,7 @@ impl Token {
|
|||||||
/// Tokenize an input stream of source code for a Parser
|
/// Tokenize an input stream of source code for a Parser
|
||||||
pub(crate) struct Tokenizer<R: BufRead> {
|
pub(crate) struct Tokenizer<R: BufRead> {
|
||||||
reader: R,
|
reader: R,
|
||||||
tokens: VecDeque<Token>,
|
tokens: VecDeque<Result<Token, TokenizeError>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<R: BufRead> Tokenizer<R> {
|
impl<R: BufRead> Tokenizer<R> {
|
||||||
@@ -159,6 +162,72 @@ impl<R: BufRead> Tokenizer<R> {
|
|||||||
tokens: VecDeque::new(),
|
tokens: VecDeque::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Tokenizes more input and adds them to the internal queue
|
||||||
|
fn tokenize<I: Iterator<Item = char>>(&mut self, mut iter: Peekable<I>) {
|
||||||
|
const OPERATOR_CHARS: &'static str = "!@$%^&*()-=+[]{}|;:,<.>/?";
|
||||||
|
|
||||||
|
let c = if let Some(c) = iter.next() {
|
||||||
|
c
|
||||||
|
} else {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
|
if c.is_alphanumeric() || c == '.' {
|
||||||
|
let mut token = String::from(c);
|
||||||
|
|
||||||
|
while let Some(c) = iter.next_if(|&c| c.is_alphanumeric() || c == '.' || c == '\'') {
|
||||||
|
token.push(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.tokens.push_back(Token::parse(&token));
|
||||||
|
self.tokenize(iter)
|
||||||
|
} else if OPERATOR_CHARS.contains(c) {
|
||||||
|
let mut token = String::from(c);
|
||||||
|
|
||||||
|
while let Some(c) = iter.next_if(|&c| OPERATOR_CHARS.contains(c)) {
|
||||||
|
token.push(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.tokens.push_back(Token::parse(&token));
|
||||||
|
self.tokenize(iter)
|
||||||
|
} else if c == '#' {
|
||||||
|
// consume comments
|
||||||
|
let _: String = iter.by_ref().take_while(|&c| c != '\n').collect();
|
||||||
|
} else if c == '\"' {
|
||||||
|
let mut token = String::new();
|
||||||
|
|
||||||
|
while let Some(c) = iter.next() {
|
||||||
|
match c {
|
||||||
|
'"' => break,
|
||||||
|
'\n' => {
|
||||||
|
self.tokens.push_back(Err(TokenizeError::UnclosedString));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
'\\' => match iter.next() {
|
||||||
|
Some('\\') => token.push('\\'),
|
||||||
|
Some('n') => token.push('\n'),
|
||||||
|
Some('t') => token.push('\t'),
|
||||||
|
Some('r') => token.push('\r'),
|
||||||
|
Some('\"') => token.push('"'),
|
||||||
|
Some(c) => token.push(c),
|
||||||
|
None => {
|
||||||
|
self.tokens.push_back(Err(TokenizeError::UnclosedString));
|
||||||
|
return;
|
||||||
|
},
|
||||||
|
}
|
||||||
|
_ => token.push(c),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.tokens.push_back(Ok(Token::Constant(Value::String(token))));
|
||||||
|
self.tokenize(iter)
|
||||||
|
} else if c.is_whitespace() {
|
||||||
|
self.tokenize(iter)
|
||||||
|
} else {
|
||||||
|
self.tokens.push_back(Err(TokenizeError::InvalidCharacter(c)));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl std::str::FromStr for Tokenizer<Cursor<String>> {
|
impl std::str::FromStr for Tokenizer<Cursor<String>> {
|
||||||
@@ -175,21 +244,18 @@ impl<R: BufRead> std::iter::Iterator for Tokenizer<R> {
|
|||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
if let Some(token) = self.tokens.pop_front() {
|
if let Some(token) = self.tokens.pop_front() {
|
||||||
return Some(Ok(token));
|
return Some(token);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut input = String::new();
|
let mut input = String::new();
|
||||||
|
|
||||||
match self.reader.read_line(&mut input) {
|
match self.reader.read_line(&mut input) {
|
||||||
Ok(0) => None,
|
Ok(0) => None,
|
||||||
|
Ok(_n) => {
|
||||||
|
self.tokenize(input.chars().peekable());
|
||||||
|
self.next()
|
||||||
|
},
|
||||||
Err(e) => Some(Err(TokenizeError::IO(e))),
|
Err(e) => Some(Err(TokenizeError::IO(e))),
|
||||||
_ => {
|
|
||||||
let mut buffer = String::new();
|
|
||||||
|
|
||||||
for c in input.chars() {
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -201,7 +267,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenizer() {
|
fn tokenizer() {
|
||||||
let program = "\"hello\nworld\"";
|
let program = ": function x ** x 2 function 1200";
|
||||||
|
|
||||||
let tok = Tokenizer::from_str(program).unwrap();
|
let tok = Tokenizer::from_str(program).unwrap();
|
||||||
let tokens: Vec<Token> = tok.collect::<Result<_, TokenizeError>>().expect("tokenizer error");
|
let tokens: Vec<Token> = tok.collect::<Result<_, TokenizeError>>().expect("tokenizer error");
|
||||||
|
|||||||
Reference in New Issue
Block a user