tokenizer rewrite

This commit is contained in:
2024-10-14 20:36:28 -04:00
parent ffc3be91ee
commit 3fe09ad4e4

View File

@@ -1,3 +1,4 @@
use std::iter::Peekable;
use std::{error, io}; use std::{error, io};
use std::collections::VecDeque; use std::collections::VecDeque;
@@ -11,6 +12,7 @@ pub enum TokenizeError {
InvalidNumericConstant(String), InvalidNumericConstant(String),
InvalidIdentifier(String), InvalidIdentifier(String),
UnableToMatchToken(String), UnableToMatchToken(String),
InvalidCharacter(char),
UnclosedString, UnclosedString,
IO(io::Error), IO(io::Error),
Regex(regex::Error), Regex(regex::Error),
@@ -27,6 +29,7 @@ impl Display for TokenizeError {
=> write!(f, "invalid identifier `{ident}`"), => write!(f, "invalid identifier `{ident}`"),
TokenizeError::UnableToMatchToken(token) TokenizeError::UnableToMatchToken(token)
=> write!(f, "the token `{token}` was unable to be parsed"), => write!(f, "the token `{token}` was unable to be parsed"),
TokenizeError::InvalidCharacter(c) => write!(f, "`{c}` is not understood"),
TokenizeError::UnclosedString => write!(f, "newline was found before string was closed"), TokenizeError::UnclosedString => write!(f, "newline was found before string was closed"),
TokenizeError::IO(io) => write!(f, "{io}"), TokenizeError::IO(io) => write!(f, "{io}"),
TokenizeError::Regex(re) => write!(f, "{re}"), TokenizeError::Regex(re) => write!(f, "{re}"),
@@ -149,7 +152,7 @@ impl Token {
/// Tokenize an input stream of source code for a Parser /// Tokenize an input stream of source code for a Parser
pub(crate) struct Tokenizer<R: BufRead> { pub(crate) struct Tokenizer<R: BufRead> {
reader: R, reader: R,
tokens: VecDeque<Token>, tokens: VecDeque<Result<Token, TokenizeError>>,
} }
impl<R: BufRead> Tokenizer<R> { impl<R: BufRead> Tokenizer<R> {
@@ -159,6 +162,72 @@ impl<R: BufRead> Tokenizer<R> {
tokens: VecDeque::new(), tokens: VecDeque::new(),
} }
} }
/// Tokenizes more input and adds them to the internal queue
fn tokenize<I: Iterator<Item = char>>(&mut self, mut iter: Peekable<I>) {
const OPERATOR_CHARS: &'static str = "!@$%^&*()-=+[]{}|;:,<.>/?";
let c = if let Some(c) = iter.next() {
c
} else {
return;
};
if c.is_alphanumeric() || c == '.' {
let mut token = String::from(c);
while let Some(c) = iter.next_if(|&c| c.is_alphanumeric() || c == '.' || c == '\'') {
token.push(c);
}
self.tokens.push_back(Token::parse(&token));
self.tokenize(iter)
} else if OPERATOR_CHARS.contains(c) {
let mut token = String::from(c);
while let Some(c) = iter.next_if(|&c| OPERATOR_CHARS.contains(c)) {
token.push(c);
}
self.tokens.push_back(Token::parse(&token));
self.tokenize(iter)
} else if c == '#' {
// consume comments
let _: String = iter.by_ref().take_while(|&c| c != '\n').collect();
} else if c == '\"' {
let mut token = String::new();
while let Some(c) = iter.next() {
match c {
'"' => break,
'\n' => {
self.tokens.push_back(Err(TokenizeError::UnclosedString));
return;
}
'\\' => match iter.next() {
Some('\\') => token.push('\\'),
Some('n') => token.push('\n'),
Some('t') => token.push('\t'),
Some('r') => token.push('\r'),
Some('\"') => token.push('"'),
Some(c) => token.push(c),
None => {
self.tokens.push_back(Err(TokenizeError::UnclosedString));
return;
},
}
_ => token.push(c),
}
}
self.tokens.push_back(Ok(Token::Constant(Value::String(token))));
self.tokenize(iter)
} else if c.is_whitespace() {
self.tokenize(iter)
} else {
self.tokens.push_back(Err(TokenizeError::InvalidCharacter(c)));
}
}
} }
impl std::str::FromStr for Tokenizer<Cursor<String>> { impl std::str::FromStr for Tokenizer<Cursor<String>> {
@@ -175,21 +244,18 @@ impl<R: BufRead> std::iter::Iterator for Tokenizer<R> {
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
if let Some(token) = self.tokens.pop_front() { if let Some(token) = self.tokens.pop_front() {
return Some(Ok(token)); return Some(token);
} }
let mut input = String::new(); let mut input = String::new();
match self.reader.read_line(&mut input) { match self.reader.read_line(&mut input) {
Ok(0) => None, Ok(0) => None,
Ok(_n) => {
self.tokenize(input.chars().peekable());
self.next()
},
Err(e) => Some(Err(TokenizeError::IO(e))), Err(e) => Some(Err(TokenizeError::IO(e))),
_ => {
let mut buffer = String::new();
for c in input.chars() {
}
}
} }
} }
} }
@@ -201,7 +267,7 @@ mod tests {
#[test] #[test]
fn tokenizer() { fn tokenizer() {
let program = "\"hello\nworld\""; let program = ": function x ** x 2 function 1200";
let tok = Tokenizer::from_str(program).unwrap(); let tok = Tokenizer::from_str(program).unwrap();
let tokens: Vec<Token> = tok.collect::<Result<_, TokenizeError>>().expect("tokenizer error"); let tokens: Vec<Token> = tok.collect::<Result<_, TokenizeError>>().expect("tokenizer error");