major tokenizer fixes
This commit is contained in:
146
src/tokenizer.rs
146
src/tokenizer.rs
@@ -1,6 +1,6 @@
|
|||||||
use std::iter::Peekable;
|
use std::iter::Peekable;
|
||||||
use std::{error, io};
|
use std::{error, io};
|
||||||
use std::collections::VecDeque;
|
use std::collections::{VecDeque, HashMap};
|
||||||
|
|
||||||
use super::Value;
|
use super::Value;
|
||||||
use std::fmt::{Display, Formatter};
|
use std::fmt::{Display, Formatter};
|
||||||
@@ -77,69 +77,41 @@ pub(crate) enum Token {
|
|||||||
Constant(Value),
|
Constant(Value),
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_dot_count(s: &str) -> Option<usize> {
|
fn get_dot_count<I: Iterator<Item = char>>(s: &mut Peekable<I>) -> Option<usize> {
|
||||||
s.chars().fold(Some(0), |acc, c|
|
let mut total = 0;
|
||||||
match c {
|
|
||||||
':' => acc.map(|acc| acc + 2),
|
while let Some(n) = s.next_if(|&c| c == ':' || c == '.').map(|c| match c {
|
||||||
'.' => acc.map(|acc| acc + 1),
|
':' => 2,
|
||||||
_ => None,
|
'.' => 1,
|
||||||
}
|
_ => 0,
|
||||||
)
|
}) {
|
||||||
|
total += n;
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(total)
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Token {
|
impl Token {
|
||||||
/// Parse a single token
|
/// Parse a single token
|
||||||
fn parse(s: &str) -> Result<Self, TokenizeError> {
|
fn parse(s: &str) -> Result<Self, TokenizeError> {
|
||||||
let string = regex::Regex::new(r#"".+""#).map_err(|e| TokenizeError::Regex(e))?;
|
|
||||||
let identifier = regex::Regex::new(r#"[A-Za-z_][A-Za-z0-9_']*"#).map_err(|e| TokenizeError::Regex(e))?;
|
let identifier = regex::Regex::new(r#"[A-Za-z_][A-Za-z0-9_']*"#).map_err(|e| TokenizeError::Regex(e))?;
|
||||||
let number = regex::Regex::new(r#"([0-9]+\.?[0-9]*)|(\.[0-9])"#).map_err(|e| TokenizeError::Regex(e))?;
|
let number = regex::Regex::new(r#"([0-9]+\.?[0-9]*)|(\.[0-9])"#).map_err(|e| TokenizeError::Regex(e))?;
|
||||||
|
|
||||||
match s {
|
match s {
|
||||||
// First check if s is an operator
|
// Match keywords first
|
||||||
"+" => Ok(Token::Operator(Op::Add)),
|
|
||||||
"-" => Ok(Token::Operator(Op::Sub)),
|
|
||||||
"*" => Ok(Token::Operator(Op::Mul)),
|
|
||||||
"/" => Ok(Token::Operator(Op::Div)),
|
|
||||||
"**" => Ok(Token::Operator(Op::Exp)),
|
|
||||||
"%" => Ok(Token::Operator(Op::Mod)),
|
|
||||||
"=" => Ok(Token::Operator(Op::Equ)),
|
|
||||||
"." => Ok(Token::Operator(Op::LazyEqu)),
|
|
||||||
"~" => Ok(Token::Operator(Op::Compose)),
|
|
||||||
"," => Ok(Token::Operator(Op::Id)),
|
|
||||||
"?" => Ok(Token::Operator(Op::If)),
|
|
||||||
"??" => Ok(Token::Operator(Op::IfElse)),
|
|
||||||
">" => Ok(Token::Operator(Op::GreaterThan)),
|
|
||||||
"<" => Ok(Token::Operator(Op::LessThan)),
|
|
||||||
">=" => Ok(Token::Operator(Op::GreaterThanOrEqualTo)),
|
|
||||||
"<=" => Ok(Token::Operator(Op::LessThanOrEqualTo)),
|
|
||||||
"==" => Ok(Token::Operator(Op::EqualTo)),
|
|
||||||
"[" => Ok(Token::Operator(Op::OpenArray)),
|
|
||||||
"]" => Ok(Token::Operator(Op::CloseArray)),
|
|
||||||
|
|
||||||
// then some keywords
|
|
||||||
"true" => Ok(Token::Constant(Value::Bool(true))),
|
"true" => Ok(Token::Constant(Value::Bool(true))),
|
||||||
"false" => Ok(Token::Constant(Value::Bool(false))),
|
"false" => Ok(Token::Constant(Value::Bool(false))),
|
||||||
"not" => Ok(Token::Operator(Op::Not)),
|
"not" => Ok(Token::Operator(Op::Not)),
|
||||||
|
|
||||||
// Type casting
|
|
||||||
"int" => Ok(Token::Operator(Op::IntCast)),
|
"int" => Ok(Token::Operator(Op::IntCast)),
|
||||||
"float" => Ok(Token::Operator(Op::FloatCast)),
|
"float" => Ok(Token::Operator(Op::FloatCast)),
|
||||||
"bool" => Ok(Token::Operator(Op::BoolCast)),
|
"bool" => Ok(Token::Operator(Op::BoolCast)),
|
||||||
"string" => Ok(Token::Operator(Op::StringCast)),
|
"string" => Ok(Token::Operator(Op::StringCast)),
|
||||||
|
|
||||||
// misc
|
|
||||||
"print" => Ok(Token::Operator(Op::Print)),
|
"print" => Ok(Token::Operator(Op::Print)),
|
||||||
"empty" => Ok(Token::Operator(Op::Empty)),
|
"empty" => Ok(Token::Operator(Op::Empty)),
|
||||||
|
|
||||||
// then variable length keywords
|
// then identifiers and numbers
|
||||||
_ => {
|
_ => {
|
||||||
if s.starts_with(":") {
|
if identifier.is_match(s) {
|
||||||
Ok(Token::Operator(Op::FunctionDeclare(
|
|
||||||
get_dot_count(s).map(|x| x - 1).ok_or(TokenizeError::InvalidDynamicOperator(s.to_string()))?
|
|
||||||
)))
|
|
||||||
} else if string.is_match(s) {
|
|
||||||
Ok(Token::Constant(Value::String(s[1..s.len() - 1].to_string())))
|
|
||||||
} else if identifier.is_match(s) {
|
|
||||||
Ok(Token::Identifier(s.to_string()))
|
Ok(Token::Identifier(s.to_string()))
|
||||||
} else if number.is_match(s) {
|
} else if number.is_match(s) {
|
||||||
if let Ok(int) = s.parse::<i64>() {
|
if let Ok(int) = s.parse::<i64>() {
|
||||||
@@ -173,7 +145,28 @@ impl<R: BufRead> Tokenizer<R> {
|
|||||||
|
|
||||||
/// Tokenizes more input and adds them to the internal queue
|
/// Tokenizes more input and adds them to the internal queue
|
||||||
fn tokenize<I: Iterator<Item = char>>(&mut self, mut iter: Peekable<I>) {
|
fn tokenize<I: Iterator<Item = char>>(&mut self, mut iter: Peekable<I>) {
|
||||||
const OPERATOR_CHARS: &'static str = "!@$%^&*()-=+[]{}|;:,<.>/?";
|
let operators: HashMap<&'static str, Op> = HashMap::from([
|
||||||
|
("+", Op::Add),
|
||||||
|
("-", Op::Sub),
|
||||||
|
("*", Op::Mul),
|
||||||
|
("/", Op::Div),
|
||||||
|
("**", Op::Exp),
|
||||||
|
("%", Op::Mod),
|
||||||
|
("=", Op::Equ),
|
||||||
|
(".", Op::LazyEqu),
|
||||||
|
(":", Op::FunctionDeclare(1)),
|
||||||
|
("~", Op::Compose),
|
||||||
|
(",", Op::Id),
|
||||||
|
("?", Op::If),
|
||||||
|
("??", Op::IfElse),
|
||||||
|
(">", Op::GreaterThan),
|
||||||
|
("<", Op::LessThan),
|
||||||
|
(">=", Op::GreaterThanOrEqualTo),
|
||||||
|
("<=", Op::LessThanOrEqualTo),
|
||||||
|
("==", Op::EqualTo),
|
||||||
|
("[", Op::OpenArray),
|
||||||
|
("]", Op::CloseArray),
|
||||||
|
]);
|
||||||
|
|
||||||
let c = if let Some(c) = iter.next() {
|
let c = if let Some(c) = iter.next() {
|
||||||
c
|
c
|
||||||
@@ -188,19 +181,9 @@ impl<R: BufRead> Tokenizer<R> {
|
|||||||
token.push(c);
|
token.push(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
self.tokens.push_back(Token::parse(&token));
|
|
||||||
self.tokenize(iter)
|
|
||||||
} else if OPERATOR_CHARS.contains(c) {
|
|
||||||
let mut token = String::from(c);
|
|
||||||
|
|
||||||
while let Some(c) = iter.next_if(|&c| OPERATOR_CHARS.contains(c)) {
|
|
||||||
token.push(c);
|
|
||||||
}
|
|
||||||
|
|
||||||
self.tokens.push_back(Token::parse(&token));
|
self.tokens.push_back(Token::parse(&token));
|
||||||
self.tokenize(iter)
|
self.tokenize(iter)
|
||||||
} else if c == '#' {
|
} else if c == '#' {
|
||||||
// consume comments
|
|
||||||
let _: String = iter.by_ref().take_while(|&c| c != '\n').collect();
|
let _: String = iter.by_ref().take_while(|&c| c != '\n').collect();
|
||||||
} else if c == '\"' {
|
} else if c == '\"' {
|
||||||
let mut token = String::new();
|
let mut token = String::new();
|
||||||
@@ -229,11 +212,64 @@ impl<R: BufRead> Tokenizer<R> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
self.tokens.push_back(Ok(Token::Constant(Value::String(token))));
|
self.tokens.push_back(Ok(Token::Constant(Value::String(token))));
|
||||||
|
self.tokenize(iter)
|
||||||
|
} else if operators.keys().any(|x| x.starts_with(c)) {
|
||||||
|
let mut token = String::from(c);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let possible: HashMap<&'static str, Op> = operators
|
||||||
|
.clone().into_iter()
|
||||||
|
.filter(|(key, _)| key.starts_with(&token))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let is_expected = |c: &char|
|
||||||
|
possible.iter().any(|(op, _)| match op.chars().nth(token.len()) {
|
||||||
|
Some(i) => *c == i,
|
||||||
|
None => false,
|
||||||
|
});
|
||||||
|
|
||||||
|
match possible.len() {
|
||||||
|
1 => {
|
||||||
|
self.tokens.push_back(Ok(Token::Operator(match possible.get(token.as_str()).unwrap().clone() {
|
||||||
|
Op::FunctionDeclare(n) => {
|
||||||
|
let count = match get_dot_count(&mut iter) {
|
||||||
|
Some(count) => count,
|
||||||
|
None => {
|
||||||
|
self.tokens.push_back(Err(TokenizeError::InvalidDynamicOperator(token)));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Op::FunctionDeclare(n + count)
|
||||||
|
}
|
||||||
|
op => op,
|
||||||
|
})));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
0 => {
|
||||||
|
self.tokens.push_back(Err(TokenizeError::InvalidCharacter(c)));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
let next = match iter.next_if(is_expected) {
|
||||||
|
Some(c) => c,
|
||||||
|
None => {
|
||||||
|
// at this point, token must be in the hashmap possible, otherwise it wouldnt have any matches
|
||||||
|
self.tokens.push_back(Ok(Token::Operator(possible.get(token.as_str()).unwrap().clone())));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
token.push(next);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
self.tokenize(iter)
|
self.tokenize(iter)
|
||||||
} else if c.is_whitespace() {
|
} else if c.is_whitespace() {
|
||||||
self.tokenize(iter)
|
self.tokenize(iter)
|
||||||
} else {
|
} else {
|
||||||
self.tokens.push_back(Err(TokenizeError::InvalidCharacter(c)));
|
self.tokens.push_back(Err(TokenizeError::InvalidCharacter(c)));
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user