major tokenizer fixes

This commit is contained in:
2024-10-15 16:29:51 -04:00
parent ea61007301
commit 92f6d43fa0

View File

@@ -1,6 +1,6 @@
use std::iter::Peekable; use std::iter::Peekable;
use std::{error, io}; use std::{error, io};
use std::collections::VecDeque; use std::collections::{VecDeque, HashMap};
use super::Value; use super::Value;
use std::fmt::{Display, Formatter}; use std::fmt::{Display, Formatter};
@@ -77,69 +77,41 @@ pub(crate) enum Token {
Constant(Value), Constant(Value),
} }
fn get_dot_count(s: &str) -> Option<usize> { fn get_dot_count<I: Iterator<Item = char>>(s: &mut Peekable<I>) -> Option<usize> {
s.chars().fold(Some(0), |acc, c| let mut total = 0;
match c {
':' => acc.map(|acc| acc + 2), while let Some(n) = s.next_if(|&c| c == ':' || c == '.').map(|c| match c {
'.' => acc.map(|acc| acc + 1), ':' => 2,
_ => None, '.' => 1,
_ => 0,
}) {
total += n;
} }
)
Some(total)
} }
impl Token { impl Token {
/// Parse a single token /// Parse a single token
fn parse(s: &str) -> Result<Self, TokenizeError> { fn parse(s: &str) -> Result<Self, TokenizeError> {
let string = regex::Regex::new(r#"".+""#).map_err(|e| TokenizeError::Regex(e))?;
let identifier = regex::Regex::new(r#"[A-Za-z_][A-Za-z0-9_']*"#).map_err(|e| TokenizeError::Regex(e))?; let identifier = regex::Regex::new(r#"[A-Za-z_][A-Za-z0-9_']*"#).map_err(|e| TokenizeError::Regex(e))?;
let number = regex::Regex::new(r#"([0-9]+\.?[0-9]*)|(\.[0-9])"#).map_err(|e| TokenizeError::Regex(e))?; let number = regex::Regex::new(r#"([0-9]+\.?[0-9]*)|(\.[0-9])"#).map_err(|e| TokenizeError::Regex(e))?;
match s { match s {
// First check if s is an operator // Match keywords first
"+" => Ok(Token::Operator(Op::Add)),
"-" => Ok(Token::Operator(Op::Sub)),
"*" => Ok(Token::Operator(Op::Mul)),
"/" => Ok(Token::Operator(Op::Div)),
"**" => Ok(Token::Operator(Op::Exp)),
"%" => Ok(Token::Operator(Op::Mod)),
"=" => Ok(Token::Operator(Op::Equ)),
"." => Ok(Token::Operator(Op::LazyEqu)),
"~" => Ok(Token::Operator(Op::Compose)),
"," => Ok(Token::Operator(Op::Id)),
"?" => Ok(Token::Operator(Op::If)),
"??" => Ok(Token::Operator(Op::IfElse)),
">" => Ok(Token::Operator(Op::GreaterThan)),
"<" => Ok(Token::Operator(Op::LessThan)),
">=" => Ok(Token::Operator(Op::GreaterThanOrEqualTo)),
"<=" => Ok(Token::Operator(Op::LessThanOrEqualTo)),
"==" => Ok(Token::Operator(Op::EqualTo)),
"[" => Ok(Token::Operator(Op::OpenArray)),
"]" => Ok(Token::Operator(Op::CloseArray)),
// then some keywords
"true" => Ok(Token::Constant(Value::Bool(true))), "true" => Ok(Token::Constant(Value::Bool(true))),
"false" => Ok(Token::Constant(Value::Bool(false))), "false" => Ok(Token::Constant(Value::Bool(false))),
"not" => Ok(Token::Operator(Op::Not)), "not" => Ok(Token::Operator(Op::Not)),
// Type casting
"int" => Ok(Token::Operator(Op::IntCast)), "int" => Ok(Token::Operator(Op::IntCast)),
"float" => Ok(Token::Operator(Op::FloatCast)), "float" => Ok(Token::Operator(Op::FloatCast)),
"bool" => Ok(Token::Operator(Op::BoolCast)), "bool" => Ok(Token::Operator(Op::BoolCast)),
"string" => Ok(Token::Operator(Op::StringCast)), "string" => Ok(Token::Operator(Op::StringCast)),
// misc
"print" => Ok(Token::Operator(Op::Print)), "print" => Ok(Token::Operator(Op::Print)),
"empty" => Ok(Token::Operator(Op::Empty)), "empty" => Ok(Token::Operator(Op::Empty)),
// then variable length keywords // then identifiers and numbers
_ => { _ => {
if s.starts_with(":") { if identifier.is_match(s) {
Ok(Token::Operator(Op::FunctionDeclare(
get_dot_count(s).map(|x| x - 1).ok_or(TokenizeError::InvalidDynamicOperator(s.to_string()))?
)))
} else if string.is_match(s) {
Ok(Token::Constant(Value::String(s[1..s.len() - 1].to_string())))
} else if identifier.is_match(s) {
Ok(Token::Identifier(s.to_string())) Ok(Token::Identifier(s.to_string()))
} else if number.is_match(s) { } else if number.is_match(s) {
if let Ok(int) = s.parse::<i64>() { if let Ok(int) = s.parse::<i64>() {
@@ -173,7 +145,28 @@ impl<R: BufRead> Tokenizer<R> {
/// Tokenizes more input and adds them to the internal queue /// Tokenizes more input and adds them to the internal queue
fn tokenize<I: Iterator<Item = char>>(&mut self, mut iter: Peekable<I>) { fn tokenize<I: Iterator<Item = char>>(&mut self, mut iter: Peekable<I>) {
const OPERATOR_CHARS: &'static str = "!@$%^&*()-=+[]{}|;:,<.>/?"; let operators: HashMap<&'static str, Op> = HashMap::from([
("+", Op::Add),
("-", Op::Sub),
("*", Op::Mul),
("/", Op::Div),
("**", Op::Exp),
("%", Op::Mod),
("=", Op::Equ),
(".", Op::LazyEqu),
(":", Op::FunctionDeclare(1)),
("~", Op::Compose),
(",", Op::Id),
("?", Op::If),
("??", Op::IfElse),
(">", Op::GreaterThan),
("<", Op::LessThan),
(">=", Op::GreaterThanOrEqualTo),
("<=", Op::LessThanOrEqualTo),
("==", Op::EqualTo),
("[", Op::OpenArray),
("]", Op::CloseArray),
]);
let c = if let Some(c) = iter.next() { let c = if let Some(c) = iter.next() {
c c
@@ -188,19 +181,9 @@ impl<R: BufRead> Tokenizer<R> {
token.push(c); token.push(c);
} }
self.tokens.push_back(Token::parse(&token));
self.tokenize(iter)
} else if OPERATOR_CHARS.contains(c) {
let mut token = String::from(c);
while let Some(c) = iter.next_if(|&c| OPERATOR_CHARS.contains(c)) {
token.push(c);
}
self.tokens.push_back(Token::parse(&token)); self.tokens.push_back(Token::parse(&token));
self.tokenize(iter) self.tokenize(iter)
} else if c == '#' { } else if c == '#' {
// consume comments
let _: String = iter.by_ref().take_while(|&c| c != '\n').collect(); let _: String = iter.by_ref().take_while(|&c| c != '\n').collect();
} else if c == '\"' { } else if c == '\"' {
let mut token = String::new(); let mut token = String::new();
@@ -229,11 +212,64 @@ impl<R: BufRead> Tokenizer<R> {
} }
self.tokens.push_back(Ok(Token::Constant(Value::String(token)))); self.tokens.push_back(Ok(Token::Constant(Value::String(token))));
self.tokenize(iter)
} else if operators.keys().any(|x| x.starts_with(c)) {
let mut token = String::from(c);
loop {
let possible: HashMap<&'static str, Op> = operators
.clone().into_iter()
.filter(|(key, _)| key.starts_with(&token))
.collect();
let is_expected = |c: &char|
possible.iter().any(|(op, _)| match op.chars().nth(token.len()) {
Some(i) => *c == i,
None => false,
});
match possible.len() {
1 => {
self.tokens.push_back(Ok(Token::Operator(match possible.get(token.as_str()).unwrap().clone() {
Op::FunctionDeclare(n) => {
let count = match get_dot_count(&mut iter) {
Some(count) => count,
None => {
self.tokens.push_back(Err(TokenizeError::InvalidDynamicOperator(token)));
return;
}
};
Op::FunctionDeclare(n + count)
}
op => op,
})));
break;
}
0 => {
self.tokens.push_back(Err(TokenizeError::InvalidCharacter(c)));
return;
}
_ => {
let next = match iter.next_if(is_expected) {
Some(c) => c,
None => {
// at this point, token must be in the hashmap possible, otherwise it wouldnt have any matches
self.tokens.push_back(Ok(Token::Operator(possible.get(token.as_str()).unwrap().clone())));
break;
}
};
token.push(next);
}
}
}
self.tokenize(iter) self.tokenize(iter)
} else if c.is_whitespace() { } else if c.is_whitespace() {
self.tokenize(iter) self.tokenize(iter)
} else { } else {
self.tokens.push_back(Err(TokenizeError::InvalidCharacter(c))); self.tokens.push_back(Err(TokenizeError::InvalidCharacter(c)));
return;
} }
} }
} }