half-implement new tokenizer

This commit is contained in:
2024-10-14 19:32:51 -04:00
parent 4b2fefd798
commit cf13c4ef53

View File

@@ -11,7 +11,9 @@ pub enum TokenizeError {
InvalidNumericConstant(String), InvalidNumericConstant(String),
InvalidIdentifier(String), InvalidIdentifier(String),
UnableToMatchToken(String), UnableToMatchToken(String),
UnclosedString,
IO(io::Error), IO(io::Error),
Regex(regex::Error),
} }
impl Display for TokenizeError { impl Display for TokenizeError {
@@ -25,7 +27,9 @@ impl Display for TokenizeError {
=> write!(f, "invalid identifier `{ident}`"), => write!(f, "invalid identifier `{ident}`"),
TokenizeError::UnableToMatchToken(token) TokenizeError::UnableToMatchToken(token)
=> write!(f, "the token `{token}` was unable to be parsed"), => write!(f, "the token `{token}` was unable to be parsed"),
TokenizeError::IO(io) => write!(f, "{io}") TokenizeError::UnclosedString => write!(f, "newline was found before string was closed"),
TokenizeError::IO(io) => write!(f, "{io}"),
TokenizeError::Regex(re) => write!(f, "{re}"),
} }
} }
} }
@@ -76,68 +80,66 @@ fn get_dot_count(s: &str) -> Option<usize> {
) )
} }
fn valid_identifier(c: char) -> bool {
c.is_alphanumeric() || c == '\'' || c == '_'
}
impl Token { impl Token {
/// Parse a single token
fn parse(s: &str) -> Result<Self, TokenizeError> { fn parse(s: &str) -> Result<Self, TokenizeError> {
let string = regex::Regex::new(r#"".+""#).expect("LOL!"); let string = regex::Regex::new(r#"".+""#).map_err(|e| TokenizeError::Regex(e))?;
let identifier = regex::Regex::new(r#"[A-Za-z_][A-Za-z0-9_']*"#).map_err(|e| TokenizeError::Regex(e))?;
let number = regex::Regex::new(r#"([0-9]+\.?[0-9]*)|(\.[0-9])"#).map_err(|e| TokenizeError::Regex(e))?;
if string.is_match(s) { if string.is_match(s) {
return Ok(Token::Constant(Value::String(s[1..s.len() - 1].to_string()))); Ok(Token::Constant(Value::String(s[1..s.len() - 1].to_string())))
} } else if identifier.is_match(s) {
Ok(Token::Identifier(s.to_string()))
match s { } else if number.is_match(s) {
// First check if s is an operator if let Ok(int) = s.parse::<i64>() {
"+" => Ok(Token::Operator(Op::Add)), Ok(Token::Constant(Value::Int(int)))
"-" => Ok(Token::Operator(Op::Sub)), } else if let Ok(float) = s.parse::<f64>() {
"*" => Ok(Token::Operator(Op::Mul)), Ok(Token::Constant(Value::Float(float)))
"/" => Ok(Token::Operator(Op::Div)), } else {
"**" => Ok(Token::Operator(Op::Exp)), Err(TokenizeError::InvalidNumericConstant(s.to_string()))
"%" => Ok(Token::Operator(Op::Mod)), }
"=" => Ok(Token::Operator(Op::Equ)), } else {
"." => Ok(Token::Operator(Op::LazyEqu)), match s {
"~" => Ok(Token::Operator(Op::Compose)), // First check if s is an operator
"," => Ok(Token::Operator(Op::Id)), "+" => Ok(Token::Operator(Op::Add)),
"?" => Ok(Token::Operator(Op::If)), "-" => Ok(Token::Operator(Op::Sub)),
"??" => Ok(Token::Operator(Op::IfElse)), "*" => Ok(Token::Operator(Op::Mul)),
">" => Ok(Token::Operator(Op::GreaterThan)), "/" => Ok(Token::Operator(Op::Div)),
"<" => Ok(Token::Operator(Op::LessThan)), "**" => Ok(Token::Operator(Op::Exp)),
">=" => Ok(Token::Operator(Op::GreaterThanOrEqualTo)), "%" => Ok(Token::Operator(Op::Mod)),
"<=" => Ok(Token::Operator(Op::LessThanOrEqualTo)), "=" => Ok(Token::Operator(Op::Equ)),
"==" => Ok(Token::Operator(Op::EqualTo)), "." => Ok(Token::Operator(Op::LazyEqu)),
"~" => Ok(Token::Operator(Op::Compose)),
// then some keywords "," => Ok(Token::Operator(Op::Id)),
"true" => Ok(Token::Constant(Value::Bool(true))), "?" => Ok(Token::Operator(Op::If)),
"false" => Ok(Token::Constant(Value::Bool(false))), "??" => Ok(Token::Operator(Op::IfElse)),
"not" => Ok(Token::Operator(Op::Not)), ">" => Ok(Token::Operator(Op::GreaterThan)),
"<" => Ok(Token::Operator(Op::LessThan)),
// Type casting ">=" => Ok(Token::Operator(Op::GreaterThanOrEqualTo)),
"int" => Ok(Token::Operator(Op::IntCast)), "<=" => Ok(Token::Operator(Op::LessThanOrEqualTo)),
"float" => Ok(Token::Operator(Op::FloatCast)), "==" => Ok(Token::Operator(Op::EqualTo)),
"bool" => Ok(Token::Operator(Op::BoolCast)),
"string" => Ok(Token::Operator(Op::StringCast)), // then some keywords
"true" => Ok(Token::Constant(Value::Bool(true))),
// then variable length keywords, constants, and identifiers "false" => Ok(Token::Constant(Value::Bool(false))),
_ => { "not" => Ok(Token::Operator(Op::Not)),
if s.starts_with(':') {
Ok(Token::Operator(Op::FunctionDeclare( // Type casting
get_dot_count(s).map(|x| x - 1).ok_or(TokenizeError::InvalidDynamicOperator(s.to_string()))? "int" => Ok(Token::Operator(Op::IntCast)),
))) "float" => Ok(Token::Operator(Op::FloatCast)),
} else if s.starts_with(|c| char::is_digit(c, 10) || c == '-') { "bool" => Ok(Token::Operator(Op::BoolCast)),
if let Ok(int) = s.parse::<i64>() { "string" => Ok(Token::Operator(Op::StringCast)),
Ok(Token::Constant(Value::Int(int)))
} else if let Ok(float) = s.parse::<f64>() { // then variable length keywords
Ok(Token::Constant(Value::Float(float))) _ => {
if s.starts_with(":") {
Ok(Token::Operator(Op::FunctionDeclare(
get_dot_count(s).map(|x| x - 1).ok_or(TokenizeError::InvalidDynamicOperator(s.to_string()))?
)))
} else { } else {
Err(TokenizeError::InvalidNumericConstant(s.to_string())) Err(TokenizeError::UnableToMatchToken(s.to_string()))
} }
} else if s.starts_with(valid_identifier) {
let valid = s.chars().skip(1).all(valid_identifier);
valid.then(|| Token::Identifier(s.to_string())).ok_or(TokenizeError::InvalidIdentifier(s.to_string()))
} else {
Err(TokenizeError::UnableToMatchToken(s.to_string()))
} }
} }
} }
@@ -178,21 +180,32 @@ impl<R: BufRead> std::iter::Iterator for Tokenizer<R> {
let mut input = String::new(); let mut input = String::new();
match self.reader.read_to_string(&mut input) { match self.reader.read_line(&mut input) {
Ok(0) => None, Ok(0) => None,
Err(e) => Some(Err(TokenizeError::IO(e))), Err(e) => Some(Err(TokenizeError::IO(e))),
_ => { _ => {
let re = regex::Regex::new(r#"[a-zA-Z0-9\.'_]+|[`~!@#\$%\^&\*\(\)\+-=\[\]\{\}\\|;:,<\.>/\?]+|("[^"]+")"#).expect("This wont fail promise :3"); let mut buffer = String::new();
for c in input.chars() {
for token in re.find_iter(input.as_str()).map(|mat| mat.as_str()).map(Token::parse) {
match token {
Ok(token) => self.tokens.push_back(token),
Err(e) => return Some(Err(e)),
}
} }
self.tokens.pop_front().map(|x| Ok(x))
} }
} }
} }
} }
#[cfg(test)]
mod tests {
use super::*;
use std::str::FromStr;
#[test]
fn tokenizer() {
let program = "\"hello\nworld\"";
let tok = Tokenizer::from_str(program).unwrap();
let tokens: Vec<Token> = tok.collect::<Result<_, TokenizeError>>().expect("tokenizer error");
println!("{tokens:?}");
}
}