diff --git a/src/tokenizer.rs b/src/tokenizer.rs index f9cd95a..c29d1e5 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -11,7 +11,9 @@ pub enum TokenizeError { InvalidNumericConstant(String), InvalidIdentifier(String), UnableToMatchToken(String), + UnclosedString, IO(io::Error), + Regex(regex::Error), } impl Display for TokenizeError { @@ -25,7 +27,9 @@ impl Display for TokenizeError { => write!(f, "invalid identifier `{ident}`"), TokenizeError::UnableToMatchToken(token) => write!(f, "the token `{token}` was unable to be parsed"), - TokenizeError::IO(io) => write!(f, "{io}") + TokenizeError::UnclosedString => write!(f, "newline was found before string was closed"), + TokenizeError::IO(io) => write!(f, "{io}"), + TokenizeError::Regex(re) => write!(f, "{re}"), } } } @@ -76,68 +80,66 @@ fn get_dot_count(s: &str) -> Option { ) } -fn valid_identifier(c: char) -> bool { - c.is_alphanumeric() || c == '\'' || c == '_' -} - impl Token { + /// Parse a single token fn parse(s: &str) -> Result { - let string = regex::Regex::new(r#"".+""#).expect("LOL!"); + let string = regex::Regex::new(r#"".+""#).map_err(|e| TokenizeError::Regex(e))?; + let identifier = regex::Regex::new(r#"[A-Za-z_][A-Za-z0-9_']*"#).map_err(|e| TokenizeError::Regex(e))?; + let number = regex::Regex::new(r#"([0-9]+\.?[0-9]*)|(\.[0-9])"#).map_err(|e| TokenizeError::Regex(e))?; if string.is_match(s) { - return Ok(Token::Constant(Value::String(s[1..s.len() - 1].to_string()))); - } - - match s { - // First check if s is an operator - "+" => Ok(Token::Operator(Op::Add)), - "-" => Ok(Token::Operator(Op::Sub)), - "*" => Ok(Token::Operator(Op::Mul)), - "/" => Ok(Token::Operator(Op::Div)), - "**" => Ok(Token::Operator(Op::Exp)), - "%" => Ok(Token::Operator(Op::Mod)), - "=" => Ok(Token::Operator(Op::Equ)), - "." => Ok(Token::Operator(Op::LazyEqu)), - "~" => Ok(Token::Operator(Op::Compose)), - "," => Ok(Token::Operator(Op::Id)), - "?" => Ok(Token::Operator(Op::If)), - "??" => Ok(Token::Operator(Op::IfElse)), - ">" => Ok(Token::Operator(Op::GreaterThan)), - "<" => Ok(Token::Operator(Op::LessThan)), - ">=" => Ok(Token::Operator(Op::GreaterThanOrEqualTo)), - "<=" => Ok(Token::Operator(Op::LessThanOrEqualTo)), - "==" => Ok(Token::Operator(Op::EqualTo)), - - // then some keywords - "true" => Ok(Token::Constant(Value::Bool(true))), - "false" => Ok(Token::Constant(Value::Bool(false))), - "not" => Ok(Token::Operator(Op::Not)), - - // Type casting - "int" => Ok(Token::Operator(Op::IntCast)), - "float" => Ok(Token::Operator(Op::FloatCast)), - "bool" => Ok(Token::Operator(Op::BoolCast)), - "string" => Ok(Token::Operator(Op::StringCast)), - - // then variable length keywords, constants, and identifiers - _ => { - if s.starts_with(':') { - Ok(Token::Operator(Op::FunctionDeclare( - get_dot_count(s).map(|x| x - 1).ok_or(TokenizeError::InvalidDynamicOperator(s.to_string()))? - ))) - } else if s.starts_with(|c| char::is_digit(c, 10) || c == '-') { - if let Ok(int) = s.parse::() { - Ok(Token::Constant(Value::Int(int))) - } else if let Ok(float) = s.parse::() { - Ok(Token::Constant(Value::Float(float))) + Ok(Token::Constant(Value::String(s[1..s.len() - 1].to_string()))) + } else if identifier.is_match(s) { + Ok(Token::Identifier(s.to_string())) + } else if number.is_match(s) { + if let Ok(int) = s.parse::() { + Ok(Token::Constant(Value::Int(int))) + } else if let Ok(float) = s.parse::() { + Ok(Token::Constant(Value::Float(float))) + } else { + Err(TokenizeError::InvalidNumericConstant(s.to_string())) + } + } else { + match s { + // First check if s is an operator + "+" => Ok(Token::Operator(Op::Add)), + "-" => Ok(Token::Operator(Op::Sub)), + "*" => Ok(Token::Operator(Op::Mul)), + "/" => Ok(Token::Operator(Op::Div)), + "**" => Ok(Token::Operator(Op::Exp)), + "%" => Ok(Token::Operator(Op::Mod)), + "=" => Ok(Token::Operator(Op::Equ)), + "." => Ok(Token::Operator(Op::LazyEqu)), + "~" => Ok(Token::Operator(Op::Compose)), + "," => Ok(Token::Operator(Op::Id)), + "?" => Ok(Token::Operator(Op::If)), + "??" => Ok(Token::Operator(Op::IfElse)), + ">" => Ok(Token::Operator(Op::GreaterThan)), + "<" => Ok(Token::Operator(Op::LessThan)), + ">=" => Ok(Token::Operator(Op::GreaterThanOrEqualTo)), + "<=" => Ok(Token::Operator(Op::LessThanOrEqualTo)), + "==" => Ok(Token::Operator(Op::EqualTo)), + + // then some keywords + "true" => Ok(Token::Constant(Value::Bool(true))), + "false" => Ok(Token::Constant(Value::Bool(false))), + "not" => Ok(Token::Operator(Op::Not)), + + // Type casting + "int" => Ok(Token::Operator(Op::IntCast)), + "float" => Ok(Token::Operator(Op::FloatCast)), + "bool" => Ok(Token::Operator(Op::BoolCast)), + "string" => Ok(Token::Operator(Op::StringCast)), + + // then variable length keywords + _ => { + if s.starts_with(":") { + Ok(Token::Operator(Op::FunctionDeclare( + get_dot_count(s).map(|x| x - 1).ok_or(TokenizeError::InvalidDynamicOperator(s.to_string()))? + ))) } else { - Err(TokenizeError::InvalidNumericConstant(s.to_string())) + Err(TokenizeError::UnableToMatchToken(s.to_string())) } - } else if s.starts_with(valid_identifier) { - let valid = s.chars().skip(1).all(valid_identifier); - valid.then(|| Token::Identifier(s.to_string())).ok_or(TokenizeError::InvalidIdentifier(s.to_string())) - } else { - Err(TokenizeError::UnableToMatchToken(s.to_string())) } } } @@ -178,21 +180,32 @@ impl std::iter::Iterator for Tokenizer { let mut input = String::new(); - match self.reader.read_to_string(&mut input) { + match self.reader.read_line(&mut input) { Ok(0) => None, Err(e) => Some(Err(TokenizeError::IO(e))), _ => { - let re = regex::Regex::new(r#"[a-zA-Z0-9\.'_]+|[`~!@#\$%\^&\*\(\)\+-=\[\]\{\}\\|;:,<\.>/\?]+|("[^"]+")"#).expect("This wont fail promise :3"); + let mut buffer = String::new(); + + for c in input.chars() { - for token in re.find_iter(input.as_str()).map(|mat| mat.as_str()).map(Token::parse) { - match token { - Ok(token) => self.tokens.push_back(token), - Err(e) => return Some(Err(e)), - } } - - self.tokens.pop_front().map(|x| Ok(x)) } } } -} \ No newline at end of file +} + +#[cfg(test)] +mod tests { + use super::*; + use std::str::FromStr; + + #[test] + fn tokenizer() { + let program = "\"hello\nworld\""; + + let tok = Tokenizer::from_str(program).unwrap(); + let tokens: Vec = tok.collect::>().expect("tokenizer error"); + + println!("{tokens:?}"); + } +}