add more useful error messages (only undefined identifer uses the new information)

This commit is contained in:
2024-10-26 16:58:27 -04:00
parent 0c148ebb2d
commit ef283fd02f
4 changed files with 242 additions and 185 deletions

View File

@@ -95,7 +95,7 @@ impl Value {
} }
impl Display for Value { impl Display for Value {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self { match self {
Self::Float(x) => write!(f, "{x}"), Self::Float(x) => write!(f, "{x}"),
Self::Int(x) => write!(f, "{x}"), Self::Int(x) => write!(f, "{x}"),
@@ -182,9 +182,9 @@ pub struct Runtime<'a, R: BufRead> {
} }
impl<'a, R: BufRead> Runtime<'a, R> { impl<'a, R: BufRead> Runtime<'a, R> {
pub fn new(reader: R) -> Self { pub fn new(reader: R, name: &str) -> Self {
Self { Self {
tokenizer: Tokenizer::new(reader).peekable(), tokenizer: Tokenizer::new(reader, name).peekable(),
global_types: HashMap::new(), global_types: HashMap::new(),
globals: HashMap::new(), globals: HashMap::new(),
parser: None, parser: None,

View File

@@ -1,12 +1,12 @@
use std::io::{self, BufReader}; use std::io::{self, BufReader};
fn main() { fn main() {
let mut runtime = lamm::Runtime::new(BufReader::new(io::stdin())); let mut runtime = lamm::Runtime::new(BufReader::new(io::stdin()), "<stdin>");
for value in runtime.values() { for value in runtime.values() {
match value { match value {
Ok(v) => println!("=> {v}"), Ok(v) => println!("=> {v}"),
Err(e) => eprintln!("error: {e}"), Err(e) => eprintln!("{e}"),
} }
} }
} }

View File

@@ -2,7 +2,7 @@
use crate::executor::Executor; use crate::executor::Executor;
use super::{Value, Type, Function, FunctionType}; use super::{Value, Type, Function, FunctionType};
use super::tokenizer::{Token, TokenizeError, Op}; use super::tokenizer::{Token, TokenType, TokenizeError, Op};
use std::borrow::BorrowMut; use std::borrow::BorrowMut;
use std::error; use std::error;
@@ -14,7 +14,7 @@ use std::iter::Peekable;
pub enum ParseError { pub enum ParseError {
NoInput, NoInput,
UnexpectedEndInput, UnexpectedEndInput,
IdentifierUndefined(String), IdentifierUndefined(Token),
InvalidIdentifier(Token), InvalidIdentifier(Token),
UnmatchedArrayClose, UnmatchedArrayClose,
UnwantedToken(Token), UnwantedToken(Token),
@@ -27,7 +27,7 @@ impl Display for ParseError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self { match self {
ParseError::UnexpectedEndInput => write!(f, "Input ended unexpectedly"), ParseError::UnexpectedEndInput => write!(f, "Input ended unexpectedly"),
ParseError::IdentifierUndefined(name) => write!(f, "Undefined identifier `{name}`"), ParseError::IdentifierUndefined(name) => write!(f, "Undefined identifier `{}` {}:{}:{}", name.lexeme, name.file, name.line, name.location.start),
ParseError::InvalidIdentifier(t) => write!(f, "Invalid identifier `{t:?}`"), ParseError::InvalidIdentifier(t) => write!(f, "Invalid identifier `{t:?}`"),
ParseError::NoInput => write!(f, "No input given"), ParseError::NoInput => write!(f, "No input given"),
ParseError::UnmatchedArrayClose => write!(f, "there was an unmatched array closing operator `]`"), ParseError::UnmatchedArrayClose => write!(f, "there was an unmatched array closing operator `]`"),
@@ -114,12 +114,11 @@ impl<'a, I: Iterator<Item = Result<Token, TokenizeError>>> Parser<'a, I> {
self self
} }
fn get_object_type(&self, ident: &String) -> Result<&Type, ParseError> { fn get_object_type(&self, ident: &String) -> Option<&Type> {
self.locals.get(ident).or(self.globals.get(ident)) self.locals.get(ident).or(self.globals.get(ident))
.ok_or(ParseError::IdentifierUndefined(ident.clone()))
} }
fn _get_object_types<Names: Iterator<Item = String>>(&self, items: Names) -> impl Iterator<Item = Result<&Type, ParseError>> { fn _get_object_types<Names: Iterator<Item = String>>(&self, items: Names) -> impl Iterator<Item = Option<&Type>> {
items.map(|x| self.get_object_type(&x)) items.map(|x| self.get_object_type(&x))
} }
@@ -194,10 +193,10 @@ impl<'a, I: Iterator<Item = Result<Token, TokenizeError>>> Parser<'a, I> {
.ok_or(ParseError::NoInput)? .ok_or(ParseError::NoInput)?
.map_err(|e| ParseError::TokenizeError(e))?; .map_err(|e| ParseError::TokenizeError(e))?;
match token { match token.token() {
Token::Constant(c) => Ok(ParseTree::Value(c)), TokenType::Constant(c) => Ok(ParseTree::Value(c)),
Token::Identifier(ident) => { TokenType::Identifier(ident) => {
match self.get_object_type(&ident)? { match self.get_object_type(&ident).ok_or(ParseError::IdentifierUndefined(token))? {
Type::Function(f) => { Type::Function(f) => {
let f = f.clone(); let f = f.clone();
let args = self.get_args(f.1.len())?; let args = self.get_args(f.1.len())?;
@@ -228,22 +227,25 @@ impl<'a, I: Iterator<Item = Result<Token, TokenizeError>>> Parser<'a, I> {
_ => Ok(ParseTree::Variable(ident)), _ => Ok(ParseTree::Variable(ident)),
} }
}, },
Token::Operator(op) => match op { TokenType::Operator(op) => match op {
Op::OpenArray => { Op::OpenArray => {
let mut depth = 1; let mut depth = 1;
// take tokens until we reach the end of this array // take tokens until we reach the end of this array
// if we don't collect them here it causes rust to overflow computing the types // if we don't collect them here it causes rust to overflow computing the types
let array_tokens = self.tokens.by_ref().take_while(|t| match t { let array_tokens = self.tokens.by_ref().take_while(|t| match t {
Ok(Token::Operator(Op::OpenArray)) => { Ok(t) => match t.token() {
TokenType::Operator(Op::OpenArray) => {
depth += 1; depth += 1;
true true
}, },
Ok(Token::Operator(Op::CloseArray)) => { TokenType::Operator(Op::CloseArray) => {
depth -= 1; depth -= 1;
depth > 0 depth > 0
} }
_ => true, _ => true,
}
_ => true,
}).collect::<Result<Vec<_>, TokenizeError>>().map_err(|e| ParseError::TokenizeError(e))?; }).collect::<Result<Vec<_>, TokenizeError>>().map_err(|e| ParseError::TokenizeError(e))?;
let mut array_tokens = array_tokens let mut array_tokens = array_tokens
@@ -270,15 +272,18 @@ impl<'a, I: Iterator<Item = Result<Token, TokenizeError>>> Parser<'a, I> {
// take tokens until we reach the end of this array // take tokens until we reach the end of this array
// if we don't collect them here it causes rust to overflow computing the types // if we don't collect them here it causes rust to overflow computing the types
let array_tokens = self.tokens.by_ref().take_while(|t| match t { let array_tokens = self.tokens.by_ref().take_while(|t| match t {
Ok(Token::Operator(Op::OpenStatement)) => { Ok(t) => match t.token() {
TokenType::Operator(Op::OpenStatement) => {
depth += 1; depth += 1;
true true
}, },
Ok(Token::Operator(Op::CloseStatement)) => { TokenType::Operator(Op::CloseStatement) => {
depth -= 1; depth -= 1;
depth > 0 depth > 0
} }
_ => true, _ => true,
}
_ => true,
}).collect::<Result<Vec<_>, TokenizeError>>().map_err(|e| ParseError::TokenizeError(e))?; }).collect::<Result<Vec<_>, TokenizeError>>().map_err(|e| ParseError::TokenizeError(e))?;
let mut array_tokens = array_tokens let mut array_tokens = array_tokens
@@ -304,7 +309,7 @@ impl<'a, I: Iterator<Item = Result<Token, TokenizeError>>> Parser<'a, I> {
let body = Box::new(self.parse()?); let body = Box::new(self.parse()?);
if let Token::Identifier(ident) = token { if let TokenType::Identifier(ident) = token.token() {
match op { match op {
Op::Equ => Ok(ParseTree::Equ( Op::Equ => Ok(ParseTree::Equ(
ident.clone(), ident.clone(),
@@ -359,10 +364,7 @@ impl<'a, I: Iterator<Item = Result<Token, TokenizeError>>> Parser<'a, I> {
}); });
for name in names.clone() { for name in names.clone() {
let t = match self.locals.remove(&name).ok_or(ParseError::IdentifierUndefined(name.clone())) { let t = self.locals.remove(&name).ok_or(ParseError::IdentifierUndefined(token.clone()))?;
Ok(t) => t,
Err(e) => return Err(e),
};
self.globals.insert(name, t); self.globals.insert(name, t);
} }
@@ -392,7 +394,7 @@ impl<'a, I: Iterator<Item = Result<Token, TokenizeError>>> Parser<'a, I> {
}, },
op => self.parse_operator(op), op => self.parse_operator(op),
}, },
t => Err(ParseError::UnwantedToken(t)), _ => Err(ParseError::UnwantedToken(token)),
} }
} }
@@ -436,7 +438,8 @@ impl<'a, I: Iterator<Item = Result<Token, TokenizeError>>> Parser<'a, I> {
.collect::<Result<_, _>>()?; .collect::<Result<_, _>>()?;
let (types, names): (Vec<_>, Vec<_>) = args.into_iter().unzip(); let (types, names): (Vec<_>, Vec<_>) = args.into_iter().unzip();
let ret = if tokens.next_if(|x| matches!(x, Ok(Token::Operator(Op::Arrow)))).is_some() {
let ret = if tokens.next_if(|x| matches!(x.as_ref().unwrap().token(), TokenType::Operator(Op::Arrow))).is_some() {
Self::parse_type(tokens)? Self::parse_type(tokens)?
} else { } else {
Type::Any Type::Any
@@ -445,15 +448,16 @@ impl<'a, I: Iterator<Item = Result<Token, TokenizeError>>> Parser<'a, I> {
Ok((FunctionType(Box::new(ret), types), names)) Ok((FunctionType(Box::new(ret), types), names))
} }
fn parse_function_declaration_parameter( fn parse_function_declaration_parameter(mut tokens: &mut Peekable<I>) -> Result<(Type, String), ParseError>
mut tokens: &mut Peekable<I>) -> Result<(Type, String), ParseError>
{ {
match tokens.next() { let token = tokens.next().ok_or(ParseError::UnexpectedEndInput)?.map_err(|e| ParseError::TokenizeError(e))?;
match token.token() {
// untyped variable // untyped variable
Some(Ok(Token::Identifier(x))) => Ok((Type::Any, x)), TokenType::Identifier(x) => Ok((Type::Any, x)),
// typed variable // typed variable
Some(Ok(Token::Operator(Op::TypeDeclaration))) => { TokenType::Operator(Op::TypeDeclaration) => {
let name = Self::get_identifier(tokens.next())?; let name = Self::get_identifier(tokens.next())?;
let t = Self::parse_type(&mut tokens)?; let t = Self::parse_type(&mut tokens)?;
@@ -461,7 +465,7 @@ impl<'a, I: Iterator<Item = Result<Token, TokenizeError>>> Parser<'a, I> {
} }
// untyped function (all args Any, return type Any) // untyped function (all args Any, return type Any)
Some(Ok(Token::Operator(Op::FunctionDefine(n)))) => { TokenType::Operator(Op::FunctionDefine(n)) => {
let name = Self::get_identifier(tokens.next())?; let name = Self::get_identifier(tokens.next())?;
let args = (0..n).map(|_| Type::Any).collect(); let args = (0..n).map(|_| Type::Any).collect();
@@ -469,7 +473,7 @@ impl<'a, I: Iterator<Item = Result<Token, TokenizeError>>> Parser<'a, I> {
} }
// typed function // typed function
Some(Ok(Token::Operator(Op::FunctionDeclare(n)))) => { TokenType::Operator(Op::FunctionDeclare(n)) => {
let name = Self::get_identifier(tokens.next())?; let name = Self::get_identifier(tokens.next())?;
let args = (0..n).map(|_| Self::parse_type(&mut tokens)).collect::<Result<_, _>>()?; let args = (0..n).map(|_| Self::parse_type(&mut tokens)).collect::<Result<_, _>>()?;
let mut ret = Type::Any; let mut ret = Type::Any;
@@ -477,7 +481,7 @@ impl<'a, I: Iterator<Item = Result<Token, TokenizeError>>> Parser<'a, I> {
// this is annoying // this is annoying
// inside the next_if closure, we already can know that its an error // inside the next_if closure, we already can know that its an error
// and return it, but we cannot return out of a closure // and return it, but we cannot return out of a closure
if let Some(t) = tokens.next_if(|x| matches!(x, Ok(Token::Operator(Op::Arrow)))) if let Some(t) = tokens.next_if(|x| matches!(x.as_ref().unwrap().token(), TokenType::Operator(Op::Arrow)))
{ {
// so we just check for an error here. this is the only reason t exists. // so we just check for an error here. this is the only reason t exists.
if let Err(e) = t { if let Err(e) = t {
@@ -489,33 +493,35 @@ impl<'a, I: Iterator<Item = Result<Token, TokenizeError>>> Parser<'a, I> {
Ok((Type::Function(FunctionType(Box::new(ret), args)), name)) Ok((Type::Function(FunctionType(Box::new(ret), args)), name))
} }
_ => Err(ParseError::UnwantedToken(token)),
Some(Ok(t)) => Err(ParseError::UnwantedToken(t)),
Some(Err(e)) => Err(ParseError::TokenizeError(e)),
None => Err(ParseError::UnexpectedEndInput),
} }
} }
// for some dumbass reason, // for some dumbass reason,
// this is the only code that breaks if it doesn't take an impl Iterator instead of simply I ... // this is the only code that breaks if it doesn't take an impl Iterator instead of simply I ...
fn parse_type(tokens: &mut Peekable<impl Iterator<Item = Result<Token, TokenizeError>>>) -> Result<Type, ParseError> { fn parse_type(tokens: &mut Peekable<impl Iterator<Item = Result<Token, TokenizeError>>>) -> Result<Type, ParseError> {
match tokens.next() { let token = tokens.next().ok_or(ParseError::UnexpectedEndInput)?.map_err(|e| ParseError::TokenizeError(e))?;
Some(Ok(Token::Type(t))) => Ok(t),
Some(Ok(Token::Operator(Op::OpenArray))) => { match token.token() {
TokenType::Type(t) => Ok(t),
TokenType::Operator(Op::OpenArray) => {
let mut depth = 1; let mut depth = 1;
// take tokens until we reach the end of this array // take tokens until we reach the end of this array
// if we don't collect them here it causes rust to overflow computing the types // if we don't collect them here it causes rust to overflow computing the types
let array_tokens = tokens.by_ref().take_while(|t| match t { let array_tokens = tokens.by_ref().take_while(|t| match t {
Ok(Token::Operator(Op::OpenArray)) => { Ok(t) => match t.token() {
TokenType::Operator(Op::OpenStatement) => {
depth += 1; depth += 1;
true true
}, },
Ok(Token::Operator(Op::CloseArray)) => { TokenType::Operator(Op::CloseStatement) => {
depth -= 1; depth -= 1;
depth > 0 depth > 0
} }
_ => true, _ => true,
}
_ => true,
}).collect::<Result<Vec<_>, TokenizeError>>().map_err(|e| ParseError::TokenizeError(e))?; }).collect::<Result<Vec<_>, TokenizeError>>().map_err(|e| ParseError::TokenizeError(e))?;
// ... thanks to this conversion here. The compiler complains that the types don't // ... thanks to this conversion here. The compiler complains that the types don't
@@ -537,19 +543,17 @@ impl<'a, I: Iterator<Item = Result<Token, TokenizeError>>> Parser<'a, I> {
Ok(Type::Array(Box::new(t))) Ok(Type::Array(Box::new(t)))
}, },
Some(Ok(t)) => Err(ParseError::UnwantedToken(t.clone())), _ => Err(ParseError::UnwantedToken(token)),
Some(Err(e)) => Err(ParseError::TokenizeError(e)),
None => Err(ParseError::UnexpectedEndInput),
} }
} }
fn get_identifier(t: Option<Result<Token, TokenizeError>>) -> Result<String, ParseError> { fn get_identifier(t: Option<Result<Token, TokenizeError>>) -> Result<String, ParseError> {
match t.ok_or(ParseError::UnexpectedEndInput)? let token = t.ok_or(ParseError::UnexpectedEndInput)?
.map_err(|e| ParseError::TokenizeError(e)) .map_err(|e| ParseError::TokenizeError(e))?;
{
Ok(Token::Identifier(ident)) => Ok(ident), match token.token() {
Ok(t) => Err(ParseError::InvalidIdentifier(t)), TokenType::Identifier(ident) => Ok(ident),
Err(e) => Err(e), _ => Err(ParseError::InvalidIdentifier(token)),
} }
} }
} }

View File

@@ -6,7 +6,9 @@ use crate::Type;
use super::Value; use super::Value;
use std::fmt::{Display, Formatter}; use std::fmt::{Display, Formatter};
use std::io::{BufRead, Cursor}; use std::io::BufRead;
use std::sync::Arc;
use std::ops::Range;
#[derive(Debug)] #[derive(Debug)]
pub enum TokenizeError { pub enum TokenizeError {
@@ -89,17 +91,114 @@ pub enum Op {
} }
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
pub enum Token { pub enum TokenType {
Identifier(String), Identifier(String),
Operator(Op), Operator(Op),
Constant(Value), Constant(Value),
Type(Type), Type(Type),
} }
fn get_dot_count<I: Iterator<Item = char>>(s: &mut Peekable<I>) -> Option<usize> { impl TokenType {
/// Parse a single token
fn parse(s: &str) -> Result<Self, TokenizeError> {
let identifier = regex::Regex::new(r#"[A-Za-z_][A-Za-z0-9_']*"#).map_err(|e| TokenizeError::Regex(e))?;
let number = regex::Regex::new(r#"([0-9]+\.?[0-9]*)|(\.[0-9])"#).map_err(|e| TokenizeError::Regex(e))?;
Ok(match s {
// Match keywords first
"true" => TokenType::Constant(Value::Bool(true)),
"false" => TokenType::Constant(Value::Bool(false)),
"nil" => TokenType::Constant(Value::Nil),
"int" => TokenType::Operator(Op::IntCast),
"float" => TokenType::Operator(Op::FloatCast),
"bool" => TokenType::Operator(Op::BoolCast),
"string" => TokenType::Operator(Op::StringCast),
"print" => TokenType::Operator(Op::Print),
"empty" => TokenType::Operator(Op::Empty),
"head" => TokenType::Operator(Op::Head),
"tail" => TokenType::Operator(Op::Tail),
"init" => TokenType::Operator(Op::Init),
"fini" => TokenType::Operator(Op::Fini),
"export" => TokenType::Operator(Op::Export),
// Types
"Any" => TokenType::Type(Type::Any),
"Int" => TokenType::Type(Type::Int),
"Float" => TokenType::Type(Type::Float),
"Bool" => TokenType::Type(Type::Bool),
"String" => TokenType::Type(Type::String),
// then identifiers and numbers
_ => {
if identifier.is_match(s) {
TokenType::Identifier(s.to_string())
} else if number.is_match(s) {
if let Ok(int) = s.parse::<i64>() {
TokenType::Constant(Value::Int(int))
} else if let Ok(float) = s.parse::<f64>() {
TokenType::Constant(Value::Float(float))
} else {
return Err(TokenizeError::InvalidNumericConstant(s.to_string()));
}
} else {
return Err(TokenizeError::UnableToMatchToken(s.to_string()));
}
}
})
}
}
#[derive(Debug, Clone)]
pub struct Token {
t: TokenType,
pub lexeme: String,
pub line: usize,
pub file: Arc<String>,
pub location: Range<usize>,
}
impl Token {
pub fn new(t: TokenType, lexeme: String, file: Arc<String>, line: usize, column: usize) -> Self {
Self {
t,
line,
file,
location: column..column+lexeme.len(),
lexeme,
}
}
pub fn token(&self) -> TokenType {
self.t.clone()
}
}
/// Tokenize an input stream of source code for a Parser
pub(crate) struct Tokenizer<R: BufRead> {
reader: R,
line: usize,
column: usize,
code: String,
filename: Arc<String>,
tokens: VecDeque<Token>,
}
impl<R: BufRead> Tokenizer<R> {
pub fn new(reader: R, filename: &str) -> Self {
Self {
reader,
line: 0,
column: 0,
filename: Arc::new(filename.to_string()),
code: String::new(),
tokens: VecDeque::new(),
}
}
fn get_dot_count<I: Iterator<Item = char>>(&mut self, s: &mut Peekable<I>) -> Option<usize> {
let mut total = 0; let mut total = 0;
while let Some(n) = s.next_if(|&c| c == ':' || c == '.').map(|c| match c { while let Some(n) = self.next_char_if(s, |&c| c == ':' || c == '.').map(|c| match c {
':' => 2, ':' => 2,
'.' => 1, '.' => 1,
_ => 0, _ => 0,
@@ -108,74 +207,48 @@ fn get_dot_count<I: Iterator<Item = char>>(s: &mut Peekable<I>) -> Option<usize>
} }
Some(total) Some(total)
} }
impl Token { fn next_char<I: Iterator<Item = char>>(&mut self, iter: &mut Peekable<I>) -> Option<char> {
/// Parse a single token if let Some(c) = iter.next() {
fn parse(s: &str) -> Result<Self, TokenizeError> { self.column += 1;
let identifier = regex::Regex::new(r#"[A-Za-z_][A-Za-z0-9_']*"#).map_err(|e| TokenizeError::Regex(e))?; Some(c)
let number = regex::Regex::new(r#"([0-9]+\.?[0-9]*)|(\.[0-9])"#).map_err(|e| TokenizeError::Regex(e))?;
match s {
// Match keywords first
"true" => Ok(Token::Constant(Value::Bool(true))),
"false" => Ok(Token::Constant(Value::Bool(false))),
"nil" => Ok(Token::Constant(Value::Nil)),
"int" => Ok(Token::Operator(Op::IntCast)),
"float" => Ok(Token::Operator(Op::FloatCast)),
"bool" => Ok(Token::Operator(Op::BoolCast)),
"string" => Ok(Token::Operator(Op::StringCast)),
"print" => Ok(Token::Operator(Op::Print)),
"empty" => Ok(Token::Operator(Op::Empty)),
"head" => Ok(Token::Operator(Op::Head)),
"tail" => Ok(Token::Operator(Op::Tail)),
"init" => Ok(Token::Operator(Op::Init)),
"fini" => Ok(Token::Operator(Op::Fini)),
"export" => Ok(Token::Operator(Op::Export)),
// Types
"Any" => Ok(Token::Type(Type::Any)),
"Int" => Ok(Token::Type(Type::Int)),
"Float" => Ok(Token::Type(Type::Float)),
"Bool" => Ok(Token::Type(Type::Bool)),
"String" => Ok(Token::Type(Type::String)),
// then identifiers and numbers
_ => {
if identifier.is_match(s) {
Ok(Token::Identifier(s.to_string()))
} else if number.is_match(s) {
if let Ok(int) = s.parse::<i64>() {
Ok(Token::Constant(Value::Int(int)))
} else if let Ok(float) = s.parse::<f64>() {
Ok(Token::Constant(Value::Float(float)))
} else { } else {
Err(TokenizeError::InvalidNumericConstant(s.to_string())) None
}
}
fn next_char_if<I: Iterator<Item = char>>(
&mut self,
iter: &mut Peekable<I>,
pred: impl FnOnce(&char) -> bool) -> Option<char>
{
if let Some(c) = iter.next_if(pred) {
self.column += 1;
Some(c)
} else {
None
}
}
fn next_char_while<I: Iterator<Item = char>>(
&mut self,
iter: &mut Peekable<I>,
mut pred: impl FnMut(&char) -> bool) -> Option<char>
{
if let Some(c) = self.next_char(iter) {
if (pred)(&c) {
Some(c)
} else {
None
} }
} else { } else {
Err(TokenizeError::UnableToMatchToken(s.to_string())) None
}
}
}
}
}
/// Tokenize an input stream of source code for a Parser
pub(crate) struct Tokenizer<R: BufRead> {
reader: R,
tokens: VecDeque<Result<Token, TokenizeError>>,
}
impl<R: BufRead> Tokenizer<R> {
pub fn new(reader: R) -> Self {
Self {
reader,
tokens: VecDeque::new(),
} }
} }
/// Tokenizes more input and adds them to the internal queue /// Tokenizes more input and adds them to the internal queue
fn tokenize<I: Iterator<Item = char>>(&mut self, mut iter: Peekable<I>) { fn tokenize<I: Iterator<Item = char>>(&mut self, mut iter: Peekable<I>) -> Result<(), TokenizeError> {
let operators: HashMap<&'static str, Op> = HashMap::from([ let operators: HashMap<&'static str, Op> = HashMap::from([
("+", Op::Add), ("+", Op::Add),
("-", Op::Sub), ("-", Op::Sub),
@@ -211,33 +284,31 @@ impl<R: BufRead> Tokenizer<R> {
("\\", Op::NonCall), ("\\", Op::NonCall),
]); ]);
let c = if let Some(c) = iter.next() { let c = if let Some(c) = self.next_char(&mut iter) {
c c
} else { } else {
return; return Ok(());
}; };
if c.is_alphanumeric() { if c.is_alphanumeric() {
let mut token = String::from(c); let mut token = String::from(c);
while let Some(c) = iter.next_if(|&c| c.is_alphanumeric() || c == '.' || c == '\'') { while let Some(c) = self.next_char_if(&mut iter, |&c| c.is_alphanumeric() || c == '.' || c == '\'') {
token.push(c); token.push(c);
} }
self.tokens.push_back(Token::parse(&token)); self.tokens.push_back(Token::new(TokenType::parse(&token)?, token, self.filename.clone(), self.line, self.column));
self.tokenize(iter) self.tokenize(iter)
} else if c == '#' { } else if c == '#' {
let _: String = iter.by_ref().take_while(|&c| c != '\n').collect(); let _: String = iter.by_ref().take_while(|&c| c != '\n').collect();
self.tokenize(iter)
} else if c == '\"' { } else if c == '\"' {
let mut token = String::new(); let mut token = String::new();
while let Some(c) = iter.next() { while let Some(c) = self.next_char(&mut iter) {
match c { match c {
'"' => break, '"' => break,
'\n' => { '\n' => return Err(TokenizeError::UnclosedString),
self.tokens.push_back(Err(TokenizeError::UnclosedString));
return;
}
'\\' => match iter.next() { '\\' => match iter.next() {
Some('\\') => token.push('\\'), Some('\\') => token.push('\\'),
Some('n') => token.push('\n'), Some('n') => token.push('\n'),
@@ -245,16 +316,16 @@ impl<R: BufRead> Tokenizer<R> {
Some('r') => token.push('\r'), Some('r') => token.push('\r'),
Some('\"') => token.push('"'), Some('\"') => token.push('"'),
Some(c) => token.push(c), Some(c) => token.push(c),
None => { None => return Err(TokenizeError::UnclosedString),
self.tokens.push_back(Err(TokenizeError::UnclosedString));
return;
},
} }
_ => token.push(c), _ => token.push(c),
} }
} }
self.tokens.push_back(Ok(Token::Constant(Value::String(token)))); self.tokens.push_back(
Token::new(TokenType::Constant(
Value::String(token.clone())), token, self.filename.clone(), self.line, self.column));
self.tokenize(iter) self.tokenize(iter)
} else if operators.keys().any(|x| x.starts_with(c)) { } else if operators.keys().any(|x| x.starts_with(c)) {
let mut token = String::from(c); let mut token = String::from(c);
@@ -281,49 +352,39 @@ impl<R: BufRead> Tokenizer<R> {
// if not, we need to make sure that the next characters // if not, we need to make sure that the next characters
// we grab *actually* match the last operator // we grab *actually* match the last operator
if let Some(op) = possible.get(token.as_str()) { if let Some(op) = possible.get(token.as_str()) {
self.tokens.push_back(Ok(Token::Operator(match op { let token = Token::new(TokenType::Operator(match op {
// special handling for "dynamic" operators // special handling for "dynamic" operators
Op::FunctionDefine(n) => { Op::FunctionDefine(n) => {
let count = match get_dot_count(&mut iter) { let count = match self.get_dot_count(&mut iter) {
Some(count) => count, Some(count) => count,
None => { None => return Err(TokenizeError::InvalidDynamicOperator(token)),
self.tokens.push_back(Err(TokenizeError::InvalidDynamicOperator(token)));
return;
}
}; };
Op::FunctionDefine(n + count) Op::FunctionDefine(n + count)
} }
Op::FunctionDeclare(n) => { Op::FunctionDeclare(n) => {
let count = match get_dot_count(&mut iter) { let count = match self.get_dot_count(&mut iter) {
Some(count) => count, Some(count) => count,
None => { None => return Err(TokenizeError::InvalidDynamicOperator(token)),
self.tokens.push_back(Err(TokenizeError::InvalidDynamicOperator(token)));
return;
}
}; };
Op::FunctionDeclare(n + count) Op::FunctionDeclare(n + count)
} }
Op::LambdaDefine(n) => { Op::LambdaDefine(n) => {
let count = match get_dot_count(&mut iter) { let count = match self.get_dot_count(&mut iter) {
Some(count) => count, Some(count) => count,
None => { None => return Err(TokenizeError::InvalidDynamicOperator(token)),
self.tokens.push_back(Err(TokenizeError::InvalidDynamicOperator(token)));
return;
}
}; };
Op::LambdaDefine(n + count) Op::LambdaDefine(n + count)
} }
op => op.clone(), op => op.clone(),
}))); }), token, self.filename.clone(), self.line, self.column);
self.tokens.push_back(token);
break; break;
} else { } else {
let next = match iter.next_if(is_expected) { let next = match self.next_char_if(&mut iter, is_expected) {
Some(c) => c, Some(c) => c,
None => { None => return Err(TokenizeError::UnableToMatchToken(format!("{token}"))),
self.tokens.push_back(Err(TokenizeError::UnableToMatchToken(format!("{token}"))));
return;
}
}; };
token.push(next); token.push(next);
@@ -331,45 +392,38 @@ impl<R: BufRead> Tokenizer<R> {
} }
0 => unreachable!(), 0 => unreachable!(),
_ => { _ => {
let next = match iter.next_if(is_expected) { let next = match self.next_char_if(&mut iter, is_expected) {
Some(c) => c, Some(c) => c,
None => { None => {
// at this point, token must be in the hashmap possible, otherwise it wouldnt have any matches let token = Token::new(TokenType::Operator(match possible.get(token.as_str()).unwrap() {
self.tokens.push_back(Ok(Token::Operator(match possible.get(token.as_str()).unwrap() {
// special handling for "dynamic" operators // special handling for "dynamic" operators
Op::FunctionDefine(n) => { Op::FunctionDefine(n) => {
let count = match get_dot_count(&mut iter) { let count = match self.get_dot_count(&mut iter) {
Some(count) => count, Some(count) => count,
None => { None => return Err(TokenizeError::InvalidDynamicOperator(token)),
self.tokens.push_back(Err(TokenizeError::InvalidDynamicOperator(token)));
return;
}
}; };
Op::FunctionDefine(n + count) Op::FunctionDefine(n + count)
} }
Op::FunctionDeclare(n) => { Op::FunctionDeclare(n) => {
let count = match get_dot_count(&mut iter) { let count = match self.get_dot_count(&mut iter) {
Some(count) => count, Some(count) => count,
None => { None => return Err(TokenizeError::InvalidDynamicOperator(token)),
self.tokens.push_back(Err(TokenizeError::InvalidDynamicOperator(token)));
return;
}
}; };
Op::FunctionDeclare(n + count) Op::FunctionDeclare(n + count)
} }
Op::LambdaDefine(n) => { Op::LambdaDefine(n) => {
let count = match get_dot_count(&mut iter) { let count = match self.get_dot_count(&mut iter) {
Some(count) => count, Some(count) => count,
None => { None => return Err(TokenizeError::InvalidDynamicOperator(token)),
self.tokens.push_back(Err(TokenizeError::InvalidDynamicOperator(token)));
return;
}
}; };
Op::LambdaDefine(n + count) Op::LambdaDefine(n + count)
} }
op => op.clone(), op => op.clone(),
}))); }), token, self.filename.clone(), self.line, self.column);
// at this point, token must be in the hashmap possible, otherwise it wouldn't have any matches
self.tokens.push_back(token);
break; break;
} }
}; };
@@ -383,27 +437,17 @@ impl<R: BufRead> Tokenizer<R> {
} else if c.is_whitespace() { } else if c.is_whitespace() {
self.tokenize(iter) self.tokenize(iter)
} else { } else {
self.tokens.push_back(Err(TokenizeError::InvalidCharacter(c))); return Err(TokenizeError::InvalidCharacter(c));
return;
} }
} }
} }
impl std::str::FromStr for Tokenizer<Cursor<String>> { impl<R: BufRead> Iterator for Tokenizer<R> {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
let cursor = Cursor::new(s.to_string());
Ok(Tokenizer::new(cursor))
}
}
impl<R: BufRead> std::iter::Iterator for Tokenizer<R> {
type Item = Result<Token, TokenizeError>; type Item = Result<Token, TokenizeError>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
if let Some(token) = self.tokens.pop_front() { if let Some(token) = self.tokens.pop_front() {
return Some(token); return Some(Ok(token));
} }
let mut input = String::new(); let mut input = String::new();
@@ -411,7 +455,15 @@ impl<R: BufRead> std::iter::Iterator for Tokenizer<R> {
match self.reader.read_line(&mut input) { match self.reader.read_line(&mut input) {
Ok(0) => None, Ok(0) => None,
Ok(_n) => { Ok(_n) => {
self.tokenize(input.chars().peekable()); self.code.push_str(&input);
self.line += 1;
self.column = 0;
match self.tokenize(input.chars().peekable()) {
Ok(()) => (),
Err(e) => return Some(Err(e)),
}
self.next() self.next()
}, },
Err(e) => Some(Err(TokenizeError::IO(e))), Err(e) => Some(Err(TokenizeError::IO(e))),
@@ -421,7 +473,8 @@ impl<R: BufRead> std::iter::Iterator for Tokenizer<R> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::str::FromStr; use io::Cursor;
use crate::parser::Parser; use crate::parser::Parser;
use super::*; use super::*;
@@ -429,7 +482,7 @@ mod tests {
fn tokenizer() { fn tokenizer() {
let program = ": length ?. x [] -> Int ?? x + 1 length tail x 0 length [ 1 2 3 ]"; let program = ": length ?. x [] -> Int ?? x + 1 length tail x 0 length [ 1 2 3 ]";
let tokens: Vec<Token> = Tokenizer::from_str(program).unwrap().collect::<Result<_, _>>().unwrap(); let tokens: Vec<Token> = Tokenizer::new(Cursor::new(program), "<tokenizer>").collect::<Result<_, _>>().unwrap();
println!("{tokens:#?}"); println!("{tokens:#?}");
} }
@@ -438,7 +491,7 @@ mod tests {
fn a() { fn a() {
let program = ": length ?. x [] -> Int ?? x + 1 length tail x 0 length [ 1 2 3 ]"; let program = ": length ?. x [] -> Int ?? x + 1 length tail x 0 length [ 1 2 3 ]";
let mut tokenizer = Tokenizer::from_str(program).unwrap().peekable(); let mut tokenizer = Tokenizer::new(Cursor::new(program), "<a>").peekable();
let mut globals = HashMap::new(); let mut globals = HashMap::new();
let mut parser = Parser::new(&mut tokenizer, &mut globals); let mut parser = Parser::new(&mut tokenizer, &mut globals);