683 lines
28 KiB
Rust
683 lines
28 KiB
Rust
use once_cell::sync::Lazy;
|
|
use std::collections::HashMap;
|
|
use std::env;
|
|
use std::fmt;
|
|
use std::fs;
|
|
use std::io::{self, Read, Write};
|
|
use std::process;
|
|
use std::str;
|
|
|
|
// =====================================================================================================================
|
|
// Errors
|
|
// =====================================================================================================================
|
|
|
|
// Define a custom error type for cleaner error handling
|
|
#[derive(Debug)]
|
|
enum LexerError {
|
|
Io(io::Error),
|
|
Generic(String),
|
|
}
|
|
|
|
impl fmt::Display for LexerError {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
match self {
|
|
LexerError::Io(e) => write!(f, "I/O Error: {}", e),
|
|
LexerError::Generic(s) => write!(f, "Lexer Error: {}", s),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::error::Error for LexerError {}
|
|
|
|
// Allow easy conversion from io::Error
|
|
impl From<io::Error> for LexerError {
|
|
fn from(err: io::Error) -> Self {
|
|
LexerError::Io(err)
|
|
}
|
|
}
|
|
|
|
// Allow easy conversion from String/&str
|
|
impl From<String> for LexerError {
|
|
fn from(err: String) -> Self {
|
|
LexerError::Generic(err)
|
|
}
|
|
}
|
|
impl From<&str> for LexerError {
|
|
fn from(err: &str) -> Self {
|
|
LexerError::Generic(err.to_string())
|
|
}
|
|
}
|
|
|
|
type Result<T> = std::result::Result<T, LexerError>;
|
|
|
|
// =====================================================================================================================
|
|
// Machinery
|
|
// =====================================================================================================================
|
|
|
|
fn file_to_string(path: &str) -> Result<String> {
|
|
fs::read_to_string(path).map_err(LexerError::Io)
|
|
}
|
|
|
|
fn string_to_file(path: &str, contents: &str) -> Result<()> {
|
|
fs::write(path, contents).map_err(LexerError::Io)
|
|
}
|
|
|
|
// Rust version of with_IO, using closures and Result for error handling
|
|
fn with_io<F>(source: &str, destination: &str, f: F) -> Result<()>
|
|
where
|
|
F: FnOnce(String) -> Result<String>,
|
|
{
|
|
let input = if source == "stdin" {
|
|
let mut buffer = String::new();
|
|
io::stdin().read_to_string(&mut buffer)?;
|
|
buffer
|
|
} else {
|
|
file_to_string(source)?
|
|
};
|
|
|
|
let output = f(input)?; // Execute the processing function
|
|
|
|
if destination == "stdout" {
|
|
print!("{}", output); // Use print! for stdout
|
|
io::stdout().flush()?; // Ensure output is flushed
|
|
} else {
|
|
string_to_file(destination, &output)?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// Add escaped newlines and backslashes back in for printing
|
|
fn sanitize(s: &str) -> String {
|
|
let mut result = String::with_capacity(s.len()); // Pre-allocate
|
|
for c in s.chars() {
|
|
match c {
|
|
'\n' => result.push_str("\\n"),
|
|
'\\' => result.push_str("\\\\"),
|
|
_ => result.push(c),
|
|
}
|
|
}
|
|
result
|
|
}
|
|
|
|
// =====================================================================================================================
|
|
// Scanner - Operates on byte slice for closer C++ char* parity
|
|
// =====================================================================================================================
|
|
#[derive(Debug, Clone, Copy)] // Clone + Copy needed for pre_state
|
|
struct Scanner<'a> {
|
|
bytes: &'a [u8],
|
|
pos: usize, // Current byte position
|
|
line: usize,
|
|
column: usize,
|
|
}
|
|
|
|
impl<'a> Scanner<'a> {
|
|
fn new(source: &'a str) -> Self {
|
|
Scanner {
|
|
bytes: source.as_bytes(),
|
|
pos: 0,
|
|
line: 1,
|
|
column: 1,
|
|
}
|
|
}
|
|
|
|
// Peek at the current byte without consuming
|
|
fn peek(&self) -> Option<u8> {
|
|
self.bytes.get(self.pos).copied()
|
|
}
|
|
|
|
// Peek at the next byte
|
|
fn peek_next(&self) -> Option<u8> {
|
|
self.bytes.get(self.pos + 1).copied()
|
|
}
|
|
|
|
// Advance the position by one byte, updating line/column
|
|
fn advance(&mut self) {
|
|
if let Some(byte) = self.peek() {
|
|
self.pos += 1;
|
|
if byte == b'\n' {
|
|
self.line += 1;
|
|
self.column = 1;
|
|
} else {
|
|
self.column += 1;
|
|
}
|
|
}
|
|
// Don't advance past the end
|
|
}
|
|
|
|
// Advance and return the *new* current byte
|
|
// Equivalent to C++'s next() behavior: advance then peek
|
|
fn next(&mut self) -> Option<u8> {
|
|
self.advance();
|
|
self.peek()
|
|
}
|
|
|
|
// Skip ASCII whitespace characters
|
|
fn skip_whitespace(&mut self) {
|
|
while let Some(byte) = self.peek() {
|
|
if byte.is_ascii_whitespace() {
|
|
self.advance();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Get the current byte slice from start_pos to current pos
|
|
fn slice(&self, start_pos: usize) -> &'a [u8] {
|
|
&self.bytes[start_pos..self.pos]
|
|
}
|
|
}
|
|
|
|
// =====================================================================================================================
|
|
// Tokens
|
|
// =====================================================================================================================
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
enum TokenName {
|
|
OpMultiply, OpDivide, OpMod, OpAdd, OpSubtract, OpNegate,
|
|
OpLess, OpLessEqual, OpGreater, OpGreaterEqual, OpEqual, OpNotEqual,
|
|
OpNot, OpAssign, OpAnd, OpOr,
|
|
LeftParen, RightParen, LeftBrace, RightBrace, Semicolon, Comma,
|
|
KeywordIf, KeywordElse, KeywordWhile, KeywordPrint, KeywordPutc,
|
|
Identifier, Integer, String,
|
|
EndOfInput, Error,
|
|
}
|
|
|
|
// Use Display trait for string representation
|
|
impl fmt::Display for TokenName {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
use TokenName::*;
|
|
let s = match self {
|
|
OpMultiply => "Op_multiply", OpDivide => "Op_divide", OpMod => "Op_mod",
|
|
OpAdd => "Op_add", OpSubtract => "Op_subtract", OpNegate => "Op_negate",
|
|
OpLess => "Op_less", OpLessEqual => "Op_lessequal", OpGreater => "Op_greater",
|
|
OpGreaterEqual => "Op_greaterequal", OpEqual => "Op_equal", OpNotEqual => "Op_notequal",
|
|
OpNot => "Op_not", OpAssign => "Op_assign", OpAnd => "Op_and", OpOr => "Op_or",
|
|
LeftParen => "LeftParen", RightParen => "RightParen", LeftBrace => "LeftBrace",
|
|
RightBrace => "RightBrace", Semicolon => "Semicolon", Comma => "Comma",
|
|
KeywordIf => "Keyword_if", KeywordElse => "Keyword_else", KeywordWhile => "Keyword_while",
|
|
KeywordPrint => "Keyword_print", KeywordPutc => "Keyword_putc",
|
|
Identifier => "Identifier", Integer => "Integer", String => "String",
|
|
EndOfInput => "End_of_input", Error => "Error",
|
|
};
|
|
write!(f, "{}", s)
|
|
}
|
|
}
|
|
|
|
// Rust enum for token values (replaces std::variant)
|
|
#[derive(Debug, Clone, PartialEq)]
|
|
enum TokenVal {
|
|
Int(i32),
|
|
String(String),
|
|
None, // For tokens without a specific value
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
struct Token {
|
|
name: TokenName,
|
|
value: TokenVal,
|
|
line: usize,
|
|
column: usize,
|
|
}
|
|
|
|
// Use Display trait for formatted token output
|
|
impl fmt::Display for Token {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
write!(f, "{:<2} {:<2} ", self.line, self.column)?; // Use Rust formatting for width
|
|
match self.name {
|
|
TokenName::Identifier => {
|
|
if let TokenVal::String(s) = &self.value {
|
|
write!(f, "{:<18}{}", self.name, s) // Left align with width
|
|
} else {
|
|
write!(f, "{:<18}<?>", self.name) // Should not happen
|
|
}
|
|
}
|
|
TokenName::Integer => {
|
|
if let TokenVal::Int(i) = &self.value {
|
|
write!(f, "{:<18}{}", self.name, i)
|
|
} else {
|
|
write!(f, "{:<18}<?>", self.name)
|
|
}
|
|
}
|
|
TokenName::String => {
|
|
if let TokenVal::String(s) = &self.value {
|
|
write!(f, "{:<18}\"{}\"", self.name, sanitize(s))
|
|
} else {
|
|
write!(f, "{:<18}<?>", self.name)
|
|
}
|
|
}
|
|
TokenName::Error => {
|
|
if let TokenVal::String(s) = &self.value {
|
|
// Error message might be multi-line, handle indentation carefully
|
|
let lines: Vec<&str> = s.lines().collect();
|
|
if lines.is_empty() {
|
|
write!(f, "{}", self.name)
|
|
} else {
|
|
write!(f, "{:<18}{}", self.name, lines[0])?;
|
|
for line in lines.iter().skip(1) {
|
|
write!(f, "\n{:<28}{}", "", line)?; // Indent subsequent lines
|
|
}
|
|
Ok(()) // Return Ok(()) explicitly as write! doesn't cover all paths
|
|
}
|
|
} else {
|
|
write!(f, "{}", self.name) // Error without details
|
|
}
|
|
}
|
|
TokenName::EndOfInput => write!(f, "{}", self.name),
|
|
// Default for simple tokens
|
|
_ => write!(f, "{}", self.name),
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// =====================================================================================================================
|
|
// Lexer
|
|
// =====================================================================================================================
|
|
|
|
// Lazy static initialization for the keywords map
|
|
static KEYWORDS: Lazy<HashMap<String, TokenName>> = Lazy::new(|| {
|
|
let mut m = HashMap::new();
|
|
m.insert("else".to_string(), TokenName::KeywordElse);
|
|
m.insert("if".to_string(), TokenName::KeywordIf);
|
|
m.insert("print".to_string(), TokenName::KeywordPrint);
|
|
m.insert("putc".to_string(), TokenName::KeywordPutc);
|
|
m.insert("while".to_string(), TokenName::KeywordWhile);
|
|
m
|
|
});
|
|
|
|
|
|
struct Lexer<'a> {
|
|
s: Scanner<'a>,
|
|
source: &'a str, // Keep original source for error slicing if needed
|
|
// pre_state is implicitly captured before each token attempt
|
|
}
|
|
|
|
impl<'a> Lexer<'a> {
|
|
fn new(source: &'a str) -> Self {
|
|
Lexer {
|
|
s: Scanner::new(source),
|
|
source, // Store the source string slice
|
|
}
|
|
}
|
|
|
|
// Helper to create a token with current pre-state line/column
|
|
fn make_token(&self, name: TokenName, value: TokenVal, line: usize, column: usize) -> Token {
|
|
Token { name, value, line, column }
|
|
}
|
|
|
|
// Helper to create an error token
|
|
fn error_token(&mut self, msg: String, code_snippet: &str, line: usize, column: usize) -> Token {
|
|
let full_msg = format!("{}\n{:>28}({}, {}): {}",
|
|
msg, "", line, column, code_snippet);
|
|
|
|
// Ensure we advance past the problematic character(s) if possible
|
|
// In many cases, advance() might have already happened within the failing logic.
|
|
// If peek is not None, we might advance once more to avoid infinite loops on bad chars.
|
|
if self.s.peek().is_some() {
|
|
// Be cautious here. C++ version advanced unconditionally.
|
|
// Let's only advance if the error didn't consume the char.
|
|
// self.s.advance(); // Maybe remove this, depends on exact error logic
|
|
}
|
|
|
|
self.make_token(TokenName::Error, TokenVal::String(full_msg), line, column)
|
|
}
|
|
|
|
// Helper for simple single-character tokens
|
|
fn simply(&mut self, name: TokenName, line: usize, column: usize) -> Token {
|
|
self.s.advance();
|
|
self.make_token(name, TokenVal::None, line, column)
|
|
}
|
|
|
|
// Helper for tokens like &&, ||
|
|
fn expect(&mut self, expected: u8, name: TokenName, line: usize, column: usize, start_pos: usize) -> Token {
|
|
if self.s.next() == Some(expected) {
|
|
self.s.advance(); // Consume the second character
|
|
self.make_token(name, TokenVal::None, line, column)
|
|
} else {
|
|
// Get the single character that caused the error
|
|
let snippet = str::from_utf8(&self.s.bytes[start_pos..self.s.pos])
|
|
.unwrap_or("<?>"); // Use slice up to current pos
|
|
let current_char = self.s.peek().map_or('?', |b| b as char); // Use peek for error char
|
|
self.error_token(
|
|
format!("Unrecognized character '{}' after '{}'", current_char, self.s.bytes[start_pos] as char),
|
|
snippet,
|
|
line, // Error occurs at the original line/col
|
|
column,
|
|
)
|
|
}
|
|
}
|
|
|
|
// Helper for tokens like <=, >=, ==, !=
|
|
fn follow(&mut self, expected: u8, ifyes: TokenName, ifno: TokenName, line: usize, column: usize) -> Token {
|
|
if self.s.peek_next() == Some(expected) {
|
|
self.s.advance(); // Consume the first char
|
|
self.s.advance(); // Consume the second char '='
|
|
self.make_token(ifyes, TokenVal::None, line, column)
|
|
} else {
|
|
self.s.advance(); // Consume just the first char
|
|
self.make_token(ifno, TokenVal::None, line, column)
|
|
}
|
|
}
|
|
|
|
// Handles / or /* ... */ comments
|
|
fn divide_or_comment(&mut self, line: usize, column: usize, start_pos: usize) -> Token {
|
|
if self.s.peek_next() == Some(b'*') {
|
|
// Start of a block comment
|
|
self.s.advance(); // Consume '/'
|
|
self.s.advance(); // Consume '*'
|
|
|
|
loop {
|
|
match self.s.peek() {
|
|
Some(b'*') => {
|
|
if self.s.peek_next() == Some(b'/') {
|
|
self.s.advance(); // Consume '*'
|
|
self.s.advance(); // Consume '/'
|
|
return self.next_token(); // Return the *next* token after comment
|
|
} else {
|
|
self.s.advance(); // Consume '*' but it wasn't end of comment
|
|
}
|
|
}
|
|
Some(_) => {
|
|
self.s.advance(); // Consume character inside comment
|
|
}
|
|
None => {
|
|
// Reached EOF inside comment
|
|
let snippet = str::from_utf8(&self.s.bytes[start_pos..self.s.pos]).unwrap_or("/*...");
|
|
return self.error_token(
|
|
"End-of-file in comment. Closing '*/' not found.".to_string(),
|
|
snippet,
|
|
line, // Error reported at comment start
|
|
column,
|
|
);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Just a division operator
|
|
self.s.advance(); // Consume '/'
|
|
self.make_token(TokenName::OpDivide, TokenVal::None, line, column)
|
|
}
|
|
}
|
|
|
|
// Handles 'c' character literals -> stored as Integer token
|
|
fn char_lit(&mut self, line: usize, column: usize, start_pos: usize) -> Token {
|
|
self.s.advance(); // Consume opening '
|
|
|
|
let char_val = match self.s.peek() {
|
|
None => return self.error_token("End-of-file in char literal.".to_string(), "'", line, column),
|
|
Some(b'\'') => return self.error_token("Empty character constant.".to_string(), "''", line, column),
|
|
Some(b'\\') => { // Escape sequence
|
|
self.s.advance(); // Consume '\'
|
|
match self.s.peek() {
|
|
Some(b'n') => { self.s.advance(); b'\n' }
|
|
Some(b'\\') => { self.s.advance(); b'\\' }
|
|
Some(c) => {
|
|
let snippet = format!("'\\{}...", c as char);
|
|
// Advance past the unknown escape char before reporting error
|
|
self.s.advance();
|
|
return self.error_token(format!("Unknown escape sequence \\{}", c as char), &snippet, line, column);
|
|
}
|
|
None => return self.error_token("End-of-file after escape in char literal.".to_string(), "'\\", line, column),
|
|
}
|
|
}
|
|
Some(byte) => { // Normal character
|
|
self.s.advance();
|
|
byte
|
|
}
|
|
};
|
|
|
|
// Check for closing quote
|
|
if self.s.peek() == Some(b'\'') {
|
|
self.s.advance(); // Consume closing '
|
|
self.make_token(TokenName::Integer, TokenVal::Int(char_val as i32), line, column)
|
|
} else {
|
|
// Find the extent of the invalid literal for the error message
|
|
let mut end_pos = self.s.pos;
|
|
while let Some(b) = self.s.bytes.get(end_pos) {
|
|
if *b == b'\'' || *b == b'\n' || end_pos > start_pos + 10 { // Limit snippet size
|
|
break;
|
|
}
|
|
end_pos += 1;
|
|
}
|
|
// Consume until the closing quote or newline to avoid cascading errors
|
|
while let Some(b) = self.s.peek() {
|
|
if b == b'\'' { self.s.advance(); break; }
|
|
if b == b'\n' { break; } // Stop at newline
|
|
self.s.advance();
|
|
}
|
|
|
|
let snippet = str::from_utf8(&self.s.bytes[start_pos..end_pos.min(self.s.bytes.len())])
|
|
.unwrap_or("<?>");
|
|
self.error_token("Multi-character constant or missing closing quote.".to_string(), snippet, line, column)
|
|
}
|
|
}
|
|
|
|
|
|
// Handles "..." string literals
|
|
fn string_lit(&mut self, line: usize, column: usize, start_pos: usize) -> Token {
|
|
self.s.advance(); // Consume opening "
|
|
let mut content: Vec<u8> = Vec::new();
|
|
|
|
loop {
|
|
match self.s.peek() {
|
|
None => {
|
|
let snippet = str::from_utf8(&self.s.bytes[start_pos..self.s.pos]).unwrap_or("\"...");
|
|
return self.error_token("End-of-file while scanning string literal. Closing '\"' not found.".to_string(), snippet, line, column);
|
|
}
|
|
Some(b'"') => {
|
|
self.s.advance(); // Consume closing "
|
|
// Attempt to convert collected bytes to UTF-8 String
|
|
match String::from_utf8(content) {
|
|
Ok(s) => return self.make_token(TokenName::String, TokenVal::String(s), line, column),
|
|
Err(e) => {
|
|
let snippet = str::from_utf8(&self.s.bytes[start_pos..self.s.pos]).unwrap_or("<?>");
|
|
return self.error_token(format!("Invalid UTF-8 sequence in string literal: {}", e), snippet, line, column);
|
|
}
|
|
}
|
|
}
|
|
Some(b'\n') => {
|
|
let snippet = str::from_utf8(&self.s.bytes[start_pos..self.s.pos]).unwrap_or("\"...");
|
|
// Don't consume newline, error points before it
|
|
return self.error_token("End-of-line while scanning string literal. Closing '\"' not found.".to_string(), snippet, self.s.line, self.s.column); // Report error at current line/col
|
|
}
|
|
Some(b'\\') => { // Escape sequence
|
|
self.s.advance(); // Consume '\'
|
|
match self.s.peek() {
|
|
Some(b'n') => { self.s.advance(); content.push(b'\n'); }
|
|
Some(b'\\') => { self.s.advance(); content.push(b'\\'); }
|
|
Some(c) => {
|
|
let snippet = format!("...\\{}...", c as char);
|
|
// Consume the unknown escape char before reporting error
|
|
self.s.advance();
|
|
return self.error_token(format!("Unknown escape sequence \\{}", c as char), &snippet, self.s.line, self.s.column); // Use current line/col
|
|
}
|
|
None => {
|
|
let snippet = str::from_utf8(&self.s.bytes[start_pos..self.s.pos]).unwrap_or("\"...\\");
|
|
return self.error_token("End-of-file after escape in string literal.".to_string(), snippet, line, column);
|
|
}
|
|
}
|
|
}
|
|
Some(byte) => { // Normal character
|
|
content.push(byte);
|
|
self.s.advance();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Helper predicate functions (using u8 methods)
|
|
#[inline] fn is_id_start(c: u8) -> bool { c.is_ascii_alphabetic() || c == b'_' }
|
|
#[inline] fn is_id_end(c: u8) -> bool { c.is_ascii_alphanumeric() || c == b'_' }
|
|
#[inline] fn is_digit(c: u8) -> bool { c.is_ascii_digit() }
|
|
|
|
// Handles identifiers and keywords
|
|
fn identifier(&mut self, line: usize, column: usize, start_pos: usize) -> Token {
|
|
while let Some(byte) = self.s.peek() {
|
|
if Self::is_id_end(byte) {
|
|
self.s.advance();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
let ident_bytes = self.s.slice(start_pos);
|
|
// Identifiers must be valid UTF-8 in Rust
|
|
match str::from_utf8(ident_bytes) {
|
|
Ok(ident_str) => {
|
|
// Check if it's a keyword
|
|
if let Some(keyword_token) = KEYWORDS.get(ident_str) {
|
|
self.make_token(*keyword_token, TokenVal::None, line, column)
|
|
} else {
|
|
self.make_token(TokenName::Identifier, TokenVal::String(ident_str.to_string()), line, column)
|
|
}
|
|
}
|
|
Err(_) => {
|
|
// This shouldn't happen if is_id_end checks ASCII, but handle defensively
|
|
self.error_token("Invalid UTF-8 sequence in identifier.".to_string(), "<?>", line, column)
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
// Handles integer literals
|
|
fn integer_lit(&mut self, line: usize, column: usize, start_pos: usize) -> Token {
|
|
while let Some(byte) = self.s.peek() {
|
|
if Self::is_digit(byte) {
|
|
self.s.advance();
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Check if it's followed by an identifier character (invalid number)
|
|
if let Some(byte) = self.s.peek() {
|
|
if Self::is_id_start(byte) {
|
|
// Consume the invalid part to show in error
|
|
while let Some(b) = self.s.peek() {
|
|
if Self::is_id_end(b) { self.s.advance(); } else { break; }
|
|
}
|
|
let snippet = str::from_utf8(self.s.slice(start_pos)).unwrap_or("<?>");
|
|
return self.error_token("Invalid number. Contains non-numeric characters.".to_string(), snippet, line, column);
|
|
}
|
|
}
|
|
|
|
|
|
let num_bytes = self.s.slice(start_pos);
|
|
match str::from_utf8(num_bytes) {
|
|
Ok(num_str) => {
|
|
match num_str.parse::<i32>() {
|
|
Ok(n) => self.make_token(TokenName::Integer, TokenVal::Int(n), line, column),
|
|
Err(e) => {
|
|
// Could be overflow or other parse error
|
|
let snippet = str::from_utf8(num_bytes).unwrap_or("<?>");
|
|
let msg = if e.kind() == &std::num::IntErrorKind::PosOverflow || e.kind() == &std::num::IntErrorKind::NegOverflow {
|
|
"Number exceeds maximum value.".to_string()
|
|
} else {
|
|
format!("Invalid integer literal: {}", e)
|
|
};
|
|
self.error_token(msg, snippet, line, column)
|
|
}
|
|
}
|
|
}
|
|
Err(_) => {
|
|
// Should not happen for digits, but handle defensively
|
|
self.error_token("Invalid UTF-8 sequence in number.".to_string(), "<?>", line, column)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Get the next token from the input stream
|
|
pub fn next_token(&mut self) -> Token {
|
|
self.s.skip_whitespace();
|
|
|
|
// Capture state *before* processing the token
|
|
let pre_line = self.s.line;
|
|
let pre_column = self.s.column;
|
|
let start_pos = self.s.pos; // Start byte position of the token
|
|
|
|
// Use peek() to decide what kind of token comes next
|
|
match self.s.peek() {
|
|
Some(b'*') => self.simply(TokenName::OpMultiply, pre_line, pre_column),
|
|
Some(b'%') => self.simply(TokenName::OpMod, pre_line, pre_column),
|
|
Some(b'+') => self.simply(TokenName::OpAdd, pre_line, pre_column),
|
|
Some(b'-') => self.simply(TokenName::OpSubtract, pre_line, pre_column),
|
|
Some(b'{') => self.simply(TokenName::LeftBrace, pre_line, pre_column),
|
|
Some(b'}') => self.simply(TokenName::RightBrace, pre_line, pre_column),
|
|
Some(b'(') => self.simply(TokenName::LeftParen, pre_line, pre_column),
|
|
Some(b')') => self.simply(TokenName::RightParen, pre_line, pre_column),
|
|
Some(b';') => self.simply(TokenName::Semicolon, pre_line, pre_column),
|
|
Some(b',') => self.simply(TokenName::Comma, pre_line, pre_column),
|
|
|
|
Some(b'&') => self.expect(b'&', TokenName::OpAnd, pre_line, pre_column, start_pos),
|
|
Some(b'|') => self.expect(b'|', TokenName::OpOr, pre_line, pre_column, start_pos),
|
|
|
|
Some(b'<') => self.follow(b'=', TokenName::OpLessEqual, TokenName::OpLess, pre_line, pre_column),
|
|
Some(b'>') => self.follow(b'=', TokenName::OpGreaterEqual, TokenName::OpGreater, pre_line, pre_column),
|
|
Some(b'=') => self.follow(b'=', TokenName::OpEqual, TokenName::OpAssign, pre_line, pre_column),
|
|
Some(b'!') => self.follow(b'=', TokenName::OpNotEqual, TokenName::OpNot, pre_line, pre_column),
|
|
|
|
Some(b'/') => self.divide_or_comment(pre_line, pre_column, start_pos),
|
|
Some(b'\'') => self.char_lit(pre_line, pre_column, start_pos),
|
|
Some(b'"') => self.string_lit(pre_line, pre_column, start_pos),
|
|
|
|
Some(c) if Self::is_id_start(c) => self.identifier(pre_line, pre_column, start_pos),
|
|
Some(c) if Self::is_digit(c) => self.integer_lit(pre_line, pre_column, start_pos),
|
|
|
|
Some(c) => {
|
|
// Unrecognized character
|
|
let snippet = str::from_utf8(&self.s.bytes[start_pos ..= start_pos]).unwrap_or("?"); // Just the char
|
|
self.s.advance(); // Consume the bad character
|
|
self.error_token(format!("Unrecognized character '{}'", c as char), snippet, pre_line, pre_column)
|
|
}
|
|
|
|
None => self.make_token(TokenName::EndOfInput, TokenVal::None, pre_line, pre_column),
|
|
}
|
|
}
|
|
|
|
// Check if there are more characters (excluding EOF)
|
|
pub fn has_more(&self) -> bool {
|
|
self.s.peek().is_some()
|
|
}
|
|
}
|
|
|
|
// =====================================================================================================================
|
|
// Main Function
|
|
// =====================================================================================================================
|
|
fn run_lexer(input: String) -> Result<String> {
|
|
let mut lexer = Lexer::new(&input); // Pass input by reference
|
|
let mut output = String::new();
|
|
|
|
output.push_str("Location Token name Value\n");
|
|
output.push_str("--------------------------------------\n");
|
|
|
|
let mut token = lexer.next_token();
|
|
loop {
|
|
output.push_str(&format!("{}\n", token)); // Use the Display impl for Token
|
|
|
|
if token.name == TokenName::EndOfInput || token.name == TokenName::Error {
|
|
// If it's an error token, we might stop or continue depending on desired behavior.
|
|
// This version stops on the first error or EOF.
|
|
if token.name == TokenName::Error {
|
|
// Optionally, return an error instead of just stopping the output string
|
|
// return Err(LexerError::Generic(format!("Lexing failed at line {}, column {}", token.line, token.column)));
|
|
}
|
|
break;
|
|
}
|
|
|
|
token = lexer.next_token();
|
|
}
|
|
|
|
Ok(output)
|
|
}
|
|
|
|
fn main() {
|
|
let args: Vec<String> = env::args().collect();
|
|
let in_path = args.get(1).map_or("stdin", |s| s.as_str());
|
|
let out_path = args.get(2).map_or("stdout", |s| s.as_str());
|
|
|
|
if let Err(e) = with_io(in_path, out_path, run_lexer) {
|
|
eprintln!("Error: {}", e);
|
|
process::exit(1);
|
|
}
|
|
}
|