241 lines
8.6 KiB
Java
241 lines
8.6 KiB
Java
// Translated from python source
|
|
|
|
import java.io.File;
|
|
import java.io.FileNotFoundException;
|
|
import java.util.HashMap;
|
|
import java.util.Map;
|
|
import java.util.Scanner;
|
|
|
|
public class Lexer {
|
|
private int line;
|
|
private int pos;
|
|
private int position;
|
|
private char chr;
|
|
private String s;
|
|
|
|
Map<String, TokenType> keywords = new HashMap<>();
|
|
|
|
static class Token {
|
|
public TokenType tokentype;
|
|
public String value;
|
|
public int line;
|
|
public int pos;
|
|
Token(TokenType token, String value, int line, int pos) {
|
|
this.tokentype = token; this.value = value; this.line = line; this.pos = pos;
|
|
}
|
|
@Override
|
|
public String toString() {
|
|
String result = String.format("%5d %5d %-15s", this.line, this.pos, this.tokentype);
|
|
switch (this.tokentype) {
|
|
case Integer:
|
|
result += String.format(" %4s", value);
|
|
break;
|
|
case Identifier:
|
|
result += String.format(" %s", value);
|
|
break;
|
|
case String:
|
|
result += String.format(" \"%s\"", value);
|
|
break;
|
|
}
|
|
return result;
|
|
}
|
|
}
|
|
|
|
static enum TokenType {
|
|
End_of_input, Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract,
|
|
Op_negate, Op_not, Op_less, Op_lessequal, Op_greater, Op_greaterequal,
|
|
Op_equal, Op_notequal, Op_assign, Op_and, Op_or, Keyword_if,
|
|
Keyword_else, Keyword_while, Keyword_print, Keyword_putc, LeftParen, RightParen,
|
|
LeftBrace, RightBrace, Semicolon, Comma, Identifier, Integer, String
|
|
}
|
|
|
|
static void error(int line, int pos, String msg) {
|
|
if (line > 0 && pos > 0) {
|
|
System.out.printf("%s in line %d, pos %d\n", msg, line, pos);
|
|
} else {
|
|
System.out.println(msg);
|
|
}
|
|
System.exit(1);
|
|
}
|
|
|
|
Lexer(String source) {
|
|
this.line = 1;
|
|
this.pos = 0;
|
|
this.position = 0;
|
|
this.s = source;
|
|
this.chr = this.s.charAt(0);
|
|
this.keywords.put("if", TokenType.Keyword_if);
|
|
this.keywords.put("else", TokenType.Keyword_else);
|
|
this.keywords.put("print", TokenType.Keyword_print);
|
|
this.keywords.put("putc", TokenType.Keyword_putc);
|
|
this.keywords.put("while", TokenType.Keyword_while);
|
|
|
|
}
|
|
Token follow(char expect, TokenType ifyes, TokenType ifno, int line, int pos) {
|
|
if (getNextChar() == expect) {
|
|
getNextChar();
|
|
return new Token(ifyes, "", line, pos);
|
|
}
|
|
if (ifno == TokenType.End_of_input) {
|
|
error(line, pos, String.format("follow: unrecognized character: (%d) '%c'", (int)this.chr, this.chr));
|
|
}
|
|
return new Token(ifno, "", line, pos);
|
|
}
|
|
Token char_lit(int line, int pos) {
|
|
char c = getNextChar(); // skip opening quote
|
|
int n = (int)c;
|
|
if (c == '\'') {
|
|
error(line, pos, "empty character constant");
|
|
} else if (c == '\\') {
|
|
c = getNextChar();
|
|
if (c == 'n') {
|
|
n = 10;
|
|
} else if (c == '\\') {
|
|
n = '\\';
|
|
} else {
|
|
error(line, pos, String.format("unknown escape sequence \\%c", c));
|
|
}
|
|
}
|
|
if (getNextChar() != '\'') {
|
|
error(line, pos, "multi-character constant");
|
|
}
|
|
getNextChar();
|
|
return new Token(TokenType.Integer, "" + n, line, pos);
|
|
}
|
|
Token string_lit(char start, int line, int pos) {
|
|
String result = "";
|
|
while (getNextChar() != start) {
|
|
if (this.chr == '\u0000') {
|
|
error(line, pos, "EOF while scanning string literal");
|
|
}
|
|
if (this.chr == '\n') {
|
|
error(line, pos, "EOL while scanning string literal");
|
|
}
|
|
result += this.chr;
|
|
}
|
|
getNextChar();
|
|
return new Token(TokenType.String, result, line, pos);
|
|
}
|
|
Token div_or_comment(int line, int pos) {
|
|
if (getNextChar() != '*') {
|
|
return new Token(TokenType.Op_divide, "", line, pos);
|
|
}
|
|
getNextChar();
|
|
while (true) {
|
|
if (this.chr == '\u0000') {
|
|
error(line, pos, "EOF in comment");
|
|
} else if (this.chr == '*') {
|
|
if (getNextChar() == '/') {
|
|
getNextChar();
|
|
return getToken();
|
|
}
|
|
} else {
|
|
getNextChar();
|
|
}
|
|
}
|
|
}
|
|
Token identifier_or_integer(int line, int pos) {
|
|
boolean is_number = true;
|
|
String text = "";
|
|
|
|
while (Character.isAlphabetic(this.chr) || Character.isDigit(this.chr) || this.chr == '_') {
|
|
text += this.chr;
|
|
if (!Character.isDigit(this.chr)) {
|
|
is_number = false;
|
|
}
|
|
getNextChar();
|
|
}
|
|
|
|
if (text.equals("")) {
|
|
error(line, pos, String.format("identifer_or_integer unrecognized character: (%d) %c", (int)this.chr, this.chr));
|
|
}
|
|
|
|
if (Character.isDigit(text.charAt(0))) {
|
|
if (!is_number) {
|
|
error(line, pos, String.format("invalid number: %s", text));
|
|
}
|
|
return new Token(TokenType.Integer, text, line, pos);
|
|
}
|
|
|
|
if (this.keywords.containsKey(text)) {
|
|
return new Token(this.keywords.get(text), "", line, pos);
|
|
}
|
|
return new Token(TokenType.Identifier, text, line, pos);
|
|
}
|
|
Token getToken() {
|
|
int line, pos;
|
|
while (Character.isWhitespace(this.chr)) {
|
|
getNextChar();
|
|
}
|
|
line = this.line;
|
|
pos = this.pos;
|
|
|
|
switch (this.chr) {
|
|
case '\u0000': return new Token(TokenType.End_of_input, "", this.line, this.pos);
|
|
case '/': return div_or_comment(line, pos);
|
|
case '\'': return char_lit(line, pos);
|
|
case '<': return follow('=', TokenType.Op_lessequal, TokenType.Op_less, line, pos);
|
|
case '>': return follow('=', TokenType.Op_greaterequal, TokenType.Op_greater, line, pos);
|
|
case '=': return follow('=', TokenType.Op_equal, TokenType.Op_assign, line, pos);
|
|
case '!': return follow('=', TokenType.Op_notequal, TokenType.Op_not, line, pos);
|
|
case '&': return follow('&', TokenType.Op_and, TokenType.End_of_input, line, pos);
|
|
case '|': return follow('|', TokenType.Op_or, TokenType.End_of_input, line, pos);
|
|
case '"': return string_lit(this.chr, line, pos);
|
|
case '{': getNextChar(); return new Token(TokenType.LeftBrace, "", line, pos);
|
|
case '}': getNextChar(); return new Token(TokenType.RightBrace, "", line, pos);
|
|
case '(': getNextChar(); return new Token(TokenType.LeftParen, "", line, pos);
|
|
case ')': getNextChar(); return new Token(TokenType.RightParen, "", line, pos);
|
|
case '+': getNextChar(); return new Token(TokenType.Op_add, "", line, pos);
|
|
case '-': getNextChar(); return new Token(TokenType.Op_subtract, "", line, pos);
|
|
case '*': getNextChar(); return new Token(TokenType.Op_multiply, "", line, pos);
|
|
case '%': getNextChar(); return new Token(TokenType.Op_mod, "", line, pos);
|
|
case ';': getNextChar(); return new Token(TokenType.Semicolon, "", line, pos);
|
|
case ',': getNextChar(); return new Token(TokenType.Comma, "", line, pos);
|
|
|
|
default: return identifier_or_integer(line, pos);
|
|
}
|
|
}
|
|
|
|
char getNextChar() {
|
|
this.pos++;
|
|
this.position++;
|
|
if (this.position >= this.s.length()) {
|
|
this.chr = '\u0000';
|
|
return this.chr;
|
|
}
|
|
this.chr = this.s.charAt(this.position);
|
|
if (this.chr == '\n') {
|
|
this.line++;
|
|
this.pos = 0;
|
|
}
|
|
return this.chr;
|
|
}
|
|
|
|
void printTokens() {
|
|
Token t;
|
|
while ((t = getToken()).tokentype != TokenType.End_of_input) {
|
|
System.out.println(t);
|
|
}
|
|
System.out.println(t);
|
|
}
|
|
public static void main(String[] args) {
|
|
if (args.length > 0) {
|
|
try {
|
|
|
|
File f = new File(args[0]);
|
|
Scanner s = new Scanner(f);
|
|
String source = " ";
|
|
while (s.hasNext()) {
|
|
source += s.nextLine() + "\n";
|
|
}
|
|
Lexer l = new Lexer(source);
|
|
l.printTokens();
|
|
} catch(FileNotFoundException e) {
|
|
error(-1, -1, "Exception: " + e.getMessage());
|
|
}
|
|
} else {
|
|
error(-1, -1, "No args");
|
|
}
|
|
}
|
|
}
|