RosettaCodeData/Task/Compiler-lexical-analyzer/Java/compiler-lexical-analyzer.java

241 lines
8.6 KiB
Java

// Translated from python source
import java.io.File;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.Map;
import java.util.Scanner;
public class Lexer {
private int line;
private int pos;
private int position;
private char chr;
private String s;
Map<String, TokenType> keywords = new HashMap<>();
static class Token {
public TokenType tokentype;
public String value;
public int line;
public int pos;
Token(TokenType token, String value, int line, int pos) {
this.tokentype = token; this.value = value; this.line = line; this.pos = pos;
}
@Override
public String toString() {
String result = String.format("%5d %5d %-15s", this.line, this.pos, this.tokentype);
switch (this.tokentype) {
case Integer:
result += String.format(" %4s", value);
break;
case Identifier:
result += String.format(" %s", value);
break;
case String:
result += String.format(" \"%s\"", value);
break;
}
return result;
}
}
static enum TokenType {
End_of_input, Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract,
Op_negate, Op_not, Op_less, Op_lessequal, Op_greater, Op_greaterequal,
Op_equal, Op_notequal, Op_assign, Op_and, Op_or, Keyword_if,
Keyword_else, Keyword_while, Keyword_print, Keyword_putc, LeftParen, RightParen,
LeftBrace, RightBrace, Semicolon, Comma, Identifier, Integer, String
}
static void error(int line, int pos, String msg) {
if (line > 0 && pos > 0) {
System.out.printf("%s in line %d, pos %d\n", msg, line, pos);
} else {
System.out.println(msg);
}
System.exit(1);
}
Lexer(String source) {
this.line = 1;
this.pos = 0;
this.position = 0;
this.s = source;
this.chr = this.s.charAt(0);
this.keywords.put("if", TokenType.Keyword_if);
this.keywords.put("else", TokenType.Keyword_else);
this.keywords.put("print", TokenType.Keyword_print);
this.keywords.put("putc", TokenType.Keyword_putc);
this.keywords.put("while", TokenType.Keyword_while);
}
Token follow(char expect, TokenType ifyes, TokenType ifno, int line, int pos) {
if (getNextChar() == expect) {
getNextChar();
return new Token(ifyes, "", line, pos);
}
if (ifno == TokenType.End_of_input) {
error(line, pos, String.format("follow: unrecognized character: (%d) '%c'", (int)this.chr, this.chr));
}
return new Token(ifno, "", line, pos);
}
Token char_lit(int line, int pos) {
char c = getNextChar(); // skip opening quote
int n = (int)c;
if (c == '\'') {
error(line, pos, "empty character constant");
} else if (c == '\\') {
c = getNextChar();
if (c == 'n') {
n = 10;
} else if (c == '\\') {
n = '\\';
} else {
error(line, pos, String.format("unknown escape sequence \\%c", c));
}
}
if (getNextChar() != '\'') {
error(line, pos, "multi-character constant");
}
getNextChar();
return new Token(TokenType.Integer, "" + n, line, pos);
}
Token string_lit(char start, int line, int pos) {
String result = "";
while (getNextChar() != start) {
if (this.chr == '\u0000') {
error(line, pos, "EOF while scanning string literal");
}
if (this.chr == '\n') {
error(line, pos, "EOL while scanning string literal");
}
result += this.chr;
}
getNextChar();
return new Token(TokenType.String, result, line, pos);
}
Token div_or_comment(int line, int pos) {
if (getNextChar() != '*') {
return new Token(TokenType.Op_divide, "", line, pos);
}
getNextChar();
while (true) {
if (this.chr == '\u0000') {
error(line, pos, "EOF in comment");
} else if (this.chr == '*') {
if (getNextChar() == '/') {
getNextChar();
return getToken();
}
} else {
getNextChar();
}
}
}
Token identifier_or_integer(int line, int pos) {
boolean is_number = true;
String text = "";
while (Character.isAlphabetic(this.chr) || Character.isDigit(this.chr) || this.chr == '_') {
text += this.chr;
if (!Character.isDigit(this.chr)) {
is_number = false;
}
getNextChar();
}
if (text.equals("")) {
error(line, pos, String.format("identifer_or_integer unrecognized character: (%d) %c", (int)this.chr, this.chr));
}
if (Character.isDigit(text.charAt(0))) {
if (!is_number) {
error(line, pos, String.format("invalid number: %s", text));
}
return new Token(TokenType.Integer, text, line, pos);
}
if (this.keywords.containsKey(text)) {
return new Token(this.keywords.get(text), "", line, pos);
}
return new Token(TokenType.Identifier, text, line, pos);
}
Token getToken() {
int line, pos;
while (Character.isWhitespace(this.chr)) {
getNextChar();
}
line = this.line;
pos = this.pos;
switch (this.chr) {
case '\u0000': return new Token(TokenType.End_of_input, "", this.line, this.pos);
case '/': return div_or_comment(line, pos);
case '\'': return char_lit(line, pos);
case '<': return follow('=', TokenType.Op_lessequal, TokenType.Op_less, line, pos);
case '>': return follow('=', TokenType.Op_greaterequal, TokenType.Op_greater, line, pos);
case '=': return follow('=', TokenType.Op_equal, TokenType.Op_assign, line, pos);
case '!': return follow('=', TokenType.Op_notequal, TokenType.Op_not, line, pos);
case '&': return follow('&', TokenType.Op_and, TokenType.End_of_input, line, pos);
case '|': return follow('|', TokenType.Op_or, TokenType.End_of_input, line, pos);
case '"': return string_lit(this.chr, line, pos);
case '{': getNextChar(); return new Token(TokenType.LeftBrace, "", line, pos);
case '}': getNextChar(); return new Token(TokenType.RightBrace, "", line, pos);
case '(': getNextChar(); return new Token(TokenType.LeftParen, "", line, pos);
case ')': getNextChar(); return new Token(TokenType.RightParen, "", line, pos);
case '+': getNextChar(); return new Token(TokenType.Op_add, "", line, pos);
case '-': getNextChar(); return new Token(TokenType.Op_subtract, "", line, pos);
case '*': getNextChar(); return new Token(TokenType.Op_multiply, "", line, pos);
case '%': getNextChar(); return new Token(TokenType.Op_mod, "", line, pos);
case ';': getNextChar(); return new Token(TokenType.Semicolon, "", line, pos);
case ',': getNextChar(); return new Token(TokenType.Comma, "", line, pos);
default: return identifier_or_integer(line, pos);
}
}
char getNextChar() {
this.pos++;
this.position++;
if (this.position >= this.s.length()) {
this.chr = '\u0000';
return this.chr;
}
this.chr = this.s.charAt(this.position);
if (this.chr == '\n') {
this.line++;
this.pos = 0;
}
return this.chr;
}
void printTokens() {
Token t;
while ((t = getToken()).tokentype != TokenType.End_of_input) {
System.out.println(t);
}
System.out.println(t);
}
public static void main(String[] args) {
if (args.length > 0) {
try {
File f = new File(args[0]);
Scanner s = new Scanner(f);
String source = " ";
while (s.hasNext()) {
source += s.nextLine() + "\n";
}
Lexer l = new Lexer(source);
l.printTokens();
} catch(FileNotFoundException e) {
error(-1, -1, "Exception: " + e.getMessage());
}
} else {
error(-1, -1, "No args");
}
}
}