RosettaCodeData/Task/Compiler-lexical-analyzer/Nim/compiler-lexical-analyzer-3...

222 lines
6.0 KiB
Nim

import strutils
type
TokenKind* = enum
tokMult = "Op_multiply", tokDiv = "Op_divide", tokMod = "Op_mod",
tokAdd = "Op_add", tokSub = "Op_subtract", tokLess = "Op_less",
tokLessEq = "Op_lessequal", tokGreater = "Op_greater",
tokGreaterEq = "Op_greaterequal", tokEq = "Op_equal",
tokNotEq = "Op_notequal", tokNot = "Op_not", tokAssign = "Op_assign",
tokAnd = "Op_and", tokOr = "Op_or"
tokLPar = "LeftParen", tokRPar = "RightParen"
tokLBrace = "LeftBrace", tokRBrace = "RightBrace"
tokSemi = "Semicolon", tokComma = "Comma"
tokIf = "Keyword_if", tokElse = "Keyword_else", tokWhile = "Keyword_while",
tokPrint = "Keyword_print", tokPutc = "Keyword_putc"
tokIdent = "Identifier", tokInt = "Integer", tokChar = "Integer",
tokString = "String"
tokEnd = "End_of_input"
Token* = object
ln*, col*: int
case kind*: TokenKind
of tokIdent: ident*: string
of tokInt: intVal*: int
of tokChar: charVal*: char
of tokString: stringVal*: string
else: discard
Lexer* = object
input: string
pos: int
ln, col: int
LexicalError* = object of CatchableError
ln*, col*: int
proc error(lexer: var Lexer, message: string) =
var err = newException(LexicalError, message)
err.ln = lexer.ln
err.col = lexer.col
template current: char =
if lexer.pos < lexer.input.len: lexer.input[lexer.pos]
else: '\x00'
template get(n: int): string =
if lexer.pos < lexer.input.len:
lexer.input[min(lexer.pos, lexer.input.len)..
min(lexer.pos + n - 1, lexer.input.len)]
else: ""
template next() =
inc(lexer.pos); inc(lexer.col)
if current() == '\n':
inc(lexer.ln)
lexer.col = 0
elif current() == '\r':
lexer.col = 0
proc skip(lexer: var Lexer) =
while true:
if current() in Whitespace:
while current() in Whitespace:
next()
continue
elif get(2) == "/*":
next(); next()
while get(2) != "*/":
if current() == '\x00':
lexer.error("Unterminated comment")
next()
next(); next()
continue
else: discard
break
proc charOrEscape(lexer: var Lexer): char =
if current() != '\\':
result = current()
next()
else:
next()
case current()
of 'n': result = '\n'
of '\\': result = '\\'
else: lexer.error("Unknown escape sequence '\\" & current() & "'")
next()
proc next*(lexer: var Lexer): Token =
let
ln = lexer.ln
col = lexer.col
case current()
of '*': result = Token(kind: tokMult); next()
of '/': result = Token(kind: tokDiv); next()
of '%': result = Token(kind: tokMod); next()
of '+': result = Token(kind: tokAdd); next()
of '-': result = Token(kind: tokSub); next()
of '<':
next()
if current() == '=': result = Token(kind: tokLessEq)
else: result = Token(kind: tokLess)
of '>':
next()
if current() == '=':
result = Token(kind: tokGreaterEq)
next()
else:
result = Token(kind: tokGreater)
of '=':
next()
if current() == '=':
result = Token(kind: tokEq)
next()
else:
result = Token(kind: tokAssign)
of '!':
next()
if current() == '=':
result = Token(kind: tokNotEq)
next()
else:
result = Token(kind: tokNot)
of '&':
next()
if current() == '&':
result = Token(kind: tokAnd)
next()
else:
lexer.error("'&&' expected")
of '|':
next()
if current() == '|':
result = Token(kind: tokOr)
next()
else:
lexer.error("'||' expected")
of '(': result = Token(kind: tokLPar); next()
of ')': result = Token(kind: tokRPar); next()
of '{': result = Token(kind: tokLBrace); next()
of '}': result = Token(kind: tokRBrace); next()
of ';': result = Token(kind: tokSemi); next()
of ',': result = Token(kind: tokComma); next()
of '\'':
next()
if current() == '\'': lexer.error("Empty character literal")
let ch = lexer.charOrEscape()
if current() != '\'':
lexer.error("Character literal must contain a single character or " &
"escape sequence")
result = Token(kind: tokChar, charVal: ch)
next()
of '0'..'9':
var number = ""
while current() in Digits:
number.add(current())
next()
if current() in IdentStartChars:
lexer.error("Integer literal ends in non-digit characters")
result = Token(kind: tokInt, intVal: parseInt(number))
of '"':
next()
var str = ""
while current() notin {'"', '\x00', '\n'}:
str.add(lexer.charOrEscape())
if current() == '\x00':
lexer.error("Unterminated string literal")
elif current() == '\n':
lexer.error("Line feed in string literal")
else:
next()
result = Token(kind: tokString, stringVal: str)
of IdentStartChars:
var ident = $current()
next()
while current() in IdentChars:
ident.add(current())
next()
case ident
of "if": result = Token(kind: tokIf)
of "else": result = Token(kind: tokElse)
of "while": result = Token(kind: tokWhile)
of "print": result = Token(kind: tokPrint)
of "putc": result = Token(kind: tokPutc)
else: result = Token(kind: tokIdent, ident: ident)
of '\x00':
result = Token(kind: tokEnd)
else:
lexer.error("Unexpected character: '" & current() & "'")
result.ln = ln
result.col = col
lexer.skip()
proc peek*(lexer: var Lexer): Token =
discard
proc initLexer*(input: string): Lexer =
result = Lexer(input: input, pos: 0, ln: 1, col: 1)
result.skip()
when isMainModule:
let code = readAll(stdin)
var
lexer = initLexer(code)
token: Token
while true:
token = lexer.next()
stdout.write(token.ln, ' ', token.col, ' ', token.kind)
case token.kind
of tokInt: stdout.write(' ', token.intVal)
of tokChar: stdout.write(' ', token.charVal.ord)
of tokString: stdout.write(" \"", token.stringVal
.replace("\\", "\\\\")
.replace("\n", "\\n"), '"')
of tokIdent: stdout.write(' ', token.ident)
else: discard
stdout.write('\n')
if token.kind == tokEnd:
break