RosettaCodeData/Task/Compiler-lexical-analyzer/Nim/compiler-lexical-analyzer-1...

192 lines
5.3 KiB
Nim

import re, strformat, strutils
type
TokenKind = enum
tkUnknown = "UNKNOWN_TOKEN",
tkMul = "Op_multiply",
tkDiv = "Op_divide",
tkMod = "Op_mod",
tkAdd = "Op_add",
tkSub = "Op_subtract",
tkNeg = "Op_negate",
tkLt = "Op_less",
tkLte = "Op_lessequal",
tkGt = "Op_greater",
tkGte = "Op_greaterequal",
tkEq = "Op_equal",
tkNeq = "Op_notequal",
tkNot = "Op_not",
tkAsgn = "Op_assign",
tkAnd = "Op_and",
tkOr = "Op_or",
tkLpar = "LeftParen",
tkRpar = "RightParen",
tkLbra = "LeftBrace",
tkRbra = "RightBrace",
tkSmc = "Semicolon",
tkCom = "Comma",
tkIf = "Keyword_if",
tkElse = "Keyword_else",
tkWhile = "Keyword_while",
tkPrint = "Keyword_print",
tkPutc = "Keyword_putc",
tkId = "Identifier",
tkInt = "Integer",
tkChar = "Integer",
tkStr = "String",
tkEof = "End_of_input"
Token = object
kind: TokenKind
value: string
TokenAnn = object
## Annotated token with messages for compiler
token: Token
line, column: int
proc getSymbols(table: openArray[(char, TokenKind)]): seq[char] =
result = newSeq[char]()
for ch, tokenKind in items(table):
result.add ch
const
tkSymbols = { # single-char tokens
'*': tkMul,
'%': tkMod,
'+': tkAdd,
'-': tkSub,
'(': tkLpar,
')': tkRpar,
'{': tkLbra,
'}': tkRbra,
';': tkSmc,
',': tkCom,
'/': tkDiv, # the comment case /* ... */ is handled in `stripUnimportant`
}
symbols = getSymbols(tkSymbols)
proc findTokenKind(table: openArray[(char, TokenKind)]; needle: char):
TokenKind =
for ch, tokenKind in items(table):
if ch == needle: return tokenKind
tkUnknown
proc stripComment(text: var string, lineNo, colNo: var int) =
var matches: array[1, string]
if match(text, re"\A(/\*[\s\S]*?\*/)", matches):
text = text[matches[0].len..^1]
for s in matches[0]:
if s == '\n':
inc lineNo
colNo = 1
else:
inc colNo
proc stripUnimportant(text: var string; lineNo, colNo: var int) =
while true:
if text.len == 0: return
elif text[0] == '\n':
inc lineNo
colNo = 1
text = text[1..^1]
elif text[0] == ' ':
inc colNo
text = text[1..^1]
elif text.len >= 2 and text[0] == '/' and text[1] == '*':
stripComment(text, lineNo, colNo)
else: return
proc lookAhead(ch1, ch2: char, tk1, tk2: TokenKind): (TokenKind, int) =
if ch1 == ch2: (tk1, 2)
else: (tk2, 1)
proc consumeToken(text: var string; tkl: var int): Token =
## Return token removing it from the `text` and write its length to
## `tkl`. If the token can not be defined, return `tkUnknown` as a
## token, shrink text by 1 and write 1 to its length.
var
matches: array[1, string]
tKind: TokenKind
val: string
if text.len == 0:
(tKind, tkl) = (tkEof, 0)
# Simple characters
elif text[0] in symbols: (tKind, tkl) = (tkSymbols.findTokenKind(text[0]), 1)
elif text[0] == '<': (tKind, tkl) = lookAhead(text[1], '=', tkLte, tkLt)
elif text[0] == '>': (tKind, tkl) = lookAhead(text[1], '=', tkGte, tkGt)
elif text[0] == '=': (tKind, tkl) = lookAhead(text[1], '=', tkEq, tkAsgn)
elif text[0] == '!': (tKind, tkl) = lookAhead(text[1], '=', tkNeq, tkNot)
elif text[0] == '&': (tKind, tkl) = lookAhead(text[1], '&', tkAnd, tkUnknown)
elif text[0] == '|': (tKind, tkl) = lookAhead(text[1], '|', tkOr, tkUnknown)
# Keywords
elif match(text, re"\Aif\b"): (tKind, tkl) = (tkIf, 2)
elif match(text, re"\Aelse\b"): (tKind, tkl) = (tkElse, 4)
elif match(text, re"\Awhile\b"): (tKind, tkl) = (tkWhile, 5)
elif match(text, re"\Aprint\b"): (tKind, tkl) = (tkPrint, 5)
elif match(text, re"\Aputc\b"): (tKind, tkl) = (tkPutc, 4)
# Literals and identifiers
elif match(text, re"\A([0-9]+)", matches):
(tKind, tkl) = (tkInt, matches[0].len)
val = matches[0]
elif match(text, re"\A([_a-zA-Z][_a-zA-Z0-9]*)", matches):
(tKind, tkl) = (tkId, matches[0].len)
val = matches[0]
elif match(text, re"\A('(?:[^'\n]|\\\\|\\n)')", matches):
(tKind, tkl) = (tkChar, matches[0].len)
val = case matches[0]
of r"' '": $ord(' ')
of r"'\n'": $ord('\n')
of r"'\\'": $ord('\\')
else: $ord(matches[0][1]) # "'a'"[1] == 'a'
elif match(text, re"\A(""[^""\n]*"")", matches):
(tKind, tkl) = (tkStr, matches[0].len)
val = matches[0]
else: (tKind, tkl) = (tkUnknown, 1)
text = text[tkl..^1]
Token(kind: tKind, value: val)
proc tokenize*(text: string): seq[TokenAnn] =
result = newSeq[TokenAnn]()
var
lineNo, colNo: int = 1
text = text
token: Token
tokenLength: int
while text.len > 0:
stripUnimportant(text, lineNo, colNo)
token = consumeToken(text, tokenLength)
result.add TokenAnn(token: token, line: lineNo, column: colNo)
inc colNo, tokenLength
proc output*(s: seq[TokenAnn]): string =
var
tokenKind: TokenKind
value: string
line, column: int
for tokenAnn in items(s):
line = tokenAnn.line
column = tokenAnn.column
tokenKind = tokenAnn.token.kind
value = tokenAnn.token.value
result.add(
fmt"{line:>5}{column:>7} {tokenKind:<15}{value}"
.strip(leading = false) & "\n")
when isMainModule:
import os
let input = if paramCount() > 0: readFile paramStr(1)
else: readAll stdin
echo input.tokenize.output