RosettaCodeData/Task/Compiler-lexical-analyzer/Pluto/compiler-lexical-analyzer.p...

259 lines
10 KiB
Plaintext

do -- Compiler/Lexical analyser
fmt = require( "fmt" )
local lineNumber, columnNumber, atEof = 0, 0, false
local line, lineWidth, currChar = "", -1, "\n"
local tkValue, tkType, tkLine, tkColumn, tkIntegerValue
local MAX_TOKEN_LENGTH <const> = 256
local MAX_INTEGER <const> = 2^32
local MAXINTEGER_OVER_10 <const> = MAX_INTEGER // 10
local MAXINTEGER_MOD_10 <const> = MAX_INTEGER % 10
local tk = {}
do -- setup token types
local function T( tknName ) tk[ tknName ] = tknName end
T( "Op_multiply" ) T( "Op_divide" ) T( "Op_mod" )
T( "Op_add" ) T( "Op_subtract" ) T( "Op_negate" )
T( "Op_less" ) T( "Op_lessequal" ) T( "Op_greater" )
T( "Op_greaterequal" ) T( "Op_equal" ) T( "Op_notequal" )
T( "Op_not" ) T( "Op_assign" ) T( "Op_and" )
T( "Op_or" ) T( "LeftParen" ) T( "RightParen" )
T( "LeftBrace" ) T( "RightBrace" ) T( "Semicolon" )
T( "Comma" ) T( "Keyword_if" ) T( "Keyword_else" )
T( "Keyword_while" ) T( "Keyword_print" ) T( "Keyword_putc" )
T( "Identifier" ) T( "Integer" ) T( "String" )
T( "End_of_input" ) T( "Comment" )
end
-- reports an error
local function lexError( message )
io.write( $"**** Error at({lineNumber},{columnNumber}): {message}\n" )
end
-- gets the next source character
local function nextChar()
++ columnNumber
if columnNumber == ( lineWidth + 1 ) then
currChar = "\n"
elseif columnNumber > lineWidth then
line = io.read( "l" )
if line == nil then
atEof, curChar, lineWidth = true, " ", -1
else
if line == "" then line = " " end
++ lineNumber
lineWidth, columnNumber, currChar = #line, 1, line[ 1 ]
end
else
currChar = line[ columnNumber ]
end
end
-- gets the next token, returns the token type
local function nextToken()
-- returns true if currChar is in the inclusive range lowerValue to upperValue
-- false otherwise
local function inRange( lowerValue, upperValue )
return lowerValue <= currChar <= upperValue
end
-- returns true if the current character can start an identifier, false otherwise
local function identifierStartChar()
return currChar == "_" or inRange( "a", "z" ) or inRange( "A", "Z" )
end
-- handle a single character token
local function singleCharToken( tokenType )
tkType = tokenType
nextChar()
end
-- handle a doubled character token: && or ||
local function doubleCharToken( tokenType )
local firstChar = currChar
tkType = tokenType
nextChar()
if currChar == firstChar then
nextChar()
else -- the character wasn't doubled
lexError( "Unrecognised character." )
end
end
-- handle an operator or operator= token
local function opOrOpEqual( opToken, opEqualToken )
tkType = opToken
nextChar()
if currChar == "=" then
-- have operator=
tkType = opEqualToken
nextChar()
end
end
-- handle a / operator or /*...*/ comment
local function divideOrComment()
tkType = tk.Op_divide
nextChar()
if currChar == "*" then
-- have a comment
tkType = tk.Comment
repeat
nextChar()
while currChar != "*" and not atEof do nextChar() end
while currChar == "*" and not atEof do nextChar() end
until atEof or currChar == "/"
if not atEof
then nextChar()
else lexError( "End-of-file in comment." )
end
end
end
-- handle an indentifier or keyword
local function identifierOrKeyword()
tkType = tk.Identifier
while identifierStartChar() or inRange( "0", "9" ) do nextChar() end
tkValue = line:sub( tkColumn, columnNumber - 1 )
-- there are only 5 keywords, so we just test each in turn here
if tkValue == "if" then tkType = tk.Keyword_if
elseif tkValue == "else" then tkType = tk.Keyword_else
elseif tkValue == "while" then tkType = tk.Keyword_while
elseif tkValue == "print" then tkType = tk.Keyword_print
elseif tkValue == "putc" then tkType = tk.Keyword_putc
end
if tkType != tk.Identifier then
tkValue = ""
end
end
-- handle an integer literal
local function integerLiteral()
local overflowed = false
tkType = tk.Integer
while inRange( "0", "9" ) do
local digit = tonumber( currChar )
if tkIntegerValue > MAXINTEGER_OVER_10
then overflowed = true
elseif tkIntegerValue == MAXINTEGER_OVER_10
and digit > MAXINTEGER_MOD_10
then overflowed = true
else
tkIntegerValue *= 10
tkIntegerValue += digit
end
nextChar()
end
if overflowed then
lexError( "Number too large." )
end
if identifierStartChar() then
lexError( "Number followed by letter or underscore." )
end
end
-- handle a char literal
local function charLiteral()
nextChar()
if currChar == "'" or currChar == "\n" then
lexError( "Invalid character constant." )
elseif currChar == "\\" then
-- have an escape
nextChar()
if currChar == "n" then currChar = "\n"
elseif currChar != "\\" then lexError( "Unknown escape sequence." )
end
end
tkType = tk.Integer
tkIntegerValue = string.byte( currChar )
-- should have a closing quoute next
nextChar()
if currChar != "'"
then lexError( "Multi-character constant." )
else nextChar()
end
end
-- handle a string literal
local function stringLiteral()
tkType = tk.String
nextChar()
while currChar != "\"" and currChar != "\n" and not atEof do
nextChar()
end
if currChar == "\n" then
lexError( "End-of-line while scanning string literal." )
elseif atEof then
lexError( "End-of-file while scanning string literal." )
else -- currChar must be """"
nextChar()
end
tkValue = line:sub( tkColumn, columnNumber - 1 )
end
repeat
-- skip white space
while ( currChar == " " or currChar == "\n" ) and not atEof do
nextChar()
end
-- get the token
tkLine = lineNumber
tkColumn = columnNumber
tkValue = ""
tkIntegerValue = 0
if atEof then tkType = tk.End_of_input
elseif currChar == "*" then singleCharToken( tk.Op_multiply )
elseif currChar == "/" then divideOrComment()
elseif currChar == "%" then singleCharToken( tk.Op_mod )
elseif currChar == "+" then singleCharToken( tk.Op_add )
elseif currChar == "-" then singleCharToken( tk.Op_subtract )
elseif currChar == "<" then opOrOpEqual( tk.Op_less, tk.Op_lessequal )
elseif currChar == ">" then opOrOpEqual( tk.Op_greater, tk.Op_greaterequal )
elseif currChar == "=" then opOrOpEqual( tk.Op_assign, tk.Op_equal )
elseif currChar == "!" then opOrOpEqual( tk.Op_not, tk.Op_notequal )
elseif currChar == "&" then doubleCharToken( tk.Op_and )
elseif currChar == "|" then doubleCharToken( tk.Op_or )
elseif currChar == "(" then singleCharToken( tk.LeftParen )
elseif currChar == ")" then singleCharToken( tk.RightParen )
elseif currChar == "{" then singleCharToken( tk.LeftBrace )
elseif currChar == "}" then singleCharToken( tk.RightBrace )
elseif currChar == ";" then singleCharToken( tk.Semicolon )
elseif currChar == "," then singleCharToken( tk.Comma )
elseif identifierStartChar() then identifierOrKeyword()
elseif inRange( "0", "9" ) then integerLiteral()
elseif currChar == "'" then charLiteral()
elseif currChar == "\"" then stringLiteral()
else
lexError( "Unrecognised character." )
singleCharToken( tk.Comment )
end
-- continue until we get something other than a comment
until tkType != tk.Comment
return tkType
end
-- outputs the current token
local function writeToken()
fmt.write( "%5d %5d ", tkLine, tkColumn )
if tkType == tk.Integer then
fmt.write( "%-16s%6d", tkType, tkIntegerValue )
elseif tkValue != "" then
-- token has a value
fmt.write( "%-16s%s", tkType, tkValue )
else
-- no value - no need to pad the token type
io.write( tkType )
end
io.write( "\n" )
end
-- get and print all tokens from standard input
while nextToken() != tk.End_of_input do writeToken() end
writeToken()
end