RosettaCodeData/Task/Compiler-lexical-analyzer/Lua/compiler-lexical-analyzer-2...

85 lines
2.7 KiB
Lua

-- module lpeg_token_finder
local M = {} -- only items added to M will be public (via 'return M' at end)
local table, concat = table, table.concat
local error, tonumber = error, tonumber
local lpeg = require 'lpeg' -- see http://www.inf.puc-rio.br/~roberto/lpeg/
local token_name = require 'token_name'
_ENV = {}
local imports = 'P R S C Carg Cb Cc Cf Cg Cp Cs Ct Cmt V'
for w in imports:gmatch('%a+') do _ENV[w] = lpeg[w] end
------------------- Define patterns to match tokens -----------------------
alpha = R'az' + R'AZ' + P'_'
digit = R'09'
alnum = alpha + digit
space = S' \t\r\n'
function ptok(text) return {name=token_name[text]} end
op2c = C(P'<=' + P'>=' + P'==' + P'!=' + P'&&' + P'||') / ptok
op1c = C(S'*/%+-<>!=') / ptok
symbol = C(S'(){};,') / ptok
keyword_or_identifier = C(alpha * alnum^0) / function(text)
local name = token_name[text]
return name and {name=name} or {name='Identifier', value=text}
end
integer = C(digit^1) * -alpha / function(text)
return {name='Integer', value=tonumber(text)}
end
Cline = Carg(1) -- call to 'match' sets the first extra argument to source line number
bad_escseq_err = Cmt(Cline, function (_,pos,line)
error{err='bad_escseq', line=line, column=pos-1}
end)
esc_subst = {['\\'] = '\\', ['n'] = '\n'}
escseq = P'\\' * C(S'\\n' + bad_escseq_err) / esc_subst
qchar = P"'" * ( C( P(1) - S"'\n\\" ) + escseq ) * P"'" / function (text)
return {name='Integer', value=text:byte()}
end
qstr = P'"' * ( C((P(1) - S'"\n\\')^1) + escseq )^0 * P'"' / function(...)
return {name='String', value=concat{...}}
end
Ctoken = symbol + op2c + op1c + keyword_or_identifier + integer + qstr + qchar
unfinished_comment_err = Cmt(Cline * Cb('SOC'), function (_, pos, line, socpos)
error{err='unfinished_comment', line=line, column=socpos}
end)
commentstart = Cg(Cp() * P'/*', 'SOC')
commentrest = (P(1) - P'*/')^0 * (P'*/' + unfinished_comment_err)
comment = commentstart * commentrest
morecomment = Cg(Cp(), 'SOC') * commentrest
ws = (space^1 + comment)^0
bad_token_err = Cmt(Cline, function (_, pos, line)
error{err='invalid_token', line=line, column=pos}
end)
tokenpat = ws * Cline * Cp() * (C(-1) + Ctoken + bad_token_err) * Cp() /
function (line, pos, token, nextpos)
if pos == nextpos then -- at end of line; no token
return nil
else
token.line, token.column = line, pos
return token, nextpos
end
end
closecomment_tokenpat = morecomment * tokenpat
function M.find_token(line, line_pos, line_number, in_comment)
pattern = in_comment and closecomment_tokenpat or tokenpat
return lpeg.match(pattern, line, line_pos, line_number)
end
return M