RosettaCodeData/Task/Compiler-lexical-analyzer/Lua/compiler-lexical-analyzer-2...

-- module lpeg_token_finder
local M = {} -- only items added to M will be public (via 'return M' at end)
local table, concat = table, table.concat
local error, tonumber = error, tonumber

local lpeg = require 'lpeg' -- see http://www.inf.puc-rio.br/~roberto/lpeg/
local token_name = require 'token_name'
_ENV = {}

local imports = 'P R S C Carg Cb Cc Cf Cg Cp Cs Ct Cmt V'
for w in imports:gmatch('%a+') do _ENV[w] = lpeg[w] end

------------------- Define patterns to match tokens -----------------------

alpha = R'az' + R'AZ' + P'_'
digit = R'09'
alnum = alpha + digit
space = S' \t\r\n'

function ptok(text) return {name=token_name[text]} end
op2c = C(P'<=' + P'>=' + P'==' + P'!=' + P'&&' + P'||') / ptok
op1c = C(S'*/%+-<>!=') / ptok
symbol = C(S'(){};,') / ptok

keyword_or_identifier = C(alpha * alnum^0) / function(text)
    local name = token_name[text]
    return name and {name=name} or {name='Identifier', value=text}
end

integer = C(digit^1) * -alpha  / function(text)
    return {name='Integer', value=tonumber(text)}
end

Cline = Carg(1) -- call to 'match' sets the first extra argument to source line number

bad_escseq_err = Cmt(Cline, function (_,pos,line)
    error{err='bad_escseq', line=line, column=pos-1}
end)

esc_subst = {['\\'] = '\\', ['n'] = '\n'}
escseq = P'\\' * C(S'\\n' + bad_escseq_err) / esc_subst

qchar = P"'" * ( C( P(1) - S"'\n\\"   ) + escseq )   * P"'" / function (text)
    return {name='Integer', value=text:byte()}
end

qstr =  P'"' * ( C((P(1) - S'"\n\\')^1) + escseq )^0 * P'"' / function(...)
    return {name='String', value=concat{...}}
end

Ctoken = symbol + op2c + op1c + keyword_or_identifier + integer + qstr + qchar

unfinished_comment_err = Cmt(Cline * Cb('SOC'), function (_, pos, line, socpos)
    error{err='unfinished_comment', line=line, column=socpos}
end)
commentstart = Cg(Cp() * P'/*', 'SOC')
commentrest  =  (P(1) - P'*/')^0 * (P'*/' + unfinished_comment_err)
comment      = commentstart * commentrest
morecomment  = Cg(Cp(), 'SOC') * commentrest

ws = (space^1 + comment)^0

bad_token_err = Cmt(Cline, function (_, pos, line)
    error{err='invalid_token', line=line, column=pos}
end)

tokenpat = ws * Cline * Cp() * (C(-1) + Ctoken + bad_token_err) * Cp() /
    function (line, pos, token, nextpos)
        if pos == nextpos then -- at end of line; no token
            return nil
        else
            token.line, token.column = line, pos
            return token, nextpos
        end
    end

closecomment_tokenpat = morecomment * tokenpat

function M.find_token(line, line_pos, line_number, in_comment)
    pattern = in_comment and closecomment_tokenpat or tokenpat
    return lpeg.match(pattern, line, line_pos, line_number)
end

return M