RosettaCodeData/Task/Compiler-lexical-analyzer/Elixir/compiler-lexical-analyzer.e...

470 lines
13 KiB
Elixir

#!/bin/env elixir
# -*- elixir -*-
defmodule Lex do
def main args do
{inpf_name, outf_name, exit_status} =
case args do
[] -> {"-", "-", 0}
[name] -> {name, "-", 0}
[name1, name2] -> {name1, name2, 0}
[name1, name2 | _] -> {name1, name2, usage_error()}
end
{inpf, outf, exit_status} =
case {inpf_name, outf_name, exit_status} do
{"-", "-", 0} -> {:stdio, :stdio, 0}
{name1, "-", 0} ->
{inpf, exit_status} = open_file(name1, [:read])
{inpf, :stdio, exit_status}
{"-", name2, 0} ->
{outf, exit_status} = open_file(name2, [:write])
{:stdio, outf, exit_status}
{name1, name2, 0} ->
{inpf, exit_status} = open_file(name1, [:read])
if exit_status != 0 do
{inpf, name2, exit_status}
else
{outf, exit_status} = open_file(name2, [:write])
{inpf, outf, exit_status}
end
_ -> {inpf_name, outf_name, exit_status}
end
exit_status =
case exit_status do
0 -> main_program inpf, outf
_ -> exit_status
end
# Choose one.
System.halt exit_status # Fast exit.
#System.stop exit_status # Laborious cleanup.
end
def main_program inpf, outf do
inp = make_inp inpf
scan_text outf, inp
exit_status = 0
exit_status
end
def open_file name, rw do
case File.open name, rw do
{:ok, f} -> {f, 0}
_ ->
IO.write :stderr, "Cannot open "
IO.write :stderr, name
case rw do
[:read] -> IO.puts " for input"
[:write] -> IO.puts " for output"
end
{name, 1}
end
end
def scan_text outf, inp do
{toktup, inp} = get_next_token inp
print_token outf, toktup
case toktup do
{"End_of_input", _, _, _} -> :ok
_ -> scan_text outf, inp
end
end
def print_token outf, {tok, arg, line_no, column_no} do
IO.write outf, (String.pad_leading "#{line_no}", 5)
IO.write outf, " "
IO.write outf, (String.pad_leading "#{column_no}", 5)
IO.write outf, " "
IO.write outf, tok
case tok do
"Identifier" ->
IO.write outf, " "
IO.write outf, arg
"Integer" ->
IO.write outf, " "
IO.write outf, arg
"String" ->
IO.write outf, " "
IO.write outf, arg
_ -> :ok
end
IO.puts outf, ""
end
###-------------------------------------------------------------------
###
### The token dispatcher.
###
def get_next_token inp do
inp = skip_spaces_and_comments inp
{ch, inp} = get_ch inp
{chr, line_no, column_no} = ch
ln = line_no
cn = column_no
case chr do
:eof -> {{"End_of_input", "", ln, cn}, inp}
"," -> {{"Comma", ",", ln, cn}, inp}
";" -> {{"Semicolon", ";", ln, cn}, inp}
"(" -> {{"LeftParen", "(", ln, cn}, inp}
")" -> {{"RightParen", ")", ln, cn}, inp}
"{" -> {{"LeftBrace", "{", ln, cn}, inp}
"}" -> {{"RightBrace", "}", ln, cn}, inp}
"*" -> {{"Op_multiply", "*", ln, cn}, inp}
"/" -> {{"Op_divide", "/", ln, cn}, inp}
"%" -> {{"Op_mod", "%", ln, cn}, inp}
"+" -> {{"Op_add", "+", ln, cn}, inp}
"-" -> {{"Op_subtract", "-", ln, cn}, inp}
"<" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_lessequal", "<=", ln, cn}, inp}
_ -> {{"Op_less", "<", ln, cn}, (push_back ch1, inp)}
end
">" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_greaterequal", ">=", ln, cn}, inp}
_ -> {{"Op_greater", ">", ln, cn}, (push_back ch1, inp)}
end
"=" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_equal", "==", ln, cn}, inp}
_ -> {{"Op_assign", "=", ln, cn}, (push_back ch1, inp)}
end
"!" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_notequal", "!=", ln, cn}, inp}
_ -> {{"Op_not", "!", ln, cn}, (push_back ch1, inp)}
end
"&" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"&" -> {{"Op_and", "&&", ln, cn}, inp}
_ -> unexpected_character ln, cn, chr
end
"|" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"|" -> {{"Op_or", "||", ln, cn}, inp}
_ -> unexpected_character ln, cn, chr
end
"\"" ->
inp = push_back ch, inp
scan_string_literal inp
"'" ->
inp = push_back ch, inp
scan_character_literal inp
_ ->
cond do
String.match? chr, ~r/^[[:digit:]]$/u ->
inp = push_back ch, inp
scan_integer_literal inp
String.match? chr, ~r/^[[:alpha:]_]$/u ->
inp = push_back ch, inp
scan_identifier_or_reserved_word inp
true -> unexpected_character ln, cn, chr
end
end
end
###-------------------------------------------------------------------
###
### Skipping past spaces and /* ... */ comments.
###
### Comments are treated exactly like a bit of whitespace. They never
### make it to the dispatcher.
###
def skip_spaces_and_comments inp do
{ch, inp} = get_ch inp
{chr, line_no, column_no} = ch
cond do
chr == :eof -> push_back ch, inp
String.match? chr, ~r/^[[:space:]]$/u ->
skip_spaces_and_comments inp
chr == "/" ->
{ch1, inp} = get_ch inp
case ch1 do
{"*", _, _} ->
inp = scan_comment inp, line_no, column_no
skip_spaces_and_comments inp
_ -> push_back ch, (push_back ch1, inp)
end
true -> push_back ch, inp
end
end
def scan_comment inp, line_no, column_no do
{ch, inp} = get_ch inp
case ch do
{:eof, _, _} -> unterminated_comment line_no, column_no
{"*", _, _} ->
{ch1, inp} = get_ch inp
case ch1 do
{:eof, _, _} -> unterminated_comment line_no, column_no
{"/", _, _} -> inp
_ -> scan_comment inp, line_no, column_no
end
_ -> scan_comment inp, line_no, column_no
end
end
###-------------------------------------------------------------------
###
### Scanning of integer literals, identifiers, and reserved words.
###
### These three types of token are very similar to each other.
###
def scan_integer_literal inp do
# Scan an entire word, not just digits. This way we detect
# erroneous text such as "23skidoo".
{line_no, column_no, inp} = get_position inp
{word, inp} = scan_word inp
if String.match? word, (~r/^[[:digit:]]+$/u) do
{{"Integer", word, line_no, column_no}, inp}
else
invalid_integer_literal line_no, column_no, word
end
end
def scan_identifier_or_reserved_word inp do
# It is assumed that the first character is of the correct type,
# thanks to the dispatcher.
{line_no, column_no, inp} = get_position inp
{word, inp} = scan_word inp
tok =
case word do
"if" -> "Keyword_if"
"else" -> "Keyword_else"
"while" -> "Keyword_while"
"print" -> "Keyword_print"
"putc" -> "Keyword_putc"
_ -> "Identifier"
end
{{tok, word, line_no, column_no}, inp}
end
def scan_word inp, word\\"" do
{ch, inp} = get_ch inp
{chr, _, _} = ch
if String.match? chr, (~r/^[[:alnum:]_]$/u) do
scan_word inp, (word <> chr)
else
{word, (push_back ch, inp)}
end
end
def get_position inp do
{ch, inp} = get_ch inp
{_, line_no, column_no} = ch
inp = push_back ch, inp
{line_no, column_no, inp}
end
###-------------------------------------------------------------------
###
### Scanning of string literals.
###
### It is assumed that the first character is the opening quote, and
### that the closing quote is the same character.
###
def scan_string_literal inp do
{ch, inp} = get_ch inp
{quote_mark, line_no, column_no} = ch
{contents, inp} = scan_str_lit inp, ch
{{"String", quote_mark <> contents <> quote_mark,
line_no, column_no},
inp}
end
def scan_str_lit inp, ch, contents\\"" do
{quote_mark, line_no, column_no} = ch
{ch1, inp} = get_ch inp
{chr1, line_no1, column_no1} = ch1
if chr1 == quote_mark do
{contents, inp}
else
case chr1 do
:eof -> eoi_in_string_literal line_no, column_no
"\n" -> eoln_in_string_literal line_no, column_no
"\\" ->
{ch2, inp} = get_ch inp
{chr2, _, _} = ch2
case chr2 do
"n" -> scan_str_lit inp, ch, (contents <> "\\n")
"\\" -> scan_str_lit inp, ch, (contents <> "\\\\")
_ -> unsupported_escape line_no1, column_no1, chr2
end
_ -> scan_str_lit inp, ch, (contents <> chr1)
end
end
end
###-------------------------------------------------------------------
###
### Scanning of character literals.
###
### It is assumed that the first character is the opening quote, and
### that the closing quote is the same character.
###
### The tedious part of scanning a character literal is distinguishing
### between the kinds of lexical error. (One might wish to modify the
### code to detect, as a distinct kind of error, end of line within a
### character literal.)
###
def scan_character_literal inp do
{ch, inp} = get_ch inp
{_, line_no, column_no} = ch
{ch1, inp} = get_ch inp
{chr1, line_no1, column_no1} = ch1
{intval, inp} =
case chr1 do
:eof -> unterminated_character_literal line_no, column_no
"\\" ->
{ch2, inp} = get_ch inp
{chr2, _, _} = ch2
case chr2 do
:eof -> unterminated_character_literal line_no, column_no
"n" -> {(:binary.first "\n"), inp}
"\\" -> {(:binary.first "\\"), inp}
_ -> unsupported_escape line_no1, column_no1, chr2
end
_ -> {(:binary.first chr1), inp}
end
inp = check_character_literal_end inp, ch
{{"Integer", "#{intval}", line_no, column_no}, inp}
end
def check_character_literal_end inp, ch do
{chr, _, _} = ch
{{chr1, _, _}, inp} = get_ch inp
if chr1 == chr do
inp
else
# Lexical error.
find_char_lit_end inp, ch
end
end
def find_char_lit_end inp, ch do
{chr, line_no, column_no} = ch
{{chr1, _, _}, inp} = get_ch inp
if chr1 == chr do
multicharacter_literal line_no, column_no
else
case chr1 do
:eof -> unterminated_character_literal line_no, column_no
_ -> find_char_lit_end inp, ch
end
end
end
###-------------------------------------------------------------------
###
### Character-at-a-time input, with unrestricted pushback, and with
### line and column numbering.
###
def make_inp inpf do
{inpf, [], 1, 1}
end
def get_ch {inpf, pushback, line_no, column_no} do
case pushback do
[head | tail] ->
{head, {inpf, tail, line_no, column_no}}
[] ->
case IO.read(inpf, 1) do
:eof ->
{{:eof, line_no, column_no},
{inpf, pushback, line_no, column_no}}
{:error, _} ->
{{:eof, line_no, column_no},
{inpf, pushback, line_no, column_no}}
chr ->
case chr do
"\n" ->
{{chr, line_no, column_no},
{inpf, pushback, line_no + 1, 1}}
_ ->
{{chr, line_no, column_no},
{inpf, pushback, line_no, column_no + 1}}
end
end
end
end
def push_back ch, {inpf, pushback, line_no, column_no} do
{inpf, [ch | pushback], line_no, column_no}
end
###-------------------------------------------------------------------
###
### Lexical and usage errors.
###
def unterminated_comment line_no, column_no do
raise "#{scriptname()}: unterminated comment at #{line_no}:#{column_no}"
end
def invalid_integer_literal line_no, column_no, word do
raise "#{scriptname()}: invalid integer literal #{word} at #{line_no}:#{column_no}"
end
def unsupported_escape line_no, column_no, chr do
raise "#{scriptname()}: unsupported escape \\#{chr} at #{line_no}:#{column_no}"
end
def eoi_in_string_literal line_no, column_no do
raise "#{scriptname()}: end of input in string literal starting at #{line_no}:#{column_no}"
end
def eoln_in_string_literal line_no, column_no do
raise "#{scriptname()}: end of line in string literal starting at #{line_no}:#{column_no}"
end
def multicharacter_literal line_no, column_no do
raise "#{scriptname()}: unsupported multicharacter literal at #{line_no}:#{column_no}"
end
def unterminated_character_literal line_no, column_no do
raise "#{scriptname()}: unterminated character literal starting at #{line_no}:#{column_no}"
end
def unexpected_character line_no, column_no, chr do
raise "#{scriptname()}: unexpected character '#{chr}' at #{line_no}:#{column_no}"
end
def usage_error() do
IO.puts "Usage: #{scriptname()} [INPUTFILE [OUTPUTFILE]]"
IO.puts "If either of INPUTFILE or OUTPUTFILE is not present or is \"-\","
IO.puts "standard input or standard output is used, respectively."
exit_status = 2
exit_status
end
def scriptname() do
Path.basename(__ENV__.file)
end
#---------------------------------------------------------------------
end ## module Lex
Lex.main(System.argv)