#!/bin/env elixir # -*- elixir -*- defmodule Lex do def main args do {inpf_name, outf_name, exit_status} = case args do [] -> {"-", "-", 0} [name] -> {name, "-", 0} [name1, name2] -> {name1, name2, 0} [name1, name2 | _] -> {name1, name2, usage_error()} end {inpf, outf, exit_status} = case {inpf_name, outf_name, exit_status} do {"-", "-", 0} -> {:stdio, :stdio, 0} {name1, "-", 0} -> {inpf, exit_status} = open_file(name1, [:read]) {inpf, :stdio, exit_status} {"-", name2, 0} -> {outf, exit_status} = open_file(name2, [:write]) {:stdio, outf, exit_status} {name1, name2, 0} -> {inpf, exit_status} = open_file(name1, [:read]) if exit_status != 0 do {inpf, name2, exit_status} else {outf, exit_status} = open_file(name2, [:write]) {inpf, outf, exit_status} end _ -> {inpf_name, outf_name, exit_status} end exit_status = case exit_status do 0 -> main_program inpf, outf _ -> exit_status end # Choose one. System.halt exit_status # Fast exit. #System.stop exit_status # Laborious cleanup. end def main_program inpf, outf do inp = make_inp inpf scan_text outf, inp exit_status = 0 exit_status end def open_file name, rw do case File.open name, rw do {:ok, f} -> {f, 0} _ -> IO.write :stderr, "Cannot open " IO.write :stderr, name case rw do [:read] -> IO.puts " for input" [:write] -> IO.puts " for output" end {name, 1} end end def scan_text outf, inp do {toktup, inp} = get_next_token inp print_token outf, toktup case toktup do {"End_of_input", _, _, _} -> :ok _ -> scan_text outf, inp end end def print_token outf, {tok, arg, line_no, column_no} do IO.write outf, (String.pad_leading "#{line_no}", 5) IO.write outf, " " IO.write outf, (String.pad_leading "#{column_no}", 5) IO.write outf, " " IO.write outf, tok case tok do "Identifier" -> IO.write outf, " " IO.write outf, arg "Integer" -> IO.write outf, " " IO.write outf, arg "String" -> IO.write outf, " " IO.write outf, arg _ -> :ok end IO.puts outf, "" end ###------------------------------------------------------------------- ### ### The token dispatcher. ### def get_next_token inp do inp = skip_spaces_and_comments inp {ch, inp} = get_ch inp {chr, line_no, column_no} = ch ln = line_no cn = column_no case chr do :eof -> {{"End_of_input", "", ln, cn}, inp} "," -> {{"Comma", ",", ln, cn}, inp} ";" -> {{"Semicolon", ";", ln, cn}, inp} "(" -> {{"LeftParen", "(", ln, cn}, inp} ")" -> {{"RightParen", ")", ln, cn}, inp} "{" -> {{"LeftBrace", "{", ln, cn}, inp} "}" -> {{"RightBrace", "}", ln, cn}, inp} "*" -> {{"Op_multiply", "*", ln, cn}, inp} "/" -> {{"Op_divide", "/", ln, cn}, inp} "%" -> {{"Op_mod", "%", ln, cn}, inp} "+" -> {{"Op_add", "+", ln, cn}, inp} "-" -> {{"Op_subtract", "-", ln, cn}, inp} "<" -> {ch1, inp} = get_ch inp {chr1, _, _} = ch1 case chr1 do "=" -> {{"Op_lessequal", "<=", ln, cn}, inp} _ -> {{"Op_less", "<", ln, cn}, (push_back ch1, inp)} end ">" -> {ch1, inp} = get_ch inp {chr1, _, _} = ch1 case chr1 do "=" -> {{"Op_greaterequal", ">=", ln, cn}, inp} _ -> {{"Op_greater", ">", ln, cn}, (push_back ch1, inp)} end "=" -> {ch1, inp} = get_ch inp {chr1, _, _} = ch1 case chr1 do "=" -> {{"Op_equal", "==", ln, cn}, inp} _ -> {{"Op_assign", "=", ln, cn}, (push_back ch1, inp)} end "!" -> {ch1, inp} = get_ch inp {chr1, _, _} = ch1 case chr1 do "=" -> {{"Op_notequal", "!=", ln, cn}, inp} _ -> {{"Op_not", "!", ln, cn}, (push_back ch1, inp)} end "&" -> {ch1, inp} = get_ch inp {chr1, _, _} = ch1 case chr1 do "&" -> {{"Op_and", "&&", ln, cn}, inp} _ -> unexpected_character ln, cn, chr end "|" -> {ch1, inp} = get_ch inp {chr1, _, _} = ch1 case chr1 do "|" -> {{"Op_or", "||", ln, cn}, inp} _ -> unexpected_character ln, cn, chr end "\"" -> inp = push_back ch, inp scan_string_literal inp "'" -> inp = push_back ch, inp scan_character_literal inp _ -> cond do String.match? chr, ~r/^[[:digit:]]$/u -> inp = push_back ch, inp scan_integer_literal inp String.match? chr, ~r/^[[:alpha:]_]$/u -> inp = push_back ch, inp scan_identifier_or_reserved_word inp true -> unexpected_character ln, cn, chr end end end ###------------------------------------------------------------------- ### ### Skipping past spaces and /* ... */ comments. ### ### Comments are treated exactly like a bit of whitespace. They never ### make it to the dispatcher. ### def skip_spaces_and_comments inp do {ch, inp} = get_ch inp {chr, line_no, column_no} = ch cond do chr == :eof -> push_back ch, inp String.match? chr, ~r/^[[:space:]]$/u -> skip_spaces_and_comments inp chr == "/" -> {ch1, inp} = get_ch inp case ch1 do {"*", _, _} -> inp = scan_comment inp, line_no, column_no skip_spaces_and_comments inp _ -> push_back ch, (push_back ch1, inp) end true -> push_back ch, inp end end def scan_comment inp, line_no, column_no do {ch, inp} = get_ch inp case ch do {:eof, _, _} -> unterminated_comment line_no, column_no {"*", _, _} -> {ch1, inp} = get_ch inp case ch1 do {:eof, _, _} -> unterminated_comment line_no, column_no {"/", _, _} -> inp _ -> scan_comment inp, line_no, column_no end _ -> scan_comment inp, line_no, column_no end end ###------------------------------------------------------------------- ### ### Scanning of integer literals, identifiers, and reserved words. ### ### These three types of token are very similar to each other. ### def scan_integer_literal inp do # Scan an entire word, not just digits. This way we detect # erroneous text such as "23skidoo". {line_no, column_no, inp} = get_position inp {word, inp} = scan_word inp if String.match? word, (~r/^[[:digit:]]+$/u) do {{"Integer", word, line_no, column_no}, inp} else invalid_integer_literal line_no, column_no, word end end def scan_identifier_or_reserved_word inp do # It is assumed that the first character is of the correct type, # thanks to the dispatcher. {line_no, column_no, inp} = get_position inp {word, inp} = scan_word inp tok = case word do "if" -> "Keyword_if" "else" -> "Keyword_else" "while" -> "Keyword_while" "print" -> "Keyword_print" "putc" -> "Keyword_putc" _ -> "Identifier" end {{tok, word, line_no, column_no}, inp} end def scan_word inp, word\\"" do {ch, inp} = get_ch inp {chr, _, _} = ch if String.match? chr, (~r/^[[:alnum:]_]$/u) do scan_word inp, (word <> chr) else {word, (push_back ch, inp)} end end def get_position inp do {ch, inp} = get_ch inp {_, line_no, column_no} = ch inp = push_back ch, inp {line_no, column_no, inp} end ###------------------------------------------------------------------- ### ### Scanning of string literals. ### ### It is assumed that the first character is the opening quote, and ### that the closing quote is the same character. ### def scan_string_literal inp do {ch, inp} = get_ch inp {quote_mark, line_no, column_no} = ch {contents, inp} = scan_str_lit inp, ch {{"String", quote_mark <> contents <> quote_mark, line_no, column_no}, inp} end def scan_str_lit inp, ch, contents\\"" do {quote_mark, line_no, column_no} = ch {ch1, inp} = get_ch inp {chr1, line_no1, column_no1} = ch1 if chr1 == quote_mark do {contents, inp} else case chr1 do :eof -> eoi_in_string_literal line_no, column_no "\n" -> eoln_in_string_literal line_no, column_no "\\" -> {ch2, inp} = get_ch inp {chr2, _, _} = ch2 case chr2 do "n" -> scan_str_lit inp, ch, (contents <> "\\n") "\\" -> scan_str_lit inp, ch, (contents <> "\\\\") _ -> unsupported_escape line_no1, column_no1, chr2 end _ -> scan_str_lit inp, ch, (contents <> chr1) end end end ###------------------------------------------------------------------- ### ### Scanning of character literals. ### ### It is assumed that the first character is the opening quote, and ### that the closing quote is the same character. ### ### The tedious part of scanning a character literal is distinguishing ### between the kinds of lexical error. (One might wish to modify the ### code to detect, as a distinct kind of error, end of line within a ### character literal.) ### def scan_character_literal inp do {ch, inp} = get_ch inp {_, line_no, column_no} = ch {ch1, inp} = get_ch inp {chr1, line_no1, column_no1} = ch1 {intval, inp} = case chr1 do :eof -> unterminated_character_literal line_no, column_no "\\" -> {ch2, inp} = get_ch inp {chr2, _, _} = ch2 case chr2 do :eof -> unterminated_character_literal line_no, column_no "n" -> {(:binary.first "\n"), inp} "\\" -> {(:binary.first "\\"), inp} _ -> unsupported_escape line_no1, column_no1, chr2 end _ -> {(:binary.first chr1), inp} end inp = check_character_literal_end inp, ch {{"Integer", "#{intval}", line_no, column_no}, inp} end def check_character_literal_end inp, ch do {chr, _, _} = ch {{chr1, _, _}, inp} = get_ch inp if chr1 == chr do inp else # Lexical error. find_char_lit_end inp, ch end end def find_char_lit_end inp, ch do {chr, line_no, column_no} = ch {{chr1, _, _}, inp} = get_ch inp if chr1 == chr do multicharacter_literal line_no, column_no else case chr1 do :eof -> unterminated_character_literal line_no, column_no _ -> find_char_lit_end inp, ch end end end ###------------------------------------------------------------------- ### ### Character-at-a-time input, with unrestricted pushback, and with ### line and column numbering. ### def make_inp inpf do {inpf, [], 1, 1} end def get_ch {inpf, pushback, line_no, column_no} do case pushback do [head | tail] -> {head, {inpf, tail, line_no, column_no}} [] -> case IO.read(inpf, 1) do :eof -> {{:eof, line_no, column_no}, {inpf, pushback, line_no, column_no}} {:error, _} -> {{:eof, line_no, column_no}, {inpf, pushback, line_no, column_no}} chr -> case chr do "\n" -> {{chr, line_no, column_no}, {inpf, pushback, line_no + 1, 1}} _ -> {{chr, line_no, column_no}, {inpf, pushback, line_no, column_no + 1}} end end end end def push_back ch, {inpf, pushback, line_no, column_no} do {inpf, [ch | pushback], line_no, column_no} end ###------------------------------------------------------------------- ### ### Lexical and usage errors. ### def unterminated_comment line_no, column_no do raise "#{scriptname()}: unterminated comment at #{line_no}:#{column_no}" end def invalid_integer_literal line_no, column_no, word do raise "#{scriptname()}: invalid integer literal #{word} at #{line_no}:#{column_no}" end def unsupported_escape line_no, column_no, chr do raise "#{scriptname()}: unsupported escape \\#{chr} at #{line_no}:#{column_no}" end def eoi_in_string_literal line_no, column_no do raise "#{scriptname()}: end of input in string literal starting at #{line_no}:#{column_no}" end def eoln_in_string_literal line_no, column_no do raise "#{scriptname()}: end of line in string literal starting at #{line_no}:#{column_no}" end def multicharacter_literal line_no, column_no do raise "#{scriptname()}: unsupported multicharacter literal at #{line_no}:#{column_no}" end def unterminated_character_literal line_no, column_no do raise "#{scriptname()}: unterminated character literal starting at #{line_no}:#{column_no}" end def unexpected_character line_no, column_no, chr do raise "#{scriptname()}: unexpected character '#{chr}' at #{line_no}:#{column_no}" end def usage_error() do IO.puts "Usage: #{scriptname()} [INPUTFILE [OUTPUTFILE]]" IO.puts "If either of INPUTFILE or OUTPUTFILE is not present or is \"-\"," IO.puts "standard input or standard output is used, respectively." exit_status = 2 exit_status end def scriptname() do Path.basename(__ENV__.file) end #--------------------------------------------------------------------- end ## module Lex Lex.main(System.argv)