RosettaCodeData/Task/Compiler-lexical-analyzer/Erlang/compiler-lexical-analyzer.erl

#!/bin/env escript
%%%-------------------------------------------------------------------

-record (inp_t, {inpf, pushback, line_no, column_no}).

main (Args) ->
   main_program (Args).

main_program ([]) ->
   scan_from_inpf_to_outf ("-", "-"),
   halt (0);
main_program ([Inpf_filename]) ->
   scan_from_inpf_to_outf (Inpf_filename, "-"),
   halt (0);
main_program ([Inpf_filename, Outf_filename]) ->
   scan_from_inpf_to_outf (Inpf_filename, Outf_filename),
   halt (0);
main_program ([_, _ | _]) ->
   ProgName = escript:script_name (),
   io:put_chars (standard_error, "Usage: "),
   io:put_chars (standard_error, ProgName),
   io:put_chars (standard_error, " [INPUTFILE [OUTPUTFILE]]\n"),
   halt (1).

scan_from_inpf_to_outf ("-", "-") ->
   scan_input (standard_io, standard_io);
scan_from_inpf_to_outf (Inpf_filename, "-") ->
   case file:open (Inpf_filename, [read]) of
      {ok, Inpf} -> scan_input (Inpf, standard_io);
      _ -> open_failure (Inpf_filename, "input")
   end;
scan_from_inpf_to_outf ("-", Outf_filename) ->
   case file:open (Outf_filename, [write]) of
      {ok, Outf} -> scan_input (standard_io, Outf);
      _ -> open_failure (Outf_filename, "output")
   end;
scan_from_inpf_to_outf (Inpf_filename, Outf_filename) ->
   case file:open(Inpf_filename, [read]) of
      {ok, Inpf} ->
         case file:open (Outf_filename, [write]) of
            {ok, Outf} -> scan_input (Inpf, Outf);
            _ -> open_failure (Outf_filename, "output")
         end;
      _ -> open_failure (Inpf_filename, "input")
   end.

open_failure (Filename, ForWhat) ->
   ProgName = escript:script_name (),
   io:put_chars (standard_error, ProgName),
   io:put_chars (standard_error, ": failed to open \""),
   io:put_chars (standard_error, Filename),
   io:put_chars (standard_error, "\" for "),
   io:put_chars (standard_error, ForWhat),
   io:put_chars (standard_error, "\n"),
   halt (1).

scan_input (Inpf, Outf) ->
   scan_text (Outf, make_inp (Inpf)).

scan_text (Outf, Inp) ->
   {TokTup, Inp1} = get_next_token (Inp),
   print_token (Outf, TokTup),
   case TokTup of
      {"End_of_input", _, _, _} -> ok;
      _ -> scan_text (Outf, Inp1)
   end.

print_token (Outf, {Tok, Arg, Line_no, Column_no}) ->
   S_line_no = erlang:integer_to_list (Line_no),
   S_column_no = erlang:integer_to_list (Column_no),
   io:put_chars (Outf, string:pad (S_line_no, 5, leading)),
   io:put_chars (Outf, " "),
   io:put_chars (Outf, string:pad (S_column_no, 5, leading)),
   io:put_chars (Outf, "  "),
   io:put_chars (Outf, Tok),
   {Padding, Arg1} =
      case Tok of
         "Identifier" -> {"     ", Arg};
         "Integer" -> {"        ", Arg};
         "String" -> {"         ", Arg};
         _ -> {"", ""}
      end,
   io:put_chars (Outf, Padding),
   io:put_chars (Outf, Arg1),
   io:put_chars ("\n").

%%%-------------------------------------------------------------------
%%%
%%% The token dispatcher.
%%%

get_next_token (Inp) ->
   Inp00 = skip_spaces_and_comments (Inp),
   {Ch, Inp0} = get_ch (Inp00),
   {Char, Line_no, Column_no} = Ch,
   Ln = Line_no,
   Cn = Column_no,
   case Char of
      eof -> {{"End_of_input", "", Ln, Cn}, Inp0};
      "," -> {{"Comma", ",", Ln, Cn}, Inp0};
      ";" -> {{"Semicolon", ";", Ln, Cn}, Inp0};
      "(" -> {{"LeftParen", "(", Ln, Cn}, Inp0};
      ")" -> {{"RightParen", ")", Ln, Cn}, Inp0};
      "{" -> {{"LeftBrace", "{", Ln, Cn}, Inp0};
      "}" -> {{"RightBrace", "}", Ln, Cn}, Inp0};
      "*" -> {{"Op_multiply", "*", Ln, Cn}, Inp0};
      "/" -> {{"Op_divide", "/", Ln, Cn}, Inp0};
      "%" -> {{"Op_mod", "%", Ln, Cn}, Inp0};
      "+" -> {{"Op_add", "+", Ln, Cn}, Inp0};
      "-" -> {{"Op_subtract", "-", Ln, Cn}, Inp0};
      "<" ->
         {Ch1, Inp1} = get_ch (Inp0),
         {Char1, _, _} = Ch1,
         case Char1 of
            "=" -> {{"Op_lessequal", "<=", Ln, Cn}, Inp1};
            _ -> {{"Op_less", "<", Ln, Cn}, push_back (Ch1, Inp1)}
         end;
      ">" ->
         {Ch1, Inp1} = get_ch (Inp0),
         {Char1, _, _} = Ch1,
         case Char1 of
            "=" -> {{"Op_greaterequal", ">=", Ln, Cn}, Inp1};
            _ -> {{"Op_greater", ">", Ln, Cn}, push_back (Ch1, Inp1)}
         end;
      "=" ->
         {Ch1, Inp1} = get_ch (Inp0),
         {Char1, _, _} = Ch1,
         case Char1 of
            "=" -> {{"Op_equal", "==", Ln, Cn}, Inp1};
            _ -> {{"Op_assign", "=", Ln, Cn}, push_back (Ch1, Inp1)}
         end;
      "!" ->
         {Ch1, Inp1} = get_ch (Inp0),
         {Char1, _, _} = Ch1,
         case Char1 of
            "=" -> {{"Op_notequal", "!=", Ln, Cn}, Inp1};
            _ -> {{"Op_not", "!", Ln, Cn}, push_back (Ch1, Inp1)}
         end;
      "&" ->
         {Ch1, Inp1} = get_ch (Inp0),
         {Char1, _, _} = Ch1,
         case Char1 of
            "&" -> {{"Op_and", "&&", Ln, Cn}, Inp1};
            _ -> unexpected_character (Ln, Cn, Char)
         end;
      "|" ->
         {Ch1, Inp1} = get_ch (Inp0),
         {Char1, _, _} = Ch1,
         case Char1 of
            "|" -> {{"Op_or", "||", Ln, Cn}, Inp1};
            _ -> unexpected_character (Ln, Cn, Char)
         end;
      "\"" ->
         Inp1 = push_back (Ch, Inp0),
         scan_string_literal (Inp1);
      "'" ->
         Inp1 = push_back (Ch, Inp0),
         scan_character_literal (Inp1);
      _ ->
         case is_digit (Char) of
            true ->
               Inp1 = push_back (Ch, Inp0),
               scan_integer_literal (Inp1);
            false ->
               case is_alpha_or_underscore (Char) of
                  true ->
                     Inp1 = push_back (Ch, Inp0),
                     scan_identifier_or_reserved_word (Inp1);
                  false ->
                     unexpected_character (Ln, Cn, Char)
               end
         end
   end.

%%%-------------------------------------------------------------------
%%%
%%% Skipping past spaces and /* ... */ comments.
%%%
%%% Comments are treated exactly like a bit of whitespace. They never
%%% make it to the dispatcher.
%%%

skip_spaces_and_comments (Inp) ->
   {Ch, Inp0} = get_ch (Inp),
   {Char, Line_no, Column_no} = Ch,
   case classify_char (Char) of
      eof -> push_back (Ch, Inp0);
      space -> skip_spaces_and_comments (Inp0);
      slash ->
         {Ch1, Inp1} = get_ch (Inp0),
         case Ch1 of
            {"*", _, _} ->
               Inp2 = scan_comment (Inp1, Line_no, Column_no),
               skip_spaces_and_comments (Inp2);
            _ -> push_back (Ch, (push_back (Ch1, Inp1)))
         end;
      other -> push_back (Ch, Inp0)
   end.

classify_char (Char) ->
   case Char of
      eof -> eof;
      "/" -> slash;
      _ -> case is_space (Char) of
              true -> space;
              false -> other
           end
   end.

scan_comment (Inp, Line_no, Column_no) ->
   {Ch0, Inp0} = get_ch (Inp),
   case Ch0 of
      {eof, _, _} -> unterminated_comment (Line_no, Column_no);
      {"*", _, _} ->
         {Ch1, Inp1} = get_ch (Inp0),
         case Ch1 of
            {eof, _, _} ->
               unterminated_comment (Line_no, Column_no);
            {"/", _, _} -> Inp1;
            _ -> scan_comment (Inp1, Line_no, Column_no)
         end;
      _ -> scan_comment (Inp0, Line_no, Column_no)
   end.

is_space (S) ->
   case re:run (S, "^[[:space:]]+$") of
      {match, _} -> true;
      _ -> false
   end.

%%%-------------------------------------------------------------------
%%%
%%% Scanning of integer literals, identifiers, and reserved words.
%%%
%%% These three types of token are very similar to each other.
%%%

scan_integer_literal (Inp) ->
   %% Scan an entire word, not just digits. This way we detect
   %% erroneous text such as "23skidoo".
   {Line_no, Column_no, Inp1} = get_position (Inp),
   {Word, Inp2} = scan_word (Inp1),
   case is_digit (Word) of
      true -> {{"Integer", Word, Line_no, Column_no}, Inp2};
      false -> invalid_integer_literal (Line_no, Column_no, Word)
   end.

scan_identifier_or_reserved_word (Inp) ->
   %% It is assumed that the first character is of the correct type,
   %% thanks to the dispatcher.
   {Line_no, Column_no, Inp1} = get_position (Inp),
   {Word, Inp2} = scan_word (Inp1),
   Tok =
      case Word of
         "if" -> "Keyword_if";
         "else" -> "Keyword_else";
         "while" -> "Keyword_while";
         "print" -> "Keyword_print";
         "putc" -> "Keyword_putc";
         _ -> "Identifier"
      end,
   {{Tok, Word, Line_no, Column_no}, Inp2}.

scan_word (Inp) ->
   scan_word_loop (Inp, "").

scan_word_loop (Inp, Word0) ->
   {Ch1, Inp1} = get_ch (Inp),
   {Char1, _, _} = Ch1,
   case is_alnum_or_underscore (Char1) of
      true -> scan_word_loop (Inp1, Word0 ++ Char1);
      false -> {Word0, push_back (Ch1, Inp1)}
   end.

get_position (Inp) ->
   {Ch1, Inp1} = get_ch (Inp),
   {_, Line_no, Column_no} = Ch1,
   Inp2 = push_back (Ch1, Inp1),
   {Line_no, Column_no, Inp2}.

is_digit (S) ->
   case re:run (S, "^[[:digit:]]+$") of
      {match, _} -> true;
      _ -> false
   end.

is_alpha_or_underscore (S) ->
   case re:run (S, "^[[:alpha:]_]+$") of
      {match, _} -> true;
      _ -> false
   end.

is_alnum_or_underscore (S) ->
   case re:run (S, "^[[:alnum:]_]+$") of
      {match, _} -> true;
      _ -> false
   end.

%%%-------------------------------------------------------------------
%%%
%%% Scanning of string literals.
%%%
%%% It is assumed that the first character is the opening quote, and
%%% that the closing quote is the same character.
%%%


scan_string_literal (Inp) ->
   {Ch1, Inp1} = get_ch (Inp),
   {Quote_mark, Line_no, Column_no} = Ch1,
   {Contents, Inp2} = scan_str_lit (Inp1, Ch1),
   Toktup = {"String", Quote_mark ++ Contents ++ Quote_mark,
             Line_no, Column_no},
   {Toktup, Inp2}.

scan_str_lit (Inp, Ch) -> scan_str_lit_loop (Inp, Ch, "").

scan_str_lit_loop (Inp, Ch, Contents) ->
   {Quote_mark, Line_no, Column_no} = Ch,
   {Ch1, Inp1} = get_ch (Inp),
   {Char1, Line_no1, Column_no1} = Ch1,
   case Char1 of
      Quote_mark -> {Contents, Inp1};
      eof -> eoi_in_string_literal (Line_no, Column_no);
      "\n" -> eoln_in_string_literal (Line_no, Column_no);
      "\\" ->
         {Ch2, Inp2} = get_ch (Inp1),
         {Char2, _, _} = Ch2,
         case Char2 of
            "n" ->
               scan_str_lit_loop (Inp2, Ch, Contents ++ "\\n");
            "\\" ->
               scan_str_lit_loop (Inp2, Ch, Contents ++ "\\\\");
            _ ->
               unsupported_escape (Line_no1, Column_no1, Char2)
         end;
      _ -> scan_str_lit_loop (Inp1, Ch, Contents ++ Char1)
   end.

%%%-------------------------------------------------------------------
%%%
%%% Scanning of character literals.
%%%
%%% It is assumed that the first character is the opening quote, and
%%% that the closing quote is the same character.
%%%
%%% The tedious part of scanning a character literal is distinguishing
%%% between the kinds of lexical error. (One might wish to modify the
%%% code to detect, as a distinct kind of error, end of line within a
%%% character literal.)
%%%

scan_character_literal (Inp) ->
   {Ch, Inp0} = get_ch (Inp),
   {_, Line_no, Column_no} = Ch,
   {Ch1, Inp1} = get_ch (Inp0),
   {Char1, Line_no1, Column_no1} = Ch1,
   {Intval, Inp3} =
      case Char1 of
         eof -> unterminated_character_literal (Line_no, Column_no);
         "\\" ->
            {Ch2, Inp2} = get_ch (Inp1),
            {Char2, _, _} = Ch2,
            case Char2 of
               eof -> unterminated_character_literal (Line_no,
                                                      Column_no);
               "n" -> {char_to_code ("\n"), Inp2};
               "\\" -> {char_to_code ("\\"), Inp2};
               _ -> unsupported_escape (Line_no1, Column_no1,
                                        Char2)
            end;
         _ -> {char_to_code (Char1), Inp1}
      end,
   Inp4 = check_character_literal_end (Inp3, Ch),
   {{"Integer", Intval, Line_no, Column_no}, Inp4}.

char_to_code (Char) ->
   %% Hat tip to https://archive.ph/BxZRS
   lists:flatmap (fun erlang:integer_to_list/1, Char).

check_character_literal_end (Inp, Ch) ->
   {Char, _, _} = Ch,
   {{Char1, _, _}, Inp1} = get_ch (Inp),
   case Char1 of
      Char -> Inp1;
      _ -> find_char_lit_end (Inp1, Ch)    % Handle a lexical error.
   end.

find_char_lit_end (Inp, Ch) ->
   %% There is a lexical error. Determine which kind it fits into.
   {Char, Line_no, Column_no} = Ch,
   {{Char1, _, _}, Inp1} = get_ch (Inp),
   case Char1 of
      Char -> multicharacter_literal (Line_no, Column_no);
      eof -> unterminated_character_literal (Line_no, Column_no);
      _ -> find_char_lit_end (Inp1, Ch)
   end.

%%%-------------------------------------------------------------------
%%%
%%% Character-at-a-time input, with unrestricted pushback, and with
%%% line and column numbering.
%%%

make_inp (Inpf) ->
   #inp_t{inpf = Inpf,
          pushback = [],
          line_no = 1,
          column_no = 1}.

get_ch (Inp) ->
   #inp_t{inpf = Inpf,
          pushback = Pushback,
          line_no = Line_no,
          column_no = Column_no} = Inp,
   case Pushback of
      [Ch | Tail] ->
         Inp1 = Inp#inp_t{pushback = Tail},
         {Ch, Inp1};
      [] ->
         case io:get_chars (Inpf, "", 1) of
            eof ->
               Ch = {eof, Line_no, Column_no},
               {Ch, Inp};
            {error, _} ->
               Ch = {eof, Line_no, Column_no},
               {Ch, Inp};
            Char ->
               case Char of
                  "\n" ->
                     Ch = {Char, Line_no, Column_no},
                     Inp1 = Inp#inp_t{line_no = Line_no + 1,
                                      column_no = 1},
                     {Ch, Inp1};
                  _ ->
                     Ch = {Char, Line_no, Column_no},
                     Inp1 =
                        Inp#inp_t{column_no = Column_no + 1},
                     {Ch, Inp1}
               end
         end
   end.

push_back (Ch, Inp) ->
   Inp#inp_t{pushback = [Ch | Inp#inp_t.pushback]}.

%%%-------------------------------------------------------------------

invalid_integer_literal (Line_no, Column_no, Word) ->
   error_abort ("invalid integer literal \"" ++
                   Word ++ "\" at " ++
                   integer_to_list (Line_no) ++ ":" ++
                   integer_to_list (Column_no)).

unsupported_escape (Line_no, Column_no, Char) ->
   error_abort ("unsupported escape \\" ++
                   Char ++ " at " ++
                   integer_to_list (Line_no) ++ ":" ++
                   integer_to_list (Column_no)).

unexpected_character (Line_no, Column_no, Char) ->
   error_abort ("unexpected character '" ++
                   Char ++ "' at " ++
                   integer_to_list (Line_no) ++ ":" ++
                   integer_to_list (Column_no)).

eoi_in_string_literal (Line_no, Column_no) ->
   error_abort ("end of input in string literal starting at " ++
                   integer_to_list (Line_no) ++ ":" ++
                   integer_to_list (Column_no)).

eoln_in_string_literal (Line_no, Column_no) ->
   error_abort ("end of line in string literal starting at " ++
                   integer_to_list (Line_no) ++ ":" ++
                   integer_to_list (Column_no)).

unterminated_character_literal (Line_no, Column_no) ->
   error_abort ("unterminated character literal starting at " ++
                   integer_to_list (Line_no) ++ ":" ++
                   integer_to_list (Column_no)).

multicharacter_literal (Line_no, Column_no) ->
   error_abort ("unsupported multicharacter literal starting at " ++
                   integer_to_list (Line_no) ++ ":" ++
                   integer_to_list (Column_no)).

unterminated_comment (Line_no, Column_no) ->
   error_abort ("unterminated comment starting at " ++
                   integer_to_list (Line_no) ++ ":" ++
                   integer_to_list (Column_no)).

error_abort (Message) ->
   ProgName = escript:script_name (),
   io:put_chars (standard_error, ProgName),
   io:put_chars (standard_error, ": "),
   io:put_chars (standard_error, Message),
   io:put_chars (standard_error, "\n"),
   halt (1).

%%%-------------------------------------------------------------------
%%% Instructions to GNU Emacs --
%%% local variables:
%%% mode: erlang
%%% erlang-indent-level: 3
%%% end:
%%%-------------------------------------------------------------------