RosettaCodeData/Task/Compiler-lexical-analyzer/ATS/compiler-lexical-analyzer.ats

(********************************************************************)
(* Usage: lex [INPUTFILE [OUTPUTFILE]]
   If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input
   or standard output is used, respectively. *)

#define ATS_DYNLOADFLAG 0

#include "share/atspre_staload.hats"
staload UN = "prelude/SATS/unsafe.sats"

#define NIL list_nil ()
#define ::  list_cons

%{^
/* alloca(3) is needed for ATS exceptions. */
#include <alloca.h>
%}

(********************************************************************)

#define NUM_TOKENS 31
#define RESERVED_WORD_HASHTAB_SIZE 9

#define TOKEN_ELSE          0
#define TOKEN_IF            1
#define TOKEN_PRINT         2
#define TOKEN_PUTC          3
#define TOKEN_WHILE         4
#define TOKEN_MULTIPLY      5
#define TOKEN_DIVIDE        6
#define TOKEN_MOD           7
#define TOKEN_ADD           8
#define TOKEN_SUBTRACT      9
#define TOKEN_NEGATE       10
#define TOKEN_LESS         11
#define TOKEN_LESSEQUAL    12
#define TOKEN_GREATER      13
#define TOKEN_GREATEREQUAL 14
#define TOKEN_EQUAL        15
#define TOKEN_NOTEQUAL     16
#define TOKEN_NOT          17
#define TOKEN_ASSIGN       18
#define TOKEN_AND          19
#define TOKEN_OR           20
#define TOKEN_LEFTPAREN    21
#define TOKEN_RIGHTPAREN   22
#define TOKEN_LEFTBRACE    23
#define TOKEN_RIGHTBRACE   24
#define TOKEN_SEMICOLON    25
#define TOKEN_COMMA        26
#define TOKEN_IDENTIFIER   27
#define TOKEN_INTEGER      28
#define TOKEN_STRING       29
#define TOKEN_END_OF_INPUT 30

typedef token_t =
  [i : int | TOKEN_ELSE <= i; i <= TOKEN_END_OF_INPUT]
  int i
typedef tokentuple_t = (token_t, String, ullint, ullint)
typedef token_names_vt = @[string][NUM_TOKENS]

vtypedef reserved_words_vt =
  @[String][RESERVED_WORD_HASHTAB_SIZE]
vtypedef reserved_word_tokens_vt =
  @[token_t][RESERVED_WORD_HASHTAB_SIZE]

vtypedef lookups_vt =
  [p_toknames : addr]
  [p_wordtab  : addr]
  [p_toktab   : addr]
  @{
    pf_toknames = token_names_vt @ p_toknames,
    pf_wordtab = reserved_words_vt @ p_wordtab,
    pf_toktab = reserved_word_tokens_vt @ p_toktab |
    toknames = ptr p_toknames,
    wordtab = ptr p_wordtab,
    toktab = ptr p_toktab
  }

fn
reserved_word_lookup
          (s         : String,
           lookups   : !lookups_vt,
           line_no   : ullint,
           column_no : ullint) : tokentuple_t =
  if string_length s < 2 then
    (TOKEN_IDENTIFIER, s, line_no, column_no)
  else
    let
      macdef wordtab = !(lookups.wordtab)
      macdef toktab = !(lookups.toktab)
      val hashval =
        g1uint_mod (g1ofg0 (char2ui s[0] + char2ui s[1]),
                    g1i2u RESERVED_WORD_HASHTAB_SIZE)
      val token = toktab[hashval]
    in
      if token = TOKEN_IDENTIFIER || s <> wordtab[hashval] then
        (TOKEN_IDENTIFIER, s, line_no, column_no)
      else
        (token, s, line_no, column_no)
    end

(********************************************************************)
(* Input allows pushback into a buffer. *)

typedef ch_t =
  @{
    ichar = int,
    line_no = ullint,
    column_no = ullint
  }

typedef inp_t (n : int) =
  [0 <= n]
  @{
    file = FILEref,
    pushback = list (ch_t, n),
    line_no = ullint,
    column_no = ullint
  }
typedef inp_t = [n : int] inp_t n

fn
get_ch (inp : inp_t) : (ch_t, inp_t) =
  case+ (inp.pushback) of
  | NIL =>
    let
      val c = fileref_getc (inp.file)
      val ch =
        @{
          ichar = c,
          line_no = inp.line_no,
          column_no = inp.column_no
        }
    in
      if c = char2i '\n' then
        let
          val inp =
            @{
              file = inp.file,
              pushback = inp.pushback,
              line_no = succ (inp.line_no),
              column_no = 1ULL
            }
        in
          (ch, inp)
        end
      else
        let
          val inp =
            @{
              file = inp.file,
              pushback = inp.pushback,
              line_no = inp.line_no,
              column_no = succ (inp.column_no)
            }
        in
          (ch, inp)
        end
    end
  | ch :: pushback =>
    let
      val inp =
        @{
          file = inp.file,
          pushback = pushback,
          line_no = inp.line_no,
          column_no = inp.column_no
        }
    in
      (ch, inp)
    end

fn
push_back_ch (ch  : ch_t,
              inp : inp_t) : [n : pos] inp_t n =
  let
    prval _ = lemma_list_param (inp.pushback)
  in
    @{
      file = inp.file,
      pushback = ch :: (inp.pushback),
      line_no = inp.line_no,
      column_no = inp.column_no
    }
  end

(********************************************************************)

exception unterminated_comment of (ullint, ullint)
exception unterminated_character_literal of (ullint, ullint)
exception multicharacter_literal of (ullint, ullint)
exception unterminated_string_literal of (ullint, ullint, bool)
exception unsupported_escape of (ullint, ullint, int)
exception invalid_integer_literal of (ullint, ullint, String)
exception unexpected_character of (ullint, ullint, int)

fn
scan_comment (inp       : inp_t,
              line_no   : ullint,
              column_no : ullint) : inp_t =
  let
    fun
    loop (inp : inp_t) : inp_t =
      let
        val (ch, inp) = get_ch inp
      in
        if (ch.ichar) < 0 then
          $raise unterminated_comment (line_no, column_no)
        else if (ch.ichar) = char2i '*' then
          let
            val (ch1, inp) = get_ch inp
          in
            if (ch.ichar) < 0 then
              $raise unterminated_comment (line_no, column_no)
            else if (ch1.ichar) = char2i '/' then
              inp
            else
              loop inp
          end
        else
          loop inp
      end
  in
    loop inp
  end

fn
skip_spaces_and_comments (inp : inp_t) : [n : pos] inp_t n =
  let
    fun
    loop (inp : inp_t) : [n : pos] inp_t n =
      let
        val (ch, inp) = get_ch inp
      in
        if isspace (ch.ichar) then
          loop inp
        else if (ch.ichar) = char2i '/' then
          let
            val (ch1, inp) = get_ch inp
          in
            if (ch1.ichar) = char2i '*' then
              loop (scan_comment (inp, ch.line_no, ch.column_no))
            else
              let
                val inp = push_back_ch (ch1, inp)
                val inp = push_back_ch (ch, inp)
              in
                inp
              end
          end
        else
          push_back_ch (ch, inp)
      end
  in
    loop inp
  end

fn
reverse_list_to_string
          {m   : int}
          (lst : list (char, m)) : string m =
  let
    fun
    fill_array {n : nat | n <= m} .<n>.
               (arr : &(@[char][m + 1]),
                lst : list (char, n),
                n   : size_t n) : void =
      case+ lst of
      | NIL => ()
      | c :: tail =>
        begin
          arr[pred n] := c;
          fill_array (arr, tail, pred n)
        end

    prval _ = lemma_list_param lst
    val m : size_t m = i2sz (list_length lst)
    val (pf, pfgc | p) = array_ptr_alloc<char> (succ m)
    val _ = array_initize_elt<char> (!p, succ m, '\0')
    val _ = fill_array (!p, lst, m)
  in
    $UN.castvwtp0 @(pf, pfgc | p)
  end

extern fun {}
simple_scan$pred : int -> bool
fun {}
simple_scan {u : nat}
            (lst : list (char, u),
             inp : inp_t) :
    [m : nat]
    [n : pos]
    (list (char, m), inp_t n) =
  let
    val (ch, inp) = get_ch inp
  in
    if simple_scan$pred (ch.ichar) then
      simple_scan<> (int2char0 (ch.ichar) :: lst, inp)
    else
      let
        val inp = push_back_ch (ch, inp)
      in
        (lst, inp)
      end
  end

fn
is_ident_start (c : int) :<> bool =
  isalpha (c) || c = char2i '_'

fn
is_ident_continuation (c : int) :<> bool =
  isalnum (c) || c = char2i '_'

fn
scan_identifier_or_reserved_word
          (inp     : inp_t,
           lookups : !lookups_vt) :
    (tokentuple_t, [n : pos] inp_t n) =
  let
    val (ch, inp) = get_ch inp
    val _ = assertloc (is_ident_start (ch.ichar))

    implement simple_scan$pred<> c = is_ident_continuation c
    val (lst, inp) = simple_scan (int2char0 (ch.ichar) :: NIL, inp)

    val s = reverse_list_to_string lst
    val toktup =
      reserved_word_lookup (s, lookups, ch.line_no, ch.column_no)
  in
    (toktup, inp)
  end

fn
scan_integer_literal
          (inp     : inp_t,
           lookups : !lookups_vt) :
    (tokentuple_t, [n : pos] inp_t n) =
  let
    val (ch, inp) = get_ch inp
    val _ = assertloc (isdigit (ch.ichar))

    implement simple_scan$pred<> c = is_ident_continuation c
    val (lst, inp) = simple_scan (int2char0 (ch.ichar) :: NIL, inp)

    val s = reverse_list_to_string lst

    fun
    check_they_are_all_digits
              {n : nat} .<n>.
              (lst : list (char, n)) : void =
      case+ lst of
      | NIL => ()
      | c :: tail =>
        if isdigit c then
          check_they_are_all_digits tail
        else
          $raise invalid_integer_literal (ch.line_no, ch.column_no, s)

    val _ = check_they_are_all_digits lst
  in
    ((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
  end

fn
ichar2integer_literal (c : int) : String0 =
  let
    var buf = @[char][100] ('\0')
    val _ = $extfcall (int, "snprintf", addr@ buf, i2sz 99, "%d", c)
    val s = string1_copy ($UN.castvwtp0{String0} buf)
  in
    strnptr2string s
  end

fn
scan_character_literal_without_checking_end (inp : inp_t) :
    (tokentuple_t, inp_t) =
  let
    val (ch, inp) = get_ch inp
    val _ = assertloc ((ch.ichar) = '\'')

    val (ch1, inp) = get_ch inp
  in
    if (ch1.ichar) < 0 then
      $raise unterminated_character_literal (ch.line_no, ch.column_no)
    else if (ch1.ichar) = char2i '\\' then
      let
        val (ch2, inp) = get_ch inp
      in
        if (ch2.ichar) < 0 then
          $raise unterminated_character_literal (ch.line_no,
                                                 ch.column_no)
        else if (ch2.ichar) = char2i 'n' then
          let
            val s = ichar2integer_literal (char2i '\n')
          in
            ((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
          end
        else if (ch2.ichar) = char2i '\\' then
          let
            val s = ichar2integer_literal (char2i '\\')
          in
            ((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
          end
        else
          $raise unsupported_escape (ch1.line_no, ch1.column_no,
                                     ch2.ichar)
      end
    else
      let
        val s = ichar2integer_literal (ch1.ichar)
      in
        ((TOKEN_INTEGER, s, ch.line_no, ch.column_no), inp)
      end
  end

fn
scan_character_literal (inp : inp_t) : (tokentuple_t, inp_t) =
  let
    val (tok, inp) =
      scan_character_literal_without_checking_end inp
    val line_no = (tok.2)
    val column_no = (tok.3)

    fun
    check_end (inp : inp_t) : inp_t =
      let
        val (ch, inp) = get_ch inp
      in
        if (ch.ichar) = char2i '\'' then
          inp
        else
          let
            fun
            loop_to_end (ch1 : ch_t,
                         inp : inp_t) : inp_t =
              if (ch1.ichar) < 0 then
                $raise unterminated_character_literal (line_no,
                                                       column_no)
              else if (ch1.ichar) = char2i '\'' then
                $raise multicharacter_literal (line_no, column_no)
              else
                let
                  val (ch1, inp) = get_ch inp
                in
                  loop_to_end (ch1, inp)
                end

            val inp = loop_to_end (ch, inp)
          in
            inp
          end
      end

    val inp = check_end inp
  in
    (tok, inp)
  end

fn
scan_string_literal (inp : inp_t) : (tokentuple_t, inp_t) =
  let
    val (ch, inp) = get_ch inp
    val _ = assertloc ((ch.ichar) = '"')

    fun
    scan {u : pos}
         (lst : list (char, u),
          inp : inp_t) :
        [m : pos] (list (char, m), inp_t) =
      let
        val (ch1, inp) = get_ch inp
      in
        if (ch1.ichar) < 0 then
          $raise unterminated_string_literal (ch.line_no,
                                              ch.column_no, false)
        else if (ch1.ichar) = char2i '\n' then
          $raise unterminated_string_literal (ch.line_no,
                                              ch.column_no, true)
        else if (ch1.ichar) = char2i '"' then
          (lst, inp)
        else if (ch1.ichar) <> char2i '\\' then
          scan (int2char0 (ch1.ichar) :: lst, inp)
        else
          let
            val (ch2, inp) = get_ch inp
          in
            if (ch2.ichar) = char2i 'n' then
              scan ('n' :: '\\' :: lst, inp)
            else if (ch2.ichar) = char2i '\\' then
              scan ('\\' :: '\\' :: lst, inp)
            else
              $raise unsupported_escape (ch1.line_no, ch1.column_no,
                                         ch2.ichar)
          end
      end

    val lst = '"' :: NIL
    val (lst, inp) = scan (lst, inp)
    val lst = '"' :: lst
    val s = reverse_list_to_string lst
  in
    ((TOKEN_STRING, s, ch.line_no, ch.column_no), inp)
  end

fn
get_next_token (inp     : inp_t,
                lookups : !lookups_vt) : (tokentuple_t, inp_t) =
  let
    val inp = skip_spaces_and_comments inp
    val (ch, inp) = get_ch inp
    val ln = ch.line_no
    val cn = ch.column_no
  in
    if ch.ichar < 0 then
      ((TOKEN_END_OF_INPUT, "", ln, cn), inp)
    else
      case+ int2char0 (ch.ichar) of
      | ',' => ((TOKEN_COMMA, ",", ln, cn), inp)
      | ';' => ((TOKEN_SEMICOLON, ";", ln, cn), inp)
      | '\(' => ((TOKEN_LEFTPAREN, "(", ln, cn), inp)
      | ')' => ((TOKEN_RIGHTPAREN, ")", ln, cn), inp)
      | '\{' => ((TOKEN_LEFTBRACE, "{", ln, cn), inp)
      | '}' => ((TOKEN_RIGHTBRACE, "}", ln, cn), inp)
      | '*' => ((TOKEN_MULTIPLY, "*", ln, cn), inp)
      | '/' => ((TOKEN_DIVIDE, "/", ln, cn), inp)
      | '%' => ((TOKEN_MOD, "%", ln, cn), inp)
      | '+' => ((TOKEN_ADD, "+", ln, cn), inp)
      | '-' => ((TOKEN_SUBTRACT, "-", ln, cn), inp)
      | '<' =>
        let
          val (ch1, inp) = get_ch inp
        in
          if (ch1.ichar) = char2i '=' then
            ((TOKEN_LESSEQUAL, "<=", ln, cn), inp)
          else
            let
              val inp = push_back_ch (ch1, inp)
            in
              ((TOKEN_LESS, "<", ln, cn), inp)
            end
        end
      | '>' =>
        let
          val (ch1, inp) = get_ch inp
        in
          if (ch1.ichar) = char2i '=' then
            ((TOKEN_GREATEREQUAL, ">=", ln, cn), inp)
          else
            let
              val inp = push_back_ch (ch1, inp)
            in
              ((TOKEN_GREATER, ">", ln, cn), inp)
            end
        end
      | '=' =>
        let
          val (ch1, inp) = get_ch inp
        in
          if (ch1.ichar) = char2i '=' then
            ((TOKEN_EQUAL, "==", ln, cn), inp)
          else
            let
              val inp = push_back_ch (ch1, inp)
            in
              ((TOKEN_ASSIGN, "=", ln, cn), inp)
            end
        end
      | '!' =>
        let
          val (ch1, inp) = get_ch inp
        in
          if (ch1.ichar) = char2i '=' then
            ((TOKEN_NOTEQUAL, "!=", ln, cn), inp)
          else
            let
              val inp = push_back_ch (ch1, inp)
            in
              ((TOKEN_NOT, "!", ln, cn), inp)
            end
        end
      | '&' =>
        let
          val (ch1, inp) = get_ch inp
        in
          if (ch1.ichar) = char2i '&' then
            ((TOKEN_AND, "&&", ln, cn), inp)
          else
            $raise unexpected_character (ch.line_no, ch.column_no,
                                         ch.ichar)
        end
      | '|' =>
        let
          val (ch1, inp) = get_ch inp
        in
          if (ch1.ichar) = char2i '|' then
            ((TOKEN_OR, "||", ln, cn), inp)
          else
            $raise unexpected_character (ch.line_no, ch.column_no,
                                         ch.ichar)
        end
      | '"' =>
        let
          val inp = push_back_ch (ch, inp)
        in
          scan_string_literal inp
        end
      | '\'' =>
        let
          val inp = push_back_ch (ch, inp)
        in
          scan_character_literal inp
        end
      | _ when isdigit (ch.ichar) =>
        let
          val inp = push_back_ch (ch, inp)
        in
          scan_integer_literal (inp, lookups)
        end
      | _ when is_ident_start (ch.ichar) =>
        let
          val inp = push_back_ch (ch, inp)
        in
          scan_identifier_or_reserved_word (inp, lookups)
        end
      | _ => $raise unexpected_character (ch.line_no, ch.column_no,
                                          ch.ichar)
  end

fn
fprint_ullint_rightjust (outf : FILEref,
                         num  : ullint) : void =
  if num < 10ULL then
    fprint! (outf, "    ", num)
  else if num < 100ULL then
    fprint! (outf, "   ", num)
  else if num < 1000ULL then
    fprint! (outf, "  ", num)
  else if num < 10000ULL then
    fprint! (outf, " ", num)
  else
    fprint! (outf, num)

fn
print_token (outf    : FILEref,
             toktup  : tokentuple_t,
             lookups : !lookups_vt) : void =
  let
    macdef toknames = !(lookups.toknames)
    val name = toknames[toktup.0]
    val str = (toktup.1)
    val line_no = (toktup.2)
    val column_no = (toktup.3)

    val _ = fprint_ullint_rightjust (outf, line_no)
    val _ = fileref_puts (outf, " ")
    val _ = fprint_ullint_rightjust (outf, column_no)
    val _ = fileref_puts (outf, "  ")
    val _ = fileref_puts (outf, name)
  in
    begin
      case+ toktup.0 of
      | TOKEN_IDENTIFIER => fprint! (outf, "     ", str)
      | TOKEN_INTEGER => fprint! (outf, "        ", str)
      | TOKEN_STRING => fprint! (outf, "         ", str)
      | _ => ()
    end;

    fileref_putc (outf, '\n')
  end

fn
scan_text (outf    : FILEref,
           inp     : inp_t,
           lookups : !lookups_vt) : void =
  let
    fun
    loop (inp     : inp_t,
          lookups : !lookups_vt) : void =
      let
        val (toktup, inp) = get_next_token (inp, lookups)
      in
        print_token (outf, toktup, lookups);
        if toktup.0 <> TOKEN_END_OF_INPUT then
          loop (inp, lookups)
      end
  in
    loop (inp, lookups)
  end

(********************************************************************)

fn
main_program (inpf : FILEref,
              outf : FILEref) : int =
  let
    (* Using a simple Scheme program, I found the following perfect
       hash for the reserved words, using the sum of the first two
       characters as the hash value. *)
    var reserved_words =
      @[String][RESERVED_WORD_HASHTAB_SIZE]
        ("if", "print", "else", "", "putc", "", "", "while", "")
    var reserved_word_tokens =
      @[token_t][RESERVED_WORD_HASHTAB_SIZE]
        (TOKEN_IF, TOKEN_PRINT, TOKEN_ELSE, TOKEN_IDENTIFIER,
         TOKEN_PUTC, TOKEN_IDENTIFIER, TOKEN_IDENTIFIER, TOKEN_WHILE,
         TOKEN_IDENTIFIER)

    var token_names =
      @[string][NUM_TOKENS]
        ("Keyword_else",
         "Keyword_if",
         "Keyword_print",
         "Keyword_putc",
         "Keyword_while",
         "Op_multiply",
         "Op_divide",
         "Op_mod",
         "Op_add",
         "Op_subtract",
         "Op_negate",
         "Op_less",
         "Op_lessequal",
         "Op_greater",
         "Op_greaterequal",
         "Op_equal",
         "Op_notequal",
         "Op_not",
         "Op_assign",
         "Op_and",
         "Op_or",
         "LeftParen",
         "RightParen",
         "LeftBrace",
         "RightBrace",
         "Semicolon",
         "Comma",
         "Identifier",
         "Integer",
         "String",
         "End_of_input")

    var lookups : lookups_vt =
      @{
        pf_toknames = view@ token_names,
        pf_wordtab = view@ reserved_words,
        pf_toktab = view@ reserved_word_tokens |
        toknames = addr@ token_names,
        wordtab = addr@ reserved_words,
        toktab = addr@ reserved_word_tokens
      }

    val inp =
      @{
        file = inpf,
        pushback = NIL,
        line_no = 1ULL,
        column_no = 1ULL
      }

    val _ = scan_text (outf, inp, lookups)

    val @{
          pf_toknames = pf_toknames,
          pf_wordtab = pf_wordtab,
          pf_toktab = pf_toktab |
          toknames = toknames,
          wordtab = wordtab,
          toktab = toktab
        } = lookups
    prval _ = view@ token_names := pf_toknames
    prval _ = view@ reserved_words := pf_wordtab
    prval _ = view@ reserved_word_tokens := pf_toktab
  in
    0
  end

macdef lex_error = "Lexical error: "

implement
main (argc, argv) =
  let
    val inpfname =
      if 2 <= argc then
        $UN.cast{string} argv[1]
      else
        "-"
    val outfname =
      if 3 <= argc then
        $UN.cast{string} argv[2]
      else
        "-"
  in
    try
      let
        val inpf =
          if (inpfname : string) = "-" then
            stdin_ref
          else
            fileref_open_exn (inpfname, file_mode_r)

        val outf =
          if (outfname : string) = "-" then
            stdout_ref
          else
            fileref_open_exn (outfname, file_mode_w)
      in
        main_program (inpf, outf)
      end
    with
    | ~ unterminated_comment (line_no, column_no) =>
      begin
        fprintln! (stderr_ref, lex_error,
                   "unterminated comment starting at ",
                   line_no, ":", column_no);
        1
      end
    | ~ unterminated_character_literal (line_no, column_no) =>
      begin
        fprintln! (stderr_ref, lex_error,
                   "unterminated character literal starting at ",
                   line_no, ":", column_no);
        1
      end
    | ~ multicharacter_literal (line_no, column_no) =>
      begin
        fprintln! (stderr_ref, lex_error,
                   "unsupported multicharacter literal starting at ",
                   line_no, ":", column_no);
        1
      end
    | ~ unterminated_string_literal (line_no, column_no,
                                     end_of_line) =>
      let
        val s =
          begin
            if end_of_line then
              "end of line"
            else
              "end of input"
          end : String
      in
        fprintln! (stderr_ref, lex_error,
                   "unterminated string literal (", s,
                   ") starting at ", line_no, ":", column_no);
        1
      end
    | ~ unsupported_escape (line_no, column_no, c) =>
      begin
        fprintln! (stderr_ref, lex_error,
                   "unsupported escape \\",
                   int2char0 c, " starting at ",
                   line_no, ":", column_no);
        1
      end
    | ~ invalid_integer_literal (line_no, column_no, s) =>
      begin
        fprintln! (stderr_ref, lex_error,
                   "invalid integer literal ", s,
                   " starting at ", line_no, ":", column_no);
        1
      end
    | ~ unexpected_character (line_no, column_no, c) =>
      begin
        fprintln! (stderr_ref, lex_error,
                   "unexpected character '", int2char0 c,
                   "' at ", line_no, ":", column_no);
        1
      end
end

(********************************************************************)