RosettaCodeData/Task/Compiler-lexical-analyzer/ObjectIcon/compiler-lexical-analyzer.oi

# -*- ObjectIcon -*-
#
# The Rosetta Code lexical analyzer in Object Icon. Based upon the ATS
# implementation.
#
# Usage: lex [INPUTFILE [OUTPUTFILE]]
#   If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input
#   or standard output is used, respectively. *)
#

import io

$define EOF -1

$define TOKEN_ELSE          0
$define TOKEN_IF            1
$define TOKEN_PRINT         2
$define TOKEN_PUTC          3
$define TOKEN_WHILE         4
$define TOKEN_MULTIPLY      5
$define TOKEN_DIVIDE        6
$define TOKEN_MOD           7
$define TOKEN_ADD           8
$define TOKEN_SUBTRACT      9
$define TOKEN_NEGATE       10
$define TOKEN_LESS         11
$define TOKEN_LESSEQUAL    12
$define TOKEN_GREATER      13
$define TOKEN_GREATEREQUAL 14
$define TOKEN_EQUAL        15
$define TOKEN_NOTEQUAL     16
$define TOKEN_NOT          17
$define TOKEN_ASSIGN       18
$define TOKEN_AND          19
$define TOKEN_OR           20
$define TOKEN_LEFTPAREN    21
$define TOKEN_RIGHTPAREN   22
$define TOKEN_LEFTBRACE    23
$define TOKEN_RIGHTBRACE   24
$define TOKEN_SEMICOLON    25
$define TOKEN_COMMA        26
$define TOKEN_IDENTIFIER   27
$define TOKEN_INTEGER      28
$define TOKEN_STRING       29
$define TOKEN_END_OF_INPUT 30

global whitespace
global ident_start
global ident_continuation

procedure main(args)
  local inpf, outf
  local pushback_buffer, inp, pushback

  initial {
    whitespace := ' \t\v\f\r\n'
    ident_start := '_' ++ &letters
    ident_continuation := ident_start ++ &digits
  }

  inpf := FileStream.stdin
  outf := FileStream.stdout
  if 1 <= *args & args[1] ~== "-" then {
    inpf := FileStream(args[1], FileOpt.RDONLY) | stop(&why)
  }
  if 2 <= *args & args[2] ~== "-" then {
    outf := FileStream(args[2], ior(FileOpt.WRONLY,
                                    FileOpt.TRUNC,
                                    FileOpt.CREAT)) | stop(&why)
  }

  pushback_buffer := []
  inp := create inputter(inpf, pushback_buffer)
  pushback := create repeat push(pushback_buffer, \@&source)
  @pushback                     #  The first invocation does nothing.

  scan_text(outf, inp, pushback)
end

procedure scan_text(outf, inp, pushback)
  local ch

  while /ch | ch[1] ~=== EOF do {
    skip_spaces_and_comments(inp, pushback)
    ch := @inp
    if ch[1] === EOF then {
      print_token(outf, [TOKEN_END_OF_INPUT, "", ch[2], ch[3]])
    } else {
      ch @pushback
      print_token(outf, get_next_token(inp, pushback))
    }
  }
end

procedure get_next_token(inp, pushback)
  local ch, ch1
  local ln, cn

  skip_spaces_and_comments(inp, pushback)
  ch := @inp
  ln := ch[2]                   # line number
  cn := ch[3]                   # column number
  case ch[1] of {
    "," : return [TOKEN_COMMA, ",", ln, cn]
    ";" : return [TOKEN_SEMICOLON, ";", ln, cn]
    "(" : return [TOKEN_LEFTPAREN, "(", ln, cn]
    ")" : return [TOKEN_RIGHTPAREN, ")", ln, cn]
    "{" : return [TOKEN_LEFTBRACE, "{", ln, cn]
    "}" : return [TOKEN_RIGHTBRACE, "}", ln, cn]
    "*" : return [TOKEN_MULTIPLY, "*", ln, cn]
    "/" : return [TOKEN_DIVIDE, "/", ln, cn]
    "%" : return [TOKEN_MOD, "%", ln, cn]
    "+" : return [TOKEN_ADD, "+", ln, cn]
    "-" : return [TOKEN_SUBTRACT, "-", ln, cn]
    "<" : {
      ch1 := @inp
      if ch1[1] === "=" then {
        return [TOKEN_LESSEQUAL, "<=", ln, cn]
      } else {
        ch1 @pushback
        return [TOKEN_LESS, "<", ln, cn]
      }
    }
    ">" : {
      ch1 := @inp
      if ch1[1] === "=" then {
        return [TOKEN_GREATEREQUAL, ">=", ln, cn]
      } else {
        ch1 @pushback
        return [TOKEN_GREATER, ">", ln, cn]
      }
    }
    "=" : {
      ch1 := @inp
      if ch1[1] === "=" then {
        return [TOKEN_EQUAL, "==", ln, cn]
      } else {
        ch1 @pushback
        return [TOKEN_ASSIGN, "=", ln, cn]
      }
    }
    "!" : {
      ch1 := @inp
      if ch1[1] === "=" then {
        return [TOKEN_NOTEQUAL, "!=", ln, cn]
      } else {
        ch1 @pushback
        return [TOKEN_NOT, "!", ln, cn]
      }
    }
    "&" : {
      ch1 := @inp
      if ch1[1] === "&" then {
        return [TOKEN_AND, "&&", ln, cn]
      } else {
        unexpected_character(ln, cn, ch)
      }
    }
    "|" : {
      ch1 := @inp
      if ch1[1] === "|" then {
        return [TOKEN_OR, "||", ln, cn]
      } else {
        unexpected_character(ln, cn, ch)
      }
    }
    "\"" : {
      ch @pushback
      return scan_string_literal(inp)
    }
    "'" : {
      ch @pushback
      return scan_character_literal(inp, pushback)
    }
    default : {
      if any(&digits, ch[1]) then {
        ch @pushback
        return scan_integer_literal(inp, pushback)
      } else if any(ident_start, ch[1]) then {
        ch @pushback
        return scan_identifier_or_reserved_word (inp, pushback)
      } else {
        unexpected_character(ln, cn, ch)
      }
    }
  }
end

procedure scan_identifier_or_reserved_word(inp, pushback)
  local ch
  local s
  local line_no, column_no

  s := ""
  ch := @inp
  line_no := ch[2]
  column_no := ch[3]
  while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
    s ||:= ch[1]
    ch := @inp
  }
  ch @pushback
  return reserved_word_lookup (s, line_no, column_no)
end

procedure scan_integer_literal(inp, pushback)
  local ch
  local s
  local line_no, column_no

  s := ""
  ch := @inp
  line_no := ch[2]
  column_no := ch[3]
  while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
    s ||:= ch[1]
    ch := @inp
  }
  ch @pushback
  not upto(~&digits, s) | invalid_integer_literal(line_no, column_no, s)
  return [TOKEN_INTEGER, s, line_no, column_no]
end

procedure scan_character_literal(inp, pushback)
  local ch, ch1
  local close_quote
  local toktup
  local line_no, column_no

  ch := @inp                    # The opening quote.
  close_quote := ch[1]          # Same as the opening quote.
  ch @pushback

  line_no := ch[2]
  column_no := ch[3]

  toktup := scan_character_literal_without_checking_end(inp)
  ch1 := @inp
  if ch1[1] ~=== close_quote then {
    repeat {
      case ch1[1] of {
        EOF : unterminated_character_literal(line_no, column_no)
        close_quote : multicharacter_literal(line_no, column_no)
        default : ch1 := @inp
      }
    }
  }
  return toktup
end

procedure scan_character_literal_without_checking_end(inp)
  local ch, ch1, ch2

  ch := @inp                    # The opening quote.
  ch1 := @inp
  EOF ~=== ch1[1] | unterminated_character_literal(ch[2], ch[3])
  if ch1[1] == "\\" then {
    ch2 := @inp
    EOF ~=== ch2[1] | unterminated_character_literal(ch[2], ch[3])
    case ch2[1] of {
      "n" : return [TOKEN_INTEGER, string(ord("\n")), ch[2], ch[3]]
      "\\" : return [TOKEN_INTEGER, string(ord("\\")), ch[2], ch[3]]
      default : unsupported_escape(ch1[2], ch1[3], ch2)
    }
  } else {
    return [TOKEN_INTEGER, string(ord(ch1[1])), ch[2], ch[3]]
  }
end

procedure scan_string_literal(inp)
  local ch, ch1, ch2
  local line_no, column_no
  local close_quote
  local s
  local retval

  ch := @inp                    # The opening quote
  close_quote := ch[1]          # Same as the opening quote.
  line_no := ch[2]
  column_no := ch[3]

  s := ch[1]
  until \retval do {
    ch1 := @inp
    ch1[1] ~=== EOF |
        unterminated_string_literal (line_no, column_no,
                                     "end of input")
    ch1[1] ~== "\n" |
        unterminated_string_literal (line_no, column_no,
                                     "end of line")
    if ch1[1] == close_quote then {
      retval := [TOKEN_STRING, s || close_quote, line_no, column_no]
    } else if ch1[1] ~== "\\" then {
      s ||:= ch1[1]
    } else {
      ch2 := @inp
      EOF ~=== ch2[1] | unsupported_escape(line_no, column_no, ch2)
      case ch2[1] of {
        "n" : s ||:= "\\n"
        "\\" : s ||:= "\\\\"
        default : unsupported_escape(line_no, column_no, ch2)
      }
    }
  }
  return retval
end

procedure skip_spaces_and_comments(inp, pushback)
  local ch, ch1

  repeat {
    ch := @inp
    (EOF === ch[1]) & { ch @pushback; return }
    if not any(whitespace, ch[1]) then {
      (ch[1] == "/") | { ch @pushback; return }
      (ch1 := @inp) | { ch @pushback; return }
      (ch1[1] == "*") | { ch1 @pushback; ch @pushback; return }
      scan_comment(inp, ch[2], ch[3])
    }
  }
end

procedure scan_comment(inp, line_no, column_no)
  local ch, ch1

  until (\ch)[1] == "*" & (\ch1)[1] == "/" do {
    ch := @inp
    (EOF === ch[1]) & unterminated_comment(line_no, column_no)
    if ch[1] == "*" then {
      ch1 := @inp
      (EOF === ch[1]) & unterminated_comment(line_no, column_no)
    }
  }
  return
end

procedure reserved_word_lookup(s, line_no, column_no)

  # Lookup is by an extremely simple perfect hash.

  static reserved_words
  static reserved_word_tokens
  local hashval, token, toktup

  initial {
    reserved_words := ["if", "print", "else",
                       "", "putc", "",
                       "", "while", ""]
    reserved_word_tokens :=
        [TOKEN_IF, TOKEN_PRINT, TOKEN_ELSE,
         TOKEN_IDENTIFIER, TOKEN_PUTC, TOKEN_IDENTIFIER,
         TOKEN_IDENTIFIER, TOKEN_WHILE, TOKEN_IDENTIFIER]
  }

  if *s < 2 then {
    toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
  } else {
    hashval := ((ord(s[1]) + ord(s[2])) % (*reserved_words)) + 1
    token := reserved_word_tokens[hashval]
    if token = TOKEN_IDENTIFIER | s ~== reserved_words[hashval] then {
      toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
    } else {
      toktup := [token, s, line_no, column_no]
    }
  }
  return toktup
end

procedure print_token(outf, toktup)
  static token_names
  local s_line, s_column

  initial {
    token_names := ["Keyword_else",
                    "Keyword_if",
                    "Keyword_print",
                    "Keyword_putc",
                    "Keyword_while",
                    "Op_multiply",
                    "Op_divide",
                    "Op_mod",
                    "Op_add",
                    "Op_subtract",
                    "Op_negate",
                    "Op_less",
                    "Op_lessequal",
                    "Op_greater",
                    "Op_greaterequal",
                    "Op_equal",
                    "Op_notequal",
                    "Op_not",
                    "Op_assign",
                    "Op_and",
                    "Op_or",
                    "LeftParen",
                    "RightParen",
                    "LeftBrace",
                    "RightBrace",
                    "Semicolon",
                    "Comma",
                    "Identifier",
                    "Integer",
                    "String",
                    "End_of_input"]
  }

  /outf := FileStream.stdout
  s_line := string(toktup[3])
  s_column := string(toktup[4])
  writes(outf, right (s_line, max(5, *s_line)))
  writes(outf, " ")
  writes(outf, right (s_column, max(5, *s_column)))
  writes(outf, "  ")
  writes(outf, token_names[toktup[1] + 1])
  case toktup[1] of {
    TOKEN_IDENTIFIER : writes(outf, "     ", toktup[2])
    TOKEN_INTEGER : writes(outf, "        ", toktup[2])
    TOKEN_STRING : writes(outf, "         ", toktup[2])
  }
  write(outf)
  return
end

procedure inputter(inpf, pushback_buffer)
  local buffer
  local line_no, column_no
  local c

  buffer := ""
  line_no := 1
  column_no := 1

  repeat {
      buffer? {
        until *pushback_buffer = 0 & pos(0) do {
          if *pushback_buffer ~= 0 then {
            suspend pop(pushback_buffer)
          } else {
            c := move(1)
            suspend [c, line_no, column_no]
            if c == "\n" then {
              line_no +:= 1
              column_no := 1
            } else {
              column_no +:= 1
            }
          }
        }
      }
      (buffer := reads(inpf, 2048)) |
          suspend [EOF, line_no, column_no]
    }
end

procedure unterminated_comment(line_no, column_no)
  error("unterminated comment starting at ",
        line_no, ":", column_no)
end

procedure unexpected_character(line_no, column_no, ch)
  error("unexpected character '", ch[1], "' starting at ",
        line_no, ":", column_no)
end

procedure unterminated_string_literal (line_no, column_no, cause)
  error("unterminated string literal (", cause, ") starting at ",
        line_no, ":", column_no)
end

procedure unsupported_escape (line_no, column_no, ch)
  if ch[1] === EOF then {
    error("unexpected \\ at end of input",
          " starting at ", line_no, ":", column_no)
  } else {
    error("unsupported escape \\", ch[1],
          " starting at ", line_no, ":", column_no)
  }
end

procedure invalid_integer_literal(line_no, column_no, s)
  error("invalid integer literal ", s,
        " starting at ", line_no, ":", column_no)
end

procedure unterminated_character_literal(line_no, column_no)
  error("unterminated character literal starting at ",
        line_no, ":", column_no)
end

procedure multicharacter_literal(line_no, column_no)
  error("unsupported multicharacter literal starting at ",
        line_no, ":", column_no)
end

procedure error(args[])
  write!([FileStream.stderr] ||| args)
  exit(1)
end