1080 lines
35 KiB
Fortran
1080 lines
35 KiB
Fortran
!!!
|
||
!!! An implementation of the Rosetta Code lexical analyzer task:
|
||
!!! https://rosettacode.org/wiki/Compiler/lexical_analyzer
|
||
!!!
|
||
!!! The C implementation was used as a reference on behavior, but was
|
||
!!! not adhered to for the implementation.
|
||
!!!
|
||
|
||
module string_buffers
|
||
use, intrinsic :: iso_fortran_env, only: error_unit
|
||
use, intrinsic :: iso_fortran_env, only: int64
|
||
|
||
implicit none
|
||
private
|
||
|
||
public :: strbuf_t
|
||
public :: strbuf_t_length_kind
|
||
public :: strbuf_t_character_kind
|
||
|
||
integer, parameter :: strbuf_t_length_kind = int64
|
||
|
||
! String buffers can handle Unicode.
|
||
integer, parameter :: strbuf_t_character_kind = selected_char_kind ('ISO_10646')
|
||
|
||
! Private abbreviations.
|
||
integer, parameter :: nk = strbuf_t_length_kind
|
||
integer, parameter :: ck = strbuf_t_character_kind
|
||
|
||
type :: strbuf_t
|
||
integer(kind = nk), private :: len = 0
|
||
!
|
||
! ‘chars’ is made public for efficient access to the individual
|
||
! characters.
|
||
!
|
||
character(1, kind = ck), allocatable, public :: chars(:)
|
||
contains
|
||
procedure, pass, private :: ensure_storage => strbuf_t_ensure_storage
|
||
procedure, pass :: to_unicode => strbuf_t_to_unicode
|
||
procedure, pass :: length => strbuf_t_length
|
||
procedure, pass :: set => strbuf_t_set
|
||
procedure, pass :: append => strbuf_t_append
|
||
generic :: assignment(=) => set
|
||
end type strbuf_t
|
||
|
||
contains
|
||
|
||
function strbuf_t_to_unicode (strbuf) result (s)
|
||
class(strbuf_t), intent(in) :: strbuf
|
||
character(:, kind = ck), allocatable :: s
|
||
|
||
!
|
||
! This does not actually ensure that the string is valid Unicode;
|
||
! any 31-bit ‘character’ is supported.
|
||
!
|
||
|
||
integer(kind = nk) :: i
|
||
|
||
allocate (character(len = strbuf%len, kind = ck) :: s)
|
||
do i = 1, strbuf%len
|
||
s(i:i) = strbuf%chars(i)
|
||
end do
|
||
end function strbuf_t_to_unicode
|
||
|
||
elemental function strbuf_t_length (strbuf) result (n)
|
||
class(strbuf_t), intent(in) :: strbuf
|
||
integer(kind = nk) :: n
|
||
|
||
n = strbuf%len
|
||
end function strbuf_t_length
|
||
|
||
elemental function next_power_of_two (x) result (y)
|
||
integer(kind = nk), intent(in) :: x
|
||
integer(kind = nk) :: y
|
||
|
||
!
|
||
! It is assumed that no more than 64 bits are used.
|
||
!
|
||
! The branch-free algorithm is that of
|
||
! https://archive.is/nKxAc#RoundUpPowerOf2
|
||
!
|
||
! Fill in bits until one less than the desired power of two is
|
||
! reached, and then add one.
|
||
!
|
||
|
||
y = x - 1
|
||
y = ior (y, ishft (y, -1))
|
||
y = ior (y, ishft (y, -2))
|
||
y = ior (y, ishft (y, -4))
|
||
y = ior (y, ishft (y, -8))
|
||
y = ior (y, ishft (y, -16))
|
||
y = ior (y, ishft (y, -32))
|
||
y = y + 1
|
||
end function next_power_of_two
|
||
|
||
elemental function new_storage_size (length_needed) result (size)
|
||
integer(kind = nk), intent(in) :: length_needed
|
||
integer(kind = nk) :: size
|
||
|
||
! Increase storage by orders of magnitude.
|
||
|
||
if (2_nk**32 < length_needed) then
|
||
size = huge (1_nk)
|
||
else
|
||
size = next_power_of_two (length_needed)
|
||
end if
|
||
end function new_storage_size
|
||
|
||
subroutine strbuf_t_ensure_storage (strbuf, length_needed)
|
||
class(strbuf_t), intent(inout) :: strbuf
|
||
integer(kind = nk), intent(in) :: length_needed
|
||
|
||
integer(kind = nk) :: new_size
|
||
type(strbuf_t) :: new_strbuf
|
||
|
||
if (.not. allocated (strbuf%chars)) then
|
||
! Initialize a new strbuf%chars array.
|
||
new_size = new_storage_size (length_needed)
|
||
allocate (strbuf%chars(1:new_size))
|
||
else if (ubound (strbuf%chars, 1) < length_needed) then
|
||
! Allocate a new strbuf%chars array, larger than the current
|
||
! one, but containing the same characters.
|
||
new_size = new_storage_size (length_needed)
|
||
allocate (new_strbuf%chars(1:new_size))
|
||
new_strbuf%chars(1:strbuf%len) = strbuf%chars(1:strbuf%len)
|
||
call move_alloc (new_strbuf%chars, strbuf%chars)
|
||
end if
|
||
end subroutine strbuf_t_ensure_storage
|
||
|
||
subroutine strbuf_t_set (dst, src)
|
||
class(strbuf_t), intent(inout) :: dst
|
||
class(*), intent(in) :: src
|
||
|
||
integer(kind = nk) :: n
|
||
integer(kind = nk) :: i
|
||
|
||
select type (src)
|
||
type is (character(*, kind = ck))
|
||
n = len (src, kind = nk)
|
||
call dst%ensure_storage(n)
|
||
do i = 1, n
|
||
dst%chars(i) = src(i:i)
|
||
end do
|
||
dst%len = n
|
||
type is (character(*))
|
||
n = len (src, kind = nk)
|
||
call dst%ensure_storage(n)
|
||
do i = 1, n
|
||
dst%chars(i) = src(i:i)
|
||
end do
|
||
dst%len = n
|
||
class is (strbuf_t)
|
||
n = src%len
|
||
call dst%ensure_storage(n)
|
||
dst%chars(1:n) = src%chars(1:n)
|
||
dst%len = n
|
||
class default
|
||
error stop
|
||
end select
|
||
end subroutine strbuf_t_set
|
||
|
||
subroutine strbuf_t_append (dst, src)
|
||
class(strbuf_t), intent(inout) :: dst
|
||
class(*), intent(in) :: src
|
||
|
||
integer(kind = nk) :: n_dst, n_src, n
|
||
integer(kind = nk) :: i
|
||
|
||
select type (src)
|
||
type is (character(*, kind = ck))
|
||
n_dst = dst%len
|
||
n_src = len (src, kind = nk)
|
||
n = n_dst + n_src
|
||
call dst%ensure_storage(n)
|
||
do i = 1, n_src
|
||
dst%chars(n_dst + i) = src(i:i)
|
||
end do
|
||
dst%len = n
|
||
type is (character(*))
|
||
n_dst = dst%len
|
||
n_src = len (src, kind = nk)
|
||
n = n_dst + n_src
|
||
call dst%ensure_storage(n)
|
||
do i = 1, n_src
|
||
dst%chars(n_dst + i) = src(i:i)
|
||
end do
|
||
dst%len = n
|
||
class is (strbuf_t)
|
||
n_dst = dst%len
|
||
n_src = src%len
|
||
n = n_dst + n_src
|
||
call dst%ensure_storage(n)
|
||
dst%chars((n_dst + 1):n) = src%chars(1:n_src)
|
||
dst%len = n
|
||
class default
|
||
error stop
|
||
end select
|
||
end subroutine strbuf_t_append
|
||
|
||
end module string_buffers
|
||
|
||
module lexical_analysis
|
||
use, intrinsic :: iso_fortran_env, only: input_unit
|
||
use, intrinsic :: iso_fortran_env, only: output_unit
|
||
use, intrinsic :: iso_fortran_env, only: error_unit
|
||
use, intrinsic :: iso_fortran_env, only: int32
|
||
use, non_intrinsic :: string_buffers
|
||
|
||
implicit none
|
||
private
|
||
|
||
public :: lexer_input_t
|
||
public :: lexer_output_t
|
||
public :: run_lexer
|
||
|
||
integer, parameter :: input_file_unit_no = 100
|
||
integer, parameter :: output_file_unit_no = 101
|
||
|
||
! Private abbreviations.
|
||
integer, parameter :: nk = strbuf_t_length_kind
|
||
integer, parameter :: ck = strbuf_t_character_kind
|
||
|
||
! Integers large enough for a Unicode code point. Unicode code
|
||
! points (and UCS-4) have never been allowed to go higher than
|
||
! 7FFFFFFF, and are even further restricted now.
|
||
integer, parameter :: ichar_kind = int32
|
||
|
||
character(1, kind = ck), parameter :: horizontal_tab_char = char (9, kind = ck)
|
||
character(1, kind = ck), parameter :: linefeed_char = char (10, kind = ck)
|
||
character(1, kind = ck), parameter :: vertical_tab_char = char (11, kind = ck)
|
||
character(1, kind = ck), parameter :: formfeed_char = char (12, kind = ck)
|
||
character(1, kind = ck), parameter :: carriage_return_char = char (13, kind = ck)
|
||
character(1, kind = ck), parameter :: space_char = ck_' '
|
||
|
||
! The following is correct for Unix and its relatives.
|
||
character(1, kind = ck), parameter :: newline_char = linefeed_char
|
||
|
||
character(1, kind = ck), parameter :: backslash_char = char (92, kind = ck)
|
||
|
||
character(*, kind = ck), parameter :: newline_intstring = ck_'10'
|
||
character(*, kind = ck), parameter :: backslash_intstring = ck_'92'
|
||
|
||
integer, parameter :: tk_EOI = 0
|
||
integer, parameter :: tk_Mul = 1
|
||
integer, parameter :: tk_Div = 2
|
||
integer, parameter :: tk_Mod = 3
|
||
integer, parameter :: tk_Add = 4
|
||
integer, parameter :: tk_Sub = 5
|
||
integer, parameter :: tk_Negate = 6
|
||
integer, parameter :: tk_Not = 7
|
||
integer, parameter :: tk_Lss = 8
|
||
integer, parameter :: tk_Leq = 9
|
||
integer, parameter :: tk_Gtr = 10
|
||
integer, parameter :: tk_Geq = 11
|
||
integer, parameter :: tk_Eq = 12
|
||
integer, parameter :: tk_Neq = 13
|
||
integer, parameter :: tk_Assign = 14
|
||
integer, parameter :: tk_And = 15
|
||
integer, parameter :: tk_Or = 16
|
||
integer, parameter :: tk_If = 17
|
||
integer, parameter :: tk_Else = 18
|
||
integer, parameter :: tk_While = 19
|
||
integer, parameter :: tk_Print = 20
|
||
integer, parameter :: tk_Putc = 21
|
||
integer, parameter :: tk_Lparen = 22
|
||
integer, parameter :: tk_Rparen = 23
|
||
integer, parameter :: tk_Lbrace = 24
|
||
integer, parameter :: tk_Rbrace = 25
|
||
integer, parameter :: tk_Semi = 26
|
||
integer, parameter :: tk_Comma = 27
|
||
integer, parameter :: tk_Ident = 28
|
||
integer, parameter :: tk_Integer = 29
|
||
integer, parameter :: tk_String = 30
|
||
|
||
character(len = 16), parameter :: token_names(0:30) = &
|
||
& (/ "End_of_input ", "Op_multiply ", "Op_divide ", "Op_mod ", "Op_add ", &
|
||
& "Op_subtract ", "Op_negate ", "Op_not ", "Op_less ", "Op_lessequal ", &
|
||
& "Op_greater ", "Op_greaterequal ", "Op_equal ", "Op_notequal ", "Op_assign ", &
|
||
& "Op_and ", "Op_or ", "Keyword_if ", "Keyword_else ", "Keyword_while ", &
|
||
& "Keyword_print ", "Keyword_putc ", "LeftParen ", "RightParen ", "LeftBrace ", &
|
||
& "RightBrace ", "Semicolon ", "Comma ", "Identifier ", "Integer ", &
|
||
& "String " /)
|
||
|
||
type :: token_t
|
||
integer :: token_no
|
||
|
||
! Our implementation stores the value of a tk_Integer as a
|
||
! string. The C reference implementation stores it as an int.
|
||
character(:, kind = ck), allocatable :: val
|
||
|
||
integer(nk) :: line_no
|
||
integer(nk) :: column_no
|
||
end type token_t
|
||
|
||
type :: lexer_input_t
|
||
logical, private :: using_input_unit = .true.
|
||
integer, private :: unit_no = -(huge (1))
|
||
integer(kind = nk) :: line_no = 1
|
||
integer(kind = nk) :: column_no = 0
|
||
integer, private :: unget_count = 0
|
||
|
||
! The maximum lookahead is 2, although I believe we are using
|
||
! only 1. In principle, the lookahead could be any finite number.
|
||
character(1, kind = ck), private :: unget_buffer(1:2)
|
||
logical, private :: unget_eof_buffer(1:2)
|
||
|
||
! Using the same strbuf_t multiple times reduces the need for
|
||
! reallocations. Putting that strbuf_t in the lexer_input_t is
|
||
! simply for convenience.
|
||
type(strbuf_t), private :: strbuf
|
||
|
||
contains
|
||
!
|
||
! Note: There is currently no facility for closing one input and
|
||
! switching to another.
|
||
!
|
||
! Note: There is currently no facility to decode inputs into
|
||
! Unicode codepoints. Instead, what happens is raw bytes of
|
||
! input get stored as strbuf_t_character_kind values. This
|
||
! behavior is adequate for ASCII inputs.
|
||
!
|
||
procedure, pass :: use_file => lexer_input_t_use_file
|
||
procedure, pass :: get_next_ch => lexer_input_t_get_next_ch
|
||
procedure, pass :: unget_ch => lexer_input_t_unget_ch
|
||
procedure, pass :: unget_eof => lexer_input_t_unget_eof
|
||
end type lexer_input_t
|
||
|
||
type :: lexer_output_t
|
||
integer, private :: unit_no = output_unit
|
||
contains
|
||
procedure, pass :: use_file => lexer_output_t_use_file
|
||
procedure, pass :: output_token => lexer_output_t_output_token
|
||
end type lexer_output_t
|
||
|
||
contains
|
||
|
||
subroutine lexer_input_t_use_file (inputter, filename)
|
||
class(lexer_input_t), intent(inout) :: inputter
|
||
character(*), intent(in) :: filename
|
||
|
||
integer :: stat
|
||
|
||
inputter%using_input_unit = .false.
|
||
inputter%unit_no = input_file_unit_no
|
||
inputter%line_no = 1
|
||
inputter%column_no = 0
|
||
|
||
open (unit = input_file_unit_no, file = filename, status = 'old', &
|
||
& action = 'read', access = 'stream', form = 'unformatted', &
|
||
& iostat = stat)
|
||
if (stat /= 0) then
|
||
write (error_unit, '("Error: failed to open ", A, " for input")') filename
|
||
stop 1
|
||
end if
|
||
end subroutine lexer_input_t_use_file
|
||
|
||
!!!
|
||
!!! If you tell gfortran you want -std=f2008 or -std=f2018, you likely
|
||
!!! will need to add also -fall-intrinsics or -U__GFORTRAN__
|
||
!!!
|
||
!!! The first way, you get the FGETC intrinsic. The latter way, you
|
||
!!! get the C interface code that uses getchar(3).
|
||
!!!
|
||
#ifdef __GFORTRAN__
|
||
|
||
subroutine get_input_unit_char (c, stat)
|
||
!
|
||
! The following works if you are using gfortran.
|
||
!
|
||
! (FGETC is considered a feature for backwards compatibility with
|
||
! g77. However, I know of no way to reconfigure input_unit as a
|
||
! Fortran 2003 stream, for use with ordinary ‘read’.)
|
||
!
|
||
character, intent(inout) :: c
|
||
integer, intent(out) :: stat
|
||
|
||
call fgetc (input_unit, c, stat)
|
||
end subroutine get_input_unit_char
|
||
|
||
#else
|
||
|
||
subroutine get_input_unit_char (c, stat)
|
||
!
|
||
! An alternative implementation of get_input_unit_char. This
|
||
! actually reads input from the C standard input, which might not
|
||
! be the same as input_unit.
|
||
!
|
||
use, intrinsic :: iso_c_binding, only: c_int
|
||
character, intent(inout) :: c
|
||
integer, intent(out) :: stat
|
||
|
||
interface
|
||
!
|
||
! Use getchar(3) to read characters from standard input. This
|
||
! assumes there is actually such a function available, and that
|
||
! getchar(3) does not exist solely as a macro. (One could write
|
||
! one’s own getchar() if necessary, of course.)
|
||
!
|
||
function getchar () result (c) bind (c, name = 'getchar')
|
||
use, intrinsic :: iso_c_binding, only: c_int
|
||
integer(kind = c_int) :: c
|
||
end function getchar
|
||
end interface
|
||
|
||
integer(kind = c_int) :: i_char
|
||
|
||
i_char = getchar ()
|
||
!
|
||
! The C standard requires that EOF have a negative value. If the
|
||
! value returned by getchar(3) is not EOF, then it will be
|
||
! representable as an unsigned char. Therefore, to check for end
|
||
! of file, one need only test whether i_char is negative.
|
||
!
|
||
if (i_char < 0) then
|
||
stat = -1
|
||
else
|
||
stat = 0
|
||
c = char (i_char)
|
||
end if
|
||
end subroutine get_input_unit_char
|
||
|
||
#endif
|
||
|
||
subroutine lexer_input_t_get_next_ch (inputter, eof, ch)
|
||
class(lexer_input_t), intent(inout) :: inputter
|
||
logical, intent(out) :: eof
|
||
character(1, kind = ck), intent(inout) :: ch
|
||
|
||
integer :: stat
|
||
character(1) :: c = '*'
|
||
|
||
if (0 < inputter%unget_count) then
|
||
if (inputter%unget_eof_buffer(inputter%unget_count)) then
|
||
eof = .true.
|
||
else
|
||
eof = .false.
|
||
ch = inputter%unget_buffer(inputter%unget_count)
|
||
end if
|
||
inputter%unget_count = inputter%unget_count - 1
|
||
else
|
||
if (inputter%using_input_unit) then
|
||
call get_input_unit_char (c, stat)
|
||
else
|
||
read (unit = inputter%unit_no, iostat = stat) c
|
||
end if
|
||
|
||
ch = char (ichar (c, kind = ichar_kind), kind = ck)
|
||
|
||
if (0 < stat) then
|
||
write (error_unit, '("Input error with status code ", I0)') stat
|
||
stop 1
|
||
else if (stat < 0) then
|
||
eof = .true.
|
||
! The C reference code increases column number on end of file;
|
||
! therefore, so shall we.
|
||
inputter%column_no = inputter%column_no + 1
|
||
else
|
||
eof = .false.
|
||
if (ch == newline_char) then
|
||
inputter%line_no = inputter%line_no + 1
|
||
inputter%column_no = 0
|
||
else
|
||
inputter%column_no = inputter%column_no + 1
|
||
end if
|
||
end if
|
||
end if
|
||
end subroutine lexer_input_t_get_next_ch
|
||
|
||
subroutine lexer_input_t_unget_ch (inputter, ch)
|
||
class(lexer_input_t), intent(inout) :: inputter
|
||
character(1, kind = ck), intent(in) :: ch
|
||
|
||
if (ubound (inputter%unget_buffer, 1) <= inputter%unget_count) then
|
||
write (error_unit, '("class(lexer_input_t) unget buffer overflow")')
|
||
stop 1
|
||
else
|
||
inputter%unget_count = inputter%unget_count + 1
|
||
inputter%unget_buffer(inputter%unget_count) = ch
|
||
inputter%unget_eof_buffer(inputter%unget_count) = .false.
|
||
end if
|
||
end subroutine lexer_input_t_unget_ch
|
||
|
||
subroutine lexer_input_t_unget_eof (inputter)
|
||
class(lexer_input_t), intent(inout) :: inputter
|
||
|
||
if (ubound (inputter%unget_buffer, 1) <= inputter%unget_count) then
|
||
write (error_unit, '("class(lexer_input_t) unget buffer overflow")')
|
||
stop 1
|
||
else
|
||
inputter%unget_count = inputter%unget_count + 1
|
||
inputter%unget_buffer(inputter%unget_count) = ck_'*'
|
||
inputter%unget_eof_buffer(inputter%unget_count) = .true.
|
||
end if
|
||
end subroutine lexer_input_t_unget_eof
|
||
|
||
subroutine lexer_output_t_use_file (outputter, filename)
|
||
class(lexer_output_t), intent(inout) :: outputter
|
||
character(*), intent(in) :: filename
|
||
|
||
integer :: stat
|
||
|
||
outputter%unit_no = output_file_unit_no
|
||
open (unit = output_file_unit_no, file = filename, action = 'write', iostat = stat)
|
||
if (stat /= 0) then
|
||
write (error_unit, '("Error: failed to open ", A, " for output")') filename
|
||
stop 1
|
||
end if
|
||
end subroutine lexer_output_t_use_file
|
||
|
||
subroutine lexer_output_t_output_token (outputter, token)
|
||
class(lexer_output_t), intent(inout) :: outputter
|
||
class(token_t), intent(in) :: token
|
||
|
||
select case (token%token_no)
|
||
case (tk_Integer, tk_Ident, tk_String)
|
||
write (outputter%unit_no, '(1X, I20, 1X, I20, 1X, A, 1X, A)') &
|
||
& token%line_no, token%column_no, &
|
||
& token_names(token%token_no), token%val
|
||
case default
|
||
write (outputter%unit_no, '(1X, I20, 1X, I20, 1X, A)') &
|
||
& token%line_no, token%column_no, &
|
||
& trim (token_names(token%token_no))
|
||
end select
|
||
end subroutine lexer_output_t_output_token
|
||
|
||
subroutine run_lexer (inputter, outputter)
|
||
class(lexer_input_t), intent(inout) :: inputter
|
||
class(lexer_output_t), intent(inout) :: outputter
|
||
|
||
type(token_t) :: token
|
||
|
||
token = get_token (inputter)
|
||
do while (token%token_no /= tk_EOI)
|
||
call outputter%output_token (token)
|
||
token = get_token (inputter)
|
||
end do
|
||
call outputter%output_token (token)
|
||
end subroutine run_lexer
|
||
|
||
function get_token (inputter) result (token)
|
||
class(lexer_input_t), intent(inout) :: inputter
|
||
type(token_t) :: token
|
||
|
||
logical :: eof
|
||
character(1, kind = ck) :: ch
|
||
|
||
call skip_spaces_and_comments (inputter, eof, ch, &
|
||
& token%line_no, token%column_no)
|
||
|
||
if (eof) then
|
||
token%token_no = tk_EOI
|
||
else
|
||
select case (ch)
|
||
case (ck_'{')
|
||
token%token_no = tk_Lbrace
|
||
case (ck_'}')
|
||
token%token_no = tk_Rbrace
|
||
case (ck_'(')
|
||
token%token_no = tk_Lparen
|
||
case (ck_')')
|
||
token%token_no = tk_Rparen
|
||
case (ck_'+')
|
||
token%token_no = tk_Add
|
||
case (ck_'-')
|
||
token%token_no = tk_Sub
|
||
case (ck_'*')
|
||
token%token_no = tk_Mul
|
||
case (ck_'%')
|
||
token%token_no = tk_Mod
|
||
case (ck_';')
|
||
token%token_no = tk_Semi
|
||
case (ck_',')
|
||
token%token_no = tk_Comma
|
||
case (ck_'/')
|
||
token%token_no = tk_Div
|
||
|
||
case (ck_"'")
|
||
call read_character_literal
|
||
|
||
case (ck_'<')
|
||
call distinguish_operators (ch, ck_'=', tk_Leq, tk_Lss)
|
||
case (ck_'>')
|
||
call distinguish_operators (ch, ck_'=', tk_Geq, tk_Gtr)
|
||
case (ck_'=')
|
||
call distinguish_operators (ch, ck_'=', tk_Eq, tk_Assign)
|
||
case (ck_'!')
|
||
call distinguish_operators (ch, ck_'=', tk_Neq, tk_Not)
|
||
case (ck_'&')
|
||
call distinguish_operators (ch, ck_'&', tk_And, tk_EOI)
|
||
case (ck_'|')
|
||
call distinguish_operators (ch, ck_'|', tk_Or, tk_EOI)
|
||
|
||
case (ck_'"')
|
||
call read_string_literal (ch, ch)
|
||
|
||
case default
|
||
if (isdigit (ch)) then
|
||
call read_numeric_literal (ch)
|
||
else if (isalpha_or_underscore (ch)) then
|
||
call read_identifier_or_keyword (ch)
|
||
else
|
||
call start_error_message (inputter)
|
||
write (error_unit, '("unrecognized character ''", A, "''")') ch
|
||
stop 1
|
||
end if
|
||
end select
|
||
end if
|
||
contains
|
||
|
||
subroutine read_character_literal
|
||
character(1, kind = ck) :: ch
|
||
logical :: eof
|
||
character(20, kind = ck) :: buffer
|
||
|
||
token%token_no = tk_Integer
|
||
|
||
call inputter%get_next_ch (eof, ch)
|
||
if (eof) then
|
||
call start_error_message (inputter)
|
||
write (error_unit, '("end of input in character literal")')
|
||
stop 1
|
||
else if (ch == ck_"'") then
|
||
call start_error_message (inputter)
|
||
write (error_unit, '("empty character literal")')
|
||
stop 1
|
||
else if (ch == backslash_char) then
|
||
call inputter%get_next_ch (eof, ch)
|
||
if (eof) then
|
||
call start_error_message (inputter)
|
||
write (error_unit, '("end of input in character literal, after backslash")')
|
||
stop 1
|
||
else if (ch == ck_'n') then
|
||
allocate (token%val, source = newline_intstring)
|
||
else if (ch == backslash_char) then
|
||
allocate (token%val, source = backslash_intstring)
|
||
else
|
||
call start_error_message (inputter)
|
||
write (error_unit, '("unknown escape sequence ''", A, A, "'' in character literal")') &
|
||
& backslash_char, ch
|
||
stop 1
|
||
end if
|
||
call read_character_literal_close_quote
|
||
else
|
||
call read_character_literal_close_quote
|
||
write (buffer, '(I0)') ichar (ch, kind = ichar_kind)
|
||
allocate (token%val, source = trim (buffer))
|
||
end if
|
||
end subroutine read_character_literal
|
||
|
||
subroutine read_character_literal_close_quote
|
||
logical :: eof
|
||
character(1, kind = ck) :: close_quote
|
||
|
||
call inputter%get_next_ch (eof, close_quote)
|
||
if (eof) then
|
||
call start_error_message (inputter)
|
||
write (error_unit, '("end of input in character literal")')
|
||
stop 1
|
||
else if (close_quote /= ck_"'") then
|
||
call start_error_message (inputter)
|
||
write (error_unit, '("multi-character literal")')
|
||
stop 1
|
||
end if
|
||
end subroutine read_character_literal_close_quote
|
||
|
||
subroutine distinguish_operators (first_ch, second_ch, &
|
||
& token_no_if_second_ch, &
|
||
& token_no_if_no_second_ch)
|
||
character(1, kind = ck), intent(in) :: first_ch
|
||
character(1, kind = ck), intent(in) :: second_ch
|
||
integer, intent(in) :: token_no_if_second_ch
|
||
integer, intent(in) :: token_no_if_no_second_ch
|
||
|
||
character(1, kind = ck) :: ch
|
||
logical :: eof
|
||
|
||
call inputter%get_next_ch (eof, ch)
|
||
if (eof) then
|
||
call inputter%unget_eof
|
||
token%token_no = token_no_if_no_second_ch
|
||
else if (ch == second_ch) then
|
||
token%token_no = token_no_if_second_ch
|
||
else if (token_no_if_no_second_ch == tk_EOI) then
|
||
call start_error_message (inputter)
|
||
write (error_unit, '("unrecognized character ''", A, "''")') first_ch
|
||
stop 1
|
||
else
|
||
call inputter%unget_ch (ch)
|
||
token%token_no = token_no_if_no_second_ch
|
||
end if
|
||
end subroutine distinguish_operators
|
||
|
||
subroutine read_string_literal (opening_quote, closing_quote)
|
||
character(1, kind = ck), intent(in) :: opening_quote
|
||
character(1, kind = ck), intent(in) :: closing_quote
|
||
|
||
character(1, kind = ck) :: ch
|
||
logical :: done
|
||
|
||
inputter%strbuf = opening_quote
|
||
done = .false.
|
||
do while (.not. done)
|
||
call inputter%get_next_ch (eof, ch)
|
||
if (eof) then
|
||
call start_error_message (inputter)
|
||
write (error_unit, '("end of input in string literal")')
|
||
stop 1
|
||
else if (ch == closing_quote) then
|
||
call inputter%strbuf%append(ch)
|
||
done = .true.
|
||
else if (ch == newline_char) then
|
||
call start_error_message (inputter)
|
||
write (error_unit, '("end of line in string literal")')
|
||
stop 1
|
||
else
|
||
call inputter%strbuf%append(ch)
|
||
end if
|
||
end do
|
||
allocate (token%val, source = inputter%strbuf%to_unicode())
|
||
token%token_no = tk_String
|
||
end subroutine read_string_literal
|
||
|
||
subroutine read_numeric_literal (first_ch)
|
||
character(1, kind = ck), intent(in) :: first_ch
|
||
|
||
character(1, kind = ck) :: ch
|
||
|
||
token%token_no = tk_Integer
|
||
|
||
inputter%strbuf = first_ch
|
||
call inputter%get_next_ch (eof, ch)
|
||
do while (isdigit (ch))
|
||
call inputter%strbuf%append (ch)
|
||
call inputter%get_next_ch (eof, ch)
|
||
end do
|
||
if (isalpha_or_underscore (ch)) then
|
||
call start_error_message (inputter)
|
||
write (error_unit, '("invalid numeric literal """, A, """")') &
|
||
& inputter%strbuf%to_unicode()
|
||
stop 1
|
||
else
|
||
call inputter%unget_ch (ch)
|
||
allocate (token%val, source = inputter%strbuf%to_unicode())
|
||
end if
|
||
end subroutine read_numeric_literal
|
||
|
||
subroutine read_identifier_or_keyword (first_ch)
|
||
character(1, kind = ck), intent(in) :: first_ch
|
||
|
||
character(1, kind = ck) :: ch
|
||
|
||
inputter%strbuf = first_ch
|
||
call inputter%get_next_ch (eof, ch)
|
||
do while (isalnum_or_underscore (ch))
|
||
call inputter%strbuf%append (ch)
|
||
call inputter%get_next_ch (eof, ch)
|
||
end do
|
||
|
||
call inputter%unget_ch (ch)
|
||
|
||
!
|
||
! The following is a handwritten ‘implicit radix tree’ search
|
||
! for keywords, first partitioning the set of keywords according
|
||
! to their lengths.
|
||
!
|
||
! I did it this way for fun. One could, of course, write a
|
||
! program to generate code for such a search.
|
||
!
|
||
! Perfect hashes are another method one could use.
|
||
!
|
||
! The reference C implementation uses a binary search.
|
||
!
|
||
token%token_no = tk_Ident
|
||
select case (inputter%strbuf%length())
|
||
case (2)
|
||
select case (inputter%strbuf%chars(1))
|
||
case (ck_'i')
|
||
select case (inputter%strbuf%chars(2))
|
||
case (ck_'f')
|
||
token%token_no = tk_If
|
||
case default
|
||
continue
|
||
end select
|
||
case default
|
||
continue
|
||
end select
|
||
case (4)
|
||
select case (inputter%strbuf%chars(1))
|
||
case (ck_'e')
|
||
select case (inputter%strbuf%chars(2))
|
||
case (ck_'l')
|
||
select case (inputter%strbuf%chars(3))
|
||
case (ck_'s')
|
||
select case (inputter%strbuf%chars(4))
|
||
case (ck_'e')
|
||
token%token_no = tk_Else
|
||
case default
|
||
continue
|
||
end select
|
||
case default
|
||
continue
|
||
end select
|
||
case default
|
||
continue
|
||
end select
|
||
case (ck_'p')
|
||
select case (inputter%strbuf%chars(2))
|
||
case (ck_'u')
|
||
select case (inputter%strbuf%chars(3))
|
||
case (ck_'t')
|
||
select case (inputter%strbuf%chars(4))
|
||
case (ck_'c')
|
||
token%token_no = tk_Putc
|
||
case default
|
||
continue
|
||
end select
|
||
case default
|
||
continue
|
||
end select
|
||
case default
|
||
continue
|
||
end select
|
||
case default
|
||
continue
|
||
end select
|
||
case (5)
|
||
select case (inputter%strbuf%chars(1))
|
||
case (ck_'p')
|
||
select case (inputter%strbuf%chars(2))
|
||
case (ck_'r')
|
||
select case (inputter%strbuf%chars(3))
|
||
case (ck_'i')
|
||
select case (inputter%strbuf%chars(4))
|
||
case (ck_'n')
|
||
select case (inputter%strbuf%chars(5))
|
||
case (ck_'t')
|
||
token%token_no = tk_Print
|
||
case default
|
||
continue
|
||
end select
|
||
case default
|
||
continue
|
||
end select
|
||
case default
|
||
continue
|
||
end select
|
||
case default
|
||
continue
|
||
end select
|
||
case (ck_'w')
|
||
select case (inputter%strbuf%chars(2))
|
||
case (ck_'h')
|
||
select case (inputter%strbuf%chars(3))
|
||
case (ck_'i')
|
||
select case (inputter%strbuf%chars(4))
|
||
case (ck_'l')
|
||
select case (inputter%strbuf%chars(5))
|
||
case (ck_'e')
|
||
token%token_no = tk_While
|
||
case default
|
||
continue
|
||
end select
|
||
case default
|
||
continue
|
||
end select
|
||
case default
|
||
continue
|
||
end select
|
||
case default
|
||
continue
|
||
end select
|
||
case default
|
||
continue
|
||
end select
|
||
case default
|
||
continue
|
||
end select
|
||
if (token%token_no == tk_Ident) then
|
||
allocate (token%val, source = inputter%strbuf%to_unicode ())
|
||
end if
|
||
end subroutine read_identifier_or_keyword
|
||
|
||
end function get_token
|
||
|
||
subroutine skip_spaces_and_comments (inputter, eof, ch, line_no, column_no)
|
||
!
|
||
! This procedure skips spaces and comments, and also captures the
|
||
! line and column numbers at the correct moment to indicate the
|
||
! start of a token.
|
||
!
|
||
class(lexer_input_t), intent(inout) :: inputter
|
||
logical, intent(out) :: eof
|
||
character(1, kind = ck), intent(inout) :: ch
|
||
integer(kind = nk), intent(out) :: line_no
|
||
integer(kind = nk), intent(out) :: column_no
|
||
|
||
integer(kind = nk), parameter :: not_done = -(huge (1_nk))
|
||
|
||
line_no = not_done
|
||
do while (line_no == not_done)
|
||
call inputter%get_next_ch (eof, ch)
|
||
if (eof) then
|
||
line_no = inputter%line_no
|
||
column_no = inputter%column_no
|
||
else if (ch == ck_'/') then
|
||
line_no = inputter%line_no
|
||
column_no = inputter%column_no
|
||
call inputter%get_next_ch (eof, ch)
|
||
if (eof) then
|
||
call inputter%unget_eof
|
||
ch = ck_'/'
|
||
else if (ch /= ck_'*') then
|
||
call inputter%unget_ch (ch)
|
||
ch = ck_'/'
|
||
else
|
||
call read_to_end_of_comment
|
||
line_no = not_done
|
||
end if
|
||
else if (.not. isspace (ch)) then
|
||
line_no = inputter%line_no
|
||
column_no = inputter%column_no
|
||
end if
|
||
end do
|
||
|
||
contains
|
||
|
||
subroutine read_to_end_of_comment
|
||
logical :: done
|
||
|
||
done = .false.
|
||
do while (.not. done)
|
||
call inputter%get_next_ch (eof, ch)
|
||
if (eof) then
|
||
call end_of_input_in_comment
|
||
else if (ch == ck_'*') then
|
||
call inputter%get_next_ch (eof, ch)
|
||
if (eof) then
|
||
call end_of_input_in_comment
|
||
else if (ch == ck_'/') then
|
||
done = .true.
|
||
end if
|
||
end if
|
||
end do
|
||
end subroutine read_to_end_of_comment
|
||
|
||
subroutine end_of_input_in_comment
|
||
call start_error_message (inputter)
|
||
write (error_unit, '("end of input in comment")')
|
||
stop 1
|
||
end subroutine end_of_input_in_comment
|
||
|
||
end subroutine skip_spaces_and_comments
|
||
|
||
subroutine start_error_message (inputter)
|
||
class(lexer_input_t), intent(inout) :: inputter
|
||
|
||
write (error_unit, '("Lexical error at ", I0, ".", I0, ": ")', advance = 'no') &
|
||
& inputter%line_no, inputter%column_no
|
||
end subroutine start_error_message
|
||
|
||
elemental function isspace (ch) result (bool)
|
||
character(1, kind = ck), intent(in) :: ch
|
||
logical :: bool
|
||
|
||
bool = (ch == horizontal_tab_char) .or. &
|
||
& (ch == linefeed_char) .or. &
|
||
& (ch == vertical_tab_char) .or. &
|
||
& (ch == formfeed_char) .or. &
|
||
& (ch == carriage_return_char) .or. &
|
||
& (ch == space_char)
|
||
end function isspace
|
||
|
||
elemental function isupper (ch) result (bool)
|
||
character(1, kind = ck), intent(in) :: ch
|
||
logical :: bool
|
||
|
||
integer(kind = ichar_kind), parameter :: uppercase_A = ichar (ck_'A', kind = ichar_kind)
|
||
integer(kind = ichar_kind), parameter :: uppercase_Z = ichar (ck_'Z', kind = ichar_kind)
|
||
|
||
integer(kind = ichar_kind) :: i_ch
|
||
|
||
i_ch = ichar (ch, kind = ichar_kind)
|
||
bool = (uppercase_A <= i_ch .and. i_ch <= uppercase_Z)
|
||
end function isupper
|
||
|
||
elemental function islower (ch) result (bool)
|
||
character(1, kind = ck), intent(in) :: ch
|
||
logical :: bool
|
||
|
||
integer(kind = ichar_kind), parameter :: lowercase_a = ichar (ck_'a', kind = ichar_kind)
|
||
integer(kind = ichar_kind), parameter :: lowercase_z = ichar (ck_'z', kind = ichar_kind)
|
||
|
||
integer(kind = ichar_kind) :: i_ch
|
||
|
||
i_ch = ichar (ch, kind = ichar_kind)
|
||
bool = (lowercase_a <= i_ch .and. i_ch <= lowercase_z)
|
||
end function islower
|
||
|
||
elemental function isalpha (ch) result (bool)
|
||
character(1, kind = ck), intent(in) :: ch
|
||
logical :: bool
|
||
|
||
bool = isupper (ch) .or. islower (ch)
|
||
end function isalpha
|
||
|
||
elemental function isdigit (ch) result (bool)
|
||
character(1, kind = ck), intent(in) :: ch
|
||
logical :: bool
|
||
|
||
integer(kind = ichar_kind), parameter :: zero = ichar (ck_'0', kind = ichar_kind)
|
||
integer(kind = ichar_kind), parameter :: nine = ichar (ck_'9', kind = ichar_kind)
|
||
|
||
integer(kind = ichar_kind) :: i_ch
|
||
|
||
i_ch = ichar (ch, kind = ichar_kind)
|
||
bool = (zero <= i_ch .and. i_ch <= nine)
|
||
end function isdigit
|
||
|
||
elemental function isalnum (ch) result (bool)
|
||
character(1, kind = ck), intent(in) :: ch
|
||
logical :: bool
|
||
|
||
bool = isalpha (ch) .or. isdigit (ch)
|
||
end function isalnum
|
||
|
||
elemental function isalpha_or_underscore (ch) result (bool)
|
||
character(1, kind = ck), intent(in) :: ch
|
||
logical :: bool
|
||
|
||
bool = isalpha (ch) .or. (ch == ck_'_')
|
||
end function isalpha_or_underscore
|
||
|
||
elemental function isalnum_or_underscore (ch) result (bool)
|
||
character(1, kind = ck), intent(in) :: ch
|
||
logical :: bool
|
||
|
||
bool = isalnum (ch) .or. (ch == ck_'_')
|
||
end function isalnum_or_underscore
|
||
|
||
end module lexical_analysis
|
||
|
||
program lex
|
||
use, intrinsic :: iso_fortran_env, only: output_unit
|
||
use, non_intrinsic :: lexical_analysis
|
||
|
||
implicit none
|
||
|
||
integer :: arg_count
|
||
character(200) :: arg
|
||
type(lexer_input_t) :: inputter
|
||
type(lexer_output_t) :: outputter
|
||
|
||
arg_count = command_argument_count ()
|
||
if (3 <= arg_count) then
|
||
call print_usage
|
||
else if (arg_count == 0) then
|
||
call run_lexer (inputter, outputter)
|
||
else if (arg_count == 1) then
|
||
call get_command_argument (1, arg)
|
||
call inputter%use_file(trim (arg))
|
||
call run_lexer (inputter, outputter)
|
||
else if (arg_count == 2) then
|
||
call get_command_argument (1, arg)
|
||
call inputter%use_file(trim (arg))
|
||
call get_command_argument (2, arg)
|
||
call outputter%use_file(trim (arg))
|
||
call run_lexer (inputter, outputter)
|
||
end if
|
||
|
||
contains
|
||
|
||
subroutine print_usage
|
||
character(200) :: progname
|
||
|
||
call get_command_argument (0, progname)
|
||
write (output_unit, '("Usage: ", A, " [INPUT_FILE [OUTPUT_FILE]]")') &
|
||
& trim (progname)
|
||
end subroutine print_usage
|
||
|
||
end program lex
|