RosettaCodeData/Task/Compiler-lexical-analyzer/Prolog/compiler-lexical-analyzer.pro

162 lines
5.1 KiB
Prolog

/*
Test harness for the analyzer, not needed if we are actually using the output.
*/
load_file(File, Input) :-
read_file_to_codes(File, Codes, []),
maplist(char_code, Chars, Codes),
atom_chars(Input,Chars).
test_file(File) :-
load_file(File, Input),
tester(Input).
tester(S) :-
atom_chars(S,Chars),
tokenize(Chars,L),
maplist(print_tok, L),
!.
print_tok(L) :-
L =.. [Op,Line,Pos],
format('~d\t~d\t~p~n', [Line,Pos,Op]).
print_tok(string(Value,Line,Pos)) :-
format('~d\t~d\tstring\t\t"~w"~n', [Line,Pos,Value]).
print_tok(identifier(Value,Line,Pos)) :-
format('~d\t~d\tidentifier\t~p~n', [Line,Pos,Value]).
print_tok(integer(Value,Line,Pos)) :-
format('~d\t~d\tinteger\t\t~p~n', [Line,Pos,Value]).
/*
Tokenize
run the input over a DCG to get out the tokens.
In - a list of chars to tokenize
Tokens = a list of tokens (excluding spaces).
*/
tokenize(In,RelTokens) :-
newline_positions(In,1,NewLines),
tokenize(In,[0|NewLines],1,1,Tokens),
check_for_exceptions(Tokens),
exclude(token_name(space),Tokens,RelTokens).
tokenize([],NewLines,Pos,LineNo,[end_of_input(LineNo,Offset)]) :-
position_offset(NewLines,Pos,Offset).
tokenize(In,NewLines,Pos,LineNo,Out) :-
position_offset(NewLines,Pos,Offset),
phrase(tok(Tok,TokLen,LineNo,Offset),In,T),
(
Tok = [] -> Out = Toks
; Out = [Tok|Toks]
),
Pos1 is Pos + TokLen,
update_line_no(LineNo,NewLines,Pos1,NewLineNo,NewNewLines),
tokenize(T,NewNewLines,Pos1,NewLineNo,Toks).
update_line_no(LNo,[L],_,LNo,[L]).
update_line_no(LNo,[L,Nl|T],Pos,LNo,[L,Nl|T]) :-
Pos =< Nl.
update_line_no(LNo,[_,Nl|T],Pos,LNo2,Nlines) :-
Pos > Nl,
succ(LNo,LNo1),
update_line_no(LNo1,[Nl|T],Pos,LNo2,Nlines).
position_offset([Line|_],Pos,Offset) :- Offset is Pos - Line.
token_name(Name,Tok) :- functor(Tok,Name,_).
% Get a list of all the newlines and their position in the data
% This is used to create accurate row/column numbers.
newline_positions([],N,[N]).
newline_positions(['\n'|T],N,[N|Nt]) :- succ(N,N1), newline_positions(T,N1,Nt).
newline_positions([C|T],N,Nt) :- dif(C,'\n'), succ(N,N1), newline_positions(T,N1,Nt).
% The tokenizer can tokenize some things that it shouldn't, deal with them here.
check_for_exceptions([]). % all ok
check_for_exceptions([op_divide(L,P),op_multiply(_,_)|_]) :-
format(atom(Error), 'Unclosed comment at line ~d,~d', [L,P]),
throw(Error).
check_for_exceptions([integer(_,L,P),identifier(_,_,_)|_]) :-
format(atom(Error), 'Invalid identifier at line ~d,~d', [L,P]),
throw(Error).
check_for_exceptions([_|T]) :- check_for_exceptions(T).
/*
A set of helper DCGs for the more complicated token types
*/
:- set_prolog_flag(double_quotes, chars).
identifier(I) --> c_types(I,csym).
identifier(['_']) --> ['_'].
identifier([]) --> [].
integer_(I,L) --> c_types(N,digit), { number_codes(I,N), length(N,L) }.
% get a sequence of characters of the same type (https://www.swi-prolog.org/pldoc/doc_for?object=char_type/2)
c_types([C|T],Type) --> c_type(C,Type), c_types(T,Type).
c_types([C],Type) --> c_type(C,Type).
c_type(C,Type) --> [C],{ char_type(C,Type) }.
anything([]) --> [].
anything([A|T]) --> [A], anything(T).
string_([]) --> [].
string_([A|T]) --> [A], { dif(A,'\n') }, string_(T).
/*
The token types are all handled by the tok DCG, order of predicates is important here.
*/
% comment
tok([],CLen,_,_) --> "/*", anything(A), "*/", { length(A,Len), CLen is Len + 4 }.
% toks
tok(op_and(L,P),2,L,P) --> "&&".
tok(op_or(L,P),2,L,P) --> "||".
tok(op_lessequal(L,P),2,L,P) --> "<=".
tok(op_greaterequal(L,P),2,L,P) --> ">=".
tok(op_greaterequal(L,P),2,L,P) --> ">=".
tok(op_equal(L,P),2,L,P) --> "==".
tok(op_notequal(L,P),2,L,P) --> "!=".
tok(op_assign(L,P),1,L,P) --> "=".
tok(op_multiply(L,P),1,L,P) --> "*".
tok(op_divide(L,P),1,L,P) --> "/".
tok(op_mod(L,P),1,L,P) --> "%".
tok(op_add(L,P),1,L,P) --> "+".
tok(op_subtract(L,P),1,L,P) --> "-".
tok(op_negate(L,P),1,L,P) --> "-".
tok(op_less(L,P),1,L,P) --> "<".
tok(op_greater(L,P),1,L,P) --> ">".
tok(op_not(L,P),1,L,P) --> "!".
% symbols
tok(left_paren(L,P),1,L,P) --> "(".
tok(right_paren(L,P),1,L,P) --> ")".
tok(left_brace(L,P),1,L,P) --> "{".
tok(right_brace(L,P),1,L,P) --> "}".
tok(semicolon(L,P),1,L,P) --> ";".
tok(comma(L,P),1,L,P) --> ",".
% keywords
tok(keyword_if(L,P),2,L,P) --> "if".
tok(keyword_else(L,P),4,L,P) --> "else".
tok(keyword_while(L,P),5,L,P) --> "while".
tok(keyword_print(L,P),5,L,P) --> "print".
tok(keyword_putc(L,P),4,L,P) --> "putc".
% identifier and literals
tok(identifier(I,L,P),Len,L,P) --> c_type(S,csymf), identifier(T), { atom_chars(I,[S|T]), length([S|T],Len) }.
tok(integer(V,L,P),Len,L,P) --> integer_(V,Len).
tok(integer(I,L,P),4,L,P) --> "'\\\\'", { char_code('\\', I) }.
tok(integer(I,L,P),4,L,P) --> "'\\n'", { char_code('\n', I) }.
tok(integer(I,L,P),3,L,P) --> ['\''], [C], ['\''], { dif(C,'\n'), dif(C,'\''), char_code(C,I) }.
tok(string(S,L,P),SLen,L,P) --> ['"'], string_(A),['"'], { atom_chars(S,A), length(A,Len), SLen is Len + 2 }.
% spaces
tok(space(L,P),Len,L,P) --> c_types(S,space), { length(S,Len) }.
% anything else is an error
tok(_,_,L,P) --> { format(atom(Error), 'Invalid token at line ~d,~d', [L,P]), throw(Error) }.