#include // std::from_chars #include // file_to_string, string_to_file #include // std::invoke #include // std::setw #include // std::left #include #include // keywords #include #include #include // std::forward #include // TokenVal using namespace std; // ===================================================================================================================== // Machinery // ===================================================================================================================== string file_to_string (const string& path) { // Open file ifstream file {path, ios::in | ios::binary | ios::ate}; if (!file) throw (errno); // Allocate string memory string contents; contents.resize(file.tellg()); // Read file contents into string file.seekg(0); file.read(contents.data(), contents.size()); return contents; } void string_to_file (const string& path, string contents) { ofstream file {path, ios::out | ios::binary}; if (!file) throw (errno); file.write(contents.data(), contents.size()); } template void with_IO (string source, string destination, F&& f) { string input; if (source == "stdin") getline(cin, input); else input = file_to_string(source); string output = invoke(forward(f), input); if (destination == "stdout") cout << output; else string_to_file(destination, output); } // Add escaped newlines and backslashes back in for printing string sanitize (string s) { for (auto i = 0u; i < s.size(); ++i) { if (s[i] == '\n') s.replace(i++, 1, "\\n"); else if (s[i] == '\\') s.replace(i++, 1, "\\\\"); } return s; } class Scanner { public: const char* pos; int line = 1; int column = 1; Scanner (const char* source) : pos {source} {} inline char peek () { return *pos; } void advance () { if (*pos == '\n') { ++line; column = 1; } else ++column; ++pos; } char next () { advance(); return peek(); } void skip_whitespace () { while (isspace(static_cast(peek()))) advance(); } }; // class Scanner // ===================================================================================================================== // Tokens // ===================================================================================================================== enum class TokenName { OP_MULTIPLY, OP_DIVIDE, OP_MOD, OP_ADD, OP_SUBTRACT, OP_NEGATE, OP_LESS, OP_LESSEQUAL, OP_GREATER, OP_GREATEREQUAL, OP_EQUAL, OP_NOTEQUAL, OP_NOT, OP_ASSIGN, OP_AND, OP_OR, LEFTPAREN, RIGHTPAREN, LEFTBRACE, RIGHTBRACE, SEMICOLON, COMMA, KEYWORD_IF, KEYWORD_ELSE, KEYWORD_WHILE, KEYWORD_PRINT, KEYWORD_PUTC, IDENTIFIER, INTEGER, STRING, END_OF_INPUT, ERROR }; using TokenVal = variant; struct Token { TokenName name; TokenVal value; int line; int column; }; const char* to_cstring (TokenName name) { static const char* s[] = { "Op_multiply", "Op_divide", "Op_mod", "Op_add", "Op_subtract", "Op_negate", "Op_less", "Op_lessequal", "Op_greater", "Op_greaterequal", "Op_equal", "Op_notequal", "Op_not", "Op_assign", "Op_and", "Op_or", "LeftParen", "RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma", "Keyword_if", "Keyword_else", "Keyword_while", "Keyword_print", "Keyword_putc", "Identifier", "Integer", "String", "End_of_input", "Error" }; return s[static_cast(name)]; } string to_string (Token t) { ostringstream out; out << setw(2) << t.line << " " << setw(2) << t.column << " "; switch (t.name) { case (TokenName::IDENTIFIER) : out << "Identifier " << get(t.value); break; case (TokenName::INTEGER) : out << "Integer " << left << get(t.value); break; case (TokenName::STRING) : out << "String \"" << sanitize(get(t.value)) << '"'; break; case (TokenName::END_OF_INPUT) : out << "End_of_input"; break; case (TokenName::ERROR) : out << "Error " << get(t.value); break; default : out << to_cstring(t.name); } out << '\n'; return out.str(); } // ===================================================================================================================== // Lexer // ===================================================================================================================== class Lexer { public: Lexer (const char* source) : s {source}, pre_state {s} {} bool has_more () { return s.peek() != '\0'; } Token next_token () { s.skip_whitespace(); pre_state = s; switch (s.peek()) { case '*' : return simply(TokenName::OP_MULTIPLY); case '%' : return simply(TokenName::OP_MOD); case '+' : return simply(TokenName::OP_ADD); case '-' : return simply(TokenName::OP_SUBTRACT); case '{' : return simply(TokenName::LEFTBRACE); case '}' : return simply(TokenName::RIGHTBRACE); case '(' : return simply(TokenName::LEFTPAREN); case ')' : return simply(TokenName::RIGHTPAREN); case ';' : return simply(TokenName::SEMICOLON); case ',' : return simply(TokenName::COMMA); case '&' : return expect('&', TokenName::OP_AND); case '|' : return expect('|', TokenName::OP_OR); case '<' : return follow('=', TokenName::OP_LESSEQUAL, TokenName::OP_LESS); case '>' : return follow('=', TokenName::OP_GREATEREQUAL, TokenName::OP_GREATER); case '=' : return follow('=', TokenName::OP_EQUAL, TokenName::OP_ASSIGN); case '!' : return follow('=', TokenName::OP_NOTEQUAL, TokenName::OP_NOT); case '/' : return divide_or_comment(); case '\'' : return char_lit(); case '"' : return string_lit(); default : if (is_id_start(s.peek())) return identifier(); if (is_digit(s.peek())) return integer_lit(); return error("Unrecognized character '", s.peek(), "'"); case '\0' : return make_token(TokenName::END_OF_INPUT); } } private: Scanner s; Scanner pre_state; static const map keywords; template Token error (Args&&... ostream_args) { string code {pre_state.pos, (string::size_type) s.column - pre_state.column}; ostringstream msg; (msg << ... << forward(ostream_args)) << '\n' << string(28, ' ') << "(" << s.line << ", " << s.column << "): " << code; if (s.peek() != '\0') s.advance(); return make_token(TokenName::ERROR, msg.str()); } inline Token make_token (TokenName name, TokenVal value = 0) { return {name, value, pre_state.line, pre_state.column}; } Token simply (TokenName name) { s.advance(); return make_token(name); } Token expect (char expected, TokenName name) { if (s.next() == expected) return simply(name); else return error("Unrecognized character '", s.peek(), "'"); } Token follow (char expected, TokenName ifyes, TokenName ifno) { if (s.next() == expected) return simply(ifyes); else return make_token(ifno); } Token divide_or_comment () { if (s.next() != '*') return make_token(TokenName::OP_DIVIDE); while (s.next() != '\0') { if (s.peek() == '*' && s.next() == '/') { s.advance(); return next_token(); } } return error("End-of-file in comment. Closing comment characters not found."); } Token char_lit () { int n = s.next(); if (n == '\'') return error("Empty character constant"); if (n == '\\') switch (s.next()) { case 'n' : n = '\n'; break; case '\\' : n = '\\'; break; default : return error("Unknown escape sequence \\", s.peek()); } if (s.next() != '\'') return error("Multi-character constant"); s.advance(); return make_token(TokenName::INTEGER, n); } Token string_lit () { string text = ""; while (s.next() != '"') switch (s.peek()) { case '\\' : switch (s.next()) { case 'n' : text += '\n'; continue; case '\\' : text += '\\'; continue; default : return error("Unknown escape sequence \\", s.peek()); } case '\n' : return error("End-of-line while scanning string literal." " Closing string character not found before end-of-line."); case '\0' : return error("End-of-file while scanning string literal." " Closing string character not found."); default : text += s.peek(); } s.advance(); return make_token(TokenName::STRING, text); } static inline bool is_id_start (char c) { return isalpha(static_cast(c)) || c == '_'; } static inline bool is_id_end (char c) { return isalnum(static_cast(c)) || c == '_'; } static inline bool is_digit (char c) { return isdigit(static_cast(c)); } Token identifier () { string text (1, s.peek()); while (is_id_end(s.next())) text += s.peek(); auto i = keywords.find(text); if (i != keywords.end()) return make_token(i->second); return make_token(TokenName::IDENTIFIER, text); } Token integer_lit () { while (is_digit(s.next())); if (is_id_start(s.peek())) return error("Invalid number. Starts like a number, but ends in non-numeric characters."); int n; auto r = from_chars(pre_state.pos, s.pos, n); if (r.ec == errc::result_out_of_range) return error("Number exceeds maximum value"); return make_token(TokenName::INTEGER, n); } }; // class Lexer const map Lexer::keywords = { {"else", TokenName::KEYWORD_ELSE}, {"if", TokenName::KEYWORD_IF}, {"print", TokenName::KEYWORD_PRINT}, {"putc", TokenName::KEYWORD_PUTC}, {"while", TokenName::KEYWORD_WHILE} }; int main (int argc, char* argv[]) { string in = (argc > 1) ? argv[1] : "stdin"; string out = (argc > 2) ? argv[2] : "stdout"; with_IO(in, out, [](string input) { Lexer lexer {input.data()}; string s = "Location Token name Value\n" "--------------------------------------\n"; while (lexer.has_more()) s += to_string(lexer.next_token()); return s; }); }