[Tolk] Rewrite lexer, spaces are not mandatory anymore

A new lexer is noticeably faster and memory efficient (although splitting a file to tokens is negligible in a whole pipeline). But the purpose of rewriting lexer was not just to speed up, but to allow writing code without spaces: `2+2` is now 4, not a valid identifier as earlier. The variety of symbols allowed in identifier has greatly reduced and is now similar to other languages. SrcLocation became 8 bytes on stack everywhere. Command-line flags were also reworked: - the input for Tolk compiler is only a single file now, it's parsed, and parsing continues while new #include are resolved - flags like -A -P and so on are no more needed, actually
2025-03-09 15:40:10 +00:00 · 2024-10-31 10:59:23 +04:00 · 2024-10-31 10:59:23 +04:00 · f0e6470d0b
commit f0e6470d0b
parent 0bcc0b3c12
26 changed files with 2042 additions and 2129 deletions
--- a/tolk/lexer.h
+++ b/tolk/lexer.h
@ -15,104 +15,225 @@
    along with TON Blockchain Library.  If not, see <http://www.gnu.org/licenses/>.
 */
 #pragma once
-#include "srcread.h"
-#include <array>
-#include <memory>
-#include <cstring>
+
+#include "platform-utils.h"
+#include "src-file.h"
+#include <string>

 namespace tolk {

-/*
- *
- *   LEXER
- *
- */
+enum TokenType {
+  tok_empty,

-struct Lexem {
-  enum { Undefined = -2, Eof = -1, Unknown = 0, Ident = 0, Number = 1, Special = 2, String = 3 };
-  int tp;
-  int val;
-  std::string str;
-  SrcLocation loc;
-  int classify();
-  Lexem(std::string _str = "", const SrcLocation& _loc = {}, int _tp = Unknown, int _val = 0)
-      : tp(_tp), val(_val), str(_str), loc(_loc) {
-    classify();
-  }
-  int set(std::string _str = "", const SrcLocation& _loc = {}, int _tp = Unknown, int _val = 0);
-  Lexem& clear(const SrcLocation& _loc = {}, int _tp = Unknown, int _val = 0) {
-    tp = _tp;
-    val = _val;
-    loc = _loc;
-    str = "";
-    return *this;
-  }
-  bool valid() const {
-    return tp != Undefined;
-  }
-  std::string name_str() const;
-  void error(std::string _str) const {
-    throw ParseError{loc, _str};
-  }
-  void error_at(std::string str1, std::string str2) const {
-    error(str1 + str + str2);
-  }
+  tok_int_const,
+  tok_string_const,
+  tok_string_modifier,

-  static std::string lexem_name_str(int idx);
+  tok_identifier,
+
+  tok_plus,
+  tok_minus,
+  tok_mul,
+  tok_div,
+  tok_mod,
+  tok_question,
+  tok_colon,
+  tok_comma,
+  tok_semicolon,
+  tok_oppar,
+  tok_clpar,
+  tok_opbracket,
+  tok_clbracket,
+  tok_opbrace,
+  tok_clbrace,
+  tok_assign,
+  tok_underscore,
+  tok_lt,
+  tok_gt,
+  tok_bitwise_and,
+  tok_bitwise_or,
+  tok_bitwise_xor,
+  tok_bitwise_not,
+  tok_dot,
+
+  tok_eq,
+  tok_neq,
+  tok_leq,
+  tok_geq,
+  tok_spaceship,
+  tok_lshift,
+  tok_rshift,
+  tok_rshiftR,
+  tok_rshiftC,
+  tok_divR,
+  tok_divC,
+  tok_modR,
+  tok_modC,
+  tok_divmod,
+  tok_set_plus,
+  tok_set_minus,
+  tok_set_mul,
+  tok_set_div,
+  tok_set_divR,
+  tok_set_divC,
+  tok_set_mod,
+  tok_set_modR,
+  tok_set_modC,
+  tok_set_lshift,
+  tok_set_rshift,
+  tok_set_rshiftR,
+  tok_set_rshiftC,
+  tok_set_bitwise_and,
+  tok_set_bitwise_or,
+  tok_set_bitwise_xor,
+
+  tok_return,
+  tok_var,
+  tok_repeat,
+  tok_do,
+  tok_while,
+  tok_until,
+  tok_try,
+  tok_catch,
+  tok_if,
+  tok_ifnot,
+  tok_then,
+  tok_else,
+  tok_elseif,
+  tok_elseifnot,
+
+  tok_int,
+  tok_cell,
+  tok_slice,
+  tok_builder,
+  tok_cont,
+  tok_tuple,
+  tok_type,
+  tok_mapsto,
+  tok_forall,
+
+  tok_extern,
+  tok_global,
+  tok_asm,
+  tok_impure,
+  tok_pure,
+  tok_inline,
+  tok_inlineref,
+  tok_builtin,
+  tok_autoapply,
+  tok_method_id,
+  tok_get,
+  tok_operator,
+  tok_infix,
+  tok_infixl,
+  tok_infixr,
+  tok_const,
+
+  tok_pragma,
+  tok_pragma_name,
+  tok_semver,
+  tok_include,
+
+  tok_eof
 };

+// All tolk language is parsed into tokens.
+// Lexer::next() returns a Token.
+struct Token {
+  TokenType type = tok_empty;
+  std::string_view str_val;
+
+  Token() = default;
+  Token(TokenType type, std::string_view str_val): type(type), str_val(str_val) {}
+};
+
+// Lexer::next() is a method to be used externally (while parsing tolk file to AST).
+// It's streaming: `next()` parses a token on demand.
+// For comments, see lexer.cpp, a comment above Lexer constructor.
 class Lexer {
-  SourceReader& src;
-  bool eof;
-  Lexem lexem, peek_lexem;
-  unsigned char char_class[128];
-  std::array<int, 3> eol_cmt, cmt_op, cmt_cl;    // for ;; {- -}
-  std::array<int, 3> eol_cmt2, cmt_op2, cmt_cl2; // for // /* */
-  std::string multiline_quote;
-  enum cc { left_active = 2, right_active = 1, active = 3, allow_repeat = 4, quote_char = 8 };
+  Token tokens_circularbuf[8]{};
+  int last_token_idx = -1;
+  int cur_token_idx = -1;
+  Token cur_token;  // = tokens_circularbuf[cur_token_idx & 7]

- public:
-  bool eof_found() const {
-    return eof;
-  }
-  explicit Lexer(SourceReader& _src, std::string active_chars = ";,() ~.",
-    std::string quote_chars = "\"", std::string multiline_quote = "\"\"\"");
+  const SrcFile* file;
+  const char *p_start, *p_end, *p_next;
+  SrcLocation location;

-  void set_comment_tokens(const std::string &eol_cmts, const std::string &open_cmts, const std::string &close_cmts);
-  void set_comment2_tokens(const std::string &eol_cmts2, const std::string &open_cmts2, const std::string &close_cmts2);
-  void start_parsing();
-
-  const Lexem& next();
-  const Lexem& cur() const {
-    return lexem;
-  }
-  const Lexem& peek();
-  int tp() const {
-    return lexem.tp;
-  }
-  void expect(int exp_tp, const char* msg = 0);
-  int classify_char(unsigned c) const {
-    return c < 0x80 ? char_class[c] : 0;
-  }
-  bool is_active(int c) const {
-    return (classify_char(c) & cc::active) == cc::active;
-  }
-  bool is_left_active(int c) const {
-    return (classify_char(c) & cc::left_active);
-  }
-  bool is_right_active(int c) const {
-    return (classify_char(c) & cc::right_active);
-  }
-  bool is_repeatable(int c) const {
-    return (classify_char(c) & cc::allow_repeat);
-  }
-  bool is_quote_char(int c) const {
-    return (classify_char(c) & cc::quote_char);
+  void update_location() {
+    location.char_offset = static_cast<int>(p_next - p_start);
  }

- private:
-  void set_spec(std::array<int, 3>& arr, std::string setup);
-  bool is_multiline_quote(const char* begin, const char* end);
+  GNU_ATTRIBUTE_NORETURN GNU_ATTRIBUTE_COLD
+  void on_expect_call_failed(const char* str_expected) const;
+
+public:
+
+  explicit Lexer(const SrcFile* file);
+  Lexer(const Lexer&) = delete;
+  Lexer &operator=(const Lexer&) = delete;
+
+  void add_token(TokenType type, std::string_view str) {
+    tokens_circularbuf[++last_token_idx & 7] = Token(type, str);
+  }
+
+  void skip_spaces() {
+    while (std::isspace(*p_next)) {
+      ++p_next;
+    }
+  }
+
+  void skip_line() {
+    while (p_next < p_end && *p_next != '\n' && *p_next != '\r') {
+      ++p_next;
+    }
+    while (*p_next == '\n' || *p_next == '\r') {
+      ++p_next;
+    }
+  }
+
+  void skip_chars(int n) {
+    p_next += n;
+  }
+
+  bool is_eof() const {
+    return p_next >= p_end;
+  }
+
+  char char_at() const { return *p_next; }
+  char char_at(int shift) const { return *(p_next + shift); }
+  const char* c_str() const { return p_next; }
+
+  TokenType tok() const { return cur_token.type; }
+  std::string_view cur_str() const { return cur_token.str_val; }
+  std::string cur_str_std_string() const { return static_cast<std::string>(cur_token.str_val); }
+  SrcLocation cur_location() const { return location; }
+  int cur_sym_idx() const;
+
+  void next();
+  void next_special(TokenType parse_next_as, const char* str_expected);
+
+  void check(TokenType next_tok, const char* str_expected) const {
+    if (cur_token.type != next_tok) {
+      on_expect_call_failed(str_expected); // unlikely path, not inlined
+    }
+  }
+  void expect(TokenType next_tok, const char* str_expected) {
+    if (cur_token.type != next_tok) {
+      on_expect_call_failed(str_expected);
+    }
+    next();
+  }
+
+  GNU_ATTRIBUTE_NORETURN GNU_ATTRIBUTE_COLD
+  void error(const std::string& err_msg) const;
+  GNU_ATTRIBUTE_NORETURN GNU_ATTRIBUTE_COLD
+  void error_at(const std::string& prefix, const std::string& suffix) const;
 };

+void lexer_init();
+
+// todo #ifdef TOLK_PROFILING
+void lexer_measure_performance(const std::vector<SrcFile*>& files_to_just_parse);
+
 }  // namespace tolk