From 30572c77d62c2573d0bb556800c79fe4532503b5 Mon Sep 17 00:00:00 2001
From: Aleksandr Kirsanov <unserialize.alias@gmail.com>
Date: Tue, 30 Apr 2024 20:32:07 +0300
Subject: [PATCH] [FunC] Support traditional // and /**/ comments

They work alongside Lisp-style ;; and {--}, without any #pragma.
Conceptually, a new syntax should be disabled by default
and activated using a special compiler option.
But now, we don't have an easy way to provide compiler options
in func-js, blueprint, etc.
Note, that introducing per-file #pragma is a wrong approach here,
since if we want to fire human-readable error on using '//' without pragma,
lexer should nevertheless work differently.
(this could be controlled by a launch option, but see above)
---
 crypto/func/auto-tests/tests/comments.fc      | 39 ++++++++++++++
 .../func/auto-tests/tests/invalid-cmt-eof.fc  | 11 ++++
 crypto/func/parse-func.cpp                    |  9 +++-
 crypto/parser/lexer.cpp                       | 53 +++++++++++--------
 crypto/parser/lexer.h                         | 16 ++++--
 crypto/tl/tlbc.cpp                            |  4 +-
 6 files changed, 105 insertions(+), 27 deletions(-)
 create mode 100644 crypto/func/auto-tests/tests/comments.fc
 create mode 100644 crypto/func/auto-tests/tests/invalid-cmt-eof.fc

diff --git a/crypto/func/auto-tests/tests/comments.fc b/crypto/func/auto-tests/tests/comments.fc
new file mode 100644
index 00000000..24ff0102
--- /dev/null
+++ b/crypto/func/auto-tests/tests/comments.fc
@@ -0,0 +1,39 @@
+
+_ get10();
+
+int {-
+    block comment
+    /*
+        nested
+        */
+;;;; -} main()
+
+// inside a comment, {- doesn't start a new one
+{- but if ;; is inside, a comment may end at this line-} {
+    var cc = "a string may contain {- or // or /*, not parsed";
+    // return 1;
+    return get10() + /*
+    traditional comment /* may be also nested */
+    // line comment
+    // ends */1 +
+    1;
+    {- moreover, different comment styles
+        may be used for opening and closing
+    */
+}
+
+/*
+    first line
+//* nested
+     //two-lined*/
+*/
+
+int get10() method_id(10) {
+    return 10;
+}
+
+
+/*
+TESTCASE | 0  | | 12
+TESTCASE | 10 | | 10
+*/
diff --git a/crypto/func/auto-tests/tests/invalid-cmt-eof.fc b/crypto/func/auto-tests/tests/invalid-cmt-eof.fc
new file mode 100644
index 00000000..f9e79021
--- /dev/null
+++ b/crypto/func/auto-tests/tests/invalid-cmt-eof.fc
@@ -0,0 +1,11 @@
+int main() {
+    return 0;
+}
+
+{-
+int ...
+
+/*
+@compilation_should_fail
+@stderr comment extends past end of file
+*/
diff --git a/crypto/func/parse-func.cpp b/crypto/func/parse-func.cpp
index 80cd3568..abba6068 100644
--- a/crypto/func/parse-func.cpp
+++ b/crypto/func/parse-func.cpp
@@ -1822,7 +1822,14 @@ void parse_include(Lexer& lex, const src::FileDescr* fdescr) {
 
 bool parse_source(std::istream* is, src::FileDescr* fdescr) {
   src::SourceReader reader{is, fdescr};
-  Lexer lex{reader, true, ";,()[] ~."};
+  Lexer lex{reader, ";,()[] ~."};
+  // previously, FunC had lisp-style comments,
+  // but starting from v0.5.0, it supports traditional (slash) comments alongside
+  // (in IDE, the user has a setting, what comment style he prefers)
+  // maybe, in some far future, we'll stop supporting lisp-style comments
+  lex.set_comment_tokens(";;", "{-", "-}");
+  lex.set_comment2_tokens("//", "/*", "*/");
+  lex.start_parsing();
   while (lex.tp() != _Eof) {
     if (lex.tp() == _PragmaHashtag) {
       parse_pragma(lex);
diff --git a/crypto/parser/lexer.cpp b/crypto/parser/lexer.cpp
index 117f1df5..418860eb 100644
--- a/crypto/parser/lexer.cpp
+++ b/crypto/parser/lexer.cpp
@@ -124,8 +124,7 @@ int Lexem::set(std::string _str, const SrcLocation& _loc, int _tp, int _val) {
   return classify();
 }
 
-Lexer::Lexer(SourceReader& _src, bool init, std::string active_chars, std::string eol_cmts, std::string open_cmts,
-             std::string close_cmts, std::string quote_chars, std::string multiline_quote)
+Lexer::Lexer(SourceReader& _src, std::string active_chars, std::string quote_chars, std::string multiline_quote)
     : src(_src), eof(false), lexem("", src.here(), Lexem::Undefined), peek_lexem("", {}, Lexem::Undefined),
       multiline_quote(std::move(multiline_quote)) {
   std::memset(char_class, 0, sizeof(char_class));
@@ -139,17 +138,27 @@ Lexer::Lexer(SourceReader& _src, bool init, std::string active_chars, std::strin
       char_class[(unsigned)c] |= activity;
     }
   }
-  set_spec(eol_cmt, eol_cmts);
-  set_spec(cmt_op, open_cmts);
-  set_spec(cmt_cl, close_cmts);
   for (int c : quote_chars) {
     if (c > ' ' && c <= 0x7f) {
       char_class[(unsigned)c] |= cc::quote_char;
     }
   }
-  if (init) {
-    next();
-  }
+}
+
+void Lexer::set_comment_tokens(const std::string &eol_cmts, const std::string &open_cmts, const std::string &close_cmts) {
+  set_spec(eol_cmt, eol_cmts);
+  set_spec(cmt_op, open_cmts);
+  set_spec(cmt_cl, close_cmts);
+}
+
+void Lexer::set_comment2_tokens(const std::string &eol_cmts2, const std::string &open_cmts2, const std::string &close_cmts2) {
+  set_spec(eol_cmt2, eol_cmts2);
+  set_spec(cmt_op2, open_cmts2);
+  set_spec(cmt_cl2, close_cmts2);
+}
+
+void Lexer::start_parsing() {
+  next();
 }
 
 void Lexer::set_spec(std::array<int, 3>& arr, std::string setup) {
@@ -206,24 +215,30 @@ const Lexem& Lexer::next() {
   long long comm = 1;
   while (!src.seek_eof()) {
     int cc = src.cur_char(), nc = src.next_char();
-    if (cc == eol_cmt[0] || (cc == eol_cmt[1] && nc == eol_cmt[2])) {
-      src.load_line();
-    } else if (cc == cmt_op[1] && nc == cmt_op[2]) {
+    // note, that in practice (both in FunC and tlbc), [0]-th element is -256, condition for [0]-th is always false
+    if (cc == eol_cmt[0] || (cc == eol_cmt[1] && nc == eol_cmt[2]) || cc == eol_cmt2[0] || (cc == eol_cmt2[1] && nc == eol_cmt2[2])) {
+      if (comm == 1) {    // just "//" — skip a whole line
+        src.load_line();
+      } else {            // if "//" is nested into "/*", continue reading, since "*/" may be met
+        src.advance(1);
+      }
+    } else if (cc == cmt_op[1] && nc == cmt_op[2] || cc == cmt_op2[1] && nc == cmt_op2[2]) {
       src.advance(2);
       comm = comm * 2 + 1;
-    } else if (cc == cmt_op[0]) {
+    } else if (cc == cmt_op[0] || cc == cmt_op2[0]) {  // always false
       src.advance(1);
       comm *= 2;
     } else if (comm == 1) {
-      break;
-    } else if (cc == cmt_cl[1] && nc == cmt_cl[2]) {
-      if (!(comm & 1)) {
+      break; // means that we are not inside a comment
+    } else if (cc == cmt_cl[1] && nc == cmt_cl[2] || cc == cmt_cl2[1] && nc == cmt_cl2[2]) {
+      if (!(comm & 1)) { // always false
         src.error(std::string{"a `"} + (char)cmt_op[0] + "` comment closed by `" + (char)cmt_cl[1] + (char)cmt_cl[2] +
                   "`");
       }
+      // note that in FunC, {- may be closed with */, but assume it's ok (we'll get rid of {- in the future)
       comm >>= 1;
       src.advance(2);
-    } else if (cc == cmt_cl[0]) {
+    } else if (cc == cmt_cl[0] || cc == cmt_cl2[0]) { // always false
       if (!(comm & 1)) {
         src.error(std::string{"a `"} + (char)cmt_op[1] + (char)cmt_op[2] + "` comment closed by `" + (char)cmt_cl[0] +
                   "`");
@@ -240,11 +255,7 @@ const Lexem& Lexer::next() {
   if (src.seek_eof()) {
     eof = true;
     if (comm > 1) {
-      if (comm & 1) {
-        src.error(std::string{"`"} + (char)cmt_op[1] + (char)cmt_op[2] + "` comment extends past end of file");
-      } else {
-        src.error(std::string{"`"} + (char)cmt_op[0] + "` comment extends past end of file");
-      }
+      src.error("comment extends past end of file");
     }
     return lexem.clear(src.here(), Lexem::Eof);
   }
diff --git a/crypto/parser/lexer.h b/crypto/parser/lexer.h
index 686d8eac..904d8b31 100644
--- a/crypto/parser/lexer.h
+++ b/crypto/parser/lexer.h
@@ -65,12 +65,16 @@ struct Lexem {
   static std::string lexem_name_str(int idx);
 };
 
+// todo this class (like all sources in /ton/crypto/parser) is shared between FunC and tlbc
+// this "shareness" and "generalization" is weird and annoying rather than solves any problems
+// later on, I'll get rid of this (parser/) folder, copying and adapting its sources to FunC and tlbc
 class Lexer {
   SourceReader& src;
   bool eof;
   Lexem lexem, peek_lexem;
   unsigned char char_class[128];
-  std::array<int, 3> eol_cmt, cmt_op, cmt_cl;
+  std::array<int, 3> eol_cmt, cmt_op, cmt_cl;    // for FunC <  0.5.0: ;; {- -}
+  std::array<int, 3> eol_cmt2, cmt_op2, cmt_cl2; // for FunC >= 0.5.0: // /* */
   std::string multiline_quote;
   enum cc { left_active = 2, right_active = 1, active = 3, allow_repeat = 4, quote_char = 8 };
 
@@ -78,9 +82,13 @@ class Lexer {
   bool eof_found() const {
     return eof;
   }
-  Lexer(SourceReader& _src, bool init = false, std::string active_chars = ";,() ~.", std::string eol_cmts = ";;",
-        std::string open_cmts = "{-", std::string close_cmts = "-}", std::string quote_chars = "\"",
-        std::string multiline_quote = "\"\"\"");
+  explicit Lexer(SourceReader& _src, std::string active_chars = ";,() ~.",
+    std::string quote_chars = "\"", std::string multiline_quote = "\"\"\"");
+
+  void set_comment_tokens(const std::string &eol_cmts, const std::string &open_cmts, const std::string &close_cmts);
+  void set_comment2_tokens(const std::string &eol_cmts2, const std::string &open_cmts2, const std::string &close_cmts2);
+  void start_parsing();
+
   const Lexem& next();
   const Lexem& cur() const {
     return lexem;
diff --git a/crypto/tl/tlbc.cpp b/crypto/tl/tlbc.cpp
index b48bc472..9f8fdb0f 100644
--- a/crypto/tl/tlbc.cpp
+++ b/crypto/tl/tlbc.cpp
@@ -2421,7 +2421,9 @@ std::vector<const src::FileDescr*> source_fdescr;
 
 bool parse_source(std::istream* is, src::FileDescr* fdescr) {
   src::SourceReader reader{is, fdescr};
-  src::Lexer lex{reader, true, "(){}:;? #$. ^~ #", "//", "/*", "*/", ""};
+  src::Lexer lex{reader, "(){}:;? #$. ^~ #", ""};
+  lex.set_comment_tokens("//", "/*", "*/");
+  lex.start_parsing();
   while (lex.tp() != src::_Eof) {
     parse_constructor_def(lex);
     // std::cerr << lex.cur().str << '\t' << lex.cur().name_str() << std::endl;