From 30572c77d62c2573d0bb556800c79fe4532503b5 Mon Sep 17 00:00:00 2001 From: Aleksandr Kirsanov Date: Tue, 30 Apr 2024 20:32:07 +0300 Subject: [PATCH] [FunC] Support traditional // and /**/ comments They work alongside Lisp-style ;; and {--}, without any #pragma. Conceptually, a new syntax should be disabled by default and activated using a special compiler option. But now, we don't have an easy way to provide compiler options in func-js, blueprint, etc. Note, that introducing per-file #pragma is a wrong approach here, since if we want to fire human-readable error on using '//' without pragma, lexer should nevertheless work differently. (this could be controlled by a launch option, but see above) --- crypto/func/auto-tests/tests/comments.fc | 39 ++++++++++++++ .../func/auto-tests/tests/invalid-cmt-eof.fc | 11 ++++ crypto/func/parse-func.cpp | 9 +++- crypto/parser/lexer.cpp | 53 +++++++++++-------- crypto/parser/lexer.h | 16 ++++-- crypto/tl/tlbc.cpp | 4 +- 6 files changed, 105 insertions(+), 27 deletions(-) create mode 100644 crypto/func/auto-tests/tests/comments.fc create mode 100644 crypto/func/auto-tests/tests/invalid-cmt-eof.fc diff --git a/crypto/func/auto-tests/tests/comments.fc b/crypto/func/auto-tests/tests/comments.fc new file mode 100644 index 00000000..24ff0102 --- /dev/null +++ b/crypto/func/auto-tests/tests/comments.fc @@ -0,0 +1,39 @@ + +_ get10(); + +int {- + block comment + /* + nested + */ +;;;; -} main() + +// inside a comment, {- doesn't start a new one +{- but if ;; is inside, a comment may end at this line-} { + var cc = "a string may contain {- or // or /*, not parsed"; + // return 1; + return get10() + /* + traditional comment /* may be also nested */ + // line comment + // ends */1 + + 1; + {- moreover, different comment styles + may be used for opening and closing + */ +} + +/* + first line +//* nested + //two-lined*/ +*/ + +int get10() method_id(10) { + return 10; +} + + +/* +TESTCASE | 0 | | 12 +TESTCASE | 10 | | 10 +*/ diff --git a/crypto/func/auto-tests/tests/invalid-cmt-eof.fc b/crypto/func/auto-tests/tests/invalid-cmt-eof.fc new file mode 100644 index 00000000..f9e79021 --- /dev/null +++ b/crypto/func/auto-tests/tests/invalid-cmt-eof.fc @@ -0,0 +1,11 @@ +int main() { + return 0; +} + +{- +int ... + +/* +@compilation_should_fail +@stderr comment extends past end of file +*/ diff --git a/crypto/func/parse-func.cpp b/crypto/func/parse-func.cpp index 80cd3568..abba6068 100644 --- a/crypto/func/parse-func.cpp +++ b/crypto/func/parse-func.cpp @@ -1822,7 +1822,14 @@ void parse_include(Lexer& lex, const src::FileDescr* fdescr) { bool parse_source(std::istream* is, src::FileDescr* fdescr) { src::SourceReader reader{is, fdescr}; - Lexer lex{reader, true, ";,()[] ~."}; + Lexer lex{reader, ";,()[] ~."}; + // previously, FunC had lisp-style comments, + // but starting from v0.5.0, it supports traditional (slash) comments alongside + // (in IDE, the user has a setting, what comment style he prefers) + // maybe, in some far future, we'll stop supporting lisp-style comments + lex.set_comment_tokens(";;", "{-", "-}"); + lex.set_comment2_tokens("//", "/*", "*/"); + lex.start_parsing(); while (lex.tp() != _Eof) { if (lex.tp() == _PragmaHashtag) { parse_pragma(lex); diff --git a/crypto/parser/lexer.cpp b/crypto/parser/lexer.cpp index 117f1df5..418860eb 100644 --- a/crypto/parser/lexer.cpp +++ b/crypto/parser/lexer.cpp @@ -124,8 +124,7 @@ int Lexem::set(std::string _str, const SrcLocation& _loc, int _tp, int _val) { return classify(); } -Lexer::Lexer(SourceReader& _src, bool init, std::string active_chars, std::string eol_cmts, std::string open_cmts, - std::string close_cmts, std::string quote_chars, std::string multiline_quote) +Lexer::Lexer(SourceReader& _src, std::string active_chars, std::string quote_chars, std::string multiline_quote) : src(_src), eof(false), lexem("", src.here(), Lexem::Undefined), peek_lexem("", {}, Lexem::Undefined), multiline_quote(std::move(multiline_quote)) { std::memset(char_class, 0, sizeof(char_class)); @@ -139,17 +138,27 @@ Lexer::Lexer(SourceReader& _src, bool init, std::string active_chars, std::strin char_class[(unsigned)c] |= activity; } } - set_spec(eol_cmt, eol_cmts); - set_spec(cmt_op, open_cmts); - set_spec(cmt_cl, close_cmts); for (int c : quote_chars) { if (c > ' ' && c <= 0x7f) { char_class[(unsigned)c] |= cc::quote_char; } } - if (init) { - next(); - } +} + +void Lexer::set_comment_tokens(const std::string &eol_cmts, const std::string &open_cmts, const std::string &close_cmts) { + set_spec(eol_cmt, eol_cmts); + set_spec(cmt_op, open_cmts); + set_spec(cmt_cl, close_cmts); +} + +void Lexer::set_comment2_tokens(const std::string &eol_cmts2, const std::string &open_cmts2, const std::string &close_cmts2) { + set_spec(eol_cmt2, eol_cmts2); + set_spec(cmt_op2, open_cmts2); + set_spec(cmt_cl2, close_cmts2); +} + +void Lexer::start_parsing() { + next(); } void Lexer::set_spec(std::array& arr, std::string setup) { @@ -206,24 +215,30 @@ const Lexem& Lexer::next() { long long comm = 1; while (!src.seek_eof()) { int cc = src.cur_char(), nc = src.next_char(); - if (cc == eol_cmt[0] || (cc == eol_cmt[1] && nc == eol_cmt[2])) { - src.load_line(); - } else if (cc == cmt_op[1] && nc == cmt_op[2]) { + // note, that in practice (both in FunC and tlbc), [0]-th element is -256, condition for [0]-th is always false + if (cc == eol_cmt[0] || (cc == eol_cmt[1] && nc == eol_cmt[2]) || cc == eol_cmt2[0] || (cc == eol_cmt2[1] && nc == eol_cmt2[2])) { + if (comm == 1) { // just "//" — skip a whole line + src.load_line(); + } else { // if "//" is nested into "/*", continue reading, since "*/" may be met + src.advance(1); + } + } else if (cc == cmt_op[1] && nc == cmt_op[2] || cc == cmt_op2[1] && nc == cmt_op2[2]) { src.advance(2); comm = comm * 2 + 1; - } else if (cc == cmt_op[0]) { + } else if (cc == cmt_op[0] || cc == cmt_op2[0]) { // always false src.advance(1); comm *= 2; } else if (comm == 1) { - break; - } else if (cc == cmt_cl[1] && nc == cmt_cl[2]) { - if (!(comm & 1)) { + break; // means that we are not inside a comment + } else if (cc == cmt_cl[1] && nc == cmt_cl[2] || cc == cmt_cl2[1] && nc == cmt_cl2[2]) { + if (!(comm & 1)) { // always false src.error(std::string{"a `"} + (char)cmt_op[0] + "` comment closed by `" + (char)cmt_cl[1] + (char)cmt_cl[2] + "`"); } + // note that in FunC, {- may be closed with */, but assume it's ok (we'll get rid of {- in the future) comm >>= 1; src.advance(2); - } else if (cc == cmt_cl[0]) { + } else if (cc == cmt_cl[0] || cc == cmt_cl2[0]) { // always false if (!(comm & 1)) { src.error(std::string{"a `"} + (char)cmt_op[1] + (char)cmt_op[2] + "` comment closed by `" + (char)cmt_cl[0] + "`"); @@ -240,11 +255,7 @@ const Lexem& Lexer::next() { if (src.seek_eof()) { eof = true; if (comm > 1) { - if (comm & 1) { - src.error(std::string{"`"} + (char)cmt_op[1] + (char)cmt_op[2] + "` comment extends past end of file"); - } else { - src.error(std::string{"`"} + (char)cmt_op[0] + "` comment extends past end of file"); - } + src.error("comment extends past end of file"); } return lexem.clear(src.here(), Lexem::Eof); } diff --git a/crypto/parser/lexer.h b/crypto/parser/lexer.h index 686d8eac..904d8b31 100644 --- a/crypto/parser/lexer.h +++ b/crypto/parser/lexer.h @@ -65,12 +65,16 @@ struct Lexem { static std::string lexem_name_str(int idx); }; +// todo this class (like all sources in /ton/crypto/parser) is shared between FunC and tlbc +// this "shareness" and "generalization" is weird and annoying rather than solves any problems +// later on, I'll get rid of this (parser/) folder, copying and adapting its sources to FunC and tlbc class Lexer { SourceReader& src; bool eof; Lexem lexem, peek_lexem; unsigned char char_class[128]; - std::array eol_cmt, cmt_op, cmt_cl; + std::array eol_cmt, cmt_op, cmt_cl; // for FunC < 0.5.0: ;; {- -} + std::array eol_cmt2, cmt_op2, cmt_cl2; // for FunC >= 0.5.0: // /* */ std::string multiline_quote; enum cc { left_active = 2, right_active = 1, active = 3, allow_repeat = 4, quote_char = 8 }; @@ -78,9 +82,13 @@ class Lexer { bool eof_found() const { return eof; } - Lexer(SourceReader& _src, bool init = false, std::string active_chars = ";,() ~.", std::string eol_cmts = ";;", - std::string open_cmts = "{-", std::string close_cmts = "-}", std::string quote_chars = "\"", - std::string multiline_quote = "\"\"\""); + explicit Lexer(SourceReader& _src, std::string active_chars = ";,() ~.", + std::string quote_chars = "\"", std::string multiline_quote = "\"\"\""); + + void set_comment_tokens(const std::string &eol_cmts, const std::string &open_cmts, const std::string &close_cmts); + void set_comment2_tokens(const std::string &eol_cmts2, const std::string &open_cmts2, const std::string &close_cmts2); + void start_parsing(); + const Lexem& next(); const Lexem& cur() const { return lexem; diff --git a/crypto/tl/tlbc.cpp b/crypto/tl/tlbc.cpp index b48bc472..9f8fdb0f 100644 --- a/crypto/tl/tlbc.cpp +++ b/crypto/tl/tlbc.cpp @@ -2421,7 +2421,9 @@ std::vector source_fdescr; bool parse_source(std::istream* is, src::FileDescr* fdescr) { src::SourceReader reader{is, fdescr}; - src::Lexer lex{reader, true, "(){}:;? #$. ^~ #", "//", "/*", "*/", ""}; + src::Lexer lex{reader, "(){}:;? #$. ^~ #", ""}; + lex.set_comment_tokens("//", "/*", "*/"); + lex.start_parsing(); while (lex.tp() != src::_Eof) { parse_constructor_def(lex); // std::cerr << lex.cur().str << '\t' << lex.cur().name_str() << std::endl;