mirror of
				https://github.com/ton-blockchain/ton
				synced 2025-03-09 15:40:10 +00:00 
			
		
		
		
	They work alongside Lisp-style ;; and {--}, without any #pragma.
Conceptually, a new syntax should be disabled by default
and activated using a special compiler option.
But now, we don't have an easy way to provide compiler options
in func-js, blueprint, etc.
Note, that introducing per-file #pragma is a wrong approach here,
since if we want to fire human-readable error on using '//' without pragma,
lexer should nevertheless work differently.
(this could be controlled by a launch option, but see above)
		
	
			
		
			
				
	
	
		
			348 lines
		
	
	
	
		
			9.4 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			348 lines
		
	
	
	
		
			9.4 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|     This file is part of TON Blockchain Library.
 | |
| 
 | |
|     TON Blockchain Library is free software: you can redistribute it and/or modify
 | |
|     it under the terms of the GNU Lesser General Public License as published by
 | |
|     the Free Software Foundation, either version 2 of the License, or
 | |
|     (at your option) any later version.
 | |
| 
 | |
|     TON Blockchain Library is distributed in the hope that it will be useful,
 | |
|     but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|     GNU Lesser General Public License for more details.
 | |
| 
 | |
|     You should have received a copy of the GNU Lesser General Public License
 | |
|     along with TON Blockchain Library.  If not, see <http://www.gnu.org/licenses/>.
 | |
| 
 | |
|     Copyright 2017-2020 Telegram Systems LLP
 | |
| */
 | |
| #include "lexer.h"
 | |
| #include "symtable.h"
 | |
| #include <sstream>
 | |
| #include <cassert>
 | |
| 
 | |
| namespace src {
 | |
| 
 | |
| /*
 | |
|  *
 | |
|  *   LEXER
 | |
|  *
 | |
|  */
 | |
| 
 | |
| std::string Lexem::lexem_name_str(int idx) {
 | |
|   if (idx == Eof) {
 | |
|     return "end of file";
 | |
|   } else if (idx == Ident) {
 | |
|     return "identifier";
 | |
|   } else if (idx == Number) {
 | |
|     return "number";
 | |
|   } else if (idx == String) {
 | |
|     return "string";
 | |
|   } else if (idx == Special) {
 | |
|     return "special";
 | |
|   } else if (sym::symbols.get_keyword(idx)) {
 | |
|     return "`" + sym::symbols.get_keyword(idx)->str + "`";
 | |
|   } else {
 | |
|     std::ostringstream os{"<unknown lexem of type "};
 | |
|     os << idx << ">";
 | |
|     return os.str();
 | |
|   }
 | |
| }
 | |
| 
 | |
| std::string Lexem::name_str() const {
 | |
|   if (tp == Ident) {
 | |
|     return std::string{"identifier `"} + sym::symbols.get_name(val) + "`";
 | |
|   } else if (tp == String) {
 | |
|     return std::string{"string \""} + str + '"';
 | |
|   } else {
 | |
|     return lexem_name_str(tp);
 | |
|   }
 | |
| }
 | |
| 
 | |
| bool is_number(std::string str) {
 | |
|   auto st = str.begin(), en = str.end();
 | |
|   if (st == en) {
 | |
|     return false;
 | |
|   }
 | |
|   if (*st == '-') {
 | |
|     st++;
 | |
|   }
 | |
|   bool hex = false;
 | |
|   if (st + 1 < en && *st == '0' && st[1] == 'x') {
 | |
|     st += 2;
 | |
|     hex = true;
 | |
|   }
 | |
|   if (st == en) {
 | |
|     return false;
 | |
|   }
 | |
|   while (st < en) {
 | |
|     int c = *st;
 | |
|     if (c >= '0' && c <= '9') {
 | |
|       ++st;
 | |
|       continue;
 | |
|     }
 | |
|     if (!hex) {
 | |
|       return false;
 | |
|     }
 | |
|     c |= 0x20;
 | |
|     if (c < 'a' || c > 'f') {
 | |
|       return false;
 | |
|     }
 | |
|     ++st;
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| int Lexem::classify() {
 | |
|   if (tp != Unknown) {
 | |
|     return tp;
 | |
|   }
 | |
|   sym::sym_idx_t i = sym::symbols.lookup(str);
 | |
|   if (i) {
 | |
|     assert(str == sym::symbols[i]->str);
 | |
|     str = sym::symbols[i]->str;
 | |
|     sym::sym_idx_t idx = sym::symbols[i]->idx;
 | |
|     tp = (idx < 0 ? -idx : Ident);
 | |
|     val = i;
 | |
|   } else if (is_number(str)) {
 | |
|     tp = Number;
 | |
|   } else {
 | |
|     tp = lexem_is_special(str);
 | |
|   }
 | |
|   if (tp == Unknown) {
 | |
|     tp = Ident;
 | |
|     val = sym::symbols.lookup(str, 1);
 | |
|   }
 | |
|   return tp;
 | |
| }
 | |
| 
 | |
| int Lexem::set(std::string _str, const SrcLocation& _loc, int _tp, int _val) {
 | |
|   str = _str;
 | |
|   loc = _loc;
 | |
|   tp = _tp;
 | |
|   val = _val;
 | |
|   return classify();
 | |
| }
 | |
| 
 | |
| Lexer::Lexer(SourceReader& _src, std::string active_chars, std::string quote_chars, std::string multiline_quote)
 | |
|     : src(_src), eof(false), lexem("", src.here(), Lexem::Undefined), peek_lexem("", {}, Lexem::Undefined),
 | |
|       multiline_quote(std::move(multiline_quote)) {
 | |
|   std::memset(char_class, 0, sizeof(char_class));
 | |
|   unsigned char activity = cc::active;
 | |
|   for (char c : active_chars) {
 | |
|     if (c == ' ') {
 | |
|       if (!--activity) {
 | |
|         activity = cc::allow_repeat;
 | |
|       }
 | |
|     } else if ((unsigned)c < 0x80) {
 | |
|       char_class[(unsigned)c] |= activity;
 | |
|     }
 | |
|   }
 | |
|   for (int c : quote_chars) {
 | |
|     if (c > ' ' && c <= 0x7f) {
 | |
|       char_class[(unsigned)c] |= cc::quote_char;
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| void Lexer::set_comment_tokens(const std::string &eol_cmts, const std::string &open_cmts, const std::string &close_cmts) {
 | |
|   set_spec(eol_cmt, eol_cmts);
 | |
|   set_spec(cmt_op, open_cmts);
 | |
|   set_spec(cmt_cl, close_cmts);
 | |
| }
 | |
| 
 | |
| void Lexer::set_comment2_tokens(const std::string &eol_cmts2, const std::string &open_cmts2, const std::string &close_cmts2) {
 | |
|   set_spec(eol_cmt2, eol_cmts2);
 | |
|   set_spec(cmt_op2, open_cmts2);
 | |
|   set_spec(cmt_cl2, close_cmts2);
 | |
| }
 | |
| 
 | |
| void Lexer::start_parsing() {
 | |
|   next();
 | |
| }
 | |
| 
 | |
| void Lexer::set_spec(std::array<int, 3>& arr, std::string setup) {
 | |
|   arr[0] = arr[1] = arr[2] = -0x100;
 | |
|   std::size_t n = setup.size(), i;
 | |
|   for (i = 0; i < n; i++) {
 | |
|     if (setup[i] == ' ') {
 | |
|       continue;
 | |
|     }
 | |
|     if (i == n - 1 || setup[i + 1] == ' ') {
 | |
|       arr[0] = setup[i];
 | |
|     } else if (i == n - 2 || (i < n - 2 && setup[i + 2] == ' ')) {
 | |
|       arr[1] = setup[i];
 | |
|       arr[2] = setup[++i];
 | |
|     } else {
 | |
|       while (i < n && setup[i] != ' ') {
 | |
|         i++;
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| bool Lexer::is_multiline_quote(const char* begin, const char* end) {
 | |
|   if (multiline_quote.empty()) {
 | |
|     return false;
 | |
|   }
 | |
|   for (const char& c : multiline_quote) {
 | |
|     if (begin == end || *begin != c) {
 | |
|       return false;
 | |
|     }
 | |
|     ++begin;
 | |
|   }
 | |
|   return true;
 | |
| }
 | |
| 
 | |
| void Lexer::expect(int exp_tp, const char* msg) {
 | |
|   if (tp() != exp_tp) {
 | |
|     throw ParseError{lexem.loc, (msg ? std::string{msg} : Lexem::lexem_name_str(exp_tp)) + " expected instead of " +
 | |
|                                     cur().name_str()};
 | |
|   }
 | |
|   next();
 | |
| }
 | |
| 
 | |
| const Lexem& Lexer::next() {
 | |
|   if (peek_lexem.valid()) {
 | |
|     lexem = std::move(peek_lexem);
 | |
|     peek_lexem.clear({}, Lexem::Undefined);
 | |
|     eof = (lexem.tp == Lexem::Eof);
 | |
|     return lexem;
 | |
|   }
 | |
|   if (eof) {
 | |
|     return lexem.clear(src.here(), Lexem::Eof);
 | |
|   }
 | |
|   long long comm = 1;
 | |
|   while (!src.seek_eof()) {
 | |
|     int cc = src.cur_char(), nc = src.next_char();
 | |
|     // note, that in practice (both in FunC and tlbc), [0]-th element is -256, condition for [0]-th is always false
 | |
|     if (cc == eol_cmt[0] || (cc == eol_cmt[1] && nc == eol_cmt[2]) || cc == eol_cmt2[0] || (cc == eol_cmt2[1] && nc == eol_cmt2[2])) {
 | |
|       if (comm == 1) {    // just "//" — skip a whole line
 | |
|         src.load_line();
 | |
|       } else {            // if "//" is nested into "/*", continue reading, since "*/" may be met
 | |
|         src.advance(1);
 | |
|       }
 | |
|     } else if (cc == cmt_op[1] && nc == cmt_op[2] || cc == cmt_op2[1] && nc == cmt_op2[2]) {
 | |
|       src.advance(2);
 | |
|       comm = comm * 2 + 1;
 | |
|     } else if (cc == cmt_op[0] || cc == cmt_op2[0]) {  // always false
 | |
|       src.advance(1);
 | |
|       comm *= 2;
 | |
|     } else if (comm == 1) {
 | |
|       break; // means that we are not inside a comment
 | |
|     } else if (cc == cmt_cl[1] && nc == cmt_cl[2] || cc == cmt_cl2[1] && nc == cmt_cl2[2]) {
 | |
|       if (!(comm & 1)) { // always false
 | |
|         src.error(std::string{"a `"} + (char)cmt_op[0] + "` comment closed by `" + (char)cmt_cl[1] + (char)cmt_cl[2] +
 | |
|                   "`");
 | |
|       }
 | |
|       // note that in FunC, {- may be closed with */, but assume it's ok (we'll get rid of {- in the future)
 | |
|       comm >>= 1;
 | |
|       src.advance(2);
 | |
|     } else if (cc == cmt_cl[0] || cc == cmt_cl2[0]) { // always false
 | |
|       if (!(comm & 1)) {
 | |
|         src.error(std::string{"a `"} + (char)cmt_op[1] + (char)cmt_op[2] + "` comment closed by `" + (char)cmt_cl[0] +
 | |
|                   "`");
 | |
|       }
 | |
|       comm >>= 1;
 | |
|       src.advance(1);
 | |
|     } else {
 | |
|       src.advance(1);
 | |
|     }
 | |
|     if (comm < 0) {
 | |
|       src.error("too many nested comments");
 | |
|     }
 | |
|   }
 | |
|   if (src.seek_eof()) {
 | |
|     eof = true;
 | |
|     if (comm > 1) {
 | |
|       src.error("comment extends past end of file");
 | |
|     }
 | |
|     return lexem.clear(src.here(), Lexem::Eof);
 | |
|   }
 | |
|   if (is_multiline_quote(src.get_ptr(), src.get_end_ptr())) {
 | |
|     src.advance(multiline_quote.size());
 | |
|     const char* end = nullptr;
 | |
|     SrcLocation here = src.here();
 | |
|     std::string body;
 | |
|     while (!src.is_eof()) {
 | |
|       if (src.is_eoln()) {
 | |
|         body.push_back('\n');
 | |
|         src.load_line();
 | |
|         continue;
 | |
|       }
 | |
|       if (is_multiline_quote(src.get_ptr(), src.get_end_ptr())) {
 | |
|         end = src.get_ptr();
 | |
|         src.advance(multiline_quote.size());
 | |
|         break;
 | |
|       }
 | |
|       body.push_back(src.cur_char());
 | |
|       src.advance(1);
 | |
|     }
 | |
|     if (!end) {
 | |
|       src.error("string extends past end of file");
 | |
|     }
 | |
|     lexem.set(body, here, Lexem::String);
 | |
|     int c = src.cur_char();
 | |
|     if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
 | |
|       lexem.val = c;
 | |
|       src.advance(1);
 | |
|     }
 | |
|     return lexem;
 | |
|   }
 | |
|   int c = src.cur_char();
 | |
|   const char* end = src.get_ptr();
 | |
|   if (is_quote_char(c) || c == '`') {
 | |
|     int qc = c;
 | |
|     ++end;
 | |
|     while (end < src.get_end_ptr() && *end != qc) {
 | |
|       ++end;
 | |
|     }
 | |
|     if (*end != qc) {
 | |
|       src.error(qc == '`' ? "a `back-quoted` token extends past end of line" : "string extends past end of line");
 | |
|     }
 | |
|     lexem.set(std::string{src.get_ptr() + 1, end}, src.here(), qc == '`' ? Lexem::Unknown : Lexem::String);
 | |
|     src.set_ptr(end + 1);
 | |
|     c = src.cur_char();
 | |
|     if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
 | |
|       lexem.val = c;
 | |
|       src.set_ptr(end + 2);
 | |
|     }
 | |
|     // std::cerr << lexem.name_str() << ' ' << lexem.str << std::endl;
 | |
|     return lexem;
 | |
|   }
 | |
|   int len = 0, pc = -0x100;
 | |
|   while (end < src.get_end_ptr()) {
 | |
|     c = *end;
 | |
|     bool repeated = (c == pc && is_repeatable(c));
 | |
|     if (c == ' ' || c == 9 || (len && is_left_active(c) && !repeated)) {
 | |
|       break;
 | |
|     }
 | |
|     ++len;
 | |
|     ++end;
 | |
|     if (is_right_active(c) && !repeated) {
 | |
|       break;
 | |
|     }
 | |
|     pc = c;
 | |
|   }
 | |
|   lexem.set(std::string{src.get_ptr(), end}, src.here());
 | |
|   src.set_ptr(end);
 | |
|   // std::cerr << lexem.name_str() << ' ' << lexem.str << std::endl;
 | |
|   return lexem;
 | |
| }
 | |
| 
 | |
| const Lexem& Lexer::peek() {
 | |
|   if (peek_lexem.valid()) {
 | |
|     return peek_lexem;
 | |
|   }
 | |
|   if (eof) {
 | |
|     return lexem.clear(src.here(), Lexem::Eof);
 | |
|   }
 | |
|   Lexem keep = std::move(lexem);
 | |
|   next();
 | |
|   peek_lexem = std::move(lexem);
 | |
|   lexem = std::move(keep);
 | |
|   eof = false;
 | |
|   return peek_lexem;
 | |
| }
 | |
| 
 | |
| }  // namespace src
 |