mirror of
				https://github.com/ton-blockchain/ton
				synced 2025-03-09 15:40:10 +00:00 
			
		
		
		
	* Bugfixes in TVM and node * Upgrade to C++17 * Improve GitHub builds * Fix existing tests and partially integrate them into builds --------- Co-authored-by: neodiX42 <namlem@gmail.com> Co-authored-by: EmelyanenkoK <emelyanenko.kirill@gmail.com>
		
			
				
	
	
		
			337 lines
		
	
	
	
		
			8.6 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			337 lines
		
	
	
	
		
			8.6 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
/*
 | 
						|
    This file is part of TON Blockchain Library.
 | 
						|
 | 
						|
    TON Blockchain Library is free software: you can redistribute it and/or modify
 | 
						|
    it under the terms of the GNU Lesser General Public License as published by
 | 
						|
    the Free Software Foundation, either version 2 of the License, or
 | 
						|
    (at your option) any later version.
 | 
						|
 | 
						|
    TON Blockchain Library is distributed in the hope that it will be useful,
 | 
						|
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
						|
    GNU Lesser General Public License for more details.
 | 
						|
 | 
						|
    You should have received a copy of the GNU Lesser General Public License
 | 
						|
    along with TON Blockchain Library.  If not, see <http://www.gnu.org/licenses/>.
 | 
						|
 | 
						|
    Copyright 2017-2020 Telegram Systems LLP
 | 
						|
*/
 | 
						|
#include "lexer.h"
 | 
						|
#include "symtable.h"
 | 
						|
#include <sstream>
 | 
						|
#include <cassert>
 | 
						|
 | 
						|
namespace src {
 | 
						|
 | 
						|
/*
 | 
						|
 *
 | 
						|
 *   LEXER
 | 
						|
 *
 | 
						|
 */
 | 
						|
 | 
						|
std::string Lexem::lexem_name_str(int idx) {
 | 
						|
  if (idx == Eof) {
 | 
						|
    return "end of file";
 | 
						|
  } else if (idx == Ident) {
 | 
						|
    return "identifier";
 | 
						|
  } else if (idx == Number) {
 | 
						|
    return "number";
 | 
						|
  } else if (idx == String) {
 | 
						|
    return "string";
 | 
						|
  } else if (idx == Special) {
 | 
						|
    return "special";
 | 
						|
  } else if (sym::symbols.get_keyword(idx)) {
 | 
						|
    return "`" + sym::symbols.get_keyword(idx)->str + "`";
 | 
						|
  } else {
 | 
						|
    std::ostringstream os{"<unknown lexem of type "};
 | 
						|
    os << idx << ">";
 | 
						|
    return os.str();
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
std::string Lexem::name_str() const {
 | 
						|
  if (tp == Ident) {
 | 
						|
    return std::string{"identifier `"} + sym::symbols.get_name(val) + "`";
 | 
						|
  } else if (tp == String) {
 | 
						|
    return std::string{"string \""} + str + '"';
 | 
						|
  } else {
 | 
						|
    return lexem_name_str(tp);
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
bool is_number(std::string str) {
 | 
						|
  auto st = str.begin(), en = str.end();
 | 
						|
  if (st == en) {
 | 
						|
    return false;
 | 
						|
  }
 | 
						|
  if (*st == '-') {
 | 
						|
    st++;
 | 
						|
  }
 | 
						|
  bool hex = false;
 | 
						|
  if (st + 1 < en && *st == '0' && st[1] == 'x') {
 | 
						|
    st += 2;
 | 
						|
    hex = true;
 | 
						|
  }
 | 
						|
  if (st == en) {
 | 
						|
    return false;
 | 
						|
  }
 | 
						|
  while (st < en) {
 | 
						|
    int c = *st;
 | 
						|
    if (c >= '0' && c <= '9') {
 | 
						|
      ++st;
 | 
						|
      continue;
 | 
						|
    }
 | 
						|
    if (!hex) {
 | 
						|
      return false;
 | 
						|
    }
 | 
						|
    c |= 0x20;
 | 
						|
    if (c < 'a' || c > 'f') {
 | 
						|
      return false;
 | 
						|
    }
 | 
						|
    ++st;
 | 
						|
  }
 | 
						|
  return true;
 | 
						|
}
 | 
						|
 | 
						|
int Lexem::classify() {
 | 
						|
  if (tp != Unknown) {
 | 
						|
    return tp;
 | 
						|
  }
 | 
						|
  sym::sym_idx_t i = sym::symbols.lookup(str);
 | 
						|
  if (i) {
 | 
						|
    assert(str == sym::symbols[i]->str);
 | 
						|
    str = sym::symbols[i]->str;
 | 
						|
    sym::sym_idx_t idx = sym::symbols[i]->idx;
 | 
						|
    tp = (idx < 0 ? -idx : Ident);
 | 
						|
    val = i;
 | 
						|
  } else if (is_number(str)) {
 | 
						|
    tp = Number;
 | 
						|
  } else {
 | 
						|
    tp = lexem_is_special(str);
 | 
						|
  }
 | 
						|
  if (tp == Unknown) {
 | 
						|
    tp = Ident;
 | 
						|
    val = sym::symbols.lookup(str, 1);
 | 
						|
  }
 | 
						|
  return tp;
 | 
						|
}
 | 
						|
 | 
						|
int Lexem::set(std::string _str, const SrcLocation& _loc, int _tp, int _val) {
 | 
						|
  str = _str;
 | 
						|
  loc = _loc;
 | 
						|
  tp = _tp;
 | 
						|
  val = _val;
 | 
						|
  return classify();
 | 
						|
}
 | 
						|
 | 
						|
Lexer::Lexer(SourceReader& _src, bool init, std::string active_chars, std::string eol_cmts, std::string open_cmts,
 | 
						|
             std::string close_cmts, std::string quote_chars, std::string multiline_quote)
 | 
						|
    : src(_src), eof(false), lexem("", src.here(), Lexem::Undefined), peek_lexem("", {}, Lexem::Undefined),
 | 
						|
      multiline_quote(std::move(multiline_quote)) {
 | 
						|
  std::memset(char_class, 0, sizeof(char_class));
 | 
						|
  unsigned char activity = cc::active;
 | 
						|
  for (char c : active_chars) {
 | 
						|
    if (c == ' ') {
 | 
						|
      if (!--activity) {
 | 
						|
        activity = cc::allow_repeat;
 | 
						|
      }
 | 
						|
    } else if ((unsigned)c < 0x80) {
 | 
						|
      char_class[(unsigned)c] |= activity;
 | 
						|
    }
 | 
						|
  }
 | 
						|
  set_spec(eol_cmt, eol_cmts);
 | 
						|
  set_spec(cmt_op, open_cmts);
 | 
						|
  set_spec(cmt_cl, close_cmts);
 | 
						|
  for (int c : quote_chars) {
 | 
						|
    if (c > ' ' && c <= 0x7f) {
 | 
						|
      char_class[(unsigned)c] |= cc::quote_char;
 | 
						|
    }
 | 
						|
  }
 | 
						|
  if (init) {
 | 
						|
    next();
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
void Lexer::set_spec(std::array<int, 3>& arr, std::string setup) {
 | 
						|
  arr[0] = arr[1] = arr[2] = -0x100;
 | 
						|
  std::size_t n = setup.size(), i;
 | 
						|
  for (i = 0; i < n; i++) {
 | 
						|
    if (setup[i] == ' ') {
 | 
						|
      continue;
 | 
						|
    }
 | 
						|
    if (i == n - 1 || setup[i + 1] == ' ') {
 | 
						|
      arr[0] = setup[i];
 | 
						|
    } else if (i == n - 2 || (i < n - 2 && setup[i + 2] == ' ')) {
 | 
						|
      arr[1] = setup[i];
 | 
						|
      arr[2] = setup[++i];
 | 
						|
    } else {
 | 
						|
      while (i < n && setup[i] != ' ') {
 | 
						|
        i++;
 | 
						|
      }
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
bool Lexer::is_multiline_quote(const char* begin, const char* end) {
 | 
						|
  if (multiline_quote.empty()) {
 | 
						|
    return false;
 | 
						|
  }
 | 
						|
  for (const char& c : multiline_quote) {
 | 
						|
    if (begin == end || *begin != c) {
 | 
						|
      return false;
 | 
						|
    }
 | 
						|
    ++begin;
 | 
						|
  }
 | 
						|
  return true;
 | 
						|
}
 | 
						|
 | 
						|
void Lexer::expect(int exp_tp, const char* msg) {
 | 
						|
  if (tp() != exp_tp) {
 | 
						|
    throw ParseError{lexem.loc, (msg ? std::string{msg} : Lexem::lexem_name_str(exp_tp)) + " expected instead of " +
 | 
						|
                                    cur().name_str()};
 | 
						|
  }
 | 
						|
  next();
 | 
						|
}
 | 
						|
 | 
						|
const Lexem& Lexer::next() {
 | 
						|
  if (peek_lexem.valid()) {
 | 
						|
    lexem = std::move(peek_lexem);
 | 
						|
    peek_lexem.clear({}, Lexem::Undefined);
 | 
						|
    eof = (lexem.tp == Lexem::Eof);
 | 
						|
    return lexem;
 | 
						|
  }
 | 
						|
  if (eof) {
 | 
						|
    return lexem.clear(src.here(), Lexem::Eof);
 | 
						|
  }
 | 
						|
  long long comm = 1;
 | 
						|
  while (!src.seek_eof()) {
 | 
						|
    int cc = src.cur_char(), nc = src.next_char();
 | 
						|
    if (cc == eol_cmt[0] || (cc == eol_cmt[1] && nc == eol_cmt[2])) {
 | 
						|
      src.load_line();
 | 
						|
    } else if (cc == cmt_op[1] && nc == cmt_op[2]) {
 | 
						|
      src.advance(2);
 | 
						|
      comm = comm * 2 + 1;
 | 
						|
    } else if (cc == cmt_op[0]) {
 | 
						|
      src.advance(1);
 | 
						|
      comm *= 2;
 | 
						|
    } else if (comm == 1) {
 | 
						|
      break;
 | 
						|
    } else if (cc == cmt_cl[1] && nc == cmt_cl[2]) {
 | 
						|
      if (!(comm & 1)) {
 | 
						|
        src.error(std::string{"a `"} + (char)cmt_op[0] + "` comment closed by `" + (char)cmt_cl[1] + (char)cmt_cl[2] +
 | 
						|
                  "`");
 | 
						|
      }
 | 
						|
      comm >>= 1;
 | 
						|
      src.advance(2);
 | 
						|
    } else if (cc == cmt_cl[0]) {
 | 
						|
      if (!(comm & 1)) {
 | 
						|
        src.error(std::string{"a `"} + (char)cmt_op[1] + (char)cmt_op[2] + "` comment closed by `" + (char)cmt_cl[0] +
 | 
						|
                  "`");
 | 
						|
      }
 | 
						|
      comm >>= 1;
 | 
						|
      src.advance(1);
 | 
						|
    } else {
 | 
						|
      src.advance(1);
 | 
						|
    }
 | 
						|
    if (comm < 0) {
 | 
						|
      src.error("too many nested comments");
 | 
						|
    }
 | 
						|
  }
 | 
						|
  if (src.seek_eof()) {
 | 
						|
    eof = true;
 | 
						|
    if (comm > 1) {
 | 
						|
      if (comm & 1) {
 | 
						|
        src.error(std::string{"`"} + (char)cmt_op[1] + (char)cmt_op[2] + "` comment extends past end of file");
 | 
						|
      } else {
 | 
						|
        src.error(std::string{"`"} + (char)cmt_op[0] + "` comment extends past end of file");
 | 
						|
      }
 | 
						|
    }
 | 
						|
    return lexem.clear(src.here(), Lexem::Eof);
 | 
						|
  }
 | 
						|
  if (is_multiline_quote(src.get_ptr(), src.get_end_ptr())) {
 | 
						|
    src.advance(multiline_quote.size());
 | 
						|
    const char* end = nullptr;
 | 
						|
    SrcLocation here = src.here();
 | 
						|
    std::string body;
 | 
						|
    while (!src.is_eof()) {
 | 
						|
      if (src.is_eoln()) {
 | 
						|
        body.push_back('\n');
 | 
						|
        src.load_line();
 | 
						|
        continue;
 | 
						|
      }
 | 
						|
      if (is_multiline_quote(src.get_ptr(), src.get_end_ptr())) {
 | 
						|
        end = src.get_ptr();
 | 
						|
        src.advance(multiline_quote.size());
 | 
						|
        break;
 | 
						|
      }
 | 
						|
      body.push_back(src.cur_char());
 | 
						|
      src.advance(1);
 | 
						|
    }
 | 
						|
    if (!end) {
 | 
						|
      src.error("string extends past end of file");
 | 
						|
    }
 | 
						|
    lexem.set(body, here, Lexem::String);
 | 
						|
    int c = src.cur_char();
 | 
						|
    if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
 | 
						|
      lexem.val = c;
 | 
						|
      src.advance(1);
 | 
						|
    }
 | 
						|
    return lexem;
 | 
						|
  }
 | 
						|
  int c = src.cur_char();
 | 
						|
  const char* end = src.get_ptr();
 | 
						|
  if (is_quote_char(c) || c == '`') {
 | 
						|
    int qc = c;
 | 
						|
    ++end;
 | 
						|
    while (end < src.get_end_ptr() && *end != qc) {
 | 
						|
      ++end;
 | 
						|
    }
 | 
						|
    if (*end != qc) {
 | 
						|
      src.error(qc == '`' ? "a `back-quoted` token extends past end of line" : "string extends past end of line");
 | 
						|
    }
 | 
						|
    lexem.set(std::string{src.get_ptr() + 1, end}, src.here(), qc == '`' ? Lexem::Unknown : Lexem::String);
 | 
						|
    src.set_ptr(end + 1);
 | 
						|
    c = src.cur_char();
 | 
						|
    if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
 | 
						|
      lexem.val = c;
 | 
						|
      src.set_ptr(end + 2);
 | 
						|
    }
 | 
						|
    // std::cerr << lexem.name_str() << ' ' << lexem.str << std::endl;
 | 
						|
    return lexem;
 | 
						|
  }
 | 
						|
  int len = 0, pc = -0x100;
 | 
						|
  while (end < src.get_end_ptr()) {
 | 
						|
    c = *end;
 | 
						|
    bool repeated = (c == pc && is_repeatable(c));
 | 
						|
    if (c == ' ' || c == 9 || (len && is_left_active(c) && !repeated)) {
 | 
						|
      break;
 | 
						|
    }
 | 
						|
    ++len;
 | 
						|
    ++end;
 | 
						|
    if (is_right_active(c) && !repeated) {
 | 
						|
      break;
 | 
						|
    }
 | 
						|
    pc = c;
 | 
						|
  }
 | 
						|
  lexem.set(std::string{src.get_ptr(), end}, src.here());
 | 
						|
  src.set_ptr(end);
 | 
						|
  // std::cerr << lexem.name_str() << ' ' << lexem.str << std::endl;
 | 
						|
  return lexem;
 | 
						|
}
 | 
						|
 | 
						|
const Lexem& Lexer::peek() {
 | 
						|
  if (peek_lexem.valid()) {
 | 
						|
    return peek_lexem;
 | 
						|
  }
 | 
						|
  if (eof) {
 | 
						|
    return lexem.clear(src.here(), Lexem::Eof);
 | 
						|
  }
 | 
						|
  Lexem keep = std::move(lexem);
 | 
						|
  next();
 | 
						|
  peek_lexem = std::move(lexem);
 | 
						|
  lexem = std::move(keep);
 | 
						|
  eof = false;
 | 
						|
  return peek_lexem;
 | 
						|
}
 | 
						|
 | 
						|
}  // namespace src
 |