1
0
Fork 0
mirror of https://github.com/ton-blockchain/ton synced 2025-02-15 04:32:21 +00:00
ton/tolk/lexer.cpp
tolk-vm ebbab54cda
[Tolk] Tolk v0.5.0 as FunC v0.5.0 could have been like
All changes from PR "FunC v0.5.0":
https://github.com/ton-blockchain/ton/pull/1026

Instead of developing FunC, we decided to fork it.
BTW, the first Tolk release will be v0.6,
a metaphor of FunC v0.5 that missed a chance to occur.
2024-11-02 01:33:08 +04:00

350 lines
9.6 KiB
C++

/*
This file is part of TON Blockchain Library.
TON Blockchain Library is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
TON Blockchain Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with TON Blockchain Library. If not, see <http://www.gnu.org/licenses/>.
*/
#include "lexer.h"
#include "symtable.h"
#include <sstream>
#include <cassert>
namespace tolk {
/*
*
* LEXER
*
*/
std::string Lexem::lexem_name_str(int idx) {
if (idx == Eof) {
return "end of file";
} else if (idx == Ident) {
return "identifier";
} else if (idx == Number) {
return "number";
} else if (idx == String) {
return "string";
} else if (idx == Special) {
return "special";
} else if (symbols.get_keyword(idx)) {
return "`" + symbols.get_keyword(idx)->str + "`";
} else {
std::ostringstream os{"<unknown lexem of type "};
os << idx << ">";
return os.str();
}
}
std::string Lexem::name_str() const {
if (tp == Ident) {
return std::string{"identifier `"} + symbols.get_name(val) + "`";
} else if (tp == String) {
return std::string{"string \""} + str + '"';
} else {
return lexem_name_str(tp);
}
}
bool is_number(std::string str) {
auto st = str.begin(), en = str.end();
if (st == en) {
return false;
}
if (*st == '-') {
st++;
}
bool hex = false;
if (st + 1 < en && *st == '0' && st[1] == 'x') {
st += 2;
hex = true;
}
if (st == en) {
return false;
}
while (st < en) {
int c = *st;
if (c >= '0' && c <= '9') {
++st;
continue;
}
if (!hex) {
return false;
}
c |= 0x20;
if (c < 'a' || c > 'f') {
return false;
}
++st;
}
return true;
}
int Lexem::classify() {
if (tp != Unknown) {
return tp;
}
sym_idx_t i = symbols.lookup(str);
if (i) {
assert(str == symbols[i]->str);
str = symbols[i]->str;
sym_idx_t idx = symbols[i]->idx;
tp = (idx < 0 ? -idx : Ident);
val = i;
} else if (is_number(str)) {
tp = Number;
} else {
tp = 0;
}
if (tp == Unknown) {
tp = Ident;
val = symbols.lookup(str, 1);
}
return tp;
}
int Lexem::set(std::string _str, const SrcLocation& _loc, int _tp, int _val) {
str = _str;
loc = _loc;
tp = _tp;
val = _val;
return classify();
}
Lexer::Lexer(SourceReader& _src, std::string active_chars, std::string quote_chars, std::string multiline_quote)
: src(_src), eof(false), lexem("", src.here(), Lexem::Undefined), peek_lexem("", {}, Lexem::Undefined),
multiline_quote(std::move(multiline_quote)) {
std::memset(char_class, 0, sizeof(char_class));
unsigned char activity = cc::active;
for (char c : active_chars) {
if (c == ' ') {
if (!--activity) {
activity = cc::allow_repeat;
}
} else if ((unsigned)c < 0x80) {
char_class[(unsigned)c] |= activity;
}
}
for (int c : quote_chars) {
if (c > ' ' && c <= 0x7f) {
char_class[(unsigned)c] |= cc::quote_char;
}
}
}
void Lexer::set_comment_tokens(const std::string &eol_cmts, const std::string &open_cmts, const std::string &close_cmts) {
set_spec(eol_cmt, eol_cmts);
set_spec(cmt_op, open_cmts);
set_spec(cmt_cl, close_cmts);
}
void Lexer::set_comment2_tokens(const std::string &eol_cmts2, const std::string &open_cmts2, const std::string &close_cmts2) {
set_spec(eol_cmt2, eol_cmts2);
set_spec(cmt_op2, open_cmts2);
set_spec(cmt_cl2, close_cmts2);
}
void Lexer::start_parsing() {
next();
}
void Lexer::set_spec(std::array<int, 3>& arr, std::string setup) {
arr[0] = arr[1] = arr[2] = -0x100;
std::size_t n = setup.size(), i;
for (i = 0; i < n; i++) {
if (setup[i] == ' ') {
continue;
}
if (i == n - 1 || setup[i + 1] == ' ') {
arr[0] = setup[i];
} else if (i == n - 2 || (i < n - 2 && setup[i + 2] == ' ')) {
arr[1] = setup[i];
arr[2] = setup[++i];
} else {
while (i < n && setup[i] != ' ') {
i++;
}
}
}
}
bool Lexer::is_multiline_quote(const char* begin, const char* end) {
if (multiline_quote.empty()) {
return false;
}
for (const char& c : multiline_quote) {
if (begin == end || *begin != c) {
return false;
}
++begin;
}
return true;
}
void Lexer::expect(int exp_tp, const char* msg) {
if (tp() != exp_tp) {
throw ParseError{lexem.loc, (msg ? std::string{msg} : Lexem::lexem_name_str(exp_tp)) + " expected instead of " +
cur().name_str()};
}
next();
}
const Lexem& Lexer::next() {
if (peek_lexem.valid()) {
lexem = std::move(peek_lexem);
peek_lexem.clear({}, Lexem::Undefined);
eof = (lexem.tp == Lexem::Eof);
return lexem;
}
if (eof) {
return lexem.clear(src.here(), Lexem::Eof);
}
long long comm = 1;
// the code below is very complicated, because it tried to support one-symbol start/end and nesting
// in Tolk, we decided to stop supporting nesting (it was never used in practice and almost impossible for js highlighters)
// later on I'll simplify this code (more precisely, rewrite lexer from scratch)
while (!src.seek_eof()) {
int cc = src.cur_char(), nc = src.next_char();
// note, that in practice, [0]-th element is -256, condition for [0]-th is always false
// todo rewrite this all in the future
if (cc == eol_cmt[0] || (cc == eol_cmt[1] && nc == eol_cmt[2]) || cc == eol_cmt2[0] || (cc == eol_cmt2[1] && nc == eol_cmt2[2])) {
if (comm == 1) { // just "//" — skip a whole line
src.load_line();
} else { // if "//" is nested into "/*", continue reading, since "*/" may be met
src.advance(1);
}
} else if (cc == cmt_op[1] && nc == cmt_op[2] || cc == cmt_op2[1] && nc == cmt_op2[2]) {
src.advance(2);
comm = comm * 2 + 1;
} else if (cc == cmt_op[0] || cc == cmt_op2[0]) { // always false
src.advance(1);
comm *= 2;
} else if (comm == 1) {
break; // means that we are not inside a comment
} else if (cc == cmt_cl[1] && nc == cmt_cl[2] || cc == cmt_cl2[1] && nc == cmt_cl2[2]) {
if (!(comm & 1)) { // always false
src.error(std::string{"a `"} + (char)cmt_op[0] + "` comment closed by `" + (char)cmt_cl[1] + (char)cmt_cl[2] +
"`");
}
// note that {- may be closed with */, but assume it's ok (we'll get rid of {- in the future)
comm = 1;
src.advance(2);
} else if (cc == cmt_cl[0] || cc == cmt_cl2[0]) { // always false
if (!(comm & 1)) {
src.error(std::string{"a `"} + (char)cmt_op[1] + (char)cmt_op[2] + "` comment closed by `" + (char)cmt_cl[0] +
"`");
}
comm = 1;
src.advance(1);
} else {
src.advance(1);
}
if (comm < 0) {
src.error("too many nested comments");
}
}
if (src.seek_eof()) {
eof = true;
if (comm > 1) {
src.error("comment extends past end of file");
}
return lexem.clear(src.here(), Lexem::Eof);
}
if (is_multiline_quote(src.get_ptr(), src.get_end_ptr())) {
src.advance(multiline_quote.size());
const char* end = nullptr;
SrcLocation here = src.here();
std::string body;
while (!src.is_eof()) {
if (src.is_eoln()) {
body.push_back('\n');
src.load_line();
continue;
}
if (is_multiline_quote(src.get_ptr(), src.get_end_ptr())) {
end = src.get_ptr();
src.advance(multiline_quote.size());
break;
}
body.push_back(src.cur_char());
src.advance(1);
}
if (!end) {
src.error("string extends past end of file");
}
lexem.set(body, here, Lexem::String);
int c = src.cur_char();
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
lexem.val = c;
src.advance(1);
}
return lexem;
}
int c = src.cur_char();
const char* end = src.get_ptr();
if (is_quote_char(c) || c == '`') {
int qc = c;
++end;
while (end < src.get_end_ptr() && *end != qc) {
++end;
}
if (*end != qc) {
src.error(qc == '`' ? "a `back-quoted` token extends past end of line" : "string extends past end of line");
}
lexem.set(std::string{src.get_ptr() + 1, end}, src.here(), qc == '`' ? Lexem::Unknown : Lexem::String);
src.set_ptr(end + 1);
c = src.cur_char();
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
lexem.val = c;
src.set_ptr(end + 2);
}
// std::cerr << lexem.name_str() << ' ' << lexem.str << std::endl;
return lexem;
}
int len = 0, pc = -0x100;
while (end < src.get_end_ptr()) {
c = *end;
bool repeated = (c == pc && is_repeatable(c));
if (c == ' ' || c == 9 || (len && is_left_active(c) && !repeated)) {
break;
}
++len;
++end;
if (is_right_active(c) && !repeated) {
break;
}
pc = c;
}
lexem.set(std::string{src.get_ptr(), end}, src.here());
src.set_ptr(end);
// std::cerr << lexem.name_str() << ' ' << lexem.str << std::endl;
return lexem;
}
const Lexem& Lexer::peek() {
if (peek_lexem.valid()) {
return peek_lexem;
}
if (eof) {
return lexem.clear(src.here(), Lexem::Eof);
}
Lexem keep = std::move(lexem);
next();
peek_lexem = std::move(lexem);
lexem = std::move(keep);
eof = false;
return peek_lexem;
}
} // namespace tolk