mirror of
https://github.com/ton-blockchain/ton
synced 2025-03-09 15:40:10 +00:00
[Tolk] Initial commit of TOLK Language: fork all sources from FunC
The Tolk Language will be positioned as "next-generation FunC". It's literally a fork of a FunC compiler, introducing familiar syntax similar to TypeScript, but leaving all low-level optimizations untouched. Note, that FunC sources are partially stored in the parser/ folder (shared with TL/B). In Tolk, nothing is shared. Everything from parser/ is copied into tolk/ folder.
This commit is contained in:
parent
eed3153ace
commit
82648ebd6a
43 changed files with 13674 additions and 18 deletions
335
tolk/lexer.cpp
Normal file
335
tolk/lexer.cpp
Normal file
|
@ -0,0 +1,335 @@
|
|||
/*
|
||||
This file is part of TON Blockchain Library.
|
||||
|
||||
TON Blockchain Library is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as published by
|
||||
the Free Software Foundation, either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
TON Blockchain Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with TON Blockchain Library. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#include "lexer.h"
|
||||
#include "symtable.h"
|
||||
#include <sstream>
|
||||
#include <cassert>
|
||||
|
||||
namespace tolk {
|
||||
|
||||
/*
|
||||
*
|
||||
* LEXER
|
||||
*
|
||||
*/
|
||||
|
||||
std::string Lexem::lexem_name_str(int idx) {
|
||||
if (idx == Eof) {
|
||||
return "end of file";
|
||||
} else if (idx == Ident) {
|
||||
return "identifier";
|
||||
} else if (idx == Number) {
|
||||
return "number";
|
||||
} else if (idx == String) {
|
||||
return "string";
|
||||
} else if (idx == Special) {
|
||||
return "special";
|
||||
} else if (symbols.get_keyword(idx)) {
|
||||
return "`" + symbols.get_keyword(idx)->str + "`";
|
||||
} else {
|
||||
std::ostringstream os{"<unknown lexem of type "};
|
||||
os << idx << ">";
|
||||
return os.str();
|
||||
}
|
||||
}
|
||||
|
||||
std::string Lexem::name_str() const {
|
||||
if (tp == Ident) {
|
||||
return std::string{"identifier `"} + symbols.get_name(val) + "`";
|
||||
} else if (tp == String) {
|
||||
return std::string{"string \""} + str + '"';
|
||||
} else {
|
||||
return lexem_name_str(tp);
|
||||
}
|
||||
}
|
||||
|
||||
bool is_number(std::string str) {
|
||||
auto st = str.begin(), en = str.end();
|
||||
if (st == en) {
|
||||
return false;
|
||||
}
|
||||
if (*st == '-') {
|
||||
st++;
|
||||
}
|
||||
bool hex = false;
|
||||
if (st + 1 < en && *st == '0' && st[1] == 'x') {
|
||||
st += 2;
|
||||
hex = true;
|
||||
}
|
||||
if (st == en) {
|
||||
return false;
|
||||
}
|
||||
while (st < en) {
|
||||
int c = *st;
|
||||
if (c >= '0' && c <= '9') {
|
||||
++st;
|
||||
continue;
|
||||
}
|
||||
if (!hex) {
|
||||
return false;
|
||||
}
|
||||
c |= 0x20;
|
||||
if (c < 'a' || c > 'f') {
|
||||
return false;
|
||||
}
|
||||
++st;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int Lexem::classify() {
|
||||
if (tp != Unknown) {
|
||||
return tp;
|
||||
}
|
||||
sym_idx_t i = symbols.lookup(str);
|
||||
if (i) {
|
||||
assert(str == symbols[i]->str);
|
||||
str = symbols[i]->str;
|
||||
sym_idx_t idx = symbols[i]->idx;
|
||||
tp = (idx < 0 ? -idx : Ident);
|
||||
val = i;
|
||||
} else if (is_number(str)) {
|
||||
tp = Number;
|
||||
} else {
|
||||
tp = 0;
|
||||
}
|
||||
if (tp == Unknown) {
|
||||
tp = Ident;
|
||||
val = symbols.lookup(str, 1);
|
||||
}
|
||||
return tp;
|
||||
}
|
||||
|
||||
int Lexem::set(std::string _str, const SrcLocation& _loc, int _tp, int _val) {
|
||||
str = _str;
|
||||
loc = _loc;
|
||||
tp = _tp;
|
||||
val = _val;
|
||||
return classify();
|
||||
}
|
||||
|
||||
Lexer::Lexer(SourceReader& _src, bool init, std::string active_chars, std::string eol_cmts, std::string open_cmts,
|
||||
std::string close_cmts, std::string quote_chars, std::string multiline_quote)
|
||||
: src(_src), eof(false), lexem("", src.here(), Lexem::Undefined), peek_lexem("", {}, Lexem::Undefined),
|
||||
multiline_quote(std::move(multiline_quote)) {
|
||||
std::memset(char_class, 0, sizeof(char_class));
|
||||
unsigned char activity = cc::active;
|
||||
for (char c : active_chars) {
|
||||
if (c == ' ') {
|
||||
if (!--activity) {
|
||||
activity = cc::allow_repeat;
|
||||
}
|
||||
} else if ((unsigned)c < 0x80) {
|
||||
char_class[(unsigned)c] |= activity;
|
||||
}
|
||||
}
|
||||
set_spec(eol_cmt, eol_cmts);
|
||||
set_spec(cmt_op, open_cmts);
|
||||
set_spec(cmt_cl, close_cmts);
|
||||
for (int c : quote_chars) {
|
||||
if (c > ' ' && c <= 0x7f) {
|
||||
char_class[(unsigned)c] |= cc::quote_char;
|
||||
}
|
||||
}
|
||||
if (init) {
|
||||
next();
|
||||
}
|
||||
}
|
||||
|
||||
void Lexer::set_spec(std::array<int, 3>& arr, std::string setup) {
|
||||
arr[0] = arr[1] = arr[2] = -0x100;
|
||||
std::size_t n = setup.size(), i;
|
||||
for (i = 0; i < n; i++) {
|
||||
if (setup[i] == ' ') {
|
||||
continue;
|
||||
}
|
||||
if (i == n - 1 || setup[i + 1] == ' ') {
|
||||
arr[0] = setup[i];
|
||||
} else if (i == n - 2 || (i < n - 2 && setup[i + 2] == ' ')) {
|
||||
arr[1] = setup[i];
|
||||
arr[2] = setup[++i];
|
||||
} else {
|
||||
while (i < n && setup[i] != ' ') {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Lexer::is_multiline_quote(const char* begin, const char* end) {
|
||||
if (multiline_quote.empty()) {
|
||||
return false;
|
||||
}
|
||||
for (const char& c : multiline_quote) {
|
||||
if (begin == end || *begin != c) {
|
||||
return false;
|
||||
}
|
||||
++begin;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void Lexer::expect(int exp_tp, const char* msg) {
|
||||
if (tp() != exp_tp) {
|
||||
throw ParseError{lexem.loc, (msg ? std::string{msg} : Lexem::lexem_name_str(exp_tp)) + " expected instead of " +
|
||||
cur().name_str()};
|
||||
}
|
||||
next();
|
||||
}
|
||||
|
||||
const Lexem& Lexer::next() {
|
||||
if (peek_lexem.valid()) {
|
||||
lexem = std::move(peek_lexem);
|
||||
peek_lexem.clear({}, Lexem::Undefined);
|
||||
eof = (lexem.tp == Lexem::Eof);
|
||||
return lexem;
|
||||
}
|
||||
if (eof) {
|
||||
return lexem.clear(src.here(), Lexem::Eof);
|
||||
}
|
||||
long long comm = 1;
|
||||
while (!src.seek_eof()) {
|
||||
int cc = src.cur_char(), nc = src.next_char();
|
||||
if (cc == eol_cmt[0] || (cc == eol_cmt[1] && nc == eol_cmt[2])) {
|
||||
src.load_line();
|
||||
} else if (cc == cmt_op[1] && nc == cmt_op[2]) {
|
||||
src.advance(2);
|
||||
comm = comm * 2 + 1;
|
||||
} else if (cc == cmt_op[0]) {
|
||||
src.advance(1);
|
||||
comm *= 2;
|
||||
} else if (comm == 1) {
|
||||
break;
|
||||
} else if (cc == cmt_cl[1] && nc == cmt_cl[2]) {
|
||||
if (!(comm & 1)) {
|
||||
src.error(std::string{"a `"} + (char)cmt_op[0] + "` comment closed by `" + (char)cmt_cl[1] + (char)cmt_cl[2] +
|
||||
"`");
|
||||
}
|
||||
comm >>= 1;
|
||||
src.advance(2);
|
||||
} else if (cc == cmt_cl[0]) {
|
||||
if (!(comm & 1)) {
|
||||
src.error(std::string{"a `"} + (char)cmt_op[1] + (char)cmt_op[2] + "` comment closed by `" + (char)cmt_cl[0] +
|
||||
"`");
|
||||
}
|
||||
comm >>= 1;
|
||||
src.advance(1);
|
||||
} else {
|
||||
src.advance(1);
|
||||
}
|
||||
if (comm < 0) {
|
||||
src.error("too many nested comments");
|
||||
}
|
||||
}
|
||||
if (src.seek_eof()) {
|
||||
eof = true;
|
||||
if (comm > 1) {
|
||||
if (comm & 1) {
|
||||
src.error(std::string{"`"} + (char)cmt_op[1] + (char)cmt_op[2] + "` comment extends past end of file");
|
||||
} else {
|
||||
src.error(std::string{"`"} + (char)cmt_op[0] + "` comment extends past end of file");
|
||||
}
|
||||
}
|
||||
return lexem.clear(src.here(), Lexem::Eof);
|
||||
}
|
||||
if (is_multiline_quote(src.get_ptr(), src.get_end_ptr())) {
|
||||
src.advance(multiline_quote.size());
|
||||
const char* end = nullptr;
|
||||
SrcLocation here = src.here();
|
||||
std::string body;
|
||||
while (!src.is_eof()) {
|
||||
if (src.is_eoln()) {
|
||||
body.push_back('\n');
|
||||
src.load_line();
|
||||
continue;
|
||||
}
|
||||
if (is_multiline_quote(src.get_ptr(), src.get_end_ptr())) {
|
||||
end = src.get_ptr();
|
||||
src.advance(multiline_quote.size());
|
||||
break;
|
||||
}
|
||||
body.push_back(src.cur_char());
|
||||
src.advance(1);
|
||||
}
|
||||
if (!end) {
|
||||
src.error("string extends past end of file");
|
||||
}
|
||||
lexem.set(body, here, Lexem::String);
|
||||
int c = src.cur_char();
|
||||
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
|
||||
lexem.val = c;
|
||||
src.advance(1);
|
||||
}
|
||||
return lexem;
|
||||
}
|
||||
int c = src.cur_char();
|
||||
const char* end = src.get_ptr();
|
||||
if (is_quote_char(c) || c == '`') {
|
||||
int qc = c;
|
||||
++end;
|
||||
while (end < src.get_end_ptr() && *end != qc) {
|
||||
++end;
|
||||
}
|
||||
if (*end != qc) {
|
||||
src.error(qc == '`' ? "a `back-quoted` token extends past end of line" : "string extends past end of line");
|
||||
}
|
||||
lexem.set(std::string{src.get_ptr() + 1, end}, src.here(), qc == '`' ? Lexem::Unknown : Lexem::String);
|
||||
src.set_ptr(end + 1);
|
||||
c = src.cur_char();
|
||||
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
|
||||
lexem.val = c;
|
||||
src.set_ptr(end + 2);
|
||||
}
|
||||
// std::cerr << lexem.name_str() << ' ' << lexem.str << std::endl;
|
||||
return lexem;
|
||||
}
|
||||
int len = 0, pc = -0x100;
|
||||
while (end < src.get_end_ptr()) {
|
||||
c = *end;
|
||||
bool repeated = (c == pc && is_repeatable(c));
|
||||
if (c == ' ' || c == 9 || (len && is_left_active(c) && !repeated)) {
|
||||
break;
|
||||
}
|
||||
++len;
|
||||
++end;
|
||||
if (is_right_active(c) && !repeated) {
|
||||
break;
|
||||
}
|
||||
pc = c;
|
||||
}
|
||||
lexem.set(std::string{src.get_ptr(), end}, src.here());
|
||||
src.set_ptr(end);
|
||||
// std::cerr << lexem.name_str() << ' ' << lexem.str << std::endl;
|
||||
return lexem;
|
||||
}
|
||||
|
||||
const Lexem& Lexer::peek() {
|
||||
if (peek_lexem.valid()) {
|
||||
return peek_lexem;
|
||||
}
|
||||
if (eof) {
|
||||
return lexem.clear(src.here(), Lexem::Eof);
|
||||
}
|
||||
Lexem keep = std::move(lexem);
|
||||
next();
|
||||
peek_lexem = std::move(lexem);
|
||||
lexem = std::move(keep);
|
||||
eof = false;
|
||||
return peek_lexem;
|
||||
}
|
||||
|
||||
} // namespace tolk
|
Loading…
Add table
Add a link
Reference in a new issue