/*
This file is part of TON Blockchain Library.
TON Blockchain Library is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
TON Blockchain Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with TON Blockchain Library. If not, see .
Copyright 2017-2020 Telegram Systems LLP
*/
#include "lexer.h"
#include "symtable.h"
#include
#include
namespace src {
/*
*
* LEXER
*
*/
std::string Lexem::lexem_name_str(int idx) {
if (idx == Eof) {
return "end of file";
} else if (idx == Ident) {
return "identifier";
} else if (idx == Number) {
return "number";
} else if (idx == String) {
return "string";
} else if (idx == Special) {
return "special";
} else if (sym::symbols.get_keyword(idx)) {
return "`" + sym::symbols.get_keyword(idx)->str + "`";
} else {
std::ostringstream os{"";
return os.str();
}
}
std::string Lexem::name_str() const {
if (tp == Ident) {
return std::string{"identifier `"} + sym::symbols.get_name(val) + "`";
} else if (tp == String) {
return std::string{"string \""} + str + '"';
} else {
return lexem_name_str(tp);
}
}
bool is_number(std::string str) {
auto st = str.begin(), en = str.end();
if (st == en) {
return false;
}
if (*st == '-') {
st++;
}
bool hex = false;
if (st + 1 < en && *st == '0' && st[1] == 'x') {
st += 2;
hex = true;
}
if (st == en) {
return false;
}
while (st < en) {
int c = *st;
if (c >= '0' && c <= '9') {
++st;
continue;
}
if (!hex) {
return false;
}
c |= 0x20;
if (c < 'a' || c > 'f') {
return false;
}
++st;
}
return true;
}
int Lexem::classify() {
if (tp != Unknown) {
return tp;
}
sym::sym_idx_t i = sym::symbols.lookup(str);
if (i) {
assert(str == sym::symbols[i]->str);
str = sym::symbols[i]->str;
sym::sym_idx_t idx = sym::symbols[i]->idx;
tp = (idx < 0 ? -idx : Ident);
val = i;
} else if (is_number(str)) {
tp = Number;
} else {
tp = lexem_is_special(str);
}
if (tp == Unknown) {
tp = Ident;
val = sym::symbols.lookup(str, 1);
}
return tp;
}
int Lexem::set(std::string _str, const SrcLocation& _loc, int _tp, int _val) {
str = _str;
loc = _loc;
tp = _tp;
val = _val;
return classify();
}
Lexer::Lexer(SourceReader& _src, bool init, std::string active_chars, std::string eol_cmts, std::string open_cmts,
std::string close_cmts, std::string quote_chars, std::string multiline_quote)
: src(_src), eof(false), lexem("", src.here(), Lexem::Undefined), peek_lexem("", {}, Lexem::Undefined),
multiline_quote(std::move(multiline_quote)) {
std::memset(char_class, 0, sizeof(char_class));
unsigned char activity = cc::active;
for (char c : active_chars) {
if (c == ' ') {
if (!--activity) {
activity = cc::allow_repeat;
}
} else if ((unsigned)c < 0x80) {
char_class[(unsigned)c] |= activity;
}
}
set_spec(eol_cmt, eol_cmts);
set_spec(cmt_op, open_cmts);
set_spec(cmt_cl, close_cmts);
for (int c : quote_chars) {
if (c > ' ' && c <= 0x7f) {
char_class[(unsigned)c] |= cc::quote_char;
}
}
if (init) {
next();
}
}
void Lexer::set_spec(std::array& arr, std::string setup) {
arr[0] = arr[1] = arr[2] = -0x100;
std::size_t n = setup.size(), i;
for (i = 0; i < n; i++) {
if (setup[i] == ' ') {
continue;
}
if (i == n - 1 || setup[i + 1] == ' ') {
arr[0] = setup[i];
} else if (i == n - 2 || (i < n - 2 && setup[i + 2] == ' ')) {
arr[1] = setup[i];
arr[2] = setup[++i];
} else {
while (i < n && setup[i] != ' ') {
i++;
}
}
}
}
bool Lexer::is_multiline_quote(const char* begin, const char* end) {
if (multiline_quote.empty()) {
return false;
}
for (const char& c : multiline_quote) {
if (begin == end || *begin != c) {
return false;
}
++begin;
}
return true;
}
void Lexer::expect(int exp_tp, const char* msg) {
if (tp() != exp_tp) {
throw ParseError{lexem.loc, (msg ? std::string{msg} : Lexem::lexem_name_str(exp_tp)) + " expected instead of " +
cur().name_str()};
}
next();
}
const Lexem& Lexer::next() {
if (peek_lexem.valid()) {
lexem = std::move(peek_lexem);
peek_lexem.clear({}, Lexem::Undefined);
eof = (lexem.tp == Lexem::Eof);
return lexem;
}
if (eof) {
return lexem.clear(src.here(), Lexem::Eof);
}
long long comm = 1;
while (!src.seek_eof()) {
int cc = src.cur_char(), nc = src.next_char();
if (cc == eol_cmt[0] || (cc == eol_cmt[1] && nc == eol_cmt[2])) {
src.load_line();
} else if (cc == cmt_op[1] && nc == cmt_op[2]) {
src.advance(2);
comm = comm * 2 + 1;
} else if (cc == cmt_op[0]) {
src.advance(1);
comm *= 2;
} else if (comm == 1) {
break;
} else if (cc == cmt_cl[1] && nc == cmt_cl[2]) {
if (!(comm & 1)) {
src.error(std::string{"a `"} + (char)cmt_op[0] + "` comment closed by `" + (char)cmt_cl[1] + (char)cmt_cl[2] +
"`");
}
comm >>= 1;
src.advance(2);
} else if (cc == cmt_cl[0]) {
if (!(comm & 1)) {
src.error(std::string{"a `"} + (char)cmt_op[1] + (char)cmt_op[2] + "` comment closed by `" + (char)cmt_cl[0] +
"`");
}
comm >>= 1;
src.advance(1);
} else {
src.advance(1);
}
if (comm < 0) {
src.error("too many nested comments");
}
}
if (src.seek_eof()) {
eof = true;
if (comm > 1) {
if (comm & 1) {
src.error(std::string{"`"} + (char)cmt_op[1] + (char)cmt_op[2] + "` comment extends past end of file");
} else {
src.error(std::string{"`"} + (char)cmt_op[0] + "` comment extends past end of file");
}
}
return lexem.clear(src.here(), Lexem::Eof);
}
if (is_multiline_quote(src.get_ptr(), src.get_end_ptr())) {
src.advance(multiline_quote.size());
const char* end = nullptr;
SrcLocation here = src.here();
std::string body;
while (!src.is_eof()) {
if (src.is_eoln()) {
body.push_back('\n');
src.load_line();
continue;
}
if (is_multiline_quote(src.get_ptr(), src.get_end_ptr())) {
end = src.get_ptr();
src.advance(multiline_quote.size());
break;
}
body.push_back(src.cur_char());
src.advance(1);
}
if (!end) {
src.error("string extends past end of file");
}
lexem.set(body, here, Lexem::String);
int c = src.cur_char();
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
lexem.val = c;
src.advance(1);
}
return lexem;
}
int c = src.cur_char();
const char* end = src.get_ptr();
if (is_quote_char(c) || c == '`') {
int qc = c;
++end;
while (end < src.get_end_ptr() && *end != qc) {
++end;
}
if (*end != qc) {
src.error(qc == '`' ? "a `back-quoted` token extends past end of line" : "string extends past end of line");
}
lexem.set(std::string{src.get_ptr() + 1, end}, src.here(), qc == '`' ? Lexem::Unknown : Lexem::String);
src.set_ptr(end + 1);
c = src.cur_char();
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
lexem.val = c;
src.set_ptr(end + 2);
}
// std::cerr << lexem.name_str() << ' ' << lexem.str << std::endl;
return lexem;
}
int len = 0, pc = -0x100;
while (end < src.get_end_ptr()) {
c = *end;
bool repeated = (c == pc && is_repeatable(c));
if (c == ' ' || c == 9 || (len && is_left_active(c) && !repeated)) {
break;
}
++len;
++end;
if (is_right_active(c) && !repeated) {
break;
}
pc = c;
}
lexem.set(std::string{src.get_ptr(), end}, src.here());
src.set_ptr(end);
// std::cerr << lexem.name_str() << ' ' << lexem.str << std::endl;
return lexem;
}
const Lexem& Lexer::peek() {
if (peek_lexem.valid()) {
return peek_lexem;
}
if (eof) {
return lexem.clear(src.here(), Lexem::Eof);
}
Lexem keep = std::move(lexem);
next();
peek_lexem = std::move(lexem);
lexem = std::move(keep);
eof = false;
return peek_lexem;
}
} // namespace src