1
0
Fork 0
mirror of https://github.com/ton-blockchain/ton synced 2025-03-09 15:40:10 +00:00

[Tolk] AST-based semantic analysis, get rid of Expr

This is a huge refactoring focusing on untangling compiler internals
(previously forked from FunC).
The goal is to convert AST directly to Op (a kind of IR representation),
doing all code analysis at AST level.

Noteable changes:
- AST-based semantic kernel includes: registering global symbols,
  scope handling and resolving local/global identifiers,
  lvalue/rvalue calc and check, implicit return detection,
  mutability analysis, pure/impure validity checks,
  simple constant folding
- values of `const` variables are calculated NOT based on CodeBlob,
  but via a newly-introduced AST-based constant evaluator
- AST vertices are now inherited from expression/statement/other;
  expression vertices have common properties (TypeExpr, lvalue/rvalue)
- symbol table is rewritten completely, SymDef/SymVal no longer exist,
  lexer now doesn't need to register identifiers
- AST vertices have references to symbols, filled at different
  stages of pipeline
- the remaining "FunC legacy part" is almost unchanged besides Expr
  which was fully dropped; AST is converted to Ops (IR) directly
This commit is contained in:
tolk-vm 2024-12-16 21:19:45 +03:00
parent ea0dc16163
commit 3540424aa1
No known key found for this signature in database
GPG key ID: 7905DD7FE0324B12
71 changed files with 4270 additions and 3060 deletions

View file

@ -13,65 +13,50 @@
You should have received a copy of the GNU General Public License
along with TON Blockchain. If not, see <http://www.gnu.org/licenses/>.
In addition, as a special exception, the copyright holders give permission
to link the code of portions of this program with the OpenSSL library.
You must obey the GNU General Public License in all respects for all
of the code used other than OpenSSL. If you modify file(s) with this
exception, you may extend this exception to your version of the file(s),
but you are not obligated to do so. If you do not wish to do so, delete this
exception statement from your version. If you delete this exception statement
from all source files in the program, then also delete it here.
*/
#include "tolk.h"
#include "platform-utils.h"
#include "src-file.h"
#include "ast.h"
#include "compiler-state.h"
#include "constant-evaluator.h"
#include "td/utils/crypto.h"
#include <unordered_set>
/*
* This pipe registers global symbols: functions, constants, global vars, etc.
* It happens just after all files have been parsed to AST.
*
* "Registering" means adding symbols to a global symbol table.
* After this pass, any global symbol can be looked up.
* Note, that local variables are not analyzed here, it's a later step.
* Before digging into locals, we need a global symtable to be filled, exactly done here.
*/
namespace tolk {
Expr* process_expr(AnyV v, CodeBlob& code);
GNU_ATTRIBUTE_NORETURN GNU_ATTRIBUTE_COLD
static void fire_error_redefinition_of_symbol(V<ast_identifier> v_ident, SymDef* existing) {
if (existing->loc.is_stdlib()) {
v_ident->error("redefinition of a symbol from stdlib");
} else if (existing->loc.is_defined()) {
v_ident->error("redefinition of symbol, previous was at: " + existing->loc.to_string());
} else {
v_ident->error("redefinition of built-in symbol");
}
}
static int calc_sym_idx(std::string_view sym_name) {
return G.symbols.lookup_add(sym_name);
}
static td::RefInt256 calculate_method_id_for_entrypoint(std::string_view func_name) {
static int calculate_method_id_for_entrypoint(std::string_view func_name) {
if (func_name == "main" || func_name == "onInternalMessage") {
return td::make_refint(0);
return 0;
}
if (func_name == "onExternalMessage") {
return td::make_refint(-1);
return -1;
}
if (func_name == "onRunTickTock") {
return td::make_refint(-2);
return -2;
}
if (func_name == "onSplitPrepare") {
return td::make_refint(-3);
return -3;
}
if (func_name == "onSplitInstall") {
return td::make_refint(-4);
return -4;
}
tolk_assert(false);
}
static td::RefInt256 calculate_method_id_by_func_name(std::string_view func_name) {
static int calculate_method_id_by_func_name(std::string_view func_name) {
unsigned int crc = td::crc16(static_cast<std::string>(func_name));
return td::make_refint((crc & 0xffff) | 0x10000);
return static_cast<int>(crc & 0xffff) | 0x10000;
}
static void calc_arg_ret_order_of_asm_function(V<ast_asm_body> v_body, V<ast_parameter_list> param_list, TypeExpr* ret_type,
@ -89,7 +74,7 @@ static void calc_arg_ret_order_of_asm_function(V<ast_asm_body> v_body, V<ast_par
int tot_width = 0;
for (int i = 0; i < cnt; ++i) {
V<ast_parameter> v_param = param_list->get_param(i);
int arg_width = v_param->param_type->get_width();
int arg_width = v_param->declared_type->get_width();
if (arg_width < 0 || arg_width > 16) {
v_param->error("parameters of an assembler built-in function must have a well-defined fixed width");
}
@ -130,102 +115,39 @@ static void calc_arg_ret_order_of_asm_function(V<ast_asm_body> v_body, V<ast_par
}
static void register_constant(V<ast_constant_declaration> v) {
AnyV init_value = v->get_init_value();
SymDef* sym_def = define_global_symbol(calc_sym_idx(v->get_identifier()->name), v->loc);
if (sym_def->value) {
fire_error_redefinition_of_symbol(v->get_identifier(), sym_def);
}
ConstantValue init_value = eval_const_init_value(v->get_init_value());
GlobalConstData* c_sym = new GlobalConstData(static_cast<std::string>(v->get_identifier()->name), v->loc, std::move(init_value));
// todo currently, constant value calculation is dirty and roughly: init_value is evaluated to fif code
// and waited to be a single expression
// although it works, of course it should be later rewritten using AST calculations, as well as lots of other parts
CodeBlob code("tmp", v->loc, nullptr, nullptr);
Expr* x = process_expr(init_value, code);
if (!x->is_rvalue()) {
v->get_init_value()->error("expression is not strictly Rvalue");
}
if (v->declared_type && !v->declared_type->equals_to(x->e_type)) {
if (v->declared_type && !v->declared_type->equals_to(c_sym->inferred_type)) {
v->error("expression type does not match declared type");
}
SymValConst* sym_val = nullptr;
if (x->cls == Expr::_Const) { // Integer constant
sym_val = new SymValConst(static_cast<int>(G.all_constants.size()), x->intval);
} else if (x->cls == Expr::_SliceConst) { // Slice constant (string)
sym_val = new SymValConst(static_cast<int>(G.all_constants.size()), x->strval);
} else if (x->cls == Expr::_Apply) { // even "1 + 2" is Expr::_Apply (it applies `_+_`)
code.emplace_back(v->loc, Op::_Import, std::vector<var_idx_t>());
auto tmp_vars = x->pre_compile(code);
code.emplace_back(v->loc, Op::_Return, std::move(tmp_vars));
code.emplace_back(v->loc, Op::_Nop);
// It is REQUIRED to execute "optimizations" as in tolk.cpp
code.simplify_var_types();
code.prune_unreachable_code();
code.split_vars(true);
for (int i = 0; i < 16; i++) {
code.compute_used_code_vars();
code.fwd_analyze();
code.prune_unreachable_code();
}
code.mark_noreturn();
AsmOpList out_list(0, &code.vars);
code.generate_code(out_list);
if (out_list.list_.size() != 1) {
init_value->error("precompiled expression must result in single operation");
}
auto op = out_list.list_[0];
if (!op.is_const()) {
init_value->error("precompiled expression must result in compilation time constant");
}
if (op.origin.is_null() || !op.origin->is_valid()) {
init_value->error("precompiled expression did not result in a valid integer constant");
}
sym_val = new SymValConst(static_cast<int>(G.all_constants.size()), op.origin);
} else {
init_value->error("integer or slice literal or constant expected");
}
sym_def->value = sym_val;
#ifdef TOLK_DEBUG
sym_def->value->sym_name = v->get_identifier()->name;
#endif
G.all_constants.push_back(sym_def);
G.symtable.add_global_const(c_sym);
G.all_constants.push_back(c_sym);
v->mutate()->assign_const_ref(c_sym);
}
static void register_global_var(V<ast_global_var_declaration> v) {
SymDef* sym_def = define_global_symbol(calc_sym_idx(v->get_identifier()->name), v->loc);
if (sym_def->value) {
fire_error_redefinition_of_symbol(v->get_identifier(), sym_def);
}
GlobalVarData* g_sym = new GlobalVarData(static_cast<std::string>(v->get_identifier()->name), v->loc, v->declared_type);
sym_def->value = new SymValGlobVar(static_cast<int>(G.all_global_vars.size()), v->declared_type);
#ifdef TOLK_DEBUG
sym_def->value->sym_name = v->get_identifier()->name;
#endif
G.all_global_vars.push_back(sym_def);
G.symtable.add_global_var(g_sym);
G.all_global_vars.push_back(g_sym);
v->mutate()->assign_var_ref(g_sym);
}
static SymDef* register_parameter(V<ast_parameter> v, int idx) {
static LocalVarData register_parameter(V<ast_parameter> v, int idx) {
if (v->is_underscore()) {
return nullptr;
}
SymDef* sym_def = define_parameter(calc_sym_idx(v->get_identifier()->name), v->loc);
if (sym_def->value) {
// todo always false now, how to detect similar parameter names? (remember about underscore)
v->error("redefined parameter");
return {"", v->loc, idx, v->declared_type};
}
SymValVariable* sym_val = new SymValVariable(idx, v->param_type);
LocalVarData p_sym(static_cast<std::string>(v->param_name), v->loc, idx, v->declared_type);
if (v->declared_as_mutate) {
sym_val->flags |= SymValVariable::flagMutateParameter;
p_sym.flags |= LocalVarData::flagMutateParameter;
}
if (!v->declared_as_mutate && idx == 0 && v->get_identifier()->name == "self") {
sym_val->flags |= SymValVariable::flagImmutable;
if (!v->declared_as_mutate && idx == 0 && v->param_name == "self") {
p_sym.flags |= LocalVarData::flagImmutable;
}
sym_def->value = sym_val;
#ifdef TOLK_DEBUG
sym_def->value->sym_name = v->get_identifier()->name;
#endif
return sym_def;
return p_sym;
}
static void register_function(V<ast_function_declaration> v) {
@ -235,16 +157,16 @@ static void register_function(V<ast_function_declaration> v) {
TypeExpr* params_tensor_type = nullptr;
int n_params = v->get_num_params();
int n_mutate_params = 0;
std::vector<SymDef*> parameters_syms;
std::vector<LocalVarData> parameters;
if (n_params) {
std::vector<TypeExpr*> param_tensor_items;
param_tensor_items.reserve(n_params);
parameters_syms.reserve(n_params);
parameters.reserve(n_params);
for (int i = 0; i < n_params; ++i) {
auto v_param = v->get_param(i);
n_mutate_params += static_cast<int>(v_param->declared_as_mutate);
param_tensor_items.emplace_back(v_param->param_type);
parameters_syms.emplace_back(register_parameter(v_param, i));
param_tensor_items.emplace_back(v_param->declared_type);
parameters.emplace_back(register_parameter(v_param, i));
}
params_tensor_type = TypeExpr::new_tensor(std::move(param_tensor_items));
} else {
@ -261,24 +183,20 @@ static void register_function(V<ast_function_declaration> v) {
function_type = TypeExpr::new_forall(std::move(type_vars), function_type);
}
if (v->marked_as_builtin) {
const SymDef* builtin_func = lookup_symbol(G.symbols.lookup(func_name));
const SymValFunc* func_val = builtin_func ? dynamic_cast<SymValFunc*>(builtin_func->value) : nullptr;
if (!func_val || !func_val->is_builtin()) {
const Symbol* builtin_func = lookup_global_symbol(func_name);
const FunctionData* func_val = builtin_func ? builtin_func->as<FunctionData>() : nullptr;
if (!func_val || !func_val->is_builtin_function()) {
v->error("`builtin` used for non-builtin function");
}
#ifdef TOLK_DEBUG
// in release, we don't need this check, since `builtin` is used only in stdlib, which is our responsibility
if (!func_val->sym_type->equals_to(function_type) || func_val->is_marked_as_pure() != v->marked_as_pure) {
if (!func_val->full_type->equals_to(function_type) || func_val->is_marked_as_pure() != v->marked_as_pure) {
v->error("declaration for `builtin` function doesn't match an actual one");
}
#endif
return;
}
SymDef* sym_def = define_global_symbol(calc_sym_idx(func_name), v->loc);
if (sym_def->value) {
fire_error_redefinition_of_symbol(v->get_identifier(), sym_def);
}
if (G.is_verbosity(1)) {
std::cerr << "fun " << func_name << " : " << function_type << std::endl;
}
@ -286,67 +204,61 @@ static void register_function(V<ast_function_declaration> v) {
v->error("a pure function should return something, otherwise it will be optimized out anyway");
}
SymValFunc* sym_val = nullptr;
if (const auto* v_seq = v->get_body()->try_as<ast_sequence>()) {
sym_val = new SymValCodeFunc(std::move(parameters_syms), static_cast<int>(G.all_code_functions.size()), function_type);
} else if (const auto* v_asm = v->get_body()->try_as<ast_asm_body>()) {
std::vector<int> arg_order, ret_order;
calc_arg_ret_order_of_asm_function(v_asm, v->get_param_list(), v->ret_type, arg_order, ret_order);
sym_val = new SymValAsmFunc(std::move(parameters_syms), function_type, std::move(arg_order), std::move(ret_order), 0);
} else {
v->error("Unexpected function body statement");
FunctionBody f_body = v->get_body()->type == ast_sequence ? static_cast<FunctionBody>(new FunctionBodyCode) : static_cast<FunctionBody>(new FunctionBodyAsm);
FunctionData* f_sym = new FunctionData(static_cast<std::string>(func_name), v->loc, function_type, std::move(parameters), 0, f_body);
if (const auto* v_asm = v->get_body()->try_as<ast_asm_body>()) {
calc_arg_ret_order_of_asm_function(v_asm, v->get_param_list(), v->ret_type, f_sym->arg_order, f_sym->ret_order);
}
if (v->method_id) {
sym_val->method_id = td::string_to_int256(static_cast<std::string>(v->method_id->int_val));
if (sym_val->method_id.is_null()) {
if (v->method_id->intval.is_null() || !v->method_id->intval->signed_fits_bits(32)) {
v->method_id->error("invalid integer constant");
}
f_sym->method_id = static_cast<int>(v->method_id->intval->to_long());
} else if (v->marked_as_get_method) {
sym_val->method_id = calculate_method_id_by_func_name(func_name);
for (const SymDef* other : G.all_get_methods) {
if (!td::cmp(dynamic_cast<const SymValFunc*>(other->value)->method_id, sym_val->method_id)) {
v->error(PSTRING() << "GET methods hash collision: `" << other->name() << "` and `" << static_cast<std::string>(func_name) << "` produce the same hash. Consider renaming one of these functions.");
f_sym->method_id = calculate_method_id_by_func_name(func_name);
for (const FunctionData* other : G.all_get_methods) {
if (other->method_id == f_sym->method_id) {
v->error(PSTRING() << "GET methods hash collision: `" << other->name << "` and `" << f_sym->name << "` produce the same hash. Consider renaming one of these functions.");
}
}
} else if (v->is_entrypoint) {
sym_val->method_id = calculate_method_id_for_entrypoint(func_name);
f_sym->method_id = calculate_method_id_for_entrypoint(func_name);
}
if (v->marked_as_pure) {
sym_val->flags |= SymValFunc::flagMarkedAsPure;
f_sym->flags |= FunctionData::flagMarkedAsPure;
}
if (v->marked_as_inline) {
sym_val->flags |= SymValFunc::flagInline;
f_sym->flags |= FunctionData::flagInline;
}
if (v->marked_as_inline_ref) {
sym_val->flags |= SymValFunc::flagInlineRef;
f_sym->flags |= FunctionData::flagInlineRef;
}
if (v->marked_as_get_method) {
sym_val->flags |= SymValFunc::flagGetMethod;
f_sym->flags |= FunctionData::flagGetMethod;
}
if (v->is_entrypoint) {
sym_val->flags |= SymValFunc::flagIsEntrypoint;
f_sym->flags |= FunctionData::flagIsEntrypoint;
}
if (n_mutate_params) {
sym_val->flags |= SymValFunc::flagHasMutateParams;
f_sym->flags |= FunctionData::flagHasMutateParams;
}
if (v->accepts_self) {
sym_val->flags |= SymValFunc::flagAcceptsSelf;
f_sym->flags |= FunctionData::flagAcceptsSelf;
}
if (v->returns_self) {
sym_val->flags |= SymValFunc::flagReturnsSelf;
f_sym->flags |= FunctionData::flagReturnsSelf;
}
sym_def->value = sym_val;
#ifdef TOLK_DEBUG
sym_def->value->sym_name = func_name;
#endif
if (dynamic_cast<SymValCodeFunc*>(sym_val)) {
G.all_code_functions.push_back(sym_def);
G.symtable.add_function(f_sym);
if (f_sym->is_regular_function()) {
G.all_code_functions.push_back(f_sym);
}
if (sym_val->is_get_method()) {
G.all_get_methods.push_back(sym_def);
if (f_sym->is_get_method()) {
G.all_get_methods.push_back(f_sym);
}
v->mutate()->assign_fun_ref(f_sym);
}
static void iterate_through_file_symbols(const SrcFile* file) {