diff --git a/crypto/fift/utils.cpp b/crypto/fift/utils.cpp index 01cf0eb5..6057b2dc 100644 --- a/crypto/fift/utils.cpp +++ b/crypto/fift/utils.cpp @@ -211,20 +211,39 @@ td::Result create_mem_source_lookup(std::string main, std::s fift_dir); } -td::Result> compile_asm(td::Slice asm_code, std::string fift_dir, bool is_raw) { +td::Result> compile_asm(td::Slice asm_code) { std::stringstream ss; std::string sb; sb.reserve(asm_code.size() + 100); - sb.append("\"Asm.fif\" include\n "); - sb.append(is_raw ? "<{" : ""); + sb.append("\"Asm.fif\" include\n <{\n"); sb.append(asm_code.data(), asm_code.size()); - sb.append(is_raw ? "}>c" : ""); - sb.append(" boc>B \"res\" B>file"); + sb.append("\n}>c boc>B \"res\" B>file"); - TRY_RESULT(source_lookup, create_source_lookup(std::move(sb), true, true, true, false, false, false, false, fift_dir)); + TRY_RESULT(source_lookup, create_source_lookup(std::move(sb), true, true, true, false, false, false, false)); TRY_RESULT(res, run_fift(std::move(source_lookup), &ss)); TRY_RESULT(boc, res.read_file("res")); return vm::std_boc_deserialize(std::move(boc.data)); } +td::Result compile_asm_program(std::string&& program_code, const std::string& fift_dir) { + std::string main_fif; + main_fif.reserve(program_code.size() + 100); + main_fif.append(program_code.data(), program_code.size()); + main_fif.append(R"( dup hashB B>X $>B "hex" B>file)"); // write codeHashHex to a file + main_fif.append(R"( boc>B B>base64 $>B "boc" B>file)"); // write codeBoc64 to a file + + std::stringstream fift_output_stream; + TRY_RESULT(source_lookup, create_source_lookup(std::move(main_fif), true, true, false, false, false, false, false, fift_dir)); + TRY_RESULT(res, run_fift(std::move(source_lookup), &fift_output_stream)); + + TRY_RESULT(boc, res.read_file("boc")); + TRY_RESULT(hex, res.read_file("hex")); + + return CompiledProgramOutput{ + std::move(program_code), + std::move(boc.data), + std::move(hex.data), + }; +} + } // namespace fift diff --git a/crypto/fift/utils.h b/crypto/fift/utils.h index dd434fe0..fab92c54 100644 --- a/crypto/fift/utils.h +++ b/crypto/fift/utils.h @@ -26,11 +26,21 @@ struct FiftOutput { SourceLookup source_lookup; std::string output; }; + +// given a valid Fift code PROGRAM{ ... }END>c, compile_asm_program() returns this output +// now it's used primarily for wasm output (see tolk-js, for example) +struct CompiledProgramOutput { + std::string fiftCode; + std::string codeBoc64; + std::string codeHashHex; +}; + td::Result create_mem_source_lookup(std::string main, std::string fift_dir = "", bool need_preamble = true, bool need_asm = true, bool need_ton_util = true, bool need_lisp = true, bool need_w3_code = true); td::Result mem_run_fift(std::string source, std::vector args = {}, std::string fift_dir = ""); td::Result mem_run_fift(SourceLookup source_lookup, std::vector args); -td::Result> compile_asm(td::Slice asm_code, std::string fift_dir = "", bool is_raw = true); +td::Result> compile_asm(td::Slice asm_code); +td::Result compile_asm_program(std::string&& program_code, const std::string& fift_dir); } // namespace fift diff --git a/crypto/funcfiftlib/funcfiftlib.cpp b/crypto/funcfiftlib/funcfiftlib.cpp index 0bef9eac..403c075d 100644 --- a/crypto/funcfiftlib/funcfiftlib.cpp +++ b/crypto/funcfiftlib/funcfiftlib.cpp @@ -37,10 +37,10 @@ td::Result compile_internal(char *config_json) { TRY_RESULT(input_json, td::json_decode(td::MutableSlice(config_json))) - auto &obj = input_json.get_object(); + td::JsonObject& config = input_json.get_object(); - TRY_RESULT(opt_level, td::get_json_object_int_field(obj, "optLevel", false)); - TRY_RESULT(sources_obj, td::get_json_object_field(obj, "sources", td::JsonValue::Type::Array, false)); + TRY_RESULT(opt_level, td::get_json_object_int_field(config, "optLevel", false)); + TRY_RESULT(sources_obj, td::get_json_object_field(config, "sources", td::JsonValue::Type::Array, false)); auto &sources_arr = sources_obj.get_array(); @@ -52,29 +52,25 @@ td::Result compile_internal(char *config_json) { funC::opt_level = std::max(0, opt_level); funC::program_envelope = true; + funC::asm_preamble = true; funC::verbosity = 0; funC::indent = 1; std::ostringstream outs, errs; - auto compile_res = funC::func_proceed(sources, outs, errs); - - if (compile_res != 0) { - return td::Status::Error(std::string("Func compilation error: ") + errs.str()); + int funC_res = funC::func_proceed(sources, outs, errs); + if (funC_res != 0) { + return td::Status::Error("FunC compilation error: " + errs.str()); } - TRY_RESULT(code_cell, fift::compile_asm(outs.str(), "/fiftlib/", false)); - TRY_RESULT(boc, vm::std_boc_serialize(code_cell)); + TRY_RESULT(fift_res, fift::compile_asm_program(outs.str(), "/fiftlib/")); td::JsonBuilder result_json; - auto result_obj = result_json.enter_object(); - result_obj("status", "ok"); - result_obj("codeBoc", td::base64_encode(boc)); - result_obj("fiftCode", outs.str()); - result_obj("codeHashHex", code_cell->get_hash().to_hex()); - result_obj.leave(); - - outs.clear(); - errs.clear(); + auto obj = result_json.enter_object(); + obj("status", "ok"); + obj("fiftCode", std::move(fift_res.fiftCode)); + obj("codeBoc", std::move(fift_res.codeBoc64)); + obj("codeHashHex", std::move(fift_res.codeHashHex)); + obj.leave(); return result_json.string_builder().as_cslice().str(); } diff --git a/crypto/smartcont/mathlib.tolk b/crypto/smartcont/mathlib.tolk index 6a5b2d1b..bb18f921 100644 --- a/crypto/smartcont/mathlib.tolk +++ b/crypto/smartcont/mathlib.tolk @@ -572,9 +572,9 @@ int atanh_f261(int x, int n) inline_ref { s -= 1; } x += t; - int 2x = 2 * x; - int y = lshift256divr(2x, (x >> 1) - t); - ;; y = 2x - (mulrshiftr256(2x, y) ~>> 2); ;; this line could improve precision on very rare occasions + int `2x` = 2 * x; + int y = lshift256divr(`2x`, (x >> 1) - t); + ;; y = `2x` - (mulrshiftr256(2x, y) ~>> 2); ;; this line could improve precision on very rare occasions return (atanh_f258(y, 36), s); } diff --git a/tolk/CMakeLists.txt b/tolk/CMakeLists.txt index 8c890859..82036704 100644 --- a/tolk/CMakeLists.txt +++ b/tolk/CMakeLists.txt @@ -1,10 +1,9 @@ cmake_minimum_required(VERSION 3.5 FATAL_ERROR) set(TOLK_SOURCE - srcread.cpp + src-file.cpp lexer.cpp symtable.cpp - keywords.cpp unify-types.cpp parse-tolk.cpp abscode.cpp diff --git a/tolk/abscode.cpp b/tolk/abscode.cpp index 5833c004..8cf1f597 100644 --- a/tolk/abscode.cpp +++ b/tolk/abscode.cpp @@ -24,29 +24,19 @@ namespace tolk { * */ -TmpVar::TmpVar(var_idx_t _idx, int _cls, TypeExpr* _type, SymDef* sym, const SrcLocation* loc) - : v_type(_type), idx(_idx), cls(_cls), coord(0) { +TmpVar::TmpVar(var_idx_t _idx, int _cls, TypeExpr* _type, SymDef* sym, SrcLocation loc) + : v_type(_type), idx(_idx), cls(_cls), coord(0), where(loc) { if (sym) { name = sym->sym_idx; sym->value->idx = _idx; } - if (loc) { - where = std::make_unique(*loc); - } if (!_type) { v_type = TypeExpr::new_hole(); } - if (cls == _Named) { - undefined = true; - } } -void TmpVar::set_location(const SrcLocation& loc) { - if (where) { - *where = loc; - } else { - where = std::make_unique(loc); - } +void TmpVar::set_location(SrcLocation loc) { + where = loc; } void TmpVar::dump(std::ostream& os) const { @@ -469,10 +459,10 @@ void CodeBlob::print(std::ostream& os, int flags) const { if ((flags & 8) != 0) { for (const auto& var : vars) { var.dump(os); - if (var.where && (flags & 1) != 0) { - var.where->show(os); + if (var.where.is_defined() && (flags & 1) != 0) { + var.where.show(os); os << " defined here:\n"; - var.where->show_context(os); + var.where.show_context(os); } } } @@ -483,7 +473,7 @@ void CodeBlob::print(std::ostream& os, int flags) const { os << "-------- END ---------\n\n"; } -var_idx_t CodeBlob::create_var(int cls, TypeExpr* var_type, SymDef* sym, const SrcLocation* location) { +var_idx_t CodeBlob::create_var(int cls, TypeExpr* var_type, SymDef* sym, SrcLocation location) { vars.emplace_back(var_cnt, cls, var_type, sym, location); if (sym) { sym->value->idx = var_cnt; @@ -501,7 +491,7 @@ bool CodeBlob::import_params(FormalArgList arg_list) { SymDef* arg_sym; SrcLocation arg_loc; std::tie(arg_type, arg_sym, arg_loc) = par; - list.push_back(create_var(arg_sym ? (TmpVar::_In | TmpVar::_Named) : TmpVar::_In, arg_type, arg_sym, &arg_loc)); + list.push_back(create_var(arg_sym ? (TmpVar::_In | TmpVar::_Named) : TmpVar::_In, arg_type, arg_sym, arg_loc)); } emplace_back(loc, Op::_Import, list); in_var_cnt = var_cnt; diff --git a/tolk/analyzer.cpp b/tolk/analyzer.cpp index ab55a2b6..e38ba1bb 100644 --- a/tolk/analyzer.cpp +++ b/tolk/analyzer.cpp @@ -36,7 +36,7 @@ int CodeBlob::split_vars(bool strict) { for (int j = 0; j < var_cnt; j++) { TmpVar& var = vars[j]; if (strict && var.v_type->minw != var.v_type->maxw) { - throw ParseError{var.where.get(), "variable does not have fixed width, cannot manipulate it"}; + throw ParseError{var.where, "variable does not have fixed width, cannot manipulate it"}; } std::vector comp_types; int k = var.v_type->extract_components(comp_types); @@ -45,7 +45,7 @@ int CodeBlob::split_vars(bool strict) { if (k != 1) { var.coord = ~((n << 8) + k); for (int i = 0; i < k; i++) { - auto v = create_var(vars[j].cls, comp_types[i], 0, vars[j].where.get()); + auto v = create_var(vars[j].cls, comp_types[i], 0, vars[j].where); tolk_assert(v == n + i); tolk_assert(vars[v].idx == v); vars[v].name = vars[j].name; @@ -54,7 +54,7 @@ int CodeBlob::split_vars(bool strict) { n += k; ++changes; } else if (strict && var.v_type->minw != 1) { - throw ParseError{var.where.get(), + throw ParseError{var.where, "cannot work with variable or variable component of width greater than one"}; } } diff --git a/tolk/builtins.cpp b/tolk/builtins.cpp index 6589b9fc..355c21df 100644 --- a/tolk/builtins.cpp +++ b/tolk/builtins.cpp @@ -95,7 +95,7 @@ SymDef* define_builtin_const(std::string name, TypeExpr* const_type, Args&&... a } bool SymValAsmFunc::compile(AsmOpList& dest, std::vector& out, std::vector& in, - const SrcLocation& where) const { + SrcLocation where) const { if (simple_compile) { return dest.append(simple_compile(out, in, where)); } else if (ext_compile) { @@ -186,7 +186,7 @@ int emulate_mul(int a, int b) { return r; } -int emulate_and(int a, int b) { +int emulate_bitwise_and(int a, int b) { int both = a & b, any = a | b; int r = VarDescr::_Int; if (any & VarDescr::_Nan) { @@ -204,7 +204,7 @@ int emulate_and(int a, int b) { return r; } -int emulate_or(int a, int b) { +int emulate_bitwise_or(int a, int b) { if (b & VarDescr::_Zero) { return a; } else if (a & VarDescr::_Zero) { @@ -222,7 +222,7 @@ int emulate_or(int a, int b) { return r; } -int emulate_xor(int a, int b) { +int emulate_bitwise_xor(int a, int b) { if (b & VarDescr::_Zero) { return a; } else if (a & VarDescr::_Zero) { @@ -241,7 +241,7 @@ int emulate_xor(int a, int b) { return r; } -int emulate_not(int a) { +int emulate_bitwise_not(int a) { if ((a & VarDescr::ConstZero) == VarDescr::ConstZero) { return VarDescr::ConstTrue; } @@ -436,7 +436,7 @@ AsmOp push_const(td::RefInt256 x) { return AsmOp::IntConst(std::move(x)); } -AsmOp compile_add(std::vector& res, std::vector& args, const SrcLocation& where) { +AsmOp compile_add(std::vector& res, std::vector& args, SrcLocation where) { tolk_assert(res.size() == 1 && args.size() == 2); VarDescr &r = res[0], &x = args[0], &y = args[1]; if (x.is_int_const() && y.is_int_const()) { @@ -478,7 +478,7 @@ AsmOp compile_add(std::vector& res, std::vector& args, const return exec_op("ADD", 2); } -AsmOp compile_sub(std::vector& res, std::vector& args, const SrcLocation& where) { +AsmOp compile_sub(std::vector& res, std::vector& args, SrcLocation where) { tolk_assert(res.size() == 1 && args.size() == 2); VarDescr &r = res[0], &x = args[0], &y = args[1]; if (x.is_int_const() && y.is_int_const()) { @@ -511,7 +511,7 @@ AsmOp compile_sub(std::vector& res, std::vector& args, const return exec_op("SUB", 2); } -AsmOp compile_negate(std::vector& res, std::vector& args, const SrcLocation& where) { +AsmOp compile_unary_minus(std::vector& res, std::vector& args, SrcLocation where) { tolk_assert(res.size() == 1 && args.size() == 1); VarDescr &r = res[0], &x = args[0]; if (x.is_int_const()) { @@ -526,7 +526,19 @@ AsmOp compile_negate(std::vector& res, std::vector& args, co return exec_op("NEGATE", 1); } -AsmOp compile_and(std::vector& res, std::vector& args, const SrcLocation& where) { +AsmOp compile_unary_plus(std::vector& res, std::vector& args, SrcLocation where) { + tolk_assert(res.size() == 1 && args.size() == 1); + VarDescr &r = res[0], &x = args[0]; + if (x.is_int_const()) { + r.set_const(x.int_const); + x.unused(); + return push_const(r.int_const); + } + r.val = x.val; + return AsmOp::Nop(); +} + +AsmOp compile_bitwise_and(std::vector& res, std::vector& args, SrcLocation where) { tolk_assert(res.size() == 1 && args.size() == 2); VarDescr &r = res[0], &x = args[0], &y = args[1]; if (x.is_int_const() && y.is_int_const()) { @@ -535,11 +547,11 @@ AsmOp compile_and(std::vector& res, std::vector& args, const y.unused(); return push_const(r.int_const); } - r.val = emulate_and(x.val, y.val); + r.val = emulate_bitwise_and(x.val, y.val); return exec_op("AND", 2); } -AsmOp compile_or(std::vector& res, std::vector& args, const SrcLocation& where) { +AsmOp compile_bitwise_or(std::vector& res, std::vector& args, SrcLocation where) { tolk_assert(res.size() == 1 && args.size() == 2); VarDescr &r = res[0], &x = args[0], &y = args[1]; if (x.is_int_const() && y.is_int_const()) { @@ -548,11 +560,11 @@ AsmOp compile_or(std::vector& res, std::vector& args, const y.unused(); return push_const(r.int_const); } - r.val = emulate_or(x.val, y.val); + r.val = emulate_bitwise_or(x.val, y.val); return exec_op("OR", 2); } -AsmOp compile_xor(std::vector& res, std::vector& args, const SrcLocation& where) { +AsmOp compile_bitwise_xor(std::vector& res, std::vector& args, SrcLocation where) { tolk_assert(res.size() == 1 && args.size() == 2); VarDescr &r = res[0], &x = args[0], &y = args[1]; if (x.is_int_const() && y.is_int_const()) { @@ -561,11 +573,11 @@ AsmOp compile_xor(std::vector& res, std::vector& args, const y.unused(); return push_const(r.int_const); } - r.val = emulate_xor(x.val, y.val); + r.val = emulate_bitwise_xor(x.val, y.val); return exec_op("XOR", 2); } -AsmOp compile_not(std::vector& res, std::vector& args, const SrcLocation& where) { +AsmOp compile_bitwise_not(std::vector& res, std::vector& args, SrcLocation where) { tolk_assert(res.size() == 1 && args.size() == 1); VarDescr &r = res[0], &x = args[0]; if (x.is_int_const()) { @@ -573,11 +585,11 @@ AsmOp compile_not(std::vector& res, std::vector& args, const x.unused(); return push_const(r.int_const); } - r.val = emulate_not(x.val); + r.val = emulate_bitwise_not(x.val); return exec_op("NOT", 1); } -AsmOp compile_mul_internal(VarDescr& r, VarDescr& x, VarDescr& y, const SrcLocation& where) { +AsmOp compile_mul_internal(VarDescr& r, VarDescr& x, VarDescr& y, SrcLocation where) { if (x.is_int_const() && y.is_int_const()) { r.set_const(x.int_const * y.int_const); if (!r.int_const->is_valid()) { @@ -645,12 +657,12 @@ AsmOp compile_mul_internal(VarDescr& r, VarDescr& x, VarDescr& y, const SrcLocat return exec_op("MUL", 2); } -AsmOp compile_mul(std::vector& res, std::vector& args, const SrcLocation& where) { +AsmOp compile_mul(std::vector& res, std::vector& args, SrcLocation where) { tolk_assert(res.size() == 1 && args.size() == 2); return compile_mul_internal(res[0], args[0], args[1], where); } -AsmOp compile_lshift(std::vector& res, std::vector& args, const SrcLocation& where) { +AsmOp compile_lshift(std::vector& res, std::vector& args, SrcLocation where) { tolk_assert(res.size() == 1 && args.size() == 2); VarDescr &r = res[0], &x = args[0], &y = args[1]; if (y.is_int_const()) { @@ -692,7 +704,7 @@ AsmOp compile_lshift(std::vector& res, std::vector& args, co return exec_op("LSHIFT", 2); } -AsmOp compile_rshift(std::vector& res, std::vector& args, const SrcLocation& where, +AsmOp compile_rshift(std::vector& res, std::vector& args, SrcLocation where, int round_mode) { tolk_assert(res.size() == 1 && args.size() == 2); VarDescr &r = res[0], &x = args[0], &y = args[1]; @@ -722,7 +734,7 @@ AsmOp compile_rshift(std::vector& res, std::vector& args, co return exec_op(rshift, 2); } -AsmOp compile_div_internal(VarDescr& r, VarDescr& x, VarDescr& y, const SrcLocation& where, int round_mode) { +AsmOp compile_div_internal(VarDescr& r, VarDescr& x, VarDescr& y, SrcLocation where, int round_mode) { if (x.is_int_const() && y.is_int_const()) { r.set_const(div(x.int_const, y.int_const, round_mode)); if (!r.int_const->is_valid()) { @@ -762,12 +774,12 @@ AsmOp compile_div_internal(VarDescr& r, VarDescr& x, VarDescr& y, const SrcLocat return exec_op(op, 2); } -AsmOp compile_div(std::vector& res, std::vector& args, const SrcLocation& where, int round_mode) { +AsmOp compile_div(std::vector& res, std::vector& args, SrcLocation where, int round_mode) { tolk_assert(res.size() == 1 && args.size() == 2); return compile_div_internal(res[0], args[0], args[1], where, round_mode); } -AsmOp compile_mod(std::vector& res, std::vector& args, const SrcLocation& where, +AsmOp compile_mod(std::vector& res, std::vector& args, SrcLocation where, int round_mode) { tolk_assert(res.size() == 1 && args.size() == 2); VarDescr &r = res[0], &x = args[0], &y = args[1]; @@ -808,7 +820,7 @@ AsmOp compile_mod(std::vector& res, std::vector& args, const return exec_op(op, 2); } -AsmOp compile_muldiv(std::vector& res, std::vector& args, const SrcLocation& where, +AsmOp compile_muldiv(std::vector& res, std::vector& args, SrcLocation where, int round_mode) { tolk_assert(res.size() == 1 && args.size() == 3); VarDescr &r = res[0], &x = args[0], &y = args[1], &z = args[2]; @@ -978,7 +990,7 @@ AsmOp compile_cmp_int(std::vector& res, std::vector& args, i return exec_op(cmp_names[mode], 2); } -AsmOp compile_throw(std::vector& res, std::vector& args, const SrcLocation&) { +AsmOp compile_throw(std::vector& res, std::vector& args, SrcLocation) { tolk_assert(res.empty() && args.size() == 1); VarDescr& x = args[0]; if (x.is_int_const() && x.int_const->unsigned_fits_bits(11)) { @@ -1010,7 +1022,7 @@ AsmOp compile_cond_throw(std::vector& res, std::vector& args } } -AsmOp compile_throw_arg(std::vector& res, std::vector& args, const SrcLocation&) { +AsmOp compile_throw_arg(std::vector& res, std::vector& args, SrcLocation) { tolk_assert(res.empty() && args.size() == 2); VarDescr &x = args[1]; if (x.is_int_const() && x.int_const->unsigned_fits_bits(11)) { @@ -1101,7 +1113,7 @@ AsmOp compile_fetch_slice(std::vector& res, std::vector& arg } // _at(tuple t, int index) asm "INDEXVAR"; -AsmOp compile_tuple_at(std::vector& res, std::vector& args, const SrcLocation&) { +AsmOp compile_tuple_at(std::vector& res, std::vector& args, SrcLocation) { tolk_assert(args.size() == 2 && res.size() == 1); auto& y = args[1]; if (y.is_int_const() && y.int_const >= 0 && y.int_const < 16) { @@ -1112,7 +1124,7 @@ AsmOp compile_tuple_at(std::vector& res, std::vector& args, } // int null?(X arg) -AsmOp compile_is_null(std::vector& res, std::vector& args, const SrcLocation&) { +AsmOp compile_is_null(std::vector& res, std::vector& args, SrcLocation) { tolk_assert(args.size() == 1 && res.size() == 1); auto &x = args[0], &r = res[0]; if (x.always_null() || x.always_not_null()) { @@ -1128,12 +1140,12 @@ AsmOp compile_is_null(std::vector& res, std::vector& args, c void define_builtins() { using namespace std::placeholders; auto Unit = TypeExpr::new_unit(); - auto Int = TypeExpr::new_atomic(_Int); - auto Cell = TypeExpr::new_atomic(_Cell); - auto Slice = TypeExpr::new_atomic(_Slice); - auto Builder = TypeExpr::new_atomic(_Builder); - // auto Null = TypeExpr::new_atomic(_Null); - auto Tuple = TypeExpr::new_atomic(_Tuple); + auto Int = TypeExpr::new_atomic(TypeExpr::_Int); + auto Cell = TypeExpr::new_atomic(TypeExpr::_Cell); + auto Slice = TypeExpr::new_atomic(TypeExpr::_Slice); + auto Builder = TypeExpr::new_atomic(TypeExpr::_Builder); + // auto Null = TypeExpr::new_atomic(TypeExpr::_Null); + auto Tuple = TypeExpr::new_atomic(TypeExpr::_Tuple); auto Int2 = TypeExpr::new_tensor({Int, Int}); auto Int3 = TypeExpr::new_tensor({Int, Int, Int}); auto TupleInt = TypeExpr::new_tensor({Tuple, Int}); @@ -1156,9 +1168,16 @@ void define_builtins() { //auto arith_null_op = TypeExpr::new_map(TypeExpr::new_unit(), Int); auto throw_arg_op = TypeExpr::new_forall({X}, TypeExpr::new_map(TypeExpr::new_tensor({X, Int}), Unit)); auto cond_throw_arg_op = TypeExpr::new_forall({X}, TypeExpr::new_map(TypeExpr::new_tensor({X, Int, Int}), Unit)); + + // prevent unused vars warnings (there vars are created to acquire initial id of TypeExpr::value) + static_cast(Z); + static_cast(XY); + static_cast(Cell); + define_builtin_func("_+_", arith_bin_op, compile_add); define_builtin_func("_-_", arith_bin_op, compile_sub); - define_builtin_func("-_", arith_un_op, compile_negate); + define_builtin_func("-_", arith_un_op, compile_unary_minus); + define_builtin_func("+_", arith_un_op, compile_unary_plus); define_builtin_func("_*_", arith_bin_op, compile_mul); define_builtin_func("_/_", arith_bin_op, std::bind(compile_div, _1, _2, _3, -1)); define_builtin_func("_~/_", arith_bin_op, std::bind(compile_div, _1, _2, _3, 0)); @@ -1175,10 +1194,10 @@ void define_builtins() { define_builtin_func("_>>_", arith_bin_op, std::bind(compile_rshift, _1, _2, _3, -1)); define_builtin_func("_~>>_", arith_bin_op, std::bind(compile_rshift, _1, _2, _3, 0)); define_builtin_func("_^>>_", arith_bin_op, std::bind(compile_rshift, _1, _2, _3, 1)); - define_builtin_func("_&_", arith_bin_op, compile_and); - define_builtin_func("_|_", arith_bin_op, compile_or); - define_builtin_func("_^_", arith_bin_op, compile_xor); - define_builtin_func("~_", arith_un_op, compile_not); + define_builtin_func("_&_", arith_bin_op, compile_bitwise_and); + define_builtin_func("_|_", arith_bin_op, compile_bitwise_or); + define_builtin_func("_^_", arith_bin_op, compile_bitwise_xor); + define_builtin_func("~_", arith_un_op, compile_bitwise_not); define_builtin_func("^_+=_", arith_bin_op, compile_add); define_builtin_func("^_-=_", arith_bin_op, compile_sub); define_builtin_func("^_*=_", arith_bin_op, compile_mul); @@ -1192,9 +1211,9 @@ void define_builtins() { define_builtin_func("^_>>=_", arith_bin_op, std::bind(compile_rshift, _1, _2, _3, -1)); define_builtin_func("^_~>>=_", arith_bin_op, std::bind(compile_rshift, _1, _2, _3, 0)); define_builtin_func("^_^>>=_", arith_bin_op, std::bind(compile_rshift, _1, _2, _3, 1)); - define_builtin_func("^_&=_", arith_bin_op, compile_and); - define_builtin_func("^_|=_", arith_bin_op, compile_or); - define_builtin_func("^_^=_", arith_bin_op, compile_xor); + define_builtin_func("^_&=_", arith_bin_op, compile_bitwise_and); + define_builtin_func("^_|=_", arith_bin_op, compile_bitwise_or); + define_builtin_func("^_^=_", arith_bin_op, compile_bitwise_xor); define_builtin_func("muldiv", TypeExpr::new_map(Int3, Int), std::bind(compile_muldiv, _1, _2, _3, -1)); define_builtin_func("muldivr", TypeExpr::new_map(Int3, Int), std::bind(compile_muldiv, _1, _2, _3, 0)); define_builtin_func("muldivc", TypeExpr::new_map(Int3, Int), std::bind(compile_muldiv, _1, _2, _3, 1)); diff --git a/tolk/gen-abscode.cpp b/tolk/gen-abscode.cpp index a537d99c..1c4afa67 100644 --- a/tolk/gen-abscode.cpp +++ b/tolk/gen-abscode.cpp @@ -41,25 +41,19 @@ Expr::Expr(ExprCls c, sym_idx_t name_idx, std::initializer_list _arglist) } } -void Expr::chk_rvalue(const Lexem& lem) const { +void Expr::chk_rvalue(const Lexer& lex) const { if (!is_rvalue()) { - lem.error_at("rvalue expected before `", "`"); + lex.error_at("rvalue expected before `", "`"); } } -void Expr::chk_lvalue(const Lexem& lem) const { +void Expr::chk_lvalue(const Lexer& lex) const { if (!is_lvalue()) { - lem.error_at("lvalue expected before `", "`"); + lex.error_at("lvalue expected before `", "`"); } } -void Expr::chk_type(const Lexem& lem) const { - if (!is_type()) { - lem.error_at("type expression expected before `", "`"); - } -} - -bool Expr::deduce_type(const Lexem& lem) { +bool Expr::deduce_type(const Lexer& lex) { if (e_type) { return true; } @@ -83,7 +77,7 @@ bool Expr::deduce_type(const Lexem& lem) { std::ostringstream os; os << "cannot apply function " << sym->name() << " : " << sym_val->get_type() << " to arguments of type " << fun_type->args[0] << ": " << ue; - lem.error(os.str()); + lex.error(os.str()); } e_type = fun_type->args[1]; TypeExpr::remove_indirect(e_type); @@ -98,7 +92,7 @@ bool Expr::deduce_type(const Lexem& lem) { std::ostringstream os; os << "cannot apply expression of type " << args[0]->e_type << " to an expression of type " << args[1]->e_type << ": " << ue; - lem.error(os.str()); + lex.error(os.str()); } e_type = fun_type->args[1]; TypeExpr::remove_indirect(e_type); @@ -113,7 +107,7 @@ bool Expr::deduce_type(const Lexem& lem) { std::ostringstream os; os << "cannot assign an expression of type " << args[1]->e_type << " to a variable or pattern of type " << args[0]->e_type << ": " << ue; - lem.error(os.str()); + lex.error(os.str()); } e_type = args[0]->e_type; TypeExpr::remove_indirect(e_type); @@ -130,7 +124,7 @@ bool Expr::deduce_type(const Lexem& lem) { os << "cannot implicitly assign an expression of type " << args[1]->e_type << " to a variable or pattern of type " << rhs_type << " in modifying method `" << symbols.get_name(val) << "` : " << ue; - lem.error(os.str()); + lex.error(os.str()); } e_type = rhs_type->args[1]; TypeExpr::remove_indirect(e_type); @@ -139,13 +133,13 @@ bool Expr::deduce_type(const Lexem& lem) { } case _CondExpr: { tolk_assert(args.size() == 3); - auto flag_type = TypeExpr::new_atomic(_Int); + auto flag_type = TypeExpr::new_atomic(TypeExpr::_Int); try { unify(args[0]->e_type, flag_type); } catch (UnifyError& ue) { std::ostringstream os; os << "condition in a conditional expression has non-integer type " << args[0]->e_type << ": " << ue; - lem.error(os.str()); + lex.error(os.str()); } try { unify(args[1]->e_type, args[2]->e_type); @@ -153,7 +147,7 @@ bool Expr::deduce_type(const Lexem& lem) { std::ostringstream os; os << "the two variants in a conditional expression have different types " << args[1]->e_type << " and " << args[2]->e_type << " : " << ue; - lem.error(os.str()); + lex.error(os.str()); } e_type = args[1]->e_type; TypeExpr::remove_indirect(e_type); @@ -176,13 +170,13 @@ int Expr::define_new_vars(CodeBlob& code) { } case _Var: if (val < 0) { - val = code.create_var(TmpVar::_Named, e_type, sym, &here); + val = code.create_var(TmpVar::_Named, e_type, sym, here); return 1; } break; case _Hole: if (val < 0) { - val = code.create_var(TmpVar::_Tmp, e_type, nullptr, &here); + val = code.create_var(TmpVar::_Tmp, e_type, nullptr, here); } break; } @@ -202,7 +196,7 @@ int Expr::predefine_vars() { } case _Var: if (!sym) { - tolk_assert(val < 0 && here.defined()); + tolk_assert(val < 0 && here.is_defined()); if (prohibited_var_names.count(symbols.get_name(~val))) { throw ParseError{ here, PSTRING() << "symbol `" << symbols.get_name(~val) << "` cannot be redefined as a variable"}; @@ -212,7 +206,7 @@ int Expr::predefine_vars() { if (!sym) { throw ParseError{here, std::string{"redefined variable `"} + symbols.get_name(~val) + "`"}; } - sym->value = new SymVal{SymVal::_Var, -1, e_type}; + sym->value = new SymVal{SymValKind::_Var, -1, e_type}; return 1; } break; @@ -221,17 +215,17 @@ int Expr::predefine_vars() { } var_idx_t Expr::new_tmp(CodeBlob& code) const { - return code.create_tmp_var(e_type, &here); + return code.create_tmp_var(e_type, here); } -void add_set_globs(CodeBlob& code, std::vector>& globs, const SrcLocation& here) { +void add_set_globs(CodeBlob& code, std::vector>& globs, SrcLocation here) { for (const auto& p : globs) { auto& op = code.emplace_back(here, Op::_SetGlob, std::vector{}, std::vector{ p.second }, p.first); op.set_impure(code); } } -std::vector pre_compile_let(CodeBlob& code, Expr* lhs, Expr* rhs, const SrcLocation& here) { +std::vector pre_compile_let(CodeBlob& code, Expr* lhs, Expr* rhs, SrcLocation here) { while (lhs->is_type_apply()) { lhs = lhs->args.at(0); } @@ -245,7 +239,7 @@ std::vector pre_compile_let(CodeBlob& code, Expr* lhs, Expr* rhs, con auto right = rhs->pre_compile(code); TypeExpr::remove_indirect(rhs->e_type); auto unpacked_type = rhs->e_type->args.at(0); - std::vector tmp{code.create_tmp_var(unpacked_type, &rhs->here)}; + std::vector tmp{code.create_tmp_var(unpacked_type, rhs->here)}; code.emplace_back(lhs->here, Op::_UnTuple, tmp, std::move(right)); auto tvar = new Expr{Expr::_Var}; tvar->set_val(tmp[0]); @@ -286,14 +280,14 @@ std::vector pre_compile_tensor(const std::vector& args, CodeB for (size_t j = 0; j < res_lists[i].size(); ++j) { TmpVar& var = code.vars.at(res_lists[i][j]); if (!lval_globs && (var.cls & TmpVar::_Named)) { - var.on_modification.push_back([&modified_vars, i, j, cur_ops = code.cur_ops, done = false](const SrcLocation &here) mutable { + var.on_modification.push_back([&modified_vars, i, j, cur_ops = code.cur_ops, done = false](SrcLocation here) mutable { if (!done) { done = true; modified_vars.push_back({i, j, cur_ops}); } }); } else { - var.on_modification.push_back([](const SrcLocation &) { + var.on_modification.push_back([](SrcLocation) { }); } } @@ -307,8 +301,8 @@ std::vector pre_compile_tensor(const std::vector& args, CodeB for (size_t idx = modified_vars.size(); idx--; ) { const ModifiedVar &m = modified_vars[idx]; var_idx_t orig_v = res_lists[m.i][m.j]; - var_idx_t tmp_v = code.create_tmp_var(code.vars[orig_v].v_type, code.vars[orig_v].where.get()); - std::unique_ptr op = std::make_unique(*code.vars[orig_v].where, Op::_Let); + var_idx_t tmp_v = code.create_tmp_var(code.vars[orig_v].v_type, code.vars[orig_v].where); + std::unique_ptr op = std::make_unique(code.vars[orig_v].where, Op::_Let); op->left = {tmp_v}; op->right = {orig_v}; op->next = std::move((*m.cur_ops)); diff --git a/tolk/keywords.cpp b/tolk/keywords.cpp deleted file mode 100644 index 50d55c41..00000000 --- a/tolk/keywords.cpp +++ /dev/null @@ -1,129 +0,0 @@ -/* - This file is part of TON Blockchain Library. - - TON Blockchain Library is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - TON Blockchain Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public License - along with TON Blockchain Library. If not, see . -*/ -#include "tolk.h" - -namespace tolk { - -/* - * - * KEYWORD DEFINITION - * - */ - -void define_keywords() { - symbols.add_kw_char('+') - .add_kw_char('-') - .add_kw_char('*') - .add_kw_char('/') - .add_kw_char('%') - .add_kw_char('?') - .add_kw_char(':') - .add_kw_char(',') - .add_kw_char(';') - .add_kw_char('(') - .add_kw_char(')') - .add_kw_char('[') - .add_kw_char(']') - .add_kw_char('{') - .add_kw_char('}') - .add_kw_char('=') - .add_kw_char('_') - .add_kw_char('<') - .add_kw_char('>') - .add_kw_char('&') - .add_kw_char('|') - .add_kw_char('^') - .add_kw_char('~'); - - symbols.add_keyword("==", Keyword::_Eq) - .add_keyword("!=", Keyword::_Neq) - .add_keyword("<=", Keyword::_Leq) - .add_keyword(">=", Keyword::_Geq) - .add_keyword("<=>", Keyword::_Spaceship) - .add_keyword("<<", Keyword::_Lshift) - .add_keyword(">>", Keyword::_Rshift) - .add_keyword("~>>", Keyword::_RshiftR) - .add_keyword("^>>", Keyword::_RshiftC) - .add_keyword("~/", Keyword::_DivR) - .add_keyword("^/", Keyword::_DivC) - .add_keyword("~%", Keyword::_ModR) - .add_keyword("^%", Keyword::_ModC) - .add_keyword("/%", Keyword::_DivMod) - .add_keyword("+=", Keyword::_PlusLet) - .add_keyword("-=", Keyword::_MinusLet) - .add_keyword("*=", Keyword::_TimesLet) - .add_keyword("/=", Keyword::_DivLet) - .add_keyword("~/=", Keyword::_DivRLet) - .add_keyword("^/=", Keyword::_DivCLet) - .add_keyword("%=", Keyword::_ModLet) - .add_keyword("~%=", Keyword::_ModRLet) - .add_keyword("^%=", Keyword::_ModCLet) - .add_keyword("<<=", Keyword::_LshiftLet) - .add_keyword(">>=", Keyword::_RshiftLet) - .add_keyword("~>>=", Keyword::_RshiftRLet) - .add_keyword("^>>=", Keyword::_RshiftCLet) - .add_keyword("&=", Keyword::_AndLet) - .add_keyword("|=", Keyword::_OrLet) - .add_keyword("^=", Keyword::_XorLet); - - symbols.add_keyword("return", Keyword::_Return) - .add_keyword("var", Keyword::_Var) - .add_keyword("repeat", Keyword::_Repeat) - .add_keyword("do", Keyword::_Do) - .add_keyword("while", Keyword::_While) - .add_keyword("until", Keyword::_Until) - .add_keyword("try", Keyword::_Try) - .add_keyword("catch", Keyword::_Catch) - .add_keyword("if", Keyword::_If) - .add_keyword("ifnot", Keyword::_Ifnot) - .add_keyword("then", Keyword::_Then) - .add_keyword("else", Keyword::_Else) - .add_keyword("elseif", Keyword::_Elseif) - .add_keyword("elseifnot", Keyword::_Elseifnot); - - symbols.add_keyword("int", Keyword::_Int) - .add_keyword("cell", Keyword::_Cell) - .add_keyword("slice", Keyword::_Slice) - .add_keyword("builder", Keyword::_Builder) - .add_keyword("cont", Keyword::_Cont) - .add_keyword("tuple", Keyword::_Tuple) - .add_keyword("type", Keyword::_Type) - .add_keyword("->", Keyword::_Mapsto) - .add_keyword("forall", Keyword::_Forall); - - symbols.add_keyword("extern", Keyword::_Extern) - .add_keyword("global", Keyword::_Global) - .add_keyword("asm", Keyword::_Asm) - .add_keyword("impure", Keyword::_Impure) - .add_keyword("pure", Keyword::_Pure) - .add_keyword("inline", Keyword::_Inline) - .add_keyword("inline_ref", Keyword::_InlineRef) - .add_keyword("builtin", Keyword::_Builtin) - .add_keyword("auto_apply", Keyword::_AutoApply) - .add_keyword("method_id", Keyword::_MethodId) - .add_keyword("get", Keyword::_Get) - .add_keyword("operator", Keyword::_Operator) - .add_keyword("infix", Keyword::_Infix) - .add_keyword("infixl", Keyword::_Infixl) - .add_keyword("infixr", Keyword::_Infixr) - .add_keyword("const", Keyword::_Const); - - symbols.add_keyword("#pragma", Keyword::_PragmaHashtag) - .add_keyword("#include", Keyword::_IncludeHashtag); -} - -} // namespace tolk diff --git a/tolk/lexer.cpp b/tolk/lexer.cpp index e54c70e4..6d066d29 100644 --- a/tolk/lexer.cpp +++ b/tolk/lexer.cpp @@ -16,335 +16,632 @@ */ #include "lexer.h" #include "symtable.h" -#include #include namespace tolk { -/* - * - * LEXER - * - */ +// By 'chunk' in lexer I mean a token or a list of tokens parsed simultaneously. +// E.g., when we meet "str", ChunkString is called, it emits tok_string. +// E.g., when we meet "str"x, ChunkString emits not only tok_string, but tok_string_modifier. +// E.g., when we meet //, ChunkInlineComment is called, it emits nothing (just skips a line). +// We store all valid chunks lexers in a prefix tree (LexingTrie), see below. +struct ChunkLexerBase { + ChunkLexerBase(const ChunkLexerBase&) = delete; + ChunkLexerBase &operator=(const ChunkLexerBase&) = delete; + ChunkLexerBase() = default; -std::string Lexem::lexem_name_str(int idx) { - if (idx == Eof) { - return "end of file"; - } else if (idx == Ident) { - return "identifier"; - } else if (idx == Number) { - return "number"; - } else if (idx == String) { - return "string"; - } else if (idx == Special) { - return "special"; - } else if (symbols.get_keyword(idx)) { - return "`" + symbols.get_keyword(idx)->str + "`"; - } else { - std::ostringstream os{""; - return os.str(); - } + virtual bool parse(Lexer* lex) const = 0; + virtual ~ChunkLexerBase() = default; +}; + +template +static T* singleton() { + static T obj; + return &obj; } -std::string Lexem::name_str() const { - if (tp == Ident) { - return std::string{"identifier `"} + symbols.get_name(val) + "`"; - } else if (tp == String) { - return std::string{"string \""} + str + '"'; - } else { - return lexem_name_str(tp); - } -} +// LexingTrie is a prefix tree storing all available Tolk language constructs. +// It's effectively a map of a prefix to ChunkLexerBase. +class LexingTrie { + LexingTrie** next{nullptr}; // either nullptr or [256] + ChunkLexerBase* val{nullptr}; // non-null for leafs -bool is_number(std::string str) { - auto st = str.begin(), en = str.end(); - if (st == en) { - return false; - } - if (*st == '-') { - st++; - } - bool hex = false; - if (st + 1 < en && *st == '0' && st[1] == 'x') { - st += 2; - hex = true; - } - if (st == en) { - return false; - } - while (st < en) { - int c = *st; - if (c >= '0' && c <= '9') { - ++st; - continue; + GNU_ATTRIBUTE_ALWAYS_INLINE void ensure_next_allocated() { + if (next == nullptr) { + next = new LexingTrie*[256]; + std::memset(next, 0, 256 * sizeof(LexingTrie*)); } - if (!hex) { - return false; + } + + GNU_ATTRIBUTE_ALWAYS_INLINE void ensure_symbol_allocated(uint8_t symbol) const { + if (next[symbol] == nullptr) { + next[symbol] = new LexingTrie; } - c |= 0x20; - if (c < 'a' || c > 'f') { - return false; + } + +public: + // Maps a prefix onto a chunk lexer. + // E.g. " -> ChunkString + // E.g. """ -> ChunkMultilineString + void add_prefix(const char* s, ChunkLexerBase* val) { + LexingTrie* cur = this; + + for (; *s; ++s) { + uint8_t symbol = static_cast(*s); + cur->ensure_next_allocated(); + cur->ensure_symbol_allocated(symbol); + cur = cur->next[symbol]; } - ++st; - } - return true; -} -int Lexem::classify() { - if (tp != Unknown) { - return tp; +#ifdef TOLK_DEBUG + assert(!cur->val); +#endif + cur->val = val; } - sym_idx_t i = symbols.lookup(str); - if (i) { - assert(str == symbols[i]->str); - str = symbols[i]->str; - sym_idx_t idx = symbols[i]->idx; - tp = (idx < 0 ? -idx : Ident); - val = i; - } else if (is_number(str)) { - tp = Number; - } else { - tp = 0; - } - if (tp == Unknown) { - tp = Ident; - val = symbols.lookup(str, 1); - } - return tp; -} -int Lexem::set(std::string _str, const SrcLocation& _loc, int _tp, int _val) { - str = _str; - loc = _loc; - tp = _tp; - val = _val; - return classify(); -} + // Maps a pattern onto a chunk lexer. + // E.g. -[0-9] -> ChunkNegativeNumber + // Internally, it expands the pattern to all possible prefixes: -0, -1, etc. + // (for example, [0-9][a-z_$] gives 10*28=280 prefixes) + void add_pattern(const char* pattern, ChunkLexerBase* val) { + std::vector all_possible_trie{this}; -Lexer::Lexer(SourceReader& _src, std::string active_chars, std::string quote_chars, std::string multiline_quote) - : src(_src), eof(false), lexem("", src.here(), Lexem::Undefined), peek_lexem("", {}, Lexem::Undefined), - multiline_quote(std::move(multiline_quote)) { - std::memset(char_class, 0, sizeof(char_class)); - unsigned char activity = cc::active; - for (char c : active_chars) { - if (c == ' ') { - if (!--activity) { - activity = cc::allow_repeat; + for (const char* c = pattern; *c; ++c) { + std::string to_append; + if (*c == '[') { + c++; + while (*c != ']') { // assume that input is corrent, no out-of-string checks + if (*(c + 1) == '-') { + char l = *c, r = *(c + 2); + for (char symbol = l; symbol <= r; ++symbol) { + to_append += symbol; + } + c += 3; + } else { + to_append += *c; + c++; + } + } + } else { + to_append += *c; } - } else if ((unsigned)c < 0x80) { - char_class[(unsigned)c] |= activity; + + std::vector next_all_possible_trie; + next_all_possible_trie.reserve(all_possible_trie.size() * to_append.size()); + for (LexingTrie* cur : all_possible_trie) { + cur->ensure_next_allocated(); + for (uint8_t symbol : to_append) { + cur->ensure_symbol_allocated(symbol); + next_all_possible_trie.emplace_back(cur->next[symbol]); + } + } + all_possible_trie = std::move(next_all_possible_trie); + } + + for (LexingTrie* trie : all_possible_trie) { + trie->val = val; } } - for (int c : quote_chars) { - if (c > ' ' && c <= 0x7f) { - char_class[(unsigned)c] |= cc::quote_char; - } - } -} -void Lexer::set_comment_tokens(const std::string &eol_cmts, const std::string &open_cmts, const std::string &close_cmts) { - set_spec(eol_cmt, eol_cmts); - set_spec(cmt_op, open_cmts); - set_spec(cmt_cl, close_cmts); -} + // Looks up a chunk lexer given a string (in practice, s points to cur position in the middle of the file). + // It returns the deepest case: pointing to ", it will return ChunkMultilineString if """, or ChunkString otherwize. + ChunkLexerBase* get_deepest(const char* s) const { + const LexingTrie* best = this; -void Lexer::set_comment2_tokens(const std::string &eol_cmts2, const std::string &open_cmts2, const std::string &close_cmts2) { - set_spec(eol_cmt2, eol_cmts2); - set_spec(cmt_op2, open_cmts2); - set_spec(cmt_cl2, close_cmts2); -} - -void Lexer::start_parsing() { - next(); -} - -void Lexer::set_spec(std::array& arr, std::string setup) { - arr[0] = arr[1] = arr[2] = -0x100; - std::size_t n = setup.size(), i; - for (i = 0; i < n; i++) { - if (setup[i] == ' ') { - continue; - } - if (i == n - 1 || setup[i + 1] == ' ') { - arr[0] = setup[i]; - } else if (i == n - 2 || (i < n - 2 && setup[i + 2] == ' ')) { - arr[1] = setup[i]; - arr[2] = setup[++i]; - } else { - while (i < n && setup[i] != ' ') { - i++; + for (const LexingTrie* cur = this; cur && cur->next; ++s) { + cur = cur->next[static_cast(*s)]; // if s reaches \0, cur will just become nullptr, and loop will end + if (cur && cur->val) { + best = cur; } } - } -} -bool Lexer::is_multiline_quote(const char* begin, const char* end) { - if (multiline_quote.empty()) { - return false; + return best->val; } - for (const char& c : multiline_quote) { - if (begin == end || *begin != c) { - return false; - } - ++begin; - } - return true; -} +}; -void Lexer::expect(int exp_tp, const char* msg) { - if (tp() != exp_tp) { - throw ParseError{lexem.loc, (msg ? std::string{msg} : Lexem::lexem_name_str(exp_tp)) + " expected instead of " + - cur().name_str()}; - } - next(); -} +// +// ---------------------------------------------------------------------- +// A list of valid parsed chunks. +// -const Lexem& Lexer::next() { - if (peek_lexem.valid()) { - lexem = std::move(peek_lexem); - peek_lexem.clear({}, Lexem::Undefined); - eof = (lexem.tp == Lexem::Eof); - return lexem; +// An inline comment, starting from '//' +struct ChunkInlineComment final : ChunkLexerBase { + bool parse(Lexer* lex) const override { + lex->skip_line(); + return true; } - if (eof) { - return lexem.clear(src.here(), Lexem::Eof); - } - long long comm = 1; - // the code below is very complicated, because it tried to support one-symbol start/end and nesting - // in Tolk, we decided to stop supporting nesting (it was never used in practice and almost impossible for js highlighters) - // later on I'll simplify this code (more precisely, rewrite lexer from scratch) - while (!src.seek_eof()) { - int cc = src.cur_char(), nc = src.next_char(); - // note, that in practice, [0]-th element is -256, condition for [0]-th is always false - // todo rewrite this all in the future - if (cc == eol_cmt[0] || (cc == eol_cmt[1] && nc == eol_cmt[2]) || cc == eol_cmt2[0] || (cc == eol_cmt2[1] && nc == eol_cmt2[2])) { - if (comm == 1) { // just "//" — skip a whole line - src.load_line(); - } else { // if "//" is nested into "/*", continue reading, since "*/" may be met - src.advance(1); +}; + +// A multiline comment, starting from '/*' +// Note, that nested comments are not supported. +struct ChunkMultilineComment final : ChunkLexerBase { + bool parse(Lexer* lex) const override { + while (!lex->is_eof()) { + // todo drop -} later + if ((lex->char_at() == '-' && lex->char_at(1) == '}') || (lex->char_at() == '*' && lex->char_at(1) == '/')) { + lex->skip_chars(2); + return true; } - } else if (cc == cmt_op[1] && nc == cmt_op[2] || cc == cmt_op2[1] && nc == cmt_op2[2]) { - src.advance(2); - comm = comm * 2 + 1; - } else if (cc == cmt_op[0] || cc == cmt_op2[0]) { // always false - src.advance(1); - comm *= 2; - } else if (comm == 1) { - break; // means that we are not inside a comment - } else if (cc == cmt_cl[1] && nc == cmt_cl[2] || cc == cmt_cl2[1] && nc == cmt_cl2[2]) { - if (!(comm & 1)) { // always false - src.error(std::string{"a `"} + (char)cmt_op[0] + "` comment closed by `" + (char)cmt_cl[1] + (char)cmt_cl[2] + - "`"); - } - // note that {- may be closed with */, but assume it's ok (we'll get rid of {- in the future) - comm = 1; - src.advance(2); - } else if (cc == cmt_cl[0] || cc == cmt_cl2[0]) { // always false - if (!(comm & 1)) { - src.error(std::string{"a `"} + (char)cmt_op[1] + (char)cmt_op[2] + "` comment closed by `" + (char)cmt_cl[0] + - "`"); - } - comm = 1; - src.advance(1); - } else { - src.advance(1); - } - if (comm < 0) { - src.error("too many nested comments"); + lex->skip_chars(1); } + return true; // it's okay if comment extends past end of file } - if (src.seek_eof()) { - eof = true; - if (comm > 1) { - src.error("comment extends past end of file"); +}; + +// A string, starting from " +// Note, that there are no escape symbols inside: the purpose of strings in Tolk just doesn't need it. +// After a closing quote, a string modifier may be present, like "Ef8zMzMzMzMzMzMzMzMzMzM0vF"a. +// If present, it emits a separate tok_string_modifier. +struct ChunkString final : ChunkLexerBase { + bool parse(Lexer* lex) const override { + const char* str_begin = lex->c_str(); + lex->skip_chars(1); + while (!lex->is_eof() && lex->char_at() != '"' && lex->char_at() != '\n') { + lex->skip_chars(1); } - return lexem.clear(src.here(), Lexem::Eof); + if (lex->char_at() != '"') { + lex->error("string extends past end of line"); + } + + std::string_view str_val(str_begin + 1, lex->c_str() - str_begin - 1); + lex->skip_chars(1); + lex->add_token(tok_string_const, str_val); + + if (std::isalpha(lex->char_at())) { + std::string_view modifier_val(lex->c_str(), 1); + lex->skip_chars(1); + lex->add_token(tok_string_modifier, modifier_val); + } + + return true; } - if (is_multiline_quote(src.get_ptr(), src.get_end_ptr())) { - src.advance(multiline_quote.size()); - const char* end = nullptr; - SrcLocation here = src.here(); - std::string body; - while (!src.is_eof()) { - if (src.is_eoln()) { - body.push_back('\n'); - src.load_line(); - continue; - } - if (is_multiline_quote(src.get_ptr(), src.get_end_ptr())) { - end = src.get_ptr(); - src.advance(multiline_quote.size()); +}; + +// A string starting from """ +// Used for multiline asm constructions. Can not have a postfix modifier. +struct ChunkMultilineString final : ChunkLexerBase { + bool parse(Lexer* lex) const override { + const char* str_begin = lex->c_str(); + lex->skip_chars(3); + while (!lex->is_eof()) { + if (lex->char_at() == '"' && lex->char_at(1) == '"' && lex->char_at(2) == '"') { break; } - body.push_back(src.cur_char()); - src.advance(1); + lex->skip_chars(1); } - if (!end) { - src.error("string extends past end of file"); + if (lex->is_eof()) { + lex->error("string extends past end of file"); } - lexem.set(body, here, Lexem::String); - int c = src.cur_char(); - if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { - lexem.val = c; - src.advance(1); - } - return lexem; + + std::string_view str_val(str_begin + 3, lex->c_str() - str_begin - 3); + lex->skip_chars(3); + lex->add_token(tok_string_const, str_val); + return true; } - int c = src.cur_char(); - const char* end = src.get_ptr(); - if (is_quote_char(c) || c == '`') { - int qc = c; - ++end; - while (end < src.get_end_ptr() && *end != qc) { - ++end; +}; + +// A number, may be a hex one. +struct ChunkNumber final : ChunkLexerBase { + bool parse(Lexer* lex) const override { + const char* str_begin = lex->c_str(); + bool hex = false; + if (lex->char_at() == '0' && lex->char_at(1) == 'x') { + lex->skip_chars(2); + hex = true; } - if (*end != qc) { - src.error(qc == '`' ? "a `back-quoted` token extends past end of line" : "string extends past end of line"); + if (lex->is_eof()) { + return false; } - lexem.set(std::string{src.get_ptr() + 1, end}, src.here(), qc == '`' ? Lexem::Unknown : Lexem::String); - src.set_ptr(end + 1); - c = src.cur_char(); - if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { - lexem.val = c; - src.set_ptr(end + 2); + while (!lex->is_eof()) { + char c = lex->char_at(); + if (c >= '0' && c <= '9') { + lex->skip_chars(1); + continue; + } + if (!hex) { + break; + } + c |= 0x20; + if (c < 'a' || c > 'f') { + break; + } + lex->skip_chars(1); } - // std::cerr << lexem.name_str() << ' ' << lexem.str << std::endl; - return lexem; + + std::string_view str_val(str_begin, lex->c_str() - str_begin); + lex->add_token(tok_int_const, str_val); + return true; } - int len = 0, pc = -0x100; - while (end < src.get_end_ptr()) { - c = *end; - bool repeated = (c == pc && is_repeatable(c)); - if (c == ' ' || c == 9 || (len && is_left_active(c) && !repeated)) { - break; +}; + +// Anything starting from # is a compiler directive. +// Technically, #include and #pragma can be mapped as separate chunks, +// but storing such long strings in a trie increases its memory usage. +struct ChunkCompilerDirective final : ChunkLexerBase { + bool parse(Lexer* lex) const override { + const char* str_begin = lex->c_str(); + + lex->skip_chars(1); + while (std::isalnum(lex->char_at())) { + lex->skip_chars(1); } - ++len; - ++end; - if (is_right_active(c) && !repeated) { - break; + + std::string_view str_val(str_begin, lex->c_str() - str_begin); + if (str_val == "#include") { + lex->add_token(tok_include, str_val); + return true; } - pc = c; + if (str_val == "#pragma") { + lex->add_token(tok_pragma, str_val); + return true; + } + + lex->error("unknown compiler directive"); } - lexem.set(std::string{src.get_ptr(), end}, src.here()); - src.set_ptr(end); - // std::cerr << lexem.name_str() << ' ' << lexem.str << std::endl; - return lexem; +}; + +// Tokens like !=, &, etc. emit just a simple TokenType. +// Since they are stored in trie, "parsing" them is just skipping len chars. +struct ChunkSimpleToken final : ChunkLexerBase { + TokenType tp; + int len; + + ChunkSimpleToken(TokenType tp, int len) : tp(tp), len(len) {} + + bool parse(Lexer* lex) const override { + std::string_view str_val(lex->c_str(), len); + lex->add_token(tp, str_val); + lex->skip_chars(len); + return true; + } +}; + +// Spaces and other space-like symbols are just skipped. +struct ChunkSkipWhitespace final : ChunkLexerBase { + bool parse(Lexer* lex) const override { + lex->skip_chars(1); + lex->skip_spaces(); + return true; + } +}; + +// Here we handle corner cases of grammar that are requested on demand. +// E.g., for 'pragma version >0.5.0', '0.5.0' should be parsed specially to emit tok_semver. +// See TolkLanguageGrammar::parse_next_chunk_special(). +struct ChunkSpecialParsing { + static bool parse_pragma_name(Lexer* lex) { + const char* str_begin = lex->c_str(); + while (std::isalnum(lex->char_at()) || lex->char_at() == '-') { + lex->skip_chars(1); + } + + std::string_view str_val(str_begin, lex->c_str() - str_begin); + if (str_val.empty()) { + return false; + } + lex->add_token(tok_pragma_name, str_val); + return true; + } + + static bool parse_semver(Lexer* lex) { + const char* str_begin = lex->c_str(); + while (std::isdigit(lex->char_at()) || lex->char_at() == '.') { + lex->skip_chars(1); + } + + std::string_view str_val(str_begin, lex->c_str() - str_begin); + if (str_val.empty()) { + return false; + } + lex->add_token(tok_semver, str_val); + return true; + } +}; + +// Anything starting from a valid identifier beginning symbol is parsed as an identifier. +// But if a resulting string is a keyword, a corresponding token is emitted instead of tok_identifier. +struct ChunkIdentifierOrKeyword final : ChunkLexerBase { + // having parsed str up to the valid end, look up whether it's a valid keyword + // in the future, this could be a bit more effective than just comparing strings (e.g. gperf), + // but nevertheless, performance of the naive code below is reasonably good + static TokenType maybe_keyword(std::string_view str) { + switch (str.size()) { + case 1: + if (str == "~") return tok_bitwise_not; // todo attention + if (str == "_") return tok_underscore; // todo attention + break; + case 2: + if (str == "do") return tok_do; + if (str == "if") return tok_if; + break; + case 3: + if (str == "int") return tok_int; + if (str == "var") return tok_var; + if (str == "asm") return tok_asm; + if (str == "get") return tok_get; + if (str == "try") return tok_try; + break; + case 4: + if (str == "else") return tok_else; + if (str == "pure") return tok_pure; + if (str == "then") return tok_then; + if (str == "cell") return tok_cell; + if (str == "cont") return tok_cont; + if (str == "type") return tok_type; // todo unused token? + break; + case 5: + if (str == "slice") return tok_slice; + if (str == "tuple") return tok_tuple; + if (str == "const") return tok_const; + if (str == "while") return tok_while; + if (str == "until") return tok_until; + if (str == "catch") return tok_catch; + if (str == "ifnot") return tok_ifnot; + break; + case 6: + if (str == "return") return tok_return; + if (str == "repeat") return tok_repeat; + if (str == "elseif") return tok_elseif; + if (str == "forall") return tok_forall; + if (str == "extern") return tok_extern; + if (str == "global") return tok_global; + if (str == "impure") return tok_impure; + if (str == "inline") return tok_inline; + break; + case 7: + if (str == "builder") return tok_builder; + if (str == "builtin") return tok_builtin; + break; + case 8: + if (str == "operator") return tok_operator; + break; + case 9: + if (str == "elseifnot") return tok_elseifnot; + if (str == "method_id") return tok_method_id; + break; + case 10: + if (str == "inline_ref") return tok_inlineref; + if (str == "auto_apply") return tok_autoapply; + break; + default: + break; + } + return tok_empty; + } + + bool parse(Lexer* lex) const override { + const char* sym_begin = lex->c_str(); + lex->skip_chars(1); + while (!lex->is_eof()) { + char c = lex->char_at(); + // the pattern of valid identifier first symbol is provided in trie, here we test for identifier middle + bool allowed_in_identifier = std::isalnum(c) || c == '_' || c == '$' || c == ':' || c == '?' || c == '!' || c == '\''; + if (!allowed_in_identifier) { + break; + } + lex->skip_chars(1); + } + + std::string_view str_val(sym_begin, lex->c_str() - sym_begin); + if (TokenType kw_tok = maybe_keyword(str_val)) { + lex->add_token(kw_tok, str_val); + } else { + symbols.lookup_add(static_cast(str_val)); + lex->add_token(tok_identifier, str_val); + } + return true; + } +}; + +// Like in Kotlin, `backticks` can be used to wrap identifiers (both in declarations/usage, both for vars/functions). +// E.g.: function `do`() { var `with spaces` = 1; } +// This could be useful to use reserved names as identifiers (in a probable codegen from TL, for example). +struct ChunkIdentifierInBackticks final : ChunkLexerBase { + bool parse(Lexer* lex) const override { + const char* str_begin = lex->c_str(); + lex->skip_chars(1); + while (!lex->is_eof() && lex->char_at() != '`' && lex->char_at() != '\n') { + if (std::isspace(lex->char_at())) { // probably, I'll remove this restriction after rewriting symtable and cur_sym_idx + lex->error("An identifier can't have a space in its name (even inside backticks)"); + } + lex->skip_chars(1); + } + if (lex->char_at() != '`') { + lex->error("Unclosed backtick `"); + } + + std::string_view str_val(str_begin + 1, lex->c_str() - str_begin - 1); + lex->skip_chars(1); + symbols.lookup_add(static_cast(str_val)); + lex->add_token(tok_identifier, str_val); + return true; + } +}; + +// +// ---------------------------------------------------------------------- +// Here we define a grammar of Tolk. +// All valid chunks prefixes are stored in trie. +// + +struct TolkLanguageGrammar { + static LexingTrie trie; + + static bool parse_next_chunk(Lexer* lex) { + const ChunkLexerBase* best = trie.get_deepest(lex->c_str()); + return best && best->parse(lex); + } + + static bool parse_next_chunk_special(Lexer* lex, TokenType parse_next_as) { + switch (parse_next_as) { + case tok_pragma_name: + return ChunkSpecialParsing::parse_pragma_name(lex); + case tok_semver: + return ChunkSpecialParsing::parse_semver(lex); + default: + assert(false); + return false; + } + } + + static void register_token(const char* str, int len, TokenType tp) { + trie.add_prefix(str, new ChunkSimpleToken(tp, len)); + } + + static void init() { + trie.add_prefix("//", singleton()); + trie.add_prefix(";;", singleton()); + trie.add_prefix("/*", singleton()); + trie.add_prefix("{-", singleton()); + trie.add_prefix(R"(")", singleton()); + trie.add_prefix(R"(""")", singleton()); + trie.add_prefix(" ", singleton()); + trie.add_prefix("\t", singleton()); + trie.add_prefix("\r", singleton()); + trie.add_prefix("\n", singleton()); + trie.add_prefix("#", singleton()); + + trie.add_pattern("[0-9]", singleton()); + // todo think of . ~ + trie.add_pattern("[a-zA-Z_$.~]", singleton()); + trie.add_prefix("`", singleton()); + + register_token("+", 1, tok_plus); + register_token("-", 1, tok_minus); + register_token("*", 1, tok_mul); + register_token("/", 1, tok_div); + register_token("%", 1, tok_mod); + register_token("?", 1, tok_question); + register_token(":", 1, tok_colon); + register_token(",", 1, tok_comma); + register_token(";", 1, tok_semicolon); + register_token("(", 1, tok_oppar); + register_token(")", 1, tok_clpar); + register_token("[", 1, tok_opbracket); + register_token("]", 1, tok_clbracket); + register_token("{", 1, tok_opbrace); + register_token("}", 1, tok_clbrace); + register_token("=", 1, tok_assign); + register_token("<", 1, tok_lt); + register_token(">", 1, tok_gt); + register_token("&", 1, tok_bitwise_and); + register_token("|", 1, tok_bitwise_or); + register_token("^", 1, tok_bitwise_xor); + register_token("==", 2, tok_eq); + register_token("!=", 2, tok_neq); + register_token("<=", 2, tok_leq); + register_token(">=", 2, tok_geq); + register_token("<<", 2, tok_lshift); + register_token(">>", 2, tok_rshift); + register_token("~/", 2, tok_divR); + register_token("^/", 2, tok_divC); + register_token("~%", 2, tok_modR); + register_token("^%", 2, tok_modC); + register_token("/%", 2, tok_divmod); + register_token("+=", 2, tok_set_plus); + register_token("-=", 2, tok_set_minus); + register_token("*=", 2, tok_set_mul); + register_token("/=", 2, tok_set_div); + register_token("%=", 2, tok_set_mod); + register_token("&=", 2, tok_set_bitwise_and); + register_token("|=", 2, tok_set_bitwise_or); + register_token("^=", 2, tok_set_bitwise_xor); + register_token("->", 2, tok_mapsto); + register_token("<=>", 3, tok_spaceship); + register_token("~>>", 3, tok_rshiftR); + register_token("^>>", 3, tok_rshiftC); + register_token("~/=", 3, tok_set_divR); + register_token("^/=", 3, tok_set_divC); + register_token("~%=", 3, tok_set_modR); + register_token("^%=", 3, tok_set_modC); + register_token("<<=", 3, tok_set_lshift); + register_token(">>=", 3, tok_set_rshift); + register_token("~>>=", 4, tok_set_rshiftR); + register_token("^>>=", 4, tok_set_rshiftC); + } +}; + +LexingTrie TolkLanguageGrammar::trie; + +// +// ---------------------------------------------------------------------- +// The Lexer class is to be used outside (by parser, which constructs AST from tokens). +// It's streaming. It means, that `next()` parses a next token on demand +// (instead of parsing all file contents to vector and iterating over it). +// Parsing on demand uses effectively less memory. +// Note, that chunks, being parsed, call `add_token()`, and a chunk may add multiple tokens at once. +// That's why a small cirlular buffer for tokens is used. +// `last_token_idx` actually means a number of total tokens added. +// `cur_token_idx` is a number of returned by `next()`. +// It's assumed that an input file has already been loaded, its contents is present and won't be deleted +// (`start`, `cur` and `end`, as well as every Token str_val, points inside file->text). +// + +Lexer::Lexer(const SrcFile* file) + : file(file) + , p_start(file->text.data()) + , p_end(p_start + file->text.size()) + , p_next(p_start) + , location(file) { + next(); } -const Lexem& Lexer::peek() { - if (peek_lexem.valid()) { - return peek_lexem; +void Lexer::next() { + while (cur_token_idx == last_token_idx && !is_eof()) { + update_location(); + if (!TolkLanguageGrammar::parse_next_chunk(this)) { + error("Failed to parse"); + } } - if (eof) { - return lexem.clear(src.here(), Lexem::Eof); + if (is_eof()) { + add_token(tok_eof, file->text); + } + cur_token = tokens_circularbuf[++cur_token_idx & 7]; +} + +void Lexer::next_special(TokenType parse_next_as, const char* str_expected) { + assert(cur_token_idx == last_token_idx); + skip_spaces(); + update_location(); + if (!TolkLanguageGrammar::parse_next_chunk_special(this, parse_next_as)) { + error(std::string(str_expected) + " expected"); + } + cur_token = tokens_circularbuf[++cur_token_idx & 7]; +} + +int Lexer::cur_sym_idx() const { + assert(tok() == tok_identifier); + return symbols.lookup_add(cur_str_std_string()); +} + +void Lexer::error(const std::string& err_msg) const { + throw ParseError(cur_location(), err_msg); +} + +void Lexer::error_at(const std::string& prefix, const std::string& suffix) const { + throw ParseError(cur_location(), prefix + cur_str_std_string() + suffix); +} + +void Lexer::on_expect_call_failed(const char* str_expected) const { + throw ParseError(cur_location(), std::string(str_expected) + " expected instead of `" + cur_str_std_string() + "`"); +} + +void lexer_init() { + TolkLanguageGrammar::init(); +} + +// todo #ifdef TOLK_PROFILING +// As told above, `next()` produces tokens on demand, while AST is being generated. +// Hence, it's difficult to measure Lexer performance separately. +// This function can be called just to tick Lexer performance, it just scans all input files. +// There is no sense to use it in production, but when refactoring and optimizing Lexer, it's useful. +void lexer_measure_performance(const std::vector& files_to_just_parse) { + for (const SrcFile* file : files_to_just_parse) { + Lexer lex(file); + while (!lex.is_eof()) { + lex.next(); + } } - Lexem keep = std::move(lexem); - next(); - peek_lexem = std::move(lexem); - lexem = std::move(keep); - eof = false; - return peek_lexem; } } // namespace tolk diff --git a/tolk/lexer.h b/tolk/lexer.h index 816f7a82..e0fa7606 100644 --- a/tolk/lexer.h +++ b/tolk/lexer.h @@ -15,104 +15,225 @@ along with TON Blockchain Library. If not, see . */ #pragma once -#include "srcread.h" -#include -#include -#include + +#include "platform-utils.h" +#include "src-file.h" +#include namespace tolk { -/* - * - * LEXER - * - */ +enum TokenType { + tok_empty, -struct Lexem { - enum { Undefined = -2, Eof = -1, Unknown = 0, Ident = 0, Number = 1, Special = 2, String = 3 }; - int tp; - int val; - std::string str; - SrcLocation loc; - int classify(); - Lexem(std::string _str = "", const SrcLocation& _loc = {}, int _tp = Unknown, int _val = 0) - : tp(_tp), val(_val), str(_str), loc(_loc) { - classify(); - } - int set(std::string _str = "", const SrcLocation& _loc = {}, int _tp = Unknown, int _val = 0); - Lexem& clear(const SrcLocation& _loc = {}, int _tp = Unknown, int _val = 0) { - tp = _tp; - val = _val; - loc = _loc; - str = ""; - return *this; - } - bool valid() const { - return tp != Undefined; - } - std::string name_str() const; - void error(std::string _str) const { - throw ParseError{loc, _str}; - } - void error_at(std::string str1, std::string str2) const { - error(str1 + str + str2); - } + tok_int_const, + tok_string_const, + tok_string_modifier, - static std::string lexem_name_str(int idx); + tok_identifier, + + tok_plus, + tok_minus, + tok_mul, + tok_div, + tok_mod, + tok_question, + tok_colon, + tok_comma, + tok_semicolon, + tok_oppar, + tok_clpar, + tok_opbracket, + tok_clbracket, + tok_opbrace, + tok_clbrace, + tok_assign, + tok_underscore, + tok_lt, + tok_gt, + tok_bitwise_and, + tok_bitwise_or, + tok_bitwise_xor, + tok_bitwise_not, + tok_dot, + + tok_eq, + tok_neq, + tok_leq, + tok_geq, + tok_spaceship, + tok_lshift, + tok_rshift, + tok_rshiftR, + tok_rshiftC, + tok_divR, + tok_divC, + tok_modR, + tok_modC, + tok_divmod, + tok_set_plus, + tok_set_minus, + tok_set_mul, + tok_set_div, + tok_set_divR, + tok_set_divC, + tok_set_mod, + tok_set_modR, + tok_set_modC, + tok_set_lshift, + tok_set_rshift, + tok_set_rshiftR, + tok_set_rshiftC, + tok_set_bitwise_and, + tok_set_bitwise_or, + tok_set_bitwise_xor, + + tok_return, + tok_var, + tok_repeat, + tok_do, + tok_while, + tok_until, + tok_try, + tok_catch, + tok_if, + tok_ifnot, + tok_then, + tok_else, + tok_elseif, + tok_elseifnot, + + tok_int, + tok_cell, + tok_slice, + tok_builder, + tok_cont, + tok_tuple, + tok_type, + tok_mapsto, + tok_forall, + + tok_extern, + tok_global, + tok_asm, + tok_impure, + tok_pure, + tok_inline, + tok_inlineref, + tok_builtin, + tok_autoapply, + tok_method_id, + tok_get, + tok_operator, + tok_infix, + tok_infixl, + tok_infixr, + tok_const, + + tok_pragma, + tok_pragma_name, + tok_semver, + tok_include, + + tok_eof }; +// All tolk language is parsed into tokens. +// Lexer::next() returns a Token. +struct Token { + TokenType type = tok_empty; + std::string_view str_val; + + Token() = default; + Token(TokenType type, std::string_view str_val): type(type), str_val(str_val) {} +}; + +// Lexer::next() is a method to be used externally (while parsing tolk file to AST). +// It's streaming: `next()` parses a token on demand. +// For comments, see lexer.cpp, a comment above Lexer constructor. class Lexer { - SourceReader& src; - bool eof; - Lexem lexem, peek_lexem; - unsigned char char_class[128]; - std::array eol_cmt, cmt_op, cmt_cl; // for ;; {- -} - std::array eol_cmt2, cmt_op2, cmt_cl2; // for // /* */ - std::string multiline_quote; - enum cc { left_active = 2, right_active = 1, active = 3, allow_repeat = 4, quote_char = 8 }; + Token tokens_circularbuf[8]{}; + int last_token_idx = -1; + int cur_token_idx = -1; + Token cur_token; // = tokens_circularbuf[cur_token_idx & 7] - public: - bool eof_found() const { - return eof; - } - explicit Lexer(SourceReader& _src, std::string active_chars = ";,() ~.", - std::string quote_chars = "\"", std::string multiline_quote = "\"\"\""); + const SrcFile* file; + const char *p_start, *p_end, *p_next; + SrcLocation location; - void set_comment_tokens(const std::string &eol_cmts, const std::string &open_cmts, const std::string &close_cmts); - void set_comment2_tokens(const std::string &eol_cmts2, const std::string &open_cmts2, const std::string &close_cmts2); - void start_parsing(); - - const Lexem& next(); - const Lexem& cur() const { - return lexem; - } - const Lexem& peek(); - int tp() const { - return lexem.tp; - } - void expect(int exp_tp, const char* msg = 0); - int classify_char(unsigned c) const { - return c < 0x80 ? char_class[c] : 0; - } - bool is_active(int c) const { - return (classify_char(c) & cc::active) == cc::active; - } - bool is_left_active(int c) const { - return (classify_char(c) & cc::left_active); - } - bool is_right_active(int c) const { - return (classify_char(c) & cc::right_active); - } - bool is_repeatable(int c) const { - return (classify_char(c) & cc::allow_repeat); - } - bool is_quote_char(int c) const { - return (classify_char(c) & cc::quote_char); + void update_location() { + location.char_offset = static_cast(p_next - p_start); } - private: - void set_spec(std::array& arr, std::string setup); - bool is_multiline_quote(const char* begin, const char* end); + GNU_ATTRIBUTE_NORETURN GNU_ATTRIBUTE_COLD + void on_expect_call_failed(const char* str_expected) const; + +public: + + explicit Lexer(const SrcFile* file); + Lexer(const Lexer&) = delete; + Lexer &operator=(const Lexer&) = delete; + + void add_token(TokenType type, std::string_view str) { + tokens_circularbuf[++last_token_idx & 7] = Token(type, str); + } + + void skip_spaces() { + while (std::isspace(*p_next)) { + ++p_next; + } + } + + void skip_line() { + while (p_next < p_end && *p_next != '\n' && *p_next != '\r') { + ++p_next; + } + while (*p_next == '\n' || *p_next == '\r') { + ++p_next; + } + } + + void skip_chars(int n) { + p_next += n; + } + + bool is_eof() const { + return p_next >= p_end; + } + + char char_at() const { return *p_next; } + char char_at(int shift) const { return *(p_next + shift); } + const char* c_str() const { return p_next; } + + TokenType tok() const { return cur_token.type; } + std::string_view cur_str() const { return cur_token.str_val; } + std::string cur_str_std_string() const { return static_cast(cur_token.str_val); } + SrcLocation cur_location() const { return location; } + int cur_sym_idx() const; + + void next(); + void next_special(TokenType parse_next_as, const char* str_expected); + + void check(TokenType next_tok, const char* str_expected) const { + if (cur_token.type != next_tok) { + on_expect_call_failed(str_expected); // unlikely path, not inlined + } + } + void expect(TokenType next_tok, const char* str_expected) { + if (cur_token.type != next_tok) { + on_expect_call_failed(str_expected); + } + next(); + } + + GNU_ATTRIBUTE_NORETURN GNU_ATTRIBUTE_COLD + void error(const std::string& err_msg) const; + GNU_ATTRIBUTE_NORETURN GNU_ATTRIBUTE_COLD + void error_at(const std::string& prefix, const std::string& suffix) const; }; +void lexer_init(); + +// todo #ifdef TOLK_PROFILING +void lexer_measure_performance(const std::vector& files_to_just_parse); + } // namespace tolk diff --git a/tolk/optimize.cpp b/tolk/optimize.cpp index 64087032..cf7f460f 100644 --- a/tolk/optimize.cpp +++ b/tolk/optimize.cpp @@ -612,7 +612,7 @@ bool Optimizer::optimize() { } AsmOpConsList optimize_code_head(AsmOpConsList op_list, int mode) { - Optimizer opt(std::move(op_list), op_rewrite_comments, mode); + Optimizer opt(std::move(op_list), false, mode); opt.optimize(); return opt.extract_code(); } diff --git a/tolk/parse-tolk.cpp b/tolk/parse-tolk.cpp index 3cff0bb5..c28501d4 100644 --- a/tolk/parse-tolk.cpp +++ b/tolk/parse-tolk.cpp @@ -15,6 +15,7 @@ along with TON Blockchain Library. If not, see . */ #include "tolk.h" +#include "platform-utils.h" #include "td/utils/crypto.h" #include "common/refint.h" #include "openssl/digest.hpp" @@ -24,28 +25,16 @@ namespace tolk { using namespace std::literals::string_literals; -int compute_symbol_subclass(std::string str) { - if (str.size() < 2) { - return IdSc::undef; - } else if (str[0] == '.') { - return IdSc::dotid; - } else if (str[0] == '~') { - return IdSc::tildeid; - } else { - return IdSc::undef; - } -} - inline bool is_dot_ident(sym_idx_t idx) { - return symbols.get_subclass(idx) == IdSc::dotid; + return symbols.get_subclass(idx) == SymbolSubclass::dot_identifier; } inline bool is_tilde_ident(sym_idx_t idx) { - return symbols.get_subclass(idx) == IdSc::tildeid; + return symbols.get_subclass(idx) == SymbolSubclass::tilde_identifier; } inline bool is_special_ident(sym_idx_t idx) { - return symbols.get_subclass(idx) != IdSc::undef; + return symbols.get_subclass(idx) != SymbolSubclass::undef; } // given Expr::_Apply (a function call / a variable call), determine whether it's <, or >, or similar @@ -97,7 +86,8 @@ static inline std::string get_builtin_operator_name(sym_idx_t sym_builtin) { // fire an error for a case "flags & 0xFF != 0" (equivalent to "flags & 1", probably unexpected) // it would better be a warning, but we decided to make it a strict error -[[gnu::cold]] static void fire_error_lower_precedence(const SrcLocation& loc, sym_idx_t op_lower, sym_idx_t op_higher) { +GNU_ATTRIBUTE_NORETURN GNU_ATTRIBUTE_COLD +static void fire_error_lower_precedence(SrcLocation loc, sym_idx_t op_lower, sym_idx_t op_higher) { std::string name_lower = get_builtin_operator_name(op_lower); std::string name_higher = get_builtin_operator_name(op_higher); throw ParseError(loc, name_lower + " has lower precedence than " + name_higher + @@ -106,7 +96,8 @@ static inline std::string get_builtin_operator_name(sym_idx_t sym_builtin) { } // fire an error for a case "arg1 & arg2 | arg3" -[[gnu::cold]] static void fire_error_mix_bitwise_and_or(const SrcLocation& loc, sym_idx_t op1, sym_idx_t op2) { +GNU_ATTRIBUTE_NORETURN GNU_ATTRIBUTE_COLD +static void fire_error_mix_bitwise_and_or(SrcLocation loc, sym_idx_t op1, sym_idx_t op2) { std::string name1 = get_builtin_operator_name(op1); std::string name2 = get_builtin_operator_name(op2); throw ParseError(loc, "mixing " + name1 + " with " + name2 + " without parenthesis" @@ -117,7 +108,7 @@ static inline std::string get_builtin_operator_name(sym_idx_t sym_builtin) { // diagnose when bitwise operators are used in a probably wrong way due to tricky precedence // example: "flags & 0xFF != 0" is equivalent to "flags & 1", most likely it's unexpected // the only way to suppress this error for the programmer is to use parenthesis -static void diagnose_bitwise_precedence(const SrcLocation& loc, sym_idx_t bitwise_sym, const Expr* lhs, const Expr* rhs) { +static void diagnose_bitwise_precedence(SrcLocation loc, sym_idx_t bitwise_sym, const Expr* lhs, const Expr* rhs) { // handle "0 != flags & 0xFF" (lhs = "0 != flags") if (!lhs->is_inside_parenthesis() && lhs->cls == Expr::_Apply && lhs->e_type->is_int() && // fast false if 100% not @@ -143,7 +134,7 @@ static void diagnose_bitwise_precedence(const SrcLocation& loc, sym_idx_t bitwis } // diagnose "a << 8 + 1" (equivalent to "a << 9", probably unexpected) -static void diagnose_addition_in_bitshift(const SrcLocation& loc, sym_idx_t bitshift_sym, const Expr* rhs) { +static void diagnose_addition_in_bitshift(SrcLocation loc, sym_idx_t bitshift_sym, const Expr* rhs) { if (!rhs->is_inside_parenthesis() && rhs->cls == Expr::_Apply && rhs->e_type->is_int() && is_add_or_sub_binary_op(rhs)) { @@ -152,9 +143,9 @@ static void diagnose_addition_in_bitshift(const SrcLocation& loc, sym_idx_t bits } /* - * + * * PARSE SOURCE - * + * */ // TE ::= TA | TA -> TE @@ -162,68 +153,70 @@ static void diagnose_addition_in_bitshift(const SrcLocation& loc, sym_idx_t bits TypeExpr* parse_type(Lexer& lex); TypeExpr* parse_type1(Lexer& lex) { - switch (lex.tp()) { - case _Int: + switch (lex.tok()) { + case tok_int: lex.next(); - return TypeExpr::new_atomic(_Int); - case _Cell: + return TypeExpr::new_atomic(TypeExpr::_Int); + case tok_cell: lex.next(); - return TypeExpr::new_atomic(_Cell); - case _Slice: + return TypeExpr::new_atomic(TypeExpr::_Cell); + case tok_slice: lex.next(); - return TypeExpr::new_atomic(_Slice); - case _Builder: + return TypeExpr::new_atomic(TypeExpr::_Slice); + case tok_builder: lex.next(); - return TypeExpr::new_atomic(_Builder); - case _Cont: + return TypeExpr::new_atomic(TypeExpr::_Builder); + case tok_cont: lex.next(); - return TypeExpr::new_atomic(_Cont); - case _Tuple: + return TypeExpr::new_atomic(TypeExpr::_Cont); + case tok_tuple: lex.next(); - return TypeExpr::new_atomic(_Tuple); - case _Var: - case '_': + return TypeExpr::new_atomic(TypeExpr::_Tuple); + case tok_var: + case tok_underscore: lex.next(); return TypeExpr::new_hole(); - case _Ident: { - auto sym = lookup_symbol(lex.cur().val); + case tok_identifier: { + auto sym = lookup_symbol(lex.cur_sym_idx()); if (sym && dynamic_cast(sym->value)) { auto val = dynamic_cast(sym->value); lex.next(); return val->get_type(); } - lex.cur().error_at("`", "` is not a type identifier"); + lex.error_at("`", "` is not a type identifier"); } + default: + break; } - int c; - if (lex.tp() == '[') { + TokenType c; + if (lex.tok() == tok_opbracket) { lex.next(); - c = ']'; + c = tok_clbracket; } else { - lex.expect('('); - c = ')'; + lex.expect(tok_oppar, ""); + c = tok_clpar; } - if (lex.tp() == c) { + if (lex.tok() == c) { lex.next(); - return c == ')' ? TypeExpr::new_unit() : TypeExpr::new_tuple({}); + return c == tok_clpar ? TypeExpr::new_unit() : TypeExpr::new_tuple({}); } auto t1 = parse_type(lex); - if (lex.tp() == ')') { - lex.expect(c); + if (lex.tok() == tok_clpar) { + lex.expect(c, c == tok_clpar ? "')'" : "']'"); return t1; } std::vector tlist{1, t1}; - while (lex.tp() == ',') { + while (lex.tok() == tok_comma) { lex.next(); tlist.push_back(parse_type(lex)); } - lex.expect(c); - return c == ')' ? TypeExpr::new_tensor(std::move(tlist)) : TypeExpr::new_tuple(std::move(tlist)); + lex.expect(c, c == tok_clpar ? "')'" : "']'"); + return c == tok_clpar ? TypeExpr::new_tensor(std::move(tlist)) : TypeExpr::new_tuple(std::move(tlist)); } TypeExpr* parse_type(Lexer& lex) { auto res = parse_type1(lex); - if (lex.tp() == _Mapsto) { + if (lex.tok() == tok_mapsto) { lex.next(); auto to = parse_type(lex); return TypeExpr::new_map(res, to); @@ -234,18 +227,18 @@ TypeExpr* parse_type(Lexer& lex) { FormalArg parse_formal_arg(Lexer& lex, int fa_idx) { TypeExpr* arg_type = 0; - SrcLocation loc = lex.cur().loc; - if (lex.tp() == '_') { + SrcLocation loc = lex.cur_location(); + if (lex.tok() == tok_underscore) { lex.next(); - if (lex.tp() == ',' || lex.tp() == ')') { + if (lex.tok() == tok_comma || lex.tok() == tok_clpar) { return std::make_tuple(TypeExpr::new_hole(), (SymDef*)nullptr, loc); } arg_type = TypeExpr::new_hole(); - loc = lex.cur().loc; - } else if (lex.tp() != _Ident) { + loc = lex.cur_location(); + } else if (lex.tok() != tok_identifier) { arg_type = parse_type(lex); } else { - auto sym = lookup_symbol(lex.cur().val); + auto sym = lookup_symbol(lex.cur_sym_idx()); if (sym && dynamic_cast(sym->value)) { auto val = dynamic_cast(sym->value); lex.next(); @@ -254,44 +247,42 @@ FormalArg parse_formal_arg(Lexer& lex, int fa_idx) { arg_type = TypeExpr::new_hole(); } } - if (lex.tp() == '_' || lex.tp() == ',' || lex.tp() == ')') { - if (lex.tp() == '_') { - loc = lex.cur().loc; + if (lex.tok() == tok_underscore || lex.tok() == tok_comma || lex.tok() == tok_clpar) { + if (lex.tok() == tok_underscore) { + loc = lex.cur_location(); lex.next(); } return std::make_tuple(arg_type, (SymDef*)nullptr, loc); } - if (lex.tp() != _Ident) { - lex.expect(_Ident, "formal parameter name"); - } - loc = lex.cur().loc; - if (prohibited_var_names.count(symbols.get_name(lex.cur().val))) { + lex.check(tok_identifier, "formal parameter name"); + loc = lex.cur_location(); + if (prohibited_var_names.count(symbols.get_name(lex.cur_sym_idx()))) { throw ParseError{ - loc, PSTRING() << "symbol `" << symbols.get_name(lex.cur().val) << "` cannot be redefined as a variable"}; + loc, PSTRING() << "symbol `" << symbols.get_name(lex.cur_sym_idx()) << "` cannot be redefined as a variable"}; } - SymDef* new_sym_def = define_symbol(lex.cur().val, true, loc); + SymDef* new_sym_def = define_symbol(lex.cur_sym_idx(), true, loc); if (!new_sym_def) { - lex.cur().error_at("cannot define symbol `", "`"); + lex.error_at("cannot define symbol `", "`"); } if (new_sym_def->value) { - lex.cur().error_at("redefined formal parameter `", "`"); + lex.error_at("redefined formal parameter `", "`"); } - new_sym_def->value = new SymVal{SymVal::_Param, fa_idx, arg_type}; + new_sym_def->value = new SymVal{SymValKind::_Param, fa_idx, arg_type}; lex.next(); return std::make_tuple(arg_type, new_sym_def, loc); } void parse_global_var_decl(Lexer& lex) { TypeExpr* var_type = 0; - SrcLocation loc = lex.cur().loc; - if (lex.tp() == '_') { + SrcLocation loc = lex.cur_location(); + if (lex.tok() == tok_underscore) { lex.next(); var_type = TypeExpr::new_hole(); - loc = lex.cur().loc; - } else if (lex.tp() != _Ident) { + loc = lex.cur_location(); + } else if (lex.tok() != tok_identifier) { var_type = parse_type(lex); } else { - auto sym = lookup_symbol(lex.cur().val); + auto sym = lookup_symbol(lex.cur_sym_idx()); if (sym && dynamic_cast(sym->value)) { auto val = dynamic_cast(sym->value); lex.next(); @@ -300,18 +291,16 @@ void parse_global_var_decl(Lexer& lex) { var_type = TypeExpr::new_hole(); } } - if (lex.tp() != _Ident) { - lex.expect(_Ident, "global variable name"); - } - loc = lex.cur().loc; - SymDef* sym_def = define_global_symbol(lex.cur().val, false, loc); + lex.check(tok_identifier, "global variable name"); + loc = lex.cur_location(); + SymDef* sym_def = define_global_symbol(lex.cur_sym_idx(), false, loc); if (!sym_def) { - lex.cur().error_at("cannot define global symbol `", "`"); + lex.error_at("cannot define global symbol `", "`"); } if (sym_def->value) { auto val = dynamic_cast(sym_def->value); if (!val) { - lex.cur().error_at("symbol `", "` cannot be redefined as a global variable"); + lex.error_at("symbol `", "` cannot be redefined as a global variable"); } try { unify(var_type, val->sym_type); @@ -319,12 +308,12 @@ void parse_global_var_decl(Lexer& lex) { std::ostringstream os; os << "cannot unify new type " << var_type << " of global variable `" << sym_def->name() << "` with its previous type " << val->sym_type << ": " << ue; - lex.cur().error(os.str()); + lex.error(os.str()); } } else { sym_def->value = new SymValGlobVar{glob_var_cnt++, var_type}; #ifdef TOLK_DEBUG - dynamic_cast(sym_def->value)->name = lex.cur().str; + dynamic_cast(sym_def->value)->name = lex.cur_str(); #endif glob_vars.push_back(sym_def); } @@ -335,39 +324,39 @@ extern int const_cnt; Expr* parse_expr(Lexer& lex, CodeBlob& code, bool nv = false); void parse_const_decl(Lexer& lex) { - SrcLocation loc = lex.cur().loc; + SrcLocation loc = lex.cur_location(); int wanted_type = Expr::_None; - if (lex.tp() == _Int) { + if (lex.tok() == tok_int) { wanted_type = Expr::_Const; lex.next(); - } else if (lex.tp() == _Slice) { + } else if (lex.tok() == tok_slice) { wanted_type = Expr::_SliceConst; lex.next(); } - if (lex.tp() != _Ident) { - lex.expect(_Ident, "constant name"); - } - loc = lex.cur().loc; - SymDef* sym_def = define_global_symbol(lex.cur().val, false, loc); + lex.check(tok_identifier, "constant name"); + loc = lex.cur_location(); + SymDef* sym_def = define_global_symbol(lex.cur_sym_idx(), false, loc); if (!sym_def) { - lex.cur().error_at("cannot define global symbol `", "`"); + lex.error_at("cannot define global symbol `", "`"); + } + if (sym_def->value) { // todo below it was a check (for duplicate include?) + lex.error_at("global symbol `", "` already exists"); } - Lexem ident = lex.cur(); lex.next(); - if (lex.tp() != '=') { - lex.cur().error_at("expected = instead of ", ""); + if (lex.tok() != tok_assign) { + lex.error_at("expected = instead of ", ""); } lex.next(); CodeBlob code; // Handles processing and resolution of literals and consts auto x = parse_expr(lex, code, false); // also does lex.next() ! if (!x->is_rvalue()) { - lex.cur().error("expression is not strictly Rvalue"); + lex.error("expression is not strictly Rvalue"); } if ((wanted_type == Expr::_Const) && (x->cls == Expr::_Apply)) wanted_type = Expr::_None; // Apply is additionally checked to result in an integer if ((wanted_type != Expr::_None) && (x->cls != wanted_type)) { - lex.cur().error("expression type does not match wanted type"); + lex.error("expression type does not match wanted type"); } SymValConst* new_value = nullptr; if (x->cls == Expr::_Const) { // Integer constant @@ -392,58 +381,49 @@ void parse_const_decl(Lexer& lex) { AsmOpList out_list(0, &code.vars); code.generate_code(out_list); if (out_list.list_.size() != 1) { - lex.cur().error("precompiled expression must result in single operation"); + lex.error("precompiled expression must result in single operation"); } auto op = out_list.list_[0]; if (!op.is_const()) { - lex.cur().error("precompiled expression must result in compilation time constant"); + lex.error("precompiled expression must result in compilation time constant"); } if (op.origin.is_null() || !op.origin->is_valid()) { - lex.cur().error("precompiled expression did not result in a valid integer constant"); + lex.error("precompiled expression did not result in a valid integer constant"); } new_value = new SymValConst{const_cnt++, op.origin}; } else { - lex.cur().error("integer or slice literal or constant expected"); - } - if (sym_def->value) { - SymValConst* old_value = dynamic_cast(sym_def->value); - Keyword new_type = new_value->get_type(); - if (!old_value || old_value->get_type() != new_type || - (new_type == _Int && *old_value->get_int_value() != *new_value->get_int_value()) || - (new_type == _Slice && old_value->get_str_value() != new_value->get_str_value())) { - ident.error_at("global symbol `", "` already exists"); - } + lex.error("integer or slice literal or constant expected"); } sym_def->value = new_value; } FormalArgList parse_formal_args(Lexer& lex) { FormalArgList args; - lex.expect('(', "formal argument list"); - if (lex.tp() == ')') { + lex.expect(tok_oppar, "formal argument list"); + if (lex.tok() == tok_clpar) { lex.next(); return args; } int fa_idx = 0; args.push_back(parse_formal_arg(lex, fa_idx++)); - while (lex.tp() == ',') { + while (lex.tok() == tok_comma) { lex.next(); args.push_back(parse_formal_arg(lex, fa_idx++)); } - lex.expect(')'); + lex.expect(tok_clpar, "')'"); return args; } void parse_const_decls(Lexer& lex) { - lex.expect(_Const); + lex.expect(tok_const, "'const'"); while (true) { parse_const_decl(lex); - if (lex.tp() != ',') { + if (lex.tok() != tok_comma) { break; } - lex.expect(','); + lex.expect(tok_comma, "','"); } - lex.expect(';'); + lex.expect(tok_semicolon, "';'"); } TypeExpr* extract_total_arg_type(const FormalArgList& arg_list) { @@ -461,15 +441,15 @@ TypeExpr* extract_total_arg_type(const FormalArgList& arg_list) { } void parse_global_var_decls(Lexer& lex) { - lex.expect(_Global); + lex.expect(tok_global, "'global'"); while (true) { parse_global_var_decl(lex); - if (lex.tp() != ',') { + if (lex.tok() != tok_comma) { break; } - lex.expect(','); + lex.expect(tok_comma, "','"); } - lex.expect(';'); + lex.expect(tok_semicolon, "';'"); } SymValCodeFunc* make_new_glob_func(SymDef* func_sym, TypeExpr* func_type, bool marked_as_pure) { @@ -483,18 +463,18 @@ SymValCodeFunc* make_new_glob_func(SymDef* func_sym, TypeExpr* func_type, bool m return res; } -bool check_global_func(const Lexem& cur, sym_idx_t func_name) { +bool check_global_func(const Lexer& lex, sym_idx_t func_name) { SymDef* def = lookup_symbol(func_name); if (!def) { - cur.error("undefined symbol `" + symbols.get_name(func_name) + "`"); + lex.error("undefined symbol `" + symbols.get_name(func_name) + "`"); return false; } SymVal* val = dynamic_cast(def->value); if (!val) { - cur.error(std::string{"symbol `"} + symbols.get_name(func_name) + "` has no value and no type"); + lex.error(std::string{"symbol `"} + symbols.get_name(func_name) + "` has no value and no type"); return false; } else if (!val->get_type()) { - cur.error(std::string{"symbol `"} + symbols.get_name(func_name) + "` has no type, possibly not a function"); + lex.error(std::string{"symbol `"} + symbols.get_name(func_name) + "` has no type, possibly not a function"); return false; } else { return true; @@ -519,12 +499,12 @@ Expr* make_func_apply(Expr* fun, Expr* x) { // parse ( E { , E } ) | () | [ E { , E } ] | [] | id | num | _ Expr* parse_expr100(Lexer& lex, CodeBlob& code, bool nv) { - if (lex.tp() == '(' || lex.tp() == '[') { - bool tf = (lex.tp() == '['); - int clbr = (tf ? ']' : ')'); - SrcLocation loc{lex.cur().loc}; + if (lex.tok() == tok_oppar || lex.tok() == tok_opbracket) { + bool tf = (lex.tok() == tok_opbracket); + TokenType clbr = (tf ? tok_clbracket : tok_clpar); + SrcLocation loc{lex.cur_location()}; lex.next(); - if (lex.tp() == clbr) { + if (lex.tok() == clbr) { lex.next(); Expr* res = new Expr{Expr::_Tensor, {}}; res->flags = Expr::_IsRvalue; @@ -539,21 +519,21 @@ Expr* parse_expr100(Lexer& lex, CodeBlob& code, bool nv) { return res; } Expr* res = parse_expr(lex, code, nv); - if (lex.tp() == ')') { + if (lex.tok() == tok_clpar) { + lex.expect(clbr, clbr == tok_clbracket ? "']'" : "')'"); res->flags |= Expr::_IsInsideParenthesis; - lex.expect(clbr); return res; } std::vector type_list; type_list.push_back(res->e_type); int f = res->flags; res = new Expr{Expr::_Tensor, {res}}; - while (lex.tp() == ',') { + while (lex.tok() == tok_comma) { lex.next(); auto x = parse_expr(lex, code, nv); res->pb_arg(x); if ((f ^ x->flags) & Expr::_IsType) { - lex.cur().error("mixing type and non-type expressions inside the same tuple"); + lex.error("mixing type and non-type expressions inside the same tuple"); } f &= x->flags; type_list.push_back(x->e_type); @@ -567,53 +547,49 @@ Expr* parse_expr100(Lexer& lex, CodeBlob& code, bool nv) { res->here = loc; res->e_type = TypeExpr::new_tuple(res->args.at(0)->e_type); } - lex.expect(clbr); + lex.expect(clbr, clbr == tok_clbracket ? "']'" : "')'"); return res; } - int t = lex.tp(); - if (t == Lexem::Number) { - Expr* res = new Expr{Expr::_Const, lex.cur().loc}; + TokenType t = lex.tok(); + if (t == tok_int_const) { + Expr* res = new Expr{Expr::_Const, lex.cur_location()}; res->flags = Expr::_IsRvalue; - res->intval = td::string_to_int256(lex.cur().str); + res->intval = td::string_to_int256(lex.cur_str_std_string()); if (res->intval.is_null() || !res->intval->signed_fits_bits(257)) { - lex.cur().error_at("invalid integer constant `", "`"); + lex.error_at("invalid integer constant `", "`"); } - res->e_type = TypeExpr::new_atomic(_Int); + res->e_type = TypeExpr::new_atomic(TypeExpr::_Int); lex.next(); return res; } - if (t == Lexem::String) { - std::string str = lex.cur().str; - int str_type = lex.cur().val; + if (t == tok_string_const) { + std::string str = lex.cur_str_std_string(); + lex.next(); + char modifier = 0; + if (lex.tok() == tok_string_modifier) { + modifier = lex.cur_str()[0]; + lex.next(); + } Expr* res; - switch (str_type) { + switch (modifier) { case 0: case 's': case 'a': - { - res = new Expr{Expr::_SliceConst, lex.cur().loc}; - res->e_type = TypeExpr::new_atomic(_Slice); + res = new Expr{Expr::_SliceConst, lex.cur_location()}; + res->e_type = TypeExpr::new_atomic(TypeExpr::_Slice); break; - } case 'u': case 'h': case 'H': case 'c': - { - res = new Expr{Expr::_Const, lex.cur().loc}; - res->e_type = TypeExpr::new_atomic(_Int); + res = new Expr{Expr::_Const, lex.cur_location()}; + res->e_type = TypeExpr::new_atomic(TypeExpr::_Int); break; - } default: - { - res = new Expr{Expr::_Const, lex.cur().loc}; - res->e_type = TypeExpr::new_atomic(_Int); - lex.cur().error("invalid string type `" + std::string(1, static_cast(str_type)) + "`"); - return res; - } + lex.error("invalid string type `" + std::string(1, modifier) + "`"); } res->flags = Expr::_IsRvalue; - switch (str_type) { + switch (modifier) { case 0: { res->strval = td::hex_encode(str); break; @@ -623,7 +599,7 @@ Expr* parse_expr100(Lexer& lex, CodeBlob& code, bool nv) { unsigned char buff[128]; int bits = (int)td::bitstring::parse_bitstring_hex_literal(buff, sizeof(buff), str.data(), str.data() + str.size()); if (bits < 0) { - lex.cur().error_at("Invalid hex bitstring constant `", "`"); + lex.error_at("Invalid hex bitstring constant `", "`"); } break; } @@ -633,64 +609,63 @@ Expr* parse_expr100(Lexer& lex, CodeBlob& code, bool nv) { if (a.parse_addr(str)) { res->strval = block::tlb::MsgAddressInt().pack_std_address(a)->as_bitslice().to_hex(); } else { - lex.cur().error_at("invalid standard address `", "`"); + lex.error_at("invalid standard address `", "`"); } break; } case 'u': { res->intval = td::hex_string_to_int256(td::hex_encode(str)); - if (!str.size()) { - lex.cur().error("empty integer ascii-constant"); + if (str.empty()) { + lex.error("empty integer ascii-constant"); } if (res->intval.is_null()) { - lex.cur().error_at("too long integer ascii-constant `", "`"); + lex.error_at("too long integer ascii-constant `", "`"); } break; } case 'h': - case 'H': - { + case 'H': { unsigned char hash[32]; digest::hash_str(hash, str.data(), str.size()); - res->intval = td::bits_to_refint(hash, (str_type == 'h') ? 32 : 256, false); + res->intval = td::bits_to_refint(hash, (modifier == 'h') ? 32 : 256, false); break; } - case 'c': - { + case 'c': { res->intval = td::make_refint(td::crc32(td::Slice{str})); break; } + default: + __builtin_unreachable(); } - lex.next(); return res; } - if (t == '_') { - Expr* res = new Expr{Expr::_Hole, lex.cur().loc}; + if (t == tok_underscore) { + Expr* res = new Expr{Expr::_Hole, lex.cur_location()}; res->val = -1; res->flags = Expr::_IsLvalue; res->e_type = TypeExpr::new_hole(); lex.next(); return res; } - if (t == _Var) { - Expr* res = new Expr{Expr::_Type, lex.cur().loc}; + if (t == tok_var) { + Expr* res = new Expr{Expr::_Type, lex.cur_location()}; res->flags = Expr::_IsType; res->e_type = TypeExpr::new_hole(); lex.next(); return res; } - if (t == _Int || t == _Cell || t == _Slice || t == _Builder || t == _Cont || t == _Type || t == _Tuple) { - Expr* res = new Expr{Expr::_Type, lex.cur().loc}; + if (t == tok_int || t == tok_cell || t == tok_slice || t == tok_builder || t == tok_cont || t == tok_type || t == tok_tuple) { + Expr* res = new Expr{Expr::_Type, lex.cur_location()}; res->flags = Expr::_IsType; res->e_type = TypeExpr::new_atomic(t); lex.next(); return res; } - if (t == _Ident) { - auto sym = lookup_symbol(lex.cur().val); + if (t == tok_identifier) { + auto sym = lookup_symbol(lex.cur_sym_idx()); if (sym && dynamic_cast(sym->value)) { auto val = dynamic_cast(sym->value); - Expr* res = new Expr{Expr::_Type, lex.cur().loc}; + Expr* res = new Expr{Expr::_Type, lex.cur_location()}; res->flags = Expr::_IsType; res->e_type = val->get_type(); lex.next(); @@ -698,7 +673,7 @@ Expr* parse_expr100(Lexer& lex, CodeBlob& code, bool nv) { } if (sym && dynamic_cast(sym->value)) { auto val = dynamic_cast(sym->value); - Expr* res = new Expr{Expr::_GlobVar, lex.cur().loc}; + Expr* res = new Expr{Expr::_GlobVar, lex.cur_location()}; res->e_type = val->get_type(); res->sym = sym; res->flags = Expr::_IsLvalue | Expr::_IsRvalue | Expr::_IsImpure; @@ -707,34 +682,35 @@ Expr* parse_expr100(Lexer& lex, CodeBlob& code, bool nv) { } if (sym && dynamic_cast(sym->value)) { auto val = dynamic_cast(sym->value); - Expr* res = new Expr{Expr::_None, lex.cur().loc}; + Expr* res = new Expr{Expr::_None, lex.cur_location()}; res->flags = Expr::_IsRvalue; - if (val->type == _Int) { + if (val->get_kind() == SymValConst::IntConst) { res->cls = Expr::_Const; res->intval = val->get_int_value(); + res->e_type = TypeExpr::new_atomic(tok_int); } - else if (val->type == _Slice) { + else if (val->get_kind() == SymValConst::SliceConst) { res->cls = Expr::_SliceConst; res->strval = val->get_str_value(); + res->e_type = TypeExpr::new_atomic(tok_slice); } else { - lex.cur().error("Invalid symbolic constant type"); + lex.error("Invalid symbolic constant type"); } - res->e_type = TypeExpr::new_atomic(val->type); lex.next(); return res; } bool auto_apply = false; - Expr* res = new Expr{Expr::_Var, lex.cur().loc}; + Expr* res = new Expr{Expr::_Var, lex.cur_location()}; if (nv) { - res->val = ~lex.cur().val; + res->val = ~lex.cur_sym_idx(); res->e_type = TypeExpr::new_hole(); res->flags = Expr::_IsLvalue; // std::cerr << "defined new variable " << lex.cur().str << " : " << res->e_type << std::endl; } else { if (!sym) { - check_global_func(lex.cur(), lex.cur().val); - sym = lookup_symbol(lex.cur().val); + check_global_func(lex, lex.cur_sym_idx()); + sym = lookup_symbol(lex.cur_sym_idx()); } res->sym = sym; SymVal* val = nullptr; @@ -743,14 +719,14 @@ Expr* parse_expr100(Lexer& lex, CodeBlob& code, bool nv) { val = dynamic_cast(sym->value); } if (!val) { - lex.cur().error_at("undefined identifier `", "`"); - } else if (val->type == SymVal::_Func) { + lex.error_at("undefined identifier `", "`"); + } else if (val->kind == SymValKind::_Func) { res->e_type = val->get_type(); res->cls = Expr::_GlobFunc; auto_apply = val->auto_apply; impure = !dynamic_cast(val)->is_marked_as_pure(); } else if (val->idx < 0) { - lex.cur().error_at("accessing variable `", "` being defined"); + lex.error_at("accessing variable `", "` being defined"); } else { res->val = val->idx; res->e_type = val->get_type(); @@ -765,41 +741,41 @@ Expr* parse_expr100(Lexer& lex, CodeBlob& code, bool nv) { res = new Expr{Expr::_Apply, sym, {}}; res->flags = Expr::_IsRvalue | impure; } - res->deduce_type(lex.cur()); + res->deduce_type(lex); lex.next(); return res; } - lex.expect(Lexem::Ident); + lex.expect(tok_identifier, "identifier"); return nullptr; } // parse E { E } Expr* parse_expr90(Lexer& lex, CodeBlob& code, bool nv) { Expr* res = parse_expr100(lex, code, nv); - while (lex.tp() == '(' || lex.tp() == '[' || (lex.tp() == _Ident && !is_special_ident(lex.cur().val))) { + while (lex.tok() == tok_oppar || lex.tok() == tok_opbracket || (lex.tok() == tok_identifier && !is_special_ident(lex.cur_sym_idx()))) { if (res->is_type()) { Expr* x = parse_expr100(lex, code, true); - x->chk_lvalue(lex.cur()); // chk_lrvalue() ? + x->chk_lvalue(lex); // chk_lrvalue() ? TypeExpr* tp = res->e_type; delete res; res = new Expr{Expr::_TypeApply, {x}}; res->e_type = tp; - res->here = lex.cur().loc; + res->here = lex.cur_location(); try { unify(res->e_type, x->e_type); } catch (UnifyError& ue) { std::ostringstream os; os << "cannot transform expression of type " << x->e_type << " to explicitly requested type " << res->e_type << ": " << ue; - lex.cur().error(os.str()); + lex.error(os.str()); } res->flags = x->flags; } else { Expr* x = parse_expr100(lex, code, false); - x->chk_rvalue(lex.cur()); + x->chk_rvalue(lex); res = make_func_apply(res, x); - res->here = lex.cur().loc; - res->deduce_type(lex.cur()); + res->here = lex.cur_location(); + res->deduce_type(lex); } } return res; @@ -808,19 +784,19 @@ Expr* parse_expr90(Lexer& lex, CodeBlob& code, bool nv) { // parse E { .method E | ~method E } Expr* parse_expr80(Lexer& lex, CodeBlob& code, bool nv) { Expr* res = parse_expr90(lex, code, nv); - while (lex.tp() == _Ident && is_special_ident(lex.cur().val)) { - auto modify = is_tilde_ident(lex.cur().val); + while (lex.tok() == tok_identifier && is_special_ident(lex.cur_sym_idx())) { + auto modify = is_tilde_ident(lex.cur_sym_idx()); auto obj = res; if (modify) { - obj->chk_lvalue(lex.cur()); + obj->chk_lvalue(lex); } else { - obj->chk_rvalue(lex.cur()); + obj->chk_rvalue(lex); } - auto loc = lex.cur().loc; - auto name = lex.cur().val; + SrcLocation loc = lex.cur_location(); + sym_idx_t name = lex.cur_sym_idx(); auto sym = lookup_symbol(name); if (!sym || !dynamic_cast(sym->value)) { - auto name1 = symbols.lookup(lex.cur().str.substr(1)); + auto name1 = symbols.lookup(lex.cur_str().substr(1)); if (name1) { auto sym1 = lookup_symbol(name1); if (sym1 && dynamic_cast(sym1->value)) { @@ -829,18 +805,18 @@ Expr* parse_expr80(Lexer& lex, CodeBlob& code, bool nv) { } } } - check_global_func(lex.cur(), name); + check_global_func(lex, name); if (verbosity >= 2) { - std::cerr << "using symbol `" << symbols.get_name(name) << "` for method call of " << lex.cur().str << std::endl; + std::cerr << "using symbol `" << symbols.get_name(name) << "` for method call of " << lex.cur_str() << std::endl; } sym = lookup_symbol(name); SymValFunc* val = sym ? dynamic_cast(sym->value) : nullptr; if (!val) { - lex.cur().error_at("undefined method identifier `", "`"); + lex.error_at("undefined method identifier `", "`"); } lex.next(); auto x = parse_expr100(lex, code, false); - x->chk_rvalue(lex.cur()); + x->chk_rvalue(lex); if (x->cls == Expr::_Tensor) { res = new Expr{Expr::_Apply, name, {obj}}; res->args.insert(res->args.end(), x->args.begin(), x->args.end()); @@ -849,33 +825,54 @@ Expr* parse_expr80(Lexer& lex, CodeBlob& code, bool nv) { } res->here = loc; res->flags = Expr::_IsRvalue | (val->is_marked_as_pure() ? 0 : Expr::_IsImpure); - res->deduce_type(lex.cur()); + res->deduce_type(lex); if (modify) { auto tmp = res; res = new Expr{Expr::_LetFirst, {obj->copy(), tmp}}; res->here = loc; res->flags = tmp->flags; res->set_val(name); - res->deduce_type(lex.cur()); + res->deduce_type(lex); } } return res; } -// parse [ ~ ] E +// parse [ ~ | - | + ] E Expr* parse_expr75(Lexer& lex, CodeBlob& code, bool nv) { - if (lex.tp() == '~') { - sym_idx_t name = symbols.lookup_add("~_"); - check_global_func(lex.cur(), name); - SrcLocation loc{lex.cur().loc}; + if (lex.tok() == tok_bitwise_not || lex.tok() == tok_minus || lex.tok() == tok_plus) { + TokenType t = lex.tok(); + sym_idx_t name = symbols.lookup_add(lex.cur_str_std_string() + "_"); + check_global_func(lex, name); + SrcLocation loc{lex.cur_location()}; lex.next(); - auto x = parse_expr80(lex, code, false); - x->chk_rvalue(lex.cur()); + auto x = parse_expr75(lex, code, false); + x->chk_rvalue(lex); + + // here's an optimization to convert "-1" (tok_minus tok_int_const) to a const -1, not to Expr::Apply(-,1) + // without this, everything still works, but Tolk looses some vars/stack knowledge for now (to be fixed later) + // in FunC, it was: + // `var fst = -1;` // is constantly 1 + // `var snd = - 1;` // is Expr::Apply(-), a comment "snd=1" is lost in stack layout comments, and so on + // hence, when after grammar modification tok_minus became a true unary operator (not a part of a number), + // and thus to preserve existing behavior until compiler parts are completely rewritten, handle this case here + if (x->cls == Expr::_Const) { + if (t == tok_bitwise_not) { + x->intval = ~x->intval; + } else if (t == tok_minus) { + x->intval = -x->intval; + } + if (!x->intval->signed_fits_bits(257)) { + lex.error("integer overflow"); + } + return x; + } + auto res = new Expr{Expr::_Apply, name, {x}}; res->here = loc; - res->set_val('~'); + res->set_val(t); res->flags = Expr::_IsRvalue; - res->deduce_type(lex.cur()); + res->deduce_type(lex); return res; } else { return parse_expr80(lex, code, nv); @@ -885,58 +882,42 @@ Expr* parse_expr75(Lexer& lex, CodeBlob& code, bool nv) { // parse E { (* | / | % | /% | ^/ | ~/ | ^% | ~% ) E } Expr* parse_expr30(Lexer& lex, CodeBlob& code, bool nv) { Expr* res = parse_expr75(lex, code, nv); - while (lex.tp() == '*' || lex.tp() == '/' || lex.tp() == '%' || lex.tp() == _DivMod || lex.tp() == _DivC || - lex.tp() == _DivR || lex.tp() == _ModC || lex.tp() == _ModR) { - res->chk_rvalue(lex.cur()); - int t = lex.tp(); - sym_idx_t name = symbols.lookup_add(std::string{"_"} + lex.cur().str + "_"); - SrcLocation loc{lex.cur().loc}; - check_global_func(lex.cur(), name); + while (lex.tok() == tok_mul || lex.tok() == tok_div || lex.tok() == tok_mod || lex.tok() == tok_divmod || lex.tok() == tok_divC || + lex.tok() == tok_divR || lex.tok() == tok_modC || lex.tok() == tok_modR) { + res->chk_rvalue(lex); + TokenType t = lex.tok(); + sym_idx_t name = symbols.lookup_add(std::string{"_"} + lex.cur_str_std_string() + "_"); + SrcLocation loc{lex.cur_location()}; + check_global_func(lex, name); lex.next(); auto x = parse_expr75(lex, code, false); - x->chk_rvalue(lex.cur()); + x->chk_rvalue(lex); res = new Expr{Expr::_Apply, name, {res, x}}; res->here = loc; res->set_val(t); res->flags = Expr::_IsRvalue; - res->deduce_type(lex.cur()); + res->deduce_type(lex); } return res; } -// parse [-] E { (+ | -) E } +// parse E { (+ | -) E } Expr* parse_expr20(Lexer& lex, CodeBlob& code, bool nv) { - Expr* res; - int t = lex.tp(); - if (t == '-') { - sym_idx_t name = symbols.lookup_add("-_"); - check_global_func(lex.cur(), name); - SrcLocation loc{lex.cur().loc}; + Expr* res = parse_expr30(lex, code, nv); + while (lex.tok() == tok_minus || lex.tok() == tok_plus) { + res->chk_rvalue(lex); + TokenType t = lex.tok(); + sym_idx_t name = symbols.lookup_add(std::string{"_"} + lex.cur_str_std_string() + "_"); + check_global_func(lex, name); + SrcLocation loc{lex.cur_location()}; lex.next(); auto x = parse_expr30(lex, code, false); - x->chk_rvalue(lex.cur()); - res = new Expr{Expr::_Apply, name, {x}}; - res->here = loc; - res->set_val(t); - res->flags = Expr::_IsRvalue; - res->deduce_type(lex.cur()); - } else { - res = parse_expr30(lex, code, nv); - } - while (lex.tp() == '-' || lex.tp() == '+') { - res->chk_rvalue(lex.cur()); - t = lex.tp(); - sym_idx_t name = symbols.lookup_add(std::string{"_"} + lex.cur().str + "_"); - check_global_func(lex.cur(), name); - SrcLocation loc{lex.cur().loc}; - lex.next(); - auto x = parse_expr30(lex, code, false); - x->chk_rvalue(lex.cur()); + x->chk_rvalue(lex); res = new Expr{Expr::_Apply, name, {res, x}}; res->here = loc; res->set_val(t); res->flags = Expr::_IsRvalue; - res->deduce_type(lex.cur()); + res->deduce_type(lex); } return res; } @@ -944,21 +925,21 @@ Expr* parse_expr20(Lexer& lex, CodeBlob& code, bool nv) { // parse E { ( << | >> | ~>> | ^>> ) E } Expr* parse_expr17(Lexer& lex, CodeBlob& code, bool nv) { Expr* res = parse_expr20(lex, code, nv); - while (lex.tp() == _Lshift || lex.tp() == _Rshift || lex.tp() == _RshiftC || lex.tp() == _RshiftR) { - res->chk_rvalue(lex.cur()); - int t = lex.tp(); - sym_idx_t name = symbols.lookup_add(std::string{"_"} + lex.cur().str + "_"); - check_global_func(lex.cur(), name); - SrcLocation loc{lex.cur().loc}; + while (lex.tok() == tok_lshift || lex.tok() == tok_rshift || lex.tok() == tok_rshiftC || lex.tok() == tok_rshiftR) { + res->chk_rvalue(lex); + TokenType t = lex.tok(); + sym_idx_t name = symbols.lookup_add(std::string{"_"} + lex.cur_str_std_string() + "_"); + check_global_func(lex, name); + SrcLocation loc{lex.cur_location()}; lex.next(); auto x = parse_expr20(lex, code, false); - x->chk_rvalue(lex.cur()); + x->chk_rvalue(lex); diagnose_addition_in_bitshift(loc, name, x); res = new Expr{Expr::_Apply, name, {res, x}}; res->here = loc; res->set_val(t); res->flags = Expr::_IsRvalue; - res->deduce_type(lex.cur()); + res->deduce_type(lex); } return res; } @@ -966,21 +947,21 @@ Expr* parse_expr17(Lexer& lex, CodeBlob& code, bool nv) { // parse E [ (== | < | > | <= | >= | != | <=> ) E ] Expr* parse_expr15(Lexer& lex, CodeBlob& code, bool nv) { Expr* res = parse_expr17(lex, code, nv); - if (lex.tp() == _Eq || lex.tp() == '<' || lex.tp() == '>' || lex.tp() == _Leq || lex.tp() == _Geq || - lex.tp() == _Neq || lex.tp() == _Spaceship) { - res->chk_rvalue(lex.cur()); - int t = lex.tp(); - sym_idx_t name = symbols.lookup_add(std::string{"_"} + lex.cur().str + "_"); - check_global_func(lex.cur(), name); - SrcLocation loc{lex.cur().loc}; + if (lex.tok() == tok_eq || lex.tok() == tok_lt || lex.tok() == tok_gt || lex.tok() == tok_leq || lex.tok() == tok_geq || + lex.tok() == tok_neq || lex.tok() == tok_spaceship) { + res->chk_rvalue(lex); + TokenType t = lex.tok(); + sym_idx_t name = symbols.lookup_add(std::string{"_"} + lex.cur_str_std_string() + "_"); + check_global_func(lex, name); + SrcLocation loc{lex.cur_location()}; lex.next(); auto x = parse_expr17(lex, code, false); - x->chk_rvalue(lex.cur()); + x->chk_rvalue(lex); res = new Expr{Expr::_Apply, name, {res, x}}; res->here = loc; res->set_val(t); res->flags = Expr::_IsRvalue; - res->deduce_type(lex.cur()); + res->deduce_type(lex); } return res; } @@ -988,15 +969,15 @@ Expr* parse_expr15(Lexer& lex, CodeBlob& code, bool nv) { // parse E { ( & | `|` | ^ ) E } Expr* parse_expr14(Lexer& lex, CodeBlob& code, bool nv) { Expr* res = parse_expr15(lex, code, nv); - while (lex.tp() == '&' || lex.tp() == '|' || lex.tp() == '^') { - res->chk_rvalue(lex.cur()); - int t = lex.tp(); - sym_idx_t name = symbols.lookup_add(std::string{"_"} + lex.cur().str + "_"); - check_global_func(lex.cur(), name); - SrcLocation loc{lex.cur().loc}; + while (lex.tok() == tok_bitwise_and || lex.tok() == tok_bitwise_or || lex.tok() == tok_bitwise_xor) { + res->chk_rvalue(lex); + TokenType t = lex.tok(); + sym_idx_t name = symbols.lookup_add(std::string{"_"} + lex.cur_str_std_string() + "_"); + check_global_func(lex, name); + SrcLocation loc{lex.cur_location()}; lex.next(); auto x = parse_expr15(lex, code, false); - x->chk_rvalue(lex.cur()); + x->chk_rvalue(lex); // diagnose tricky bitwise precedence, like "flags & 0xFF != 0" (& has lower precedence) diagnose_bitwise_precedence(loc, name, res, x); @@ -1004,7 +985,7 @@ Expr* parse_expr14(Lexer& lex, CodeBlob& code, bool nv) { res->here = loc; res->set_val(t); res->flags = Expr::_IsRvalue; - res->deduce_type(lex.cur()); + res->deduce_type(lex); } return res; } @@ -1012,19 +993,19 @@ Expr* parse_expr14(Lexer& lex, CodeBlob& code, bool nv) { // parse E [ ? E : E ] Expr* parse_expr13(Lexer& lex, CodeBlob& code, bool nv) { Expr* res = parse_expr14(lex, code, nv); - if (lex.tp() == '?') { - res->chk_rvalue(lex.cur()); - SrcLocation loc{lex.cur().loc}; + if (lex.tok() == tok_question) { + res->chk_rvalue(lex); + SrcLocation loc{lex.cur_location()}; lex.next(); auto x = parse_expr(lex, code, false); - x->chk_rvalue(lex.cur()); - lex.expect(':'); + x->chk_rvalue(lex); + lex.expect(tok_colon, "':'"); auto y = parse_expr13(lex, code, false); - y->chk_rvalue(lex.cur()); + y->chk_rvalue(lex); res = new Expr{Expr::_CondExpr, {res, x, y}}; res->here = loc; res->flags = Expr::_IsRvalue; - res->deduce_type(lex.cur()); + res->deduce_type(lex); } return res; } @@ -1032,42 +1013,42 @@ Expr* parse_expr13(Lexer& lex, CodeBlob& code, bool nv) { // parse LE1 (= | += | -= | ... ) E2 Expr* parse_expr10(Lexer& lex, CodeBlob& code, bool nv) { auto x = parse_expr13(lex, code, nv); - int t = lex.tp(); - if (t == _PlusLet || t == _MinusLet || t == _TimesLet || t == _DivLet || t == _DivRLet || t == _DivCLet || - t == _ModLet || t == _ModCLet || t == _ModRLet || t == _LshiftLet || t == _RshiftLet || t == _RshiftCLet || - t == _RshiftRLet || t == _AndLet || t == _OrLet || t == _XorLet) { - x->chk_lvalue(lex.cur()); - x->chk_rvalue(lex.cur()); - sym_idx_t name = symbols.lookup_add(std::string{"^_"} + lex.cur().str + "_"); - check_global_func(lex.cur(), name); - SrcLocation loc{lex.cur().loc}; + TokenType t = lex.tok(); + if (t == tok_set_plus || t == tok_set_minus || t == tok_set_mul || t == tok_set_div || t == tok_set_divR || t == tok_set_divC || + t == tok_set_mod || t == tok_set_modC || t == tok_set_modR || t == tok_set_lshift || t == tok_set_rshift || t == tok_set_rshiftC || + t == tok_set_rshiftR || t == tok_set_bitwise_and || t == tok_set_bitwise_or || t == tok_set_bitwise_xor) { + x->chk_lvalue(lex); + x->chk_rvalue(lex); + sym_idx_t name = symbols.lookup_add(std::string{"^_"} + lex.cur_str_std_string() + "_"); + check_global_func(lex, name); + SrcLocation loc{lex.cur_location()}; lex.next(); auto y = parse_expr10(lex, code, false); - y->chk_rvalue(lex.cur()); + y->chk_rvalue(lex); Expr* z = new Expr{Expr::_Apply, name, {x, y}}; z->here = loc; z->set_val(t); z->flags = Expr::_IsRvalue; - z->deduce_type(lex.cur()); + z->deduce_type(lex); Expr* res = new Expr{Expr::_Letop, {x->copy(), z}}; res->here = loc; res->flags = (x->flags & ~Expr::_IsType) | Expr::_IsRvalue; res->set_val(t); - res->deduce_type(lex.cur()); + res->deduce_type(lex); return res; - } else if (t == '=') { - x->chk_lvalue(lex.cur()); - SrcLocation loc{lex.cur().loc}; + } else if (t == tok_assign) { + x->chk_lvalue(lex); + SrcLocation loc{lex.cur_location()}; lex.next(); auto y = parse_expr10(lex, code, false); - y->chk_rvalue(lex.cur()); + y->chk_rvalue(lex); x->predefine_vars(); x->define_new_vars(code); Expr* res = new Expr{Expr::_Letop, {x, y}}; res->here = loc; res->flags = (x->flags & ~Expr::_IsType) | Expr::_IsRvalue; res->set_val(t); - res->deduce_type(lex.cur()); + res->deduce_type(lex); return res; } else { return x; @@ -1094,7 +1075,7 @@ void combine_parallel(val& x, const val y) { blk_fl::val parse_return_stmt(Lexer& lex, CodeBlob& code) { auto expr = parse_expr(lex, code); - expr->chk_rvalue(lex.cur()); + expr->chk_rvalue(lex); try { // std::cerr << "in return: "; unify(expr->e_type, code.ret_type); @@ -1102,11 +1083,11 @@ blk_fl::val parse_return_stmt(Lexer& lex, CodeBlob& code) { std::ostringstream os; os << "previous function return type " << code.ret_type << " cannot be unified with return statement expression type " << expr->e_type << ": " << ue; - lex.cur().error(os.str()); + lex.error(os.str()); } std::vector tmp_vars = expr->pre_compile(code); - code.emplace_back(lex.cur().loc, Op::_Return, std::move(tmp_vars)); - lex.expect(';'); + code.emplace_back(lex.cur_location(), Op::_Return, std::move(tmp_vars)); + lex.expect(tok_semicolon, "';'"); return blk_fl::ret; } @@ -1119,177 +1100,177 @@ blk_fl::val parse_implicit_ret_stmt(Lexer& lex, CodeBlob& code) { std::ostringstream os; os << "previous function return type " << code.ret_type << " cannot be unified with implicit end-of-block return type " << ret_type << ": " << ue; - lex.cur().error(os.str()); + lex.error(os.str()); } - code.emplace_back(lex.cur().loc, Op::_Return); + code.emplace_back(lex.cur_location(), Op::_Return); return blk_fl::ret; } blk_fl::val parse_stmt(Lexer& lex, CodeBlob& code); blk_fl::val parse_block_stmt(Lexer& lex, CodeBlob& code, bool no_new_scope = false) { - lex.expect('{'); + lex.expect(tok_opbrace, "'{'"); if (!no_new_scope) { - open_scope(lex); + open_scope(lex.cur_location()); } blk_fl::val res = blk_fl::init; bool warned = false; - while (lex.tp() != '}') { + while (lex.tok() != tok_clbrace) { if (!(res & blk_fl::end) && !warned) { - lex.cur().loc.show_warning("unreachable code"); + lex.cur_location().show_warning("unreachable code"); warned = true; } blk_fl::combine(res, parse_stmt(lex, code)); } if (!no_new_scope) { - close_scope(lex); + close_scope(lex.cur_location()); } - lex.expect('}'); + lex.expect(tok_clbrace, "'}'"); return res; } blk_fl::val parse_repeat_stmt(Lexer& lex, CodeBlob& code) { - SrcLocation loc{lex.cur().loc}; - lex.expect(_Repeat); + SrcLocation loc{lex.cur_location()}; + lex.expect(tok_repeat, "'repeat'"); auto expr = parse_expr(lex, code); - expr->chk_rvalue(lex.cur()); - auto cnt_type = TypeExpr::new_atomic(_Int); + expr->chk_rvalue(lex); + auto cnt_type = TypeExpr::new_atomic(TypeExpr::_Int); try { unify(expr->e_type, cnt_type); } catch (UnifyError& ue) { std::ostringstream os; os << "repeat count value of type " << expr->e_type << " is not an integer: " << ue; - lex.cur().error(os.str()); + lex.error(os.str()); } std::vector tmp_vars = expr->pre_compile(code); if (tmp_vars.size() != 1) { - lex.cur().error("repeat count value is not a singleton"); + lex.error("repeat count value is not a singleton"); } Op& repeat_op = code.emplace_back(loc, Op::_Repeat, tmp_vars); code.push_set_cur(repeat_op.block0); blk_fl::val res = parse_block_stmt(lex, code); - code.close_pop_cur(lex.cur().loc); + code.close_pop_cur(lex.cur_location()); return res | blk_fl::end; } blk_fl::val parse_while_stmt(Lexer& lex, CodeBlob& code) { - SrcLocation loc{lex.cur().loc}; - lex.expect(_While); + SrcLocation loc{lex.cur_location()}; + lex.expect(tok_while, "'while'"); auto expr = parse_expr(lex, code); - expr->chk_rvalue(lex.cur()); - auto cnt_type = TypeExpr::new_atomic(_Int); + expr->chk_rvalue(lex); + auto cnt_type = TypeExpr::new_atomic(TypeExpr::_Int); try { unify(expr->e_type, cnt_type); } catch (UnifyError& ue) { std::ostringstream os; os << "while condition value of type " << expr->e_type << " is not an integer: " << ue; - lex.cur().error(os.str()); + lex.error(os.str()); } Op& while_op = code.emplace_back(loc, Op::_While); code.push_set_cur(while_op.block0); while_op.left = expr->pre_compile(code); - code.close_pop_cur(lex.cur().loc); + code.close_pop_cur(lex.cur_location()); if (while_op.left.size() != 1) { - lex.cur().error("while condition value is not a singleton"); + lex.error("while condition value is not a singleton"); } code.push_set_cur(while_op.block1); blk_fl::val res1 = parse_block_stmt(lex, code); - code.close_pop_cur(lex.cur().loc); + code.close_pop_cur(lex.cur_location()); return res1 | blk_fl::end; } blk_fl::val parse_do_stmt(Lexer& lex, CodeBlob& code) { - Op& while_op = code.emplace_back(lex.cur().loc, Op::_Until); - lex.expect(_Do); + Op& while_op = code.emplace_back(lex.cur_location(), Op::_Until); + lex.expect(tok_do, "'do'"); code.push_set_cur(while_op.block0); - open_scope(lex); + open_scope(lex.cur_location()); blk_fl::val res = parse_block_stmt(lex, code, true); - lex.expect(_Until); + lex.expect(tok_until, "'until'"); auto expr = parse_expr(lex, code); - expr->chk_rvalue(lex.cur()); - close_scope(lex); - auto cnt_type = TypeExpr::new_atomic(_Int); + expr->chk_rvalue(lex); + close_scope(lex.cur_location()); + auto cnt_type = TypeExpr::new_atomic(TypeExpr::_Int); try { unify(expr->e_type, cnt_type); } catch (UnifyError& ue) { std::ostringstream os; os << "`until` condition value of type " << expr->e_type << " is not an integer: " << ue; - lex.cur().error(os.str()); + lex.error(os.str()); } while_op.left = expr->pre_compile(code); - code.close_pop_cur(lex.cur().loc); + code.close_pop_cur(lex.cur_location()); if (while_op.left.size() != 1) { - lex.cur().error("`until` condition value is not a singleton"); + lex.error("`until` condition value is not a singleton"); } return res & ~blk_fl::empty; } blk_fl::val parse_try_catch_stmt(Lexer& lex, CodeBlob& code) { code.require_callxargs = true; - lex.expect(_Try); - Op& try_catch_op = code.emplace_back(lex.cur().loc, Op::_TryCatch); + lex.expect(tok_try, "'try'"); + Op& try_catch_op = code.emplace_back(lex.cur_location(), Op::_TryCatch); code.push_set_cur(try_catch_op.block0); blk_fl::val res0 = parse_block_stmt(lex, code); - code.close_pop_cur(lex.cur().loc); - lex.expect(_Catch); + code.close_pop_cur(lex.cur_location()); + lex.expect(tok_catch, "'catch'"); code.push_set_cur(try_catch_op.block1); - open_scope(lex); + open_scope(lex.cur_location()); Expr* expr = parse_expr(lex, code, true); - expr->chk_lvalue(lex.cur()); - TypeExpr* tvm_error_type = TypeExpr::new_tensor(TypeExpr::new_var(), TypeExpr::new_atomic(_Int)); + expr->chk_lvalue(lex); + TypeExpr* tvm_error_type = TypeExpr::new_tensor(TypeExpr::new_var(), TypeExpr::new_atomic(TypeExpr::_Int)); try { unify(expr->e_type, tvm_error_type); } catch (UnifyError& ue) { std::ostringstream os; os << "`catch` arguments have incorrect type " << expr->e_type << ": " << ue; - lex.cur().error(os.str()); + lex.error(os.str()); } expr->predefine_vars(); expr->define_new_vars(code); try_catch_op.left = expr->pre_compile(code); tolk_assert(try_catch_op.left.size() == 2 || try_catch_op.left.size() == 1); blk_fl::val res1 = parse_block_stmt(lex, code); - close_scope(lex); - code.close_pop_cur(lex.cur().loc); + close_scope(lex.cur_location()); + code.close_pop_cur(lex.cur_location()); blk_fl::combine_parallel(res0, res1); return res0; } -blk_fl::val parse_if_stmt(Lexer& lex, CodeBlob& code, int first_lex = _If) { - SrcLocation loc{lex.cur().loc}; - lex.expect(first_lex); +blk_fl::val parse_if_stmt(Lexer& lex, CodeBlob& code, TokenType first_lex = tok_if) { + SrcLocation loc{lex.cur_location()}; + lex.next(); auto expr = parse_expr(lex, code); - expr->chk_rvalue(lex.cur()); - auto flag_type = TypeExpr::new_atomic(_Int); + expr->chk_rvalue(lex); + auto flag_type = TypeExpr::new_atomic(TypeExpr::_Int); try { unify(expr->e_type, flag_type); } catch (UnifyError& ue) { std::ostringstream os; os << "`if` condition value of type " << expr->e_type << " is not an integer: " << ue; - lex.cur().error(os.str()); + lex.error(os.str()); } std::vector tmp_vars = expr->pre_compile(code); if (tmp_vars.size() != 1) { - lex.cur().error("condition value is not a singleton"); + lex.error("condition value is not a singleton"); } Op& if_op = code.emplace_back(loc, Op::_If, tmp_vars); code.push_set_cur(if_op.block0); blk_fl::val res1 = parse_block_stmt(lex, code); blk_fl::val res2 = blk_fl::init; - code.close_pop_cur(lex.cur().loc); - if (lex.tp() == _Else) { - lex.expect(_Else); + code.close_pop_cur(lex.cur_location()); + if (lex.tok() == tok_else) { + lex.expect(tok_else, "'else'"); code.push_set_cur(if_op.block1); res2 = parse_block_stmt(lex, code); - code.close_pop_cur(lex.cur().loc); - } else if (lex.tp() == _Elseif || lex.tp() == _Elseifnot) { + code.close_pop_cur(lex.cur_location()); + } else if (lex.tok() == tok_elseif || lex.tok() == tok_elseifnot) { code.push_set_cur(if_op.block1); - res2 = parse_if_stmt(lex, code, lex.tp()); - code.close_pop_cur(lex.cur().loc); + res2 = parse_if_stmt(lex, code, lex.tok()); + code.close_pop_cur(lex.cur_location()); } else { - if_op.block1 = std::make_unique(lex.cur().loc, Op::_Nop); + if_op.block1 = std::make_unique(lex.cur_location(), Op::_Nop); } - if (first_lex == _Ifnot || first_lex == _Elseifnot) { + if (first_lex == tok_ifnot || first_lex == tok_elseifnot) { std::swap(if_op.block0, if_op.block1); } blk_fl::combine_parallel(res1, res2); @@ -1297,41 +1278,41 @@ blk_fl::val parse_if_stmt(Lexer& lex, CodeBlob& code, int first_lex = _If) { } blk_fl::val parse_stmt(Lexer& lex, CodeBlob& code) { - switch (lex.tp()) { - case _Return: { + switch (lex.tok()) { + case tok_return: { lex.next(); return parse_return_stmt(lex, code); } - case '{': { + case tok_opbrace: { return parse_block_stmt(lex, code); } - case ';': { + case tok_semicolon: { lex.next(); return blk_fl::init; } - case _Repeat: + case tok_repeat: return parse_repeat_stmt(lex, code); - case _If: - case _Ifnot: - return parse_if_stmt(lex, code, lex.tp()); - case _Do: + case tok_if: + case tok_ifnot: + return parse_if_stmt(lex, code, lex.tok()); + case tok_do: return parse_do_stmt(lex, code); - case _While: + case tok_while: return parse_while_stmt(lex, code); - case _Try: + case tok_try: return parse_try_catch_stmt(lex, code); default: { auto expr = parse_expr(lex, code); - expr->chk_rvalue(lex.cur()); + expr->chk_rvalue(lex); expr->pre_compile(code); - lex.expect(';'); + lex.expect(tok_semicolon, "';'"); return blk_fl::end; } } } CodeBlob* parse_func_body(Lexer& lex, FormalArgList arg_list, TypeExpr* ret_type, bool marked_as_pure) { - lex.expect('{'); + lex.expect(tok_opbrace, "'{'"); CodeBlob* blob = new CodeBlob{ret_type}; if (marked_as_pure) { blob->flags |= CodeBlob::_ForbidImpure; @@ -1339,9 +1320,9 @@ CodeBlob* parse_func_body(Lexer& lex, FormalArgList arg_list, TypeExpr* ret_type blob->import_params(std::move(arg_list)); blk_fl::val res = blk_fl::init; bool warned = false; - while (lex.tp() != '}') { + while (lex.tok() != tok_clbrace) { if (!(res & blk_fl::end) && !warned) { - lex.cur().loc.show_warning("unreachable code"); + lex.cur_location().show_warning("unreachable code"); warned = true; } blk_fl::combine(res, parse_stmt(lex, *blob)); @@ -1349,15 +1330,15 @@ CodeBlob* parse_func_body(Lexer& lex, FormalArgList arg_list, TypeExpr* ret_type if (res & blk_fl::end) { parse_implicit_ret_stmt(lex, *blob); } - blob->close_blk(lex.cur().loc); - lex.expect('}'); + blob->close_blk(lex.cur_location()); + lex.expect(tok_clbrace, "'}'"); return blob; } SymValAsmFunc* parse_asm_func_body(Lexer& lex, TypeExpr* func_type, const FormalArgList& arg_list, TypeExpr* ret_type, bool marked_as_pure) { - auto loc = lex.cur().loc; - lex.expect(_Asm); + SrcLocation loc = lex.cur_location(); + lex.expect(tok_asm, "'asm'"); int cnt = (int)arg_list.size(); int width = ret_type->get_width(); if (width < 0 || width > 16) { @@ -1379,15 +1360,13 @@ SymValAsmFunc* parse_asm_func_body(Lexer& lex, TypeExpr* func_type, const Formal } std::vector asm_ops; std::vector arg_order, ret_order; - if (lex.tp() == '(') { - lex.expect('('); - if (lex.tp() != _Mapsto) { + if (lex.tok() == tok_oppar) { + lex.next(); + if (lex.tok() != tok_mapsto) { std::vector visited(cnt, false); for (int i = 0; i < cnt; i++) { - if (lex.tp() != _Ident) { - lex.expect(_Ident); - } - auto sym = lookup_symbol(lex.cur().val); + lex.check(tok_identifier, "identifier"); + auto sym = lookup_symbol(lex.cur_sym_idx()); int j; for (j = 0; j < cnt; j++) { if (std::get(arg_list[j]) == sym) { @@ -1395,10 +1374,10 @@ SymValAsmFunc* parse_asm_func_body(Lexer& lex, TypeExpr* func_type, const Formal } } if (j == cnt) { - lex.cur().error("formal argument name expected"); + lex.error("formal argument name expected"); } if (visited[j]) { - lex.cur().error("formal argument listed twice"); + lex.error("formal argument listed twice"); } visited[j] = true; int c1 = cum_arg_width[j], c2 = cum_arg_width[j + 1]; @@ -1409,29 +1388,29 @@ SymValAsmFunc* parse_asm_func_body(Lexer& lex, TypeExpr* func_type, const Formal } tolk_assert(arg_order.size() == (unsigned)tot_width); } - if (lex.tp() == _Mapsto) { - lex.expect(_Mapsto); + if (lex.tok() == tok_mapsto) { + lex.next(); std::vector visited(width, false); for (int i = 0; i < width; i++) { - if (lex.tp() != Lexem::Number || lex.cur().str.size() > 3) { - lex.expect(Lexem::Number); + if (lex.tok() != tok_int_const || lex.cur_str().size() > 3) { + lex.expect(tok_int_const, "number"); } - int j = atoi(lex.cur().str.c_str()); + int j = atoi(lex.cur_str_std_string().c_str()); if (j < 0 || j >= width || visited[j]) { - lex.cur().error("expected integer return value index 0 .. width-1"); + lex.error("expected integer return value index 0 .. width-1"); } visited[j] = true; ret_order.push_back(j); lex.next(); } } - lex.expect(')'); + lex.expect(tok_clpar, "')'"); } - while (lex.tp() == _String) { - std::string ops = lex.cur().str; // \n\n... + while (lex.tok() == tok_string_const) { + std::string ops = lex.cur_str_std_string(); // \n\n... std::string op; for (const char& c : ops) { - if (c == '\n') { + if (c == '\n' || c == '\r') { if (!op.empty()) { asm_ops.push_back(AsmOp::Parse(op, cnt, width)); if (asm_ops.back().is_custom()) { @@ -1452,9 +1431,9 @@ SymValAsmFunc* parse_asm_func_body(Lexer& lex, TypeExpr* func_type, const Formal lex.next(); } if (asm_ops.empty()) { - throw ParseError{lex.cur().loc, "string with assembler instruction expected"}; + lex.error("string with assembler instruction expected"); } - lex.expect(';'); + lex.expect(tok_semicolon, "';'"); std::string crc_s; for (const AsmOp& asm_op : asm_ops) { crc_s += asm_op.op; @@ -1475,34 +1454,34 @@ SymValAsmFunc* parse_asm_func_body(Lexer& lex, TypeExpr* func_type, const Formal std::vector parse_type_var_list(Lexer& lex) { std::vector res; - lex.expect(_Forall); + lex.expect(tok_forall, "'forall'"); int idx = 0; while (true) { - if (lex.tp() == _Type) { + if (lex.tok() == tok_type) { lex.next(); } - if (lex.tp() != _Ident) { - throw ParseError{lex.cur().loc, "free type identifier expected"}; + if (lex.tok() != tok_identifier) { + lex.error("free type identifier expected"); } - auto loc = lex.cur().loc; - if (prohibited_var_names.count(symbols.get_name(lex.cur().val))) { - throw ParseError{loc, PSTRING() << "symbol `" << symbols.get_name(lex.cur().val) + SrcLocation loc = lex.cur_location(); + if (prohibited_var_names.count(symbols.get_name(lex.cur_sym_idx()))) { + throw ParseError{loc, PSTRING() << "symbol `" << symbols.get_name(lex.cur_sym_idx()) << "` cannot be redefined as a variable"}; } - SymDef* new_sym_def = define_symbol(lex.cur().val, true, loc); + SymDef* new_sym_def = define_symbol(lex.cur_sym_idx(), true, loc); if (!new_sym_def || new_sym_def->value) { - lex.cur().error_at("redefined type variable `", "`"); + lex.error_at("redefined type variable `", "`"); } auto var = TypeExpr::new_var(idx); - new_sym_def->value = new SymValType{SymVal::_Typename, idx++, var}; + new_sym_def->value = new SymValType{SymValKind::_Typename, idx++, var}; res.push_back(var); lex.next(); - if (lex.tp() != ',') { + if (lex.tok() != tok_comma) { break; } lex.next(); } - lex.expect(_Mapsto); + lex.expect(tok_mapsto, "'->'"); return res; } @@ -1608,168 +1587,169 @@ void detect_if_function_just_wraps_another(SymValCodeFunc* v_current, const td:: } } -static td::RefInt256 calculate_method_id_by_func_name(const std::string &func_name) { - unsigned int crc = td::crc16(func_name); +static td::RefInt256 calculate_method_id_by_func_name(std::string_view func_name) { + unsigned int crc = td::crc16(static_cast(func_name)); return td::make_refint((crc & 0xffff) | 0x10000); } // todo rewrite function declaration parsing completely, it's weird void parse_func_def(Lexer& lex) { - SrcLocation loc{lex.cur().loc}; - open_scope(lex); + SrcLocation loc = lex.cur_location(); + open_scope(loc); std::vector type_vars; bool is_get_method = false; - if (lex.tp() == _Forall) { + if (lex.tok() == tok_forall) { type_vars = parse_type_var_list(lex); - } else if (lex.tp() == _Get) { + } else if (lex.tok() == tok_get) { is_get_method = true; lex.next(); } auto ret_type = parse_type(lex); - if (lex.tp() != _Ident) { - throw ParseError{lex.cur().loc, "function name identifier expected"}; + if (lex.tok() != tok_identifier) { + lex.error("function name identifier expected"); } - Lexem func_name = lex.cur(); + std::string func_name = lex.cur_str_std_string(); + int func_sym_idx = lex.cur_sym_idx(); lex.next(); FormalArgList arg_list = parse_formal_args(lex); bool marked_as_pure = false; - if (lex.tp() == _Impure) { + if (lex.tok() == tok_impure) { static bool warning_shown = false; if (!warning_shown) { - lex.cur().loc.show_warning("`impure` specifier is deprecated. All functions are impure by default, use `pure` to mark a function as pure"); + lex.cur_location().show_warning("`impure` specifier is deprecated. All functions are impure by default, use `pure` to mark a function as pure"); warning_shown = true; } lex.next(); - } else if (lex.tp() == _Pure) { + } else if (lex.tok() == tok_pure) { marked_as_pure = true; lex.next(); } int flags_inline = 0; - if (lex.tp() == _Inline) { + if (lex.tok() == tok_inline) { flags_inline = SymValFunc::flagInline; lex.next(); - } else if (lex.tp() == _InlineRef) { + } else if (lex.tok() == tok_inlineref) { flags_inline = SymValFunc::flagInlineRef; lex.next(); } td::RefInt256 method_id; - if (lex.tp() == _MethodId) { + if (lex.tok() == tok_method_id) { if (is_get_method) { - lex.cur().error("both `get` and `method_id` are not allowed"); + lex.error("both `get` and `method_id` are not allowed"); } lex.next(); - if (lex.tp() == '(') { // method_id(N) - lex.expect('('); - method_id = td::string_to_int256(lex.cur().str); - lex.expect(Lexem::Number); + if (lex.tok() == tok_oppar) { // method_id(N) + lex.next(); + method_id = td::string_to_int256(lex.cur_str_std_string()); + lex.expect(tok_int_const, "number"); if (method_id.is_null()) { - lex.cur().error_at("invalid integer constant `", "`"); + lex.error_at("invalid integer constant `", "`"); } - lex.expect(')'); + lex.expect(tok_clpar, "')'"); } else { static bool warning_shown = false; if (!warning_shown) { - lex.cur().loc.show_warning("`method_id` specifier is deprecated, use `get` keyword.\nExample: `get int seqno() { ... }`"); + lex.cur_location().show_warning("`method_id` specifier is deprecated, use `get` keyword.\nExample: `get int seqno() { ... }`"); warning_shown = true; } - method_id = calculate_method_id_by_func_name(func_name.str); + method_id = calculate_method_id_by_func_name(func_name); } } if (is_get_method) { tolk_assert(method_id.is_null()); - method_id = calculate_method_id_by_func_name(func_name.str); + method_id = calculate_method_id_by_func_name(func_name); for (const SymDef* other : glob_get_methods) { if (!td::cmp(dynamic_cast(other->value)->method_id, method_id)) { - lex.cur().error(PSTRING() << "GET methods hash collision: `" << other->name() << "` and `" + func_name.str + "` produce the same hash. Consider renaming one of these functions."); + lex.error(PSTRING() << "GET methods hash collision: `" << other->name() << "` and `" + func_name + "` produce the same hash. Consider renaming one of these functions."); } } } TypeExpr* func_type = TypeExpr::new_map(extract_total_arg_type(arg_list), ret_type); func_type = compute_type_closure(func_type, type_vars); - if (lex.tp() == _Builtin) { - const SymDef* builtin_func = lookup_symbol(func_name.str); + if (lex.tok() == tok_builtin) { + const SymDef* builtin_func = lookup_symbol(symbols.lookup(func_name)); const SymValFunc* func_val = builtin_func ? dynamic_cast(builtin_func->value) : nullptr; if (!func_val || !func_val->is_builtin()) { - lex.cur().error("`builtin` used for non-builtin function"); + lex.error("`builtin` used for non-builtin function"); } #ifdef TOLK_DEBUG // in release, we don't need this check, since `builtin` is used only in stdlib.tolk, which is our responsibility if (!func_val->sym_type->equals_to(func_type) || func_val->is_marked_as_pure() != marked_as_pure) { - lex.cur().error("declaration for `builtin` function doesn't match an actual one"); + lex.error("declaration for `builtin` function doesn't match an actual one"); } #endif lex.next(); - lex.expect(';'); - close_scope(lex); + lex.expect(tok_semicolon, "';'"); + close_scope(lex.cur_location()); return; } - if (lex.tp() != ';' && lex.tp() != '{' && lex.tp() != _Asm) { - lex.expect('{', "function body block"); + if (lex.tok() != tok_semicolon && lex.tok() != tok_opbrace && lex.tok() != tok_asm) { + lex.expect(tok_opbrace, "function body block"); } if (verbosity >= 1) { - std::cerr << "function " << func_name.str << " : " << func_type << std::endl; + std::cerr << "function " << func_name << " : " << func_type << std::endl; } - SymDef* func_sym = define_global_symbol(func_name.val, 0, loc); + SymDef* func_sym = define_global_symbol(func_sym_idx, 0, loc); tolk_assert(func_sym); SymValFunc* func_sym_val = dynamic_cast(func_sym->value); if (func_sym->value) { - if (func_sym->value->type != SymVal::_Func || !func_sym_val) { - lex.cur().error("was not defined as a function before"); + if (func_sym->value->kind != SymValKind::_Func || !func_sym_val) { + lex.error("was not defined as a function before"); } try { unify(func_sym_val->sym_type, func_type); } catch (UnifyError& ue) { std::ostringstream os; - os << "previous type of function " << func_name.str << " : " << func_sym_val->sym_type + os << "previous type of function " << func_name << " : " << func_sym_val->sym_type << " cannot be unified with new type " << func_type << ": " << ue; - lex.cur().error(os.str()); + lex.error(os.str()); } } - if (lex.tp() == ';') { + if (lex.tok() == tok_semicolon) { make_new_glob_func(func_sym, func_type, marked_as_pure); lex.next(); - } else if (lex.tp() == '{') { + } else if (lex.tok() == tok_opbrace) { if (dynamic_cast(func_sym_val)) { - lex.cur().error("function `"s + func_name.str + "` has been already defined as an assembler built-in"); + lex.error("function `" + func_name + "` has been already defined as an assembler built-in"); } SymValCodeFunc* func_sym_code; if (func_sym_val) { func_sym_code = dynamic_cast(func_sym_val); if (!func_sym_code) { - lex.cur().error("function `"s + func_name.str + "` has been already defined in an yet-unknown way"); + lex.error("function `" + func_name + "` has been already defined in an yet-unknown way"); } } else { func_sym_code = make_new_glob_func(func_sym, func_type, marked_as_pure); } if (func_sym_code->code) { - lex.cur().error("redefinition of function `"s + func_name.str + "`"); + lex.error("redefinition of function `"s + func_name + "`"); } if (marked_as_pure && ret_type->get_width() == 0) { - lex.cur().error("a pure function should return something, otherwise it will be optimized out anyway"); + lex.error("a pure function should return something, otherwise it will be optimized out anyway"); } CodeBlob* code = parse_func_body(lex, arg_list, ret_type, marked_as_pure); - code->name = func_name.str; + code->name = func_name; code->loc = loc; // code->print(std::cerr); // !!!DEBUG!!! func_sym_code->code = code; detect_if_function_just_wraps_another(func_sym_code, method_id); } else { - Lexem asm_lexem = lex.cur(); + SrcLocation asm_location = lex.cur_location(); SymValAsmFunc* asm_func = parse_asm_func_body(lex, func_type, arg_list, ret_type, marked_as_pure); #ifdef TOLK_DEBUG - asm_func->name = func_name.str; + asm_func->name = func_name; #endif if (func_sym_val) { if (dynamic_cast(func_sym_val)) { - asm_lexem.error("function `"s + func_name.str + "` was already declared as an ordinary function"); + throw ParseError(asm_location, "function `" + func_name + "` was already declared as an ordinary function"); } SymValAsmFunc* asm_func_old = dynamic_cast(func_sym_val); if (asm_func_old) { if (asm_func->crc != asm_func_old->crc) { - asm_lexem.error("redefinition of built-in assembler function `"s + func_name.str + "`"); + throw ParseError(asm_location, "redefinition of built-in assembler function `" + func_name + "`"); } } else { - asm_lexem.error("redefinition of previously (somehow) defined function `"s + func_name.str + "`"); + throw ParseError(asm_location, "redefinition of previously (somehow) defined function `" + func_name + "`"); } } func_sym->value = asm_func; @@ -1777,126 +1757,87 @@ void parse_func_def(Lexer& lex) { if (method_id.not_null()) { auto val = dynamic_cast(func_sym->value); if (!val) { - lex.cur().error("cannot set method id for unknown function `"s + func_name.str + "`"); + lex.error("cannot set method id for unknown function `" + func_name + "`"); } if (val->method_id.is_null()) { val->method_id = std::move(method_id); } else if (td::cmp(val->method_id, method_id) != 0) { - lex.cur().error("integer method identifier for `"s + func_name.str + "` changed from " + + lex.error("integer method identifier for `" + func_name + "` changed from " + val->method_id->to_dec_string() + " to a different value " + method_id->to_dec_string()); } } if (flags_inline) { auto val = dynamic_cast(func_sym->value); if (!val) { - lex.cur().error("cannot set unknown function `"s + func_name.str + "` as an inline"); + lex.error("cannot set unknown function `" + func_name + "` as an inline"); } if (!val->is_inline() && !val->is_inline_ref()) { val->flags |= flags_inline; } else if ((val->flags & (SymValFunc::flagInline | SymValFunc::flagInlineRef)) != flags_inline) { - lex.cur().error("inline mode for `"s + func_name.str + "` changed with respect to a previous declaration"); + lex.error("inline mode for `" + func_name + "` changed with respect to a previous declaration"); } } if (is_get_method) { auto val = dynamic_cast(func_sym->value); if (!val) { - lex.cur().error("cannot set unknown function `"s + func_name.str + "` as a get method"); + lex.error("cannot set unknown function `" + func_name + "` as a get method"); } val->flags |= SymValFunc::flagGetMethod; glob_get_methods.push_back(func_sym); } if (verbosity >= 1) { - std::cerr << "new type of function " << func_name.str << " : " << func_type << std::endl; + std::cerr << "new type of function " << func_name << " : " << func_type << std::endl; } - close_scope(lex); + close_scope(lex.cur_location()); } -std::string tolk_ver_test = tolk_version; - void parse_pragma(Lexer& lex) { - auto pragma = lex.cur(); - lex.next(); - if (lex.tp() != _Ident) { - lex.expect(_Ident, "pragma name expected"); - } - auto pragma_name = lex.cur().str; - lex.next(); - if (!pragma_name.compare("version") || !pragma_name.compare("not-version")) { - bool negate = !pragma_name.compare("not-version"); + SrcLocation loc = lex.cur_location(); + lex.next_special(tok_pragma_name, "pragma name"); + std::string_view pragma_name = lex.cur_str(); + if (pragma_name == "version") { + lex.next(); + TokenType cmp_tok = lex.tok(); char op = '='; bool eq = false; + if (cmp_tok == tok_gt || cmp_tok == tok_geq) { + op = '>'; + eq = cmp_tok == tok_geq; + } else if (cmp_tok == tok_lt || cmp_tok == tok_leq) { + op = '<'; + eq = cmp_tok == tok_leq; + } else if (cmp_tok == tok_eq) { + op = '='; + } else if (cmp_tok == tok_bitwise_xor) { + op = '^'; + } else { + lex.error("invalid comparison operator"); + } + lex.next_special(tok_semver, "semver"); + std::string_view pragma_value = lex.cur_str(); int sem_ver[3] = {0, 0, 0}; char segs = 1; - auto stoi = [&](const std::string& s) { - auto R = td::to_integer_safe(s); + auto stoi = [&](std::string_view s) { + auto R = td::to_integer_safe(static_cast(s)); if (R.is_error()) { - lex.cur().error("invalid semver format"); + lex.error("invalid semver format"); } return R.move_as_ok(); }; - if (lex.tp() == _Number) { - sem_ver[0] = stoi(lex.cur().str); - } else if (lex.tp() == _Ident) { - auto id1 = lex.cur().str; - char ch1 = id1[0]; - if ((ch1 == '>') || (ch1 == '<') || (ch1 == '=') || (ch1 == '^')) { - op = ch1; - } else { - lex.cur().error("unexpected comparator operation"); - } - if (id1.length() < 2) { - lex.cur().error("expected number after comparator"); - } - if (id1[1] == '=') { - eq = true; - if (id1.length() < 3) { - lex.cur().error("expected number after comparator"); - } - sem_ver[0] = stoi(id1.substr(2)); - } else { - sem_ver[0] = stoi(id1.substr(1)); - } - } else { - lex.cur().error("expected semver with optional comparator"); - } - lex.next(); - if (lex.tp() != ';') { - if (lex.tp() != _Ident || lex.cur().str[0] != '.') { - lex.cur().error("invalid semver format"); - } - sem_ver[1] = stoi(lex.cur().str.substr(1)); - segs = 2; - lex.next(); - } - if (lex.tp() != ';') { - if (lex.tp() != _Ident || lex.cur().str[0] != '.') { - lex.cur().error("invalid semver format"); - } - sem_ver[2] = stoi(lex.cur().str.substr(1)); - segs = 3; - lex.next(); + std::istringstream iss_value(static_cast(pragma_value)); + for (int idx = 0; idx < 3; idx++) { + std::string s{"0"}; + std::getline(iss_value, s, '.'); + sem_ver[idx] = stoi(s); } // End reading semver from source code int tolk_ver[3] = {0, 0, 0}; - std::istringstream iss(tolk_ver_test); - std::string s; + std::istringstream iss(tolk_version); for (int idx = 0; idx < 3; idx++) { + std::string s; std::getline(iss, s, '.'); tolk_ver[idx] = stoi(s); } // End parsing embedded semver - std::string semver_expr; - if (negate) { - semver_expr += '!'; - } - semver_expr += op; - if (eq) { - semver_expr += '='; - } - for (int idx = 0; idx < 3; idx++) { - semver_expr += std::to_string(sem_ver[idx]); - if (idx < 2) - semver_expr += '.'; - } bool match = true; switch (op) { case '=': @@ -1929,134 +1870,94 @@ void parse_pragma(Lexer& lex) { match = false; } break; + default: + __builtin_unreachable(); } - if ((match && negate) || (!match && !negate)) { - pragma.error(std::string("Tolk version ") + tolk_ver_test + " does not satisfy condition " + semver_expr); + if (!match) { + throw ParseError(loc, std::string("Tolk version ") + tolk_version + " does not satisfy this condition"); } - } else if (!pragma_name.compare("test-version-set")) { - if (lex.tp() != _String) { - lex.cur().error("version string expected"); - } - tolk_ver_test = lex.cur().str; - lex.next(); } else if (pragma_name == pragma_allow_post_modification.name()) { - pragma_allow_post_modification.enable(lex.cur().loc); + pragma_allow_post_modification.enable(loc); } else if (pragma_name == pragma_compute_asm_ltr.name()) { - pragma_compute_asm_ltr.enable(lex.cur().loc); + pragma_compute_asm_ltr.enable(loc); } else if (pragma_name == pragma_remove_unused_functions.name()) { - pragma_remove_unused_functions.enable(lex.cur().loc); + pragma_remove_unused_functions.enable(loc); } else { - lex.cur().error(std::string{"unknown pragma `"} + pragma_name + "`"); - } - lex.expect(';'); -} - -std::vector source_fdescr; - -std::map source_files; -std::stack inclusion_locations; - -void parse_include(Lexer& lex, const FileDescr* fdescr) { - auto include = lex.cur(); - lex.expect(_IncludeHashtag); - if (lex.tp() != _String) { - lex.expect(_String, "source file name"); - } - std::string val = lex.cur().str; - std::string parent_dir = fdescr->filename; - if (parent_dir.rfind('/') != std::string::npos) { - val = parent_dir.substr(0, parent_dir.rfind('/') + 1) + val; + lex.error("unknown pragma name"); } lex.next(); - lex.expect(';'); - if (!parse_source_file(val.c_str(), include, false)) { - include.error(std::string{"failed parsing included file `"} + val + "`"); + lex.expect(tok_semicolon, "';'"); +} + +AllRegisteredSrcFiles all_src_files; +std::string stdlib_filename; + +void parse_include(Lexer& lex, const SrcFile* parent_file) { + SrcLocation loc = lex.cur_location(); + lex.expect(tok_include, "#include"); + if (lex.tok() != tok_string_const) { + lex.expect(tok_string_const, "source file name"); + } + std::string val = static_cast(lex.cur_str()); + std::string parent_dir = parent_file->rel_filename; + if (size_t rc = parent_dir.rfind('/'); rc != std::string::npos) { + val = parent_dir.substr(0, rc + 1) + val; + } + lex.next(); + lex.expect(tok_semicolon, "';'"); + if (!parse_source_file(val.c_str(), loc)) { + lex.error(std::string{"failed parsing included file `"} + val + "`"); } } -bool parse_source(std::istream* is, FileDescr* fdescr) { - SourceReader reader{is, fdescr}; - Lexer lex{reader, ";,()[] ~."}; - // previously, FunC had lisp-style comments, - // but Tolk supports traditional (slash) comments alongside (lisp-style will be deleted soon) - lex.set_comment_tokens(";;", "{-", "-}"); - lex.set_comment2_tokens("//", "/*", "*/"); - lex.start_parsing(); - while (lex.tp() != _Eof) { - if (lex.tp() == _PragmaHashtag) { +void parse_source(const SrcFile* file) { + Lexer lex(file); + while (!lex.is_eof()) { + if (lex.tok() == tok_pragma) { parse_pragma(lex); - } else if (lex.tp() == _IncludeHashtag) { - parse_include(lex, fdescr); - } else if (lex.tp() == _Global) { + } else if (lex.tok() == tok_include) { + parse_include(lex, file); + } else if (lex.tok() == tok_global) { parse_global_var_decls(lex); - } else if (lex.tp() == _Const) { + } else if (lex.tok() == tok_const) { parse_const_decls(lex); } else { parse_func_def(lex); } } - return true; } -bool parse_source_file(const char* filename, Lexem lex, bool is_main) { +bool parse_source_file(const char* filename, SrcLocation loc_included_from) { + const SrcFile* included_from = loc_included_from.get_src_file(); if (!filename || !*filename) { - auto msg = "source file name is an empty string"; - if (lex.tp) { - lex.error(msg); - } else { - throw Fatal{msg}; - } + throw ParseError(loc_included_from, "source file name is an empty string"); } auto path_res = read_callback(ReadCallback::Kind::Realpath, filename); if (path_res.is_error()) { auto error = path_res.move_as_error(); - lex.error(error.message().c_str()); + throw ParseError(loc_included_from, error.message().c_str()); return false; } - std::string real_filename = path_res.move_as_ok(); - auto it = source_files.find(real_filename); - if (it != source_files.end()) { - it->second->is_main |= is_main; + std::string abs_filename = path_res.move_as_ok(); + const SrcFile* file = all_src_files.find_file(abs_filename); + if (file != nullptr) { if (verbosity >= 2) { - if (lex.tp) { - lex.loc.show_warning(std::string{"skipping file "} + real_filename + " because it was already included"); - } else { - std::cerr << "warning: skipping file " << real_filename << " because it was already included" << std::endl; - } + std::cerr << "skipping file " << abs_filename << " because it was already parsed" << '\n'; } return true; } - if (lex.tp) { // included + if (included_from) { generated_from += std::string{"incl:"}; } generated_from += std::string{"`"} + filename + "` "; - FileDescr* cur_source = new FileDescr{filename}; - source_files[real_filename] = cur_source; - cur_source->is_main = is_main; - source_fdescr.push_back(cur_source); - auto file_res = read_callback(ReadCallback::Kind::ReadFile, filename); - if (file_res.is_error()) { - auto msg = file_res.move_as_error().message().str(); - if (lex.tp) { - lex.error(msg); - } else { - throw Fatal{msg}; - } + td::Result text = read_callback(ReadCallback::Kind::ReadFile, abs_filename.c_str()); + if (text.is_error()) { + throw ParseError(loc_included_from, text.move_as_error().message().str()); } - auto file_str = file_res.move_as_ok(); - std::stringstream ss{file_str}; - inclusion_locations.push(lex.loc); - bool res = parse_source(&ss, cur_source); - inclusion_locations.pop(); - return res; -} - -bool parse_source_stdin() { - FileDescr* cur_source = new FileDescr{"stdin", true}; - cur_source->is_main = true; - source_fdescr.push_back(cur_source); - return parse_source(&std::cin, cur_source); + file = all_src_files.register_file(filename, abs_filename, text.move_as_ok(), included_from); + parse_source(file); + return true; } } // namespace tolk diff --git a/tolk/platform-utils.h b/tolk/platform-utils.h new file mode 100644 index 00000000..7b16226e --- /dev/null +++ b/tolk/platform-utils.h @@ -0,0 +1,44 @@ +/* + This file is part of TON Blockchain source code. + + TON Blockchain is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + TON Blockchain is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with TON Blockchain. If not, see . + + In addition, as a special exception, the copyright holders give permission + to link the code of portions of this program with the OpenSSL library. + You must obey the GNU General Public License in all respects for all + of the code used other than OpenSSL. If you modify file(s) with this + exception, you may extend this exception to your version of the file(s), + but you are not obligated to do so. If you do not wish to do so, delete this + exception statement from your version. If you delete this exception statement + from all source files in the program, then also delete it here. +*/ +#pragma once + +#if __GNUC__ +#define GNU_ATTRIBUTE_COLD [[gnu::cold]] +#define GNU_ATTRIBUTE_NORETURN [[gnu::noreturn]] +#define GNU_ATTRIBUTE_ALWAYS_INLINE [[gnu::always_inline]] +#else +#define GNU_ATTRIBUTE_COLD +#define GNU_ATTRIBUTE_NORETURN [[noreturn]] +#define GNU_ATTRIBUTE_ALWAYS_INLINE +#endif + +#if defined(__GNUC__) +#define LIKELY(x) __builtin_expect(x, true) +#define UNLIKELY(x) __builtin_expect(x, false) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif diff --git a/tolk/src-file.cpp b/tolk/src-file.cpp new file mode 100644 index 00000000..93a92e60 --- /dev/null +++ b/tolk/src-file.cpp @@ -0,0 +1,164 @@ +/* + This file is part of TON Blockchain Library. + + TON Blockchain Library is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + TON Blockchain Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with TON Blockchain Library. If not, see . +*/ +#include "src-file.h" +#include + +namespace tolk { + +extern AllRegisteredSrcFiles all_src_files; +extern std::string stdlib_filename; + +static_assert(sizeof(SrcLocation) == 8); + +const SrcFile* AllRegisteredSrcFiles::find_file(int file_id) const { + for (const SrcFile* file : all_src_files) { + if (file->file_id == file_id) { + return file; + } + } + return nullptr; +} + +const SrcFile* AllRegisteredSrcFiles::find_file(const std::string& abs_filename) const { + for (const SrcFile* file : all_src_files) { + if (file->abs_filename == abs_filename) { + return file; + } + } + return nullptr; +} + +const SrcFile* AllRegisteredSrcFiles::register_file(const std::string& rel_filename, const std::string& abs_filename, std::string&& text, const SrcFile* included_from) { + SrcFile* created = new SrcFile(++last_file_id, rel_filename, abs_filename, std::move(text), included_from); + all_src_files.push_back(created); + return created; +} + + +bool SrcFile::is_entrypoint_file() const { + return file_id == (stdlib_filename.empty() ? 0 : 1); +} + +bool SrcFile::is_offset_valid(int offset) const { + return offset >= 0 && offset < static_cast(text.size()); +} + +SrcFile::SrcPosition SrcFile::convert_offset(int offset) const { + if (!is_offset_valid(offset)) { + return SrcPosition{offset, -1, -1, "invalid offset"}; + } + + int line_idx = 0; + int char_idx = 0; + int line_offset = 0; + for (int i = 0; i < offset; ++i) { + char c = text[i]; + if (c == '\n') { + line_idx++; + char_idx = 0; + line_offset = i + 1; + } else { + char_idx++; + } + } + + size_t line_len = text.size() - line_offset; + for (int i = line_offset; i < static_cast(text.size()); ++i) { + if (text[i] == '\n') { + line_len = i - line_offset; + break; + } + } + + std::string_view line_str(text.data() + line_offset, line_len); + return SrcPosition{offset, line_idx + 1, char_idx + 1, line_str}; +} + + +std::ostream& operator<<(std::ostream& os, const SrcFile* src_file) { + return os << (src_file ? src_file->rel_filename : "unknown-location"); +} + +std::ostream& operator<<(std::ostream& os, const Fatal& fatal) { + return os << fatal.what(); +} + +const SrcFile* SrcLocation::get_src_file() const { + return all_src_files.find_file(file_id); +} + +void SrcLocation::show(std::ostream& os) const { + const SrcFile* src_file = get_src_file(); + os << src_file; + if (src_file && src_file->is_offset_valid(char_offset)) { + SrcFile::SrcPosition pos = src_file->convert_offset(char_offset); + os << ':' << pos.line_no << ':' << pos.char_no; + } +} + +void SrcLocation::show_context(std::ostream& os) const { + const SrcFile* src_file = get_src_file(); + if (!src_file || !src_file->is_offset_valid(char_offset)) { + return; + } + SrcFile::SrcPosition pos = src_file->convert_offset(char_offset); + os << " " << pos.line_str << "\n"; + + os << " "; + for (int i = 1; i < pos.char_no; ++i) { + os << ' '; + } + os << '^' << "\n"; +} + +std::ostream& operator<<(std::ostream& os, SrcLocation loc) { + loc.show(os); + return os; +} + +void SrcLocation::show_general_error(std::ostream& os, const std::string& message, const std::string& err_type) const { + show(os); + if (!err_type.empty()) { + os << ": " << err_type; + } + os << ": " << message << std::endl; + show_context(os); +} + +void SrcLocation::show_note(const std::string& err_msg) const { + show_general_error(std::cerr, err_msg, "note"); +} + +void SrcLocation::show_warning(const std::string& err_msg) const { + show_general_error(std::cerr, err_msg, "warning"); +} + +void SrcLocation::show_error(const std::string& err_msg) const { + show_general_error(std::cerr, err_msg, "error"); +} + +std::ostream& operator<<(std::ostream& os, const ParseError& error) { + error.show(os); + return os; +} + +void ParseError::show(std::ostream& os) const { + os << where << ": error: " << message << std::endl; + where.show_context(os); +} + +} // namespace tolk diff --git a/tolk/src-file.h b/tolk/src-file.h new file mode 100644 index 00000000..0f76d787 --- /dev/null +++ b/tolk/src-file.h @@ -0,0 +1,120 @@ +/* + This file is part of TON Blockchain Library. + + TON Blockchain Library is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + TON Blockchain Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with TON Blockchain Library. If not, see . +*/ +#pragma once + +#include +#include + +namespace tolk { + +struct SrcFile { + struct SrcPosition { + int offset; + int line_no; + int char_no; + std::string_view line_str; + }; + + int file_id; + std::string rel_filename; + std::string abs_filename; + std::string text; + const SrcFile* included_from{nullptr}; + + SrcFile(int file_id, std::string rel_filename, std::string abs_filename, std::string&& text, const SrcFile* included_from) + : file_id(file_id) + , rel_filename(std::move(rel_filename)) + , abs_filename(std::move(abs_filename)) + , text(std::move(text)) + , included_from(included_from) { } + + SrcFile(const SrcFile& other) = delete; + SrcFile &operator=(const SrcFile&) = delete; + + bool is_entrypoint_file() const; + bool is_offset_valid(int offset) const; + SrcPosition convert_offset(int offset) const; +}; + +class AllRegisteredSrcFiles { + std::vector all_src_files; + int last_file_id = -1; + +public: + const SrcFile *find_file(int file_id) const; + const SrcFile* find_file(const std::string& abs_filename) const; + const SrcFile* register_file(const std::string& rel_filename, const std::string& abs_filename, std::string&& text, const SrcFile* included_from); + const std::vector& get_all_files() const { return all_src_files; } +}; + +struct Fatal final : std::exception { + std::string message; + + explicit Fatal(std::string _msg) : message(std::move(_msg)) { + } + const char* what() const noexcept override { + return message.c_str(); + } +}; + +std::ostream& operator<<(std::ostream& os, const Fatal& fatal); + +// SrcLocation points to a location (line, column) in some loaded .tolk source SrcFile. +// Note, that instead of storing src_file, line_no, etc., only 2 ints are stored. +// The purpose is: sizeof(SrcLocation) == 8, so it's just passed/stored without pointers/refs, just like int64_t. +// When decoding SrcLocation into human-readable format, it's converted to SrcFile::SrcPosition via offset. +class SrcLocation { + friend class Lexer; + + int file_id = -1; // file_id from AllRegisteredSrcFiles + int char_offset = -1; // offset from SrcFile::text + +public: + + SrcLocation() = default; + explicit SrcLocation(const SrcFile* src_file) : file_id(src_file->file_id) { + } + + bool is_defined() const { return file_id != -1; } + const SrcFile* get_src_file() const; + + void show(std::ostream& os) const; + void show_context(std::ostream& os) const; + + void show_general_error(std::ostream& os, const std::string& message, const std::string& err_type) const; + void show_note(const std::string& err_msg) const; + void show_warning(const std::string& err_msg) const; + void show_error(const std::string& err_msg) const; +}; + +std::ostream& operator<<(std::ostream& os, SrcLocation loc); + +struct ParseError : std::exception { + SrcLocation where; + std::string message; + ParseError(SrcLocation _where, std::string _msg) : where(_where), message(std::move(_msg)) { + } + + const char* what() const noexcept override { + return message.c_str(); + } + void show(std::ostream& os) const; +}; + +std::ostream& operator<<(std::ostream& os, const ParseError& error); + +} // namespace tolk diff --git a/tolk/srcread.cpp b/tolk/srcread.cpp deleted file mode 100644 index c71f498d..00000000 --- a/tolk/srcread.cpp +++ /dev/null @@ -1,228 +0,0 @@ -/* - This file is part of TON Blockchain Library. - - TON Blockchain Library is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - TON Blockchain Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public License - along with TON Blockchain Library. If not, see . -*/ -#include "srcread.h" -#include - -namespace tolk { - -/* - * - * SOURCE FILE READER - * - */ - -std::ostream& operator<<(std::ostream& os, const FileDescr* fdescr) { - return os << (fdescr ? (fdescr->is_stdin ? "stdin" : fdescr->filename) : "unknown-location"); -} - -std::ostream& operator<<(std::ostream& os, const Fatal& fatal) { - return os << fatal.get_msg(); -} - -const char* FileDescr::convert_offset(long offset, long* line_no, long* line_pos, long* line_size) const { - long lno = 0, lpos = -1, lsize = 0; - const char* lstart = nullptr; - if (offset >= 0 && offset < (long)text.size()) { - auto it = std::upper_bound(line_offs.begin(), line_offs.end(), offset); - lno = it - line_offs.begin(); - if (lno && it != line_offs.end()) { - lsize = it[0] - it[-1]; - lpos = offset - it[-1]; - lstart = text.data() + it[-1]; - } - } else { - lno = (long)line_offs.size(); - } - if (line_no) { - *line_no = lno; - } - if (line_pos) { - *line_pos = lpos; - } - if (line_size) { - *line_size = lsize; - } - return lstart; -} - -const char* FileDescr::push_line(std::string new_line) { - if (line_offs.empty()) { - line_offs.push_back(0); - } - std::size_t cur_size = text.size(); - text += new_line; - text += '\0'; - line_offs.push_back((long)text.size()); - return text.data() + cur_size; -} - -void SrcLocation::show(std::ostream& os) const { - os << fdescr; - long line_no, line_pos; - if (fdescr && convert_pos(&line_no, &line_pos)) { - os << ':' << line_no; - if (line_pos >= 0) { - os << ':' << (line_pos + 1); - } - } -} - -bool SrcLocation::show_context(std::ostream& os) const { - long line_no, line_pos, line_size; - if (!fdescr || !convert_pos(&line_no, &line_pos, &line_size)) { - return false; - } - bool skip_left = (line_pos > 200), skip_right = (line_pos + 200u < line_size); - const char* here = fdescr->text.data() + char_offs; - const char* base = here - line_pos; - const char* start = skip_left ? here - 100 : base; - const char* end = skip_right ? here + 100 : base + line_size; - os << " "; - if (skip_left) { - os << "... "; - } - for (const char* ptr = start; ptr < end; ptr++) { - os << (char)*ptr; - } - if (skip_right) { - os << " ..."; - } - os << std::endl; - os << " "; - if (skip_left) { - os << "... "; - } - for (const char* ptr = start; ptr < here; ptr++) { - char c = *ptr; - os << (c == 9 || c == 10 ? c : ' '); - } - os << '^' << std::endl; - return true; -} - -std::ostream& operator<<(std::ostream& os, const SrcLocation& loc) { - loc.show(os); - return os; -} - -void SrcLocation::show_gen_error(std::ostream& os, std::string message, std::string err_type) const { - show(os); - if (!err_type.empty()) { - os << ": " << err_type; - } - os << ": " << message << std::endl; - show_context(os); -} - -std::ostream& operator<<(std::ostream& os, const Error& error) { - error.show(os); - return os; -} - -void ParseError::show(std::ostream& os) const { - os << where << ": error: " << message << std::endl; - where.show_context(os); -} - -SourceReader::SourceReader(std::istream* _is, FileDescr* _fdescr) - : ifs(_is), fdescr(_fdescr), loc(_fdescr), eof(false), cur_line_len(0), start(0), cur(0), end(0) { - load_line(); -} - -void SourceReader::set_eof() { - if (!eof) { - eof = true; - start = cur = end = 0; - } -} - -int SourceReader::skip_spc() { - if (!cur) { - return 0; - } - const char* ptr = cur; - int res = 0; - while (*ptr == ' ' || *ptr == 9) { - ++ptr; - ++res; - } - set_ptr(ptr); - return res; -} - -bool SourceReader::seek_eof() { - while (seek_eoln()) { - if (!load_line()) { - return true; - } - } - return false; -} - -const char* SourceReader::set_ptr(const char* ptr) { - if (ptr != cur) { - if (ptr < cur || ptr > end) { - error("parsing position went outside of line"); - } - loc.char_offs += ptr - cur; - cur = ptr; - } - return ptr; -} - -bool SourceReader::load_line() { - if (eof) { - return false; - } - loc.set_eof(); - if (ifs->eof()) { - set_eof(); - return false; - } - std::getline(*ifs, cur_line); - if (ifs->fail()) { - set_eof(); - if (!ifs->eof()) { - error("cannot read line from source stream"); - } - return false; - } - std::size_t len = cur_line.size(); - if (len > 0xffffff) { - set_eof(); - error("line too long"); - return false; - } - if (len && cur_line.back() == '\r') { - // CP/M line breaks support - cur_line.pop_back(); - --len; - } - cur_line_len = (int)len; - if (fdescr) { - cur = start = fdescr->push_line(std::move(cur_line)); - end = start + len; - loc.char_offs = (std::size_t)(cur - fdescr->text.data()); - cur_line.clear(); - } else { - cur = start = cur_line.c_str(); - end = start + cur_line_len; - } - return true; -} - -} // namespace tolk diff --git a/tolk/srcread.h b/tolk/srcread.h deleted file mode 100644 index 3731a5ca..00000000 --- a/tolk/srcread.h +++ /dev/null @@ -1,162 +0,0 @@ -/* - This file is part of TON Blockchain Library. - - TON Blockchain Library is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - TON Blockchain Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public License - along with TON Blockchain Library. If not, see . -*/ -#pragma once - -#include -#include -#include - -namespace tolk { - -/* - * - * SOURCE FILE READER - * - */ - -struct FileDescr { - std::string filename; - std::string text; - std::vector line_offs; - bool is_stdin; - bool is_main = false; - FileDescr(std::string _fname, bool _stdin = false) : filename(std::move(_fname)), is_stdin(_stdin) { - } - const char* push_line(std::string new_line); - const char* convert_offset(long offset, long* line_no, long* line_pos, long* line_size = nullptr) const; -}; - -struct Fatal { - std::string message; - Fatal(std::string _msg) : message(std::move(_msg)) { - } - std::string get_msg() const { - return message; - } -}; - -std::ostream& operator<<(std::ostream& os, const Fatal& fatal); - -struct SrcLocation { - const FileDescr* fdescr; - long char_offs; - SrcLocation() : fdescr(nullptr), char_offs(-1) { - } - SrcLocation(const FileDescr* _fdescr, long offs = -1) : fdescr(_fdescr), char_offs(-1) { - } - bool defined() const { - return fdescr; - } - bool eof() const { - return char_offs == -1; - } - void set_eof() { - char_offs = -1; - } - const char* convert_pos(long* line_no, long* line_pos, long* line_size = nullptr) const { - return defined() ? fdescr->convert_offset(char_offs, line_no, line_pos, line_size) : nullptr; - } - void show(std::ostream& os) const; - bool show_context(std::ostream& os) const; - void show_gen_error(std::ostream& os, std::string message, std::string err_type = "") const; - void show_note(std::string err_msg) const { - show_gen_error(std::cerr, err_msg, "note"); - } - void show_warning(std::string err_msg) const { - show_gen_error(std::cerr, err_msg, "warning"); - } - void show_error(std::string err_msg) const { - show_gen_error(std::cerr, err_msg, "error"); - } -}; - -std::ostream& operator<<(std::ostream& os, const SrcLocation& loc); - -struct Error { - virtual ~Error() = default; - virtual void show(std::ostream& os) const = 0; -}; - -std::ostream& operator<<(std::ostream& os, const Error& error); - -struct ParseError : Error { - SrcLocation where; - std::string message; - ParseError(const SrcLocation& _where, std::string _msg) : where(_where), message(_msg) { - } - ParseError(const SrcLocation* _where, std::string _msg) : message(_msg) { - if (_where) { - where = *_where; - } - } - ~ParseError() override = default; - void show(std::ostream& os) const override; -}; - -class SourceReader { - std::istream* ifs; - FileDescr* fdescr; - SrcLocation loc; - bool eof; - std::string cur_line; - int cur_line_len; - void set_eof(); - const char *start, *cur, *end; - - public: - SourceReader(std::istream* _is, FileDescr* _fdescr); - bool load_line(); - bool is_eof() const { - return eof; - } - int is_eoln() const { - return cur == end; - } - int skip_spc(); - bool seek_eoln() { - skip_spc(); - return is_eoln(); - } - bool seek_eof(); - const char* cur_line_cstr() const { - return cur_line.c_str(); - } - const SrcLocation& here() const { - return loc; - } - char cur_char() const { - return *cur; - } - char next_char() const { - return cur[1]; - } - const char* get_ptr() const { - return cur; - } - const char* get_end_ptr() const { - return end; - } - const char* set_ptr(const char* ptr); - void advance(int n) { - set_ptr(get_ptr() + n); - } - void error(std::string err_msg) { - throw ParseError{loc, err_msg}; - } -}; - -} // namespace tolk diff --git a/tolk/symtable.cpp b/tolk/symtable.cpp index ee6d6aca..ec409ab2 100644 --- a/tolk/symtable.cpp +++ b/tolk/symtable.cpp @@ -28,13 +28,19 @@ namespace tolk { int scope_level; -SymTable<100003> symbols; +SymTable symbols; -SymDef* sym_def[symbols.hprime + 1]; -SymDef* global_sym_def[symbols.hprime + 1]; +SymDef* sym_def[symbols.SIZE_PRIME + 1]; +SymDef* global_sym_def[symbols.SIZE_PRIME + 1]; std::vector> symbol_stack; std::vector scope_opened_at; +Symbol::Symbol(std::string str, sym_idx_t idx) : str(std::move(str)), idx(idx) { + subclass = this->str[0] == '.' ? SymbolSubclass::dot_identifier + : this->str[0] == '~' ? SymbolSubclass::tilde_identifier + : SymbolSubclass::undef; +} + std::string Symbol::unknown_symbol_name(sym_idx_t i) { if (!i) { return "_"; @@ -45,57 +51,43 @@ std::string Symbol::unknown_symbol_name(sym_idx_t i) { } } -sym_idx_t SymTableBase::gen_lookup(std::string str, int mode, sym_idx_t idx) { +sym_idx_t SymTable::gen_lookup(std::string_view str, int mode, sym_idx_t idx) { unsigned long long h1 = 1, h2 = 1; for (char c : str) { - h1 = ((h1 * 239) + (unsigned char)(c)) % p; - h2 = ((h2 * 17) + (unsigned char)(c)) % (p - 1); + h1 = ((h1 * 239) + (unsigned char)(c)) % SIZE_PRIME; + h2 = ((h2 * 17) + (unsigned char)(c)) % (SIZE_PRIME - 1); } ++h2; ++h1; while (true) { - if (sym_table[h1]) { - if (sym_table[h1]->str == str) { + if (sym[h1]) { + if (sym[h1]->str == str) { return (mode & 2) ? not_found : sym_idx_t(h1); } h1 += h2; - if (h1 > p) { - h1 -= p; + if (h1 > SIZE_PRIME) { + h1 -= SIZE_PRIME; } } else { if (!(mode & 1)) { return not_found; } - if (def_sym >= ((long long)p * 3) / 4) { + if (def_sym >= ((long long)SIZE_PRIME * 3) / 4) { throw SymTableOverflow{def_sym}; } - sym_table[h1] = std::make_unique(str, idx <= 0 ? sym_idx_t(h1) : -idx); + sym[h1] = std::make_unique(static_cast(str), idx <= 0 ? sym_idx_t(h1) : -idx); ++def_sym; return sym_idx_t(h1); } } } -SymTableBase& SymTableBase::add_keyword(std::string str, sym_idx_t idx) { - if (idx <= 0) { - idx = ++def_kw; - } - sym_idx_t res = gen_lookup(str, -1, idx); - if (!res) { - throw SymTableKwRedef{str}; - } - if (idx < max_kw_idx) { - keywords[idx] = res; - } - return *this; -} - -void open_scope(Lexer& lex) { +void open_scope(SrcLocation loc) { ++scope_level; - scope_opened_at.push_back(lex.cur().loc); + scope_opened_at.push_back(loc); } -void close_scope(Lexer& lex) { +void close_scope(SrcLocation loc) { if (!scope_level) { throw Fatal{"cannot close the outer scope"}; } @@ -124,24 +116,20 @@ void close_scope(Lexer& lex) { scope_opened_at.pop_back(); } -SymDef* lookup_symbol(sym_idx_t idx, int flags) { +SymDef* lookup_symbol(sym_idx_t idx) { if (!idx) { return nullptr; } - if ((flags & 1) && sym_def[idx]) { + if (sym_def[idx]) { return sym_def[idx]; } - if ((flags & 2) && global_sym_def[idx]) { + if (global_sym_def[idx]) { return global_sym_def[idx]; } return nullptr; } -SymDef* lookup_symbol(std::string name, int flags) { - return lookup_symbol(symbols.lookup(name), flags); -} - -SymDef* define_global_symbol(sym_idx_t name_idx, bool force_new, const SrcLocation& loc) { +SymDef* define_global_symbol(sym_idx_t name_idx, bool force_new, SrcLocation loc) { if (!name_idx) { return nullptr; } @@ -156,7 +144,7 @@ SymDef* define_global_symbol(sym_idx_t name_idx, bool force_new, const SrcLocati return found; } -SymDef* define_symbol(sym_idx_t name_idx, bool force_new, const SrcLocation& loc) { +SymDef* define_symbol(sym_idx_t name_idx, bool force_new, SrcLocation loc) { if (!name_idx) { return nullptr; } @@ -176,7 +164,7 @@ SymDef* define_symbol(sym_idx_t name_idx, bool force_new, const SrcLocation& loc return found; } found = sym_def[name_idx] = new SymDef(scope_level, name_idx, loc); - symbol_stack.push_back(std::make_pair(scope_level, SymDef{0, name_idx})); + symbol_stack.push_back(std::make_pair(scope_level, SymDef{0, name_idx, loc})); #ifdef TOLK_DEBUG found->sym_name = found->name(); symbol_stack.back().second.sym_name = found->name(); diff --git a/tolk/symtable.h b/tolk/symtable.h index 68a4a1da..67f949a1 100644 --- a/tolk/symtable.h +++ b/tolk/symtable.h @@ -15,8 +15,9 @@ along with TON Blockchain Library. If not, see . */ #pragma once -#include "srcread.h" -#include "lexer.h" +#include "src-file.h" +#include +#include #include namespace tolk { @@ -29,11 +30,12 @@ namespace tolk { typedef int var_idx_t; +enum class SymValKind { _Param, _Var, _Func, _Typename, _GlobVar, _Const }; + struct SymValBase { - enum { _Param, _Var, _Func, _Typename, _GlobVar, _Const }; - int type; + SymValKind kind; int idx; - SymValBase(int _type, int _idx) : type(_type), idx(_idx) { + SymValBase(SymValKind kind, int idx) : kind(kind), idx(idx) { } virtual ~SymValBase() = default; }; @@ -44,92 +46,69 @@ struct SymValBase { * */ -// defined outside this module (by the end user) -int compute_symbol_subclass(std::string str); // return 0 if unneeded +enum class SymbolSubclass { + undef = 0, + dot_identifier = 1, // begins with . (a const method) + tilde_identifier = 2 // begins with ~ (a non-const method) +}; typedef int sym_idx_t; struct Symbol { std::string str; sym_idx_t idx; - int subclass; - Symbol(std::string _str, sym_idx_t _idx, int _sc) : str(_str), idx(_idx), subclass(_sc) { - } - Symbol(std::string _str, sym_idx_t _idx) : str(_str), idx(_idx) { - subclass = compute_symbol_subclass(std::move(_str)); - } + SymbolSubclass subclass; + + Symbol(std::string str, sym_idx_t idx); + static std::string unknown_symbol_name(sym_idx_t i); }; -class SymTableBase { - unsigned p; - std::unique_ptr* sym_table; - sym_idx_t def_kw, def_sym; +class SymTable { +public: + static constexpr int SIZE_PRIME = 100003; + +private: + sym_idx_t def_sym{0}; + std::unique_ptr sym[SIZE_PRIME + 1]; + sym_idx_t gen_lookup(std::string_view str, int mode = 0, sym_idx_t idx = 0); + static constexpr int max_kw_idx = 10000; sym_idx_t keywords[max_kw_idx]; - public: - SymTableBase(unsigned p_, std::unique_ptr* sym_table_) - : p(p_), sym_table(sym_table_), def_kw(0x100), def_sym(0) { - std::memset(keywords, 0, sizeof(keywords)); - } +public: + static constexpr sym_idx_t not_found = 0; - SymTableBase& add_keyword(std::string str, sym_idx_t idx = 0); - SymTableBase& add_kw_char(char c) { - return add_keyword(std::string{c}, c); - } - sym_idx_t lookup(std::string str, int mode = 0) { + sym_idx_t lookup(const std::string_view& str, int mode = 0) { return gen_lookup(str, mode); } - sym_idx_t lookup_add(std::string str) { + sym_idx_t lookup_add(const std::string& str) { return gen_lookup(str, 1); } Symbol* operator[](sym_idx_t i) const { - return sym_table[i].get(); + return sym[i].get(); } bool is_keyword(sym_idx_t i) const { - return sym_table[i] && sym_table[i]->idx < 0; + return sym[i] && sym[i]->idx < 0; } std::string get_name(sym_idx_t i) const { - return sym_table[i] ? sym_table[i]->str : Symbol::unknown_symbol_name(i); + return sym[i] ? sym[i]->str : Symbol::unknown_symbol_name(i); } - int get_subclass(sym_idx_t i) const { - return sym_table[i] ? sym_table[i]->subclass : 0; + SymbolSubclass get_subclass(sym_idx_t i) const { + return sym[i] ? sym[i]->subclass : SymbolSubclass::undef; } Symbol* get_keyword(int i) const { - return ((unsigned)i < (unsigned)max_kw_idx) ? sym_table[keywords[i]].get() : nullptr; + return ((unsigned)i < (unsigned)max_kw_idx) ? sym[keywords[i]].get() : nullptr; } - protected: - sym_idx_t gen_lookup(std::string str, int mode = 0, sym_idx_t idx = 0); -}; - -template -class SymTable : public SymTableBase { - public: - static constexpr int hprime = pp; - static int size() { - return pp + 1; - } - - private: - std::unique_ptr sym[pp + 1]; - - public: - SymTable() : SymTableBase(pp, sym) { - } - SymTable& add_keyword(std::string str, sym_idx_t idx = 0) { - SymTableBase::add_keyword(str, idx); - return *this; - } - SymTable& add_kw_char(char c) { - return add_keyword(std::string{c}, c); + SymTable() { + std::memset(keywords, 0, sizeof(keywords)); } }; struct SymTableOverflow { int sym_def; - SymTableOverflow(int x) : sym_def(x) { + explicit SymTableOverflow(int x) : sym_def(x) { } }; @@ -139,7 +118,7 @@ struct SymTableKwRedef { } }; -extern SymTable<100003> symbols; +extern SymTable symbols; extern int scope_level; @@ -151,7 +130,7 @@ struct SymDef { #ifdef TOLK_DEBUG std::string sym_name; #endif - SymDef(int lvl, sym_idx_t idx, const SrcLocation& _loc = {}, SymValBase* val = 0) + SymDef(int lvl, sym_idx_t idx, SrcLocation _loc, SymValBase* val = nullptr) : level(lvl), sym_idx(idx), value(val), loc(_loc) { } bool has_name() const { @@ -162,17 +141,16 @@ struct SymDef { } }; -extern SymDef* sym_def[symbols.hprime + 1]; -extern SymDef* global_sym_def[symbols.hprime + 1]; +extern SymDef* sym_def[symbols.SIZE_PRIME + 1]; +extern SymDef* global_sym_def[symbols.SIZE_PRIME + 1]; extern std::vector> symbol_stack; extern std::vector scope_opened_at; -void open_scope(Lexer& lex); -void close_scope(Lexer& lex); -SymDef* lookup_symbol(sym_idx_t idx, int flags = 3); -SymDef* lookup_symbol(std::string name, int flags = 3); +void open_scope(SrcLocation loc); +void close_scope(SrcLocation loc); +SymDef* lookup_symbol(sym_idx_t idx); -SymDef* define_global_symbol(sym_idx_t name_idx, bool force_new = false, const SrcLocation& loc = {}); -SymDef* define_symbol(sym_idx_t name_idx, bool force_new = false, const SrcLocation& loc = {}); +SymDef* define_global_symbol(sym_idx_t name_idx, bool force_new = false, SrcLocation loc = {}); +SymDef* define_symbol(sym_idx_t name_idx, bool force_new, SrcLocation loc); } // namespace tolk diff --git a/tolk/tolk-main.cpp b/tolk/tolk-main.cpp index e4b6ebdb..ce08a552 100644 --- a/tolk/tolk-main.cpp +++ b/tolk/tolk-main.cpp @@ -30,67 +30,41 @@ void usage(const char* progname) { std::cerr - << "usage: " << progname - << " [-vIAPSR][-O][-i][-o][-W] { ...}\n" - "\tGenerates Fift TVM assembler code from a Tolk source\n" - "-I\tEnables interactive mode (parse stdin)\n" - "-o\tWrites generated code into specified file instead of stdout\n" - "-v\tIncreases verbosity level (extra information output into stderr)\n" - "-i\tSets indentation for the output code (in two-space units)\n" - "-A\tPrefix code with `\"Asm.fif\" include` preamble\n" + << "usage: " << progname << " [options] \n" + "\tGenerates Fift TVM assembler code from a .tolk file\n" + "-o\tWrites generated code into specified .fif file instead of stdout\n" + "-b\tGenerate Fift instructions to save TVM bytecode into .boc file\n" "-O\tSets optimization level (2 by default)\n" - "-P\tEnvelope code into PROGRAM{ ... }END>c\n" - "-S\tInclude stack layout comments in the output code\n" - "-R\tInclude operation rewrite comments in the output code\n" - "-W\tInclude Fift code to serialize and save generated code into specified BoC file. Enables " - "-A and -P.\n" - "\t-s\tOutput semantic version of Tolk and exit\n" - "\t-V\tShow Tolk build information\n"; + "-S\tDon't include stack layout comments into Fift output\n" + "-e\tIncreases verbosity level (extra output into stderr)\n" + "-v\tOutput version of Tolk and exit\n"; std::exit(2); } int main(int argc, char* const argv[]) { int i; std::string output_filename; - while ((i = getopt(argc, argv, "Ahi:Io:O:PRsSvW:V")) != -1) { + while ((i = getopt(argc, argv, "o:b:O:Sevh")) != -1) { switch (i) { - case 'A': - tolk::asm_preamble = true; - break; - case 'I': - tolk::interactive = true; - break; - case 'i': - tolk::indent = std::max(0, atoi(optarg)); - break; case 'o': output_filename = optarg; break; + case 'b': + tolk::boc_output_filename = optarg; + break; case 'O': tolk::opt_level = std::max(0, atoi(optarg)); break; - case 'P': - tolk::program_envelope = true; - break; - case 'R': - tolk::op_rewrite_comments = true; - break; case 'S': - tolk::stack_layout_comments = true; + tolk::stack_layout_comments = false; break; - case 'v': + case 'e': ++tolk::verbosity; break; - case 'W': - tolk::boc_output_filename = optarg; - tolk::asm_preamble = tolk::program_envelope = true; - break; - case 's': - std::cout << tolk::tolk_version << "\n"; - std::exit(0); - case 'V': - std::cout << "Tolk semantic version: v" << tolk::tolk_version << "\n"; - std::cout << "Build information: [ Commit: " << GitMetadata::CommitSHA1() << ", Date: " << GitMetadata::CommitDate() << "]\n"; + case 'v': + std::cout << "Tolk compiler v" << tolk::tolk_version << "\n"; + std::cout << "Build commit: " << GitMetadata::CommitSHA1() << "\n"; + std::cout << "Build date: " << GitMetadata::CommitDate() << "\n"; std::exit(0); case 'h': default: @@ -110,13 +84,14 @@ int main(int argc, char* const argv[]) { outs = fs.get(); } - std::vector sources; - - while (optind < argc) { - sources.push_back(std::string(argv[optind++])); + if (optind != argc - 1) { + std::cerr << "invalid usage: should specify exactly one input file.tolk"; + return 2; } + std::string entrypoint_file_name = argv[optind]; + tolk::read_callback = tolk::fs_read_callback; - return tolk::tolk_proceed(sources, *outs, std::cerr); + return tolk::tolk_proceed(entrypoint_file_name, *outs, std::cerr); } diff --git a/tolk/tolk-wasm.cpp b/tolk/tolk-wasm.cpp index 6ffc798e..7cf28ba3 100644 --- a/tolk/tolk-wasm.cpp +++ b/tolk/tolk-wasm.cpp @@ -31,81 +31,58 @@ #include "td/utils/Status.h" #include #include -#include "vm/boc.h" td::Result compile_internal(char *config_json) { TRY_RESULT(input_json, td::json_decode(td::MutableSlice(config_json))) - auto &obj = input_json.get_object(); + td::JsonObject& config = input_json.get_object(); - TRY_RESULT(opt_level, td::get_json_object_int_field(obj, "optLevel", false)); - TRY_RESULT(sources_obj, td::get_json_object_field(obj, "sources", td::JsonValue::Type::Array, false)); - - auto &sources_arr = sources_obj.get_array(); - - std::vector sources; - - for (auto &item : sources_arr) { - sources.push_back(item.get_string().str()); - } + TRY_RESULT(opt_level, td::get_json_object_int_field(config, "optimizationLevel", true, 2)); + TRY_RESULT(stack_comments, td::get_json_object_bool_field(config, "withStackComments", true, false)); + TRY_RESULT(entrypoint_file_name, td::get_json_object_string_field(config, "entrypointFileName", false)); tolk::opt_level = std::max(0, opt_level); - tolk::program_envelope = true; tolk::verbosity = 0; - tolk::indent = 1; + tolk::stack_layout_comments = stack_comments; std::ostringstream outs, errs; - auto compile_res = tolk::tolk_proceed(sources, outs, errs); - - if (compile_res != 0) { - return td::Status::Error(std::string("Tolk compilation error: ") + errs.str()); + int tolk_res = tolk::tolk_proceed(entrypoint_file_name, outs, errs); + if (tolk_res != 0) { + return td::Status::Error("Tolk compilation error: " + errs.str()); } - TRY_RESULT(code_cell, fift::compile_asm(outs.str(), "/fiftlib/", false)); - TRY_RESULT(boc, vm::std_boc_serialize(code_cell)); + TRY_RESULT(fift_res, fift::compile_asm_program(outs.str(), "/fiftlib/")); td::JsonBuilder result_json; - auto result_obj = result_json.enter_object(); - result_obj("status", "ok"); - result_obj("codeBoc", td::base64_encode(boc)); - result_obj("fiftCode", outs.str()); - result_obj("codeHashHex", code_cell->get_hash().to_hex()); - result_obj.leave(); - - outs.clear(); - errs.clear(); + auto obj = result_json.enter_object(); + obj("status", "ok"); + obj("fiftCode", fift_res.fiftCode); + obj("codeBoc64", fift_res.codeBoc64); + obj("codeHashHex", fift_res.codeHashHex); + obj.leave(); return result_json.string_builder().as_cslice().str(); } -/// Callback used to retrieve additional source files or data. -/// -/// @param _kind The kind of callback (a string). -/// @param _data The data for the callback (a string). -/// @param o_contents A pointer to the contents of the file, if found. Allocated via malloc(). -/// @param o_error A pointer to an error message, if there is one. Allocated via malloc(). -/// -/// The callback implementor must use malloc() to allocate storage for -/// contents or error. The callback implementor must use free() to free -/// said storage after tolk_compile returns. -/// -/// If the callback is not supported, *o_contents and *o_error must be set to NULL. -typedef void (*CStyleReadFileCallback)(char const* _kind, char const* _data, char** o_contents, char** o_error); +/// Callback used to retrieve file contents from a "not file system". See tolk-js for implementation. +/// The callback must fill either destContents or destError. +/// The implementor must use malloc() for them and use free() after tolk_compile returns. +typedef void (*CStyleReadFileCallback)(int kind, char const* data, char** destContents, char** destError); tolk::ReadCallback::Callback wrapReadCallback(CStyleReadFileCallback _readCallback) { tolk::ReadCallback::Callback readCallback; if (_readCallback) { - readCallback = [=](tolk::ReadCallback::Kind _kind, char const* _data) -> td::Result { - char* contents_c = nullptr; - char* error_c = nullptr; - _readCallback(tolk::ReadCallback::kindString(_kind).data(), _data, &contents_c, &error_c); - if (!contents_c && !error_c) { + readCallback = [=](tolk::ReadCallback::Kind kind, char const* data) -> td::Result { + char* destContents = nullptr; + char* destError = nullptr; + _readCallback(static_cast(kind), data, &destContents, &destError); + if (!destContents && !destError) { return td::Status::Error("Callback not supported"); } - if (contents_c) { - return contents_c; + if (destContents) { + return destContents; } - return td::Status::Error(std::string(error_c)); + return td::Status::Error(std::string(destError)); }; } return readCallback; diff --git a/tolk/tolk.cpp b/tolk/tolk.cpp index 1b8a17a1..1fce3ebf 100644 --- a/tolk/tolk.cpp +++ b/tolk/tolk.cpp @@ -24,18 +24,17 @@ from all source files in the program, then also delete it here. */ #include "tolk.h" -#include "srcread.h" #include "lexer.h" #include #include "git.h" #include #include "td/utils/port/path.h" +#include namespace tolk { -int verbosity, indent, opt_level = 2; -bool stack_layout_comments, op_rewrite_comments, program_envelope, asm_preamble; -bool interactive = false; +int verbosity = 0, opt_level = 2; +bool stack_layout_comments = true; GlobalPragma pragma_allow_post_modification{"allow-post-modification"}; GlobalPragma pragma_compute_asm_ltr{"compute-asm-ltr"}; GlobalPragma pragma_remove_unused_functions{"remove-unused-functions"}; @@ -82,23 +81,13 @@ void GlobalPragma::enable(SrcLocation loc) { ". Please, remove this line from your code."); return; } + if (!loc.get_src_file()->is_entrypoint_file()) { + // todo generally it's not true; rework pragmas completely + loc.show_warning(PSTRING() << "#pragma " << name_ << + " should be used in the main file only."); + } enabled_ = true; - locs_.push_back(std::move(loc)); -} - -void GlobalPragma::check_enable_in_libs() { - if (locs_.empty()) { - return; - } - for (const SrcLocation& loc : locs_) { - if (loc.fdescr->is_main) { - return; - } - } - locs_[0].show_warning(PSTRING() << "#pragma " << name_ - << " is enabled in included libraries, it may change the behavior of your code. " - << "Add this #pragma to the main source file to suppress this warning."); } void GlobalPragma::always_on_and_deprecated(const char *deprecated_from_v) { @@ -109,14 +98,19 @@ void GlobalPragma::always_on_and_deprecated(const char *deprecated_from_v) { td::Result fs_read_callback(ReadCallback::Kind kind, const char* query) { switch (kind) { case ReadCallback::Kind::ReadFile: { - std::ifstream ifs{query}; - if (ifs.fail()) { - auto msg = std::string{"cannot open source file `"} + query + "`"; - return td::Status::Error(msg); + struct stat f_stat; + int res = stat(query, &f_stat); + if (res != 0) { + return td::Status::Error(std::string{"cannot open source file: "} + query); } - std::stringstream ss; - ss << ifs.rdbuf(); - return ss.str(); + + size_t file_size = static_cast(f_stat.st_size); + std::string str; + str.resize(file_size); + FILE* f = fopen(query, "r"); + fread(str.data(), file_size, 1, f); + fclose(f); + return std::move(str); } case ReadCallback::Kind::Realpath: { return td::realpath(td::CSlice(query)); @@ -241,7 +235,7 @@ void generate_output_func(SymDef* func_sym, std::ostream &outs, std::ostream &er } else if (func_val->is_inline_ref()) { modifier = "REF"; } - outs << std::string(indent * 2, ' ') << name << " PROC" << modifier << ":<{\n"; + outs << std::string(2, ' ') << name << " PROC" << modifier << ":<{\n"; int mode = 0; if (stack_layout_comments) { mode |= Stack::_StkCmt | Stack::_CptStkCmt; @@ -255,8 +249,8 @@ void generate_output_func(SymDef* func_sym, std::ostream &outs, std::ostream &er if (func_val->is_inline() || func_val->is_inline_ref()) { mode |= Stack::_InlineAny; } - code.generate_code(outs, mode, indent + 1); - outs << std::string(indent * 2, ' ') << "}>\n"; + code.generate_code(outs, mode, 2); + outs << std::string(2, ' ') << "}>\n"; if (verbosity >= 2) { errs << "--------------\n"; } @@ -264,13 +258,9 @@ void generate_output_func(SymDef* func_sym, std::ostream &outs, std::ostream &er } int generate_output(std::ostream &outs, std::ostream &errs) { - if (asm_preamble) { - outs << "\"Asm.fif\" include\n"; - } + outs << "\"Asm.fif\" include\n"; outs << "// automatically generated from " << generated_from << std::endl; - if (program_envelope) { - outs << "PROGRAM{\n"; - } + outs << "PROGRAM{\n"; mark_used_symbols(); for (SymDef* func_sym : glob_func) { SymValCodeFunc* func_val = dynamic_cast(func_sym->value); @@ -283,7 +273,7 @@ int generate_output(std::ostream &outs, std::ostream &errs) { } std::string name = symbols.get_name(func_sym->sym_idx); - outs << std::string(indent * 2, ' '); + outs << std::string(2, ' '); if (func_val->method_id.is_null()) { outs << "DECLPROC " << name << "\n"; } else { @@ -300,7 +290,7 @@ int generate_output(std::ostream &outs, std::ostream &errs) { continue; } std::string name = symbols.get_name(gvar_sym->sym_idx); - outs << std::string(indent * 2, ' ') << "DECLGLOBVAR " << name << "\n"; + outs << std::string(2, ' ') << "DECLGLOBVAR " << name << "\n"; } int errors = 0; for (SymDef* func_sym : glob_func) { @@ -310,76 +300,46 @@ int generate_output(std::ostream &outs, std::ostream &errs) { } try { generate_output_func(func_sym, outs, errs); - } catch (Error& err) { + } catch (ParseError& err) { errs << "cannot generate code for function `" << symbols.get_name(func_sym->sym_idx) << "`:\n" << err << std::endl; ++errors; } } - if (program_envelope) { - outs << "}END>c\n"; - } + outs << "}END>c\n"; if (!boc_output_filename.empty()) { - outs << "2 boc+>B \"" << boc_output_filename << "\" B>file\n"; + outs << "boc>B \"" << boc_output_filename << "\" B>file\n"; } return errors; } -void output_inclusion_stack(std::ostream &errs) { - while (!inclusion_locations.empty()) { - SrcLocation loc = inclusion_locations.top(); - inclusion_locations.pop(); - if (loc.fdescr) { - errs << "note: included from "; - loc.show(errs); - errs << std::endl; - } - } -} - -int tolk_proceed(const std::vector &sources, std::ostream &outs, std::ostream &errs) { - if (program_envelope && !indent) { - indent = 1; - } - - define_keywords(); +int tolk_proceed(const std::string &entrypoint_file_name, std::ostream &outs, std::ostream &errs) { define_builtins(); + lexer_init(); pragma_allow_post_modification.always_on_and_deprecated("0.5.0"); pragma_compute_asm_ltr.always_on_and_deprecated("0.5.0"); - int ok = 0, proc = 0; try { - for (auto src : sources) { - ok += parse_source_file(src.c_str(), {}, true); - proc++; - } - if (interactive) { - generated_from += "stdin "; - ok += parse_source_stdin(); - proc++; - } - if (ok < proc) { + bool ok = parse_source_file(entrypoint_file_name.c_str(), {}); + if (!ok) { throw Fatal{"output code generation omitted because of errors"}; } - if (!proc) { - throw Fatal{"no source files, no output"}; - } - pragma_remove_unused_functions.check_enable_in_libs(); + + // todo #ifdef TOLK_PROFILING + comment + // lexer_measure_performance(all_src_files.get_all_files()); + return generate_output(outs, errs); } catch (Fatal& fatal) { errs << "fatal: " << fatal << std::endl; - output_inclusion_stack(errs); return 2; - } catch (Error& error) { + } catch (ParseError& error) { errs << error << std::endl; - output_inclusion_stack(errs); return 2; } catch (UnifyError& unif_err) { errs << "fatal: "; unif_err.print_message(errs); errs << std::endl; - output_inclusion_stack(errs); return 2; } diff --git a/tolk/tolk.h b/tolk/tolk.h index 9086620b..27e26f05 100644 --- a/tolk/tolk.h +++ b/tolk/tolk.h @@ -15,6 +15,7 @@ along with TON Blockchain Library. If not, see . */ #pragma once +#include #include #include #include @@ -26,7 +27,7 @@ #include "common/refcnt.hpp" #include "common/bigint.hpp" #include "common/refint.h" -#include "srcread.h" +#include "src-file.h" #include "lexer.h" #include "symtable.h" #include "td/utils/Status.h" @@ -45,104 +46,6 @@ constexpr int optimize_depth = 20; const std::string tolk_version{"0.4.5"}; -enum Keyword { - _Eof = -1, - _Ident = 0, - _Number, - _Special, - _String, - _Return = 0x80, - _Var, - _Repeat, - _Do, - _While, - _Until, - _Try, - _Catch, - _If, - _Ifnot, - _Then, - _Else, - _Elseif, - _Elseifnot, - _Eq, - _Neq, - _Leq, - _Geq, - _Spaceship, - _Lshift, - _Rshift, - _RshiftR, - _RshiftC, - _DivR, - _DivC, - _ModR, - _ModC, - _DivMod, - _PlusLet, - _MinusLet, - _TimesLet, - _DivLet, - _DivRLet, - _DivCLet, - _ModLet, - _ModRLet, - _ModCLet, - _LshiftLet, - _RshiftLet, - _RshiftRLet, - _RshiftCLet, - _AndLet, - _OrLet, - _XorLet, - _Int, - _Cell, - _Slice, - _Builder, - _Cont, - _Tuple, - _Type, - _Mapsto, - _Forall, - _Asm, - _Impure, - _Pure, - _Global, - _Extern, - _Inline, - _InlineRef, - _Builtin, - _AutoApply, - _MethodId, - _Get, - _Operator, - _Infix, - _Infixl, - _Infixr, - _Const, - _PragmaHashtag, - _IncludeHashtag -}; - -void define_keywords(); - -class IdSc { - int cls; - - public: - enum { undef = 0, dotid = 1, tildeid = 2 }; - IdSc(int _cls = undef) : cls(_cls) { - } - operator int() { - return cls; - } -}; - -// symbol subclass: -// 1 = begins with . (a const method) -// 2 = begins with ~ (a non-const method) -// 0 = else - /* * * TYPE EXPRESSIONS @@ -152,13 +55,13 @@ class IdSc { struct TypeExpr { enum te_type { te_Unknown, te_Var, te_Indirect, te_Atomic, te_Tensor, te_Tuple, te_Map, te_ForAll } constr; enum AtomicType { - _Int = Keyword::_Int, - _Cell = Keyword::_Cell, - _Slice = Keyword::_Slice, - _Builder = Keyword::_Builder, - _Cont = Keyword::_Cont, - _Tuple = Keyword::_Tuple, - _Type = Keyword::_Type + _Int = tok_int, + _Cell = tok_cell, + _Slice = tok_slice, + _Builder = tok_builder, + _Cont = tok_cont, + _Tuple = tok_tuple, + _Type = tok_type }; int value; int minw, maxw; @@ -279,14 +182,18 @@ struct TypeExpr { std::ostream& operator<<(std::ostream& os, TypeExpr* type_expr); -struct UnifyError { +struct UnifyError : std::exception { TypeExpr* te1; TypeExpr* te2; std::string msg; - UnifyError(TypeExpr* _te1, TypeExpr* _te2, std::string _msg = "") : te1(_te1), te2(_te2), msg(_msg) { + + UnifyError(TypeExpr* _te1, TypeExpr* _te2, std::string _msg = "") : te1(_te1), te2(_te2), msg(std::move(_msg)) { } + void print_message(std::ostream& os) const; - std::string message() const; + const char* what() const noexcept override { + return msg.c_str(); + } }; std::ostream& operator<<(std::ostream& os, const UnifyError& ue); @@ -310,18 +217,13 @@ struct TmpVar { int cls; sym_idx_t name; int coord; - std::unique_ptr where; - std::vector> on_modification; - bool undefined = false; - TmpVar(var_idx_t _idx, int _cls, TypeExpr* _type = 0, SymDef* sym = 0, const SrcLocation* loc = 0); + SrcLocation where; + std::vector> on_modification; + + TmpVar(var_idx_t _idx, int _cls, TypeExpr* _type, SymDef* sym, SrcLocation loc); void show(std::ostream& os, int omit_idx = 0) const; void dump(std::ostream& os) const; - void set_location(const SrcLocation& loc); - std::string to_string() const { - std::ostringstream s; - show(s, 2); - return s.str(); - } + void set_location(SrcLocation loc); }; struct VarDescr { @@ -566,25 +468,25 @@ struct Op { std::unique_ptr block0, block1; td::RefInt256 int_const; std::string str_const; - Op(const SrcLocation& _where = {}, OpKind _cl = _Undef) : cl(_cl), flags(0), fun_ref(nullptr), where(_where) { + Op(SrcLocation _where = {}, OpKind _cl = _Undef) : cl(_cl), flags(0), fun_ref(nullptr), where(_where) { } - Op(const SrcLocation& _where, OpKind _cl, const std::vector& _left) + Op(SrcLocation _where, OpKind _cl, const std::vector& _left) : cl(_cl), flags(0), fun_ref(nullptr), where(_where), left(_left) { } - Op(const SrcLocation& _where, OpKind _cl, std::vector&& _left) + Op(SrcLocation _where, OpKind _cl, std::vector&& _left) : cl(_cl), flags(0), fun_ref(nullptr), where(_where), left(std::move(_left)) { } - Op(const SrcLocation& _where, OpKind _cl, const std::vector& _left, td::RefInt256 _const) + Op(SrcLocation _where, OpKind _cl, const std::vector& _left, td::RefInt256 _const) : cl(_cl), flags(0), fun_ref(nullptr), where(_where), left(_left), int_const(_const) { } - Op(const SrcLocation& _where, OpKind _cl, const std::vector& _left, std::string _const) + Op(SrcLocation _where, OpKind _cl, const std::vector& _left, std::string _const) : cl(_cl), flags(0), fun_ref(nullptr), where(_where), left(_left), str_const(_const) { } - Op(const SrcLocation& _where, OpKind _cl, const std::vector& _left, const std::vector& _right, + Op(SrcLocation _where, OpKind _cl, const std::vector& _left, const std::vector& _right, SymDef* _fun = nullptr) : cl(_cl), flags(0), fun_ref(_fun), where(_where), left(_left), right(_right) { } - Op(const SrcLocation& _where, OpKind _cl, std::vector&& _left, std::vector&& _right, + Op(SrcLocation _where, OpKind _cl, std::vector&& _left, std::vector&& _right, SymDef* _fun = nullptr) : cl(_cl), flags(0), fun_ref(_fun), where(_where), left(std::move(_left)), right(std::move(_right)) { } @@ -700,8 +602,8 @@ struct CodeBlob { return res; } bool import_params(FormalArgList arg_list); - var_idx_t create_var(int cls, TypeExpr* var_type = 0, SymDef* sym = 0, const SrcLocation* loc = 0); - var_idx_t create_tmp_var(TypeExpr* var_type = 0, const SrcLocation* loc = 0) { + var_idx_t create_var(int cls, TypeExpr* var_type, SymDef* sym, SrcLocation loc); + var_idx_t create_tmp_var(TypeExpr* var_type, SrcLocation loc) { return create_var(TmpVar::_Tmp, var_type, nullptr, loc); } int split_vars(bool strict = false); @@ -712,14 +614,14 @@ struct CodeBlob { cur_ops_stack.push(cur_ops); cur_ops = &new_cur_ops; } - void close_blk(const SrcLocation& location) { + void close_blk(SrcLocation location) { *cur_ops = std::make_unique(location, Op::_Nop); } void pop_cur() { cur_ops = cur_ops_stack.top(); cur_ops_stack.pop(); } - void close_pop_cur(const SrcLocation& location) { + void close_pop_cur(SrcLocation location) { close_blk(location); pop_cur(); } @@ -730,7 +632,7 @@ struct CodeBlob { void generate_code(AsmOpList& out_list, int mode = 0); void generate_code(std::ostream& os, int mode = 0, int indent = 0); - void on_var_modification(var_idx_t idx, const SrcLocation& here) const { + void on_var_modification(var_idx_t idx, SrcLocation here) const { for (auto& f : vars.at(idx).on_modification) { f(here); } @@ -746,8 +648,8 @@ struct CodeBlob { struct SymVal : SymValBase { TypeExpr* sym_type; bool auto_apply{false}; - SymVal(int _type, int _idx, TypeExpr* _stype = nullptr) - : SymValBase(_type, _idx), sym_type(_stype) { + SymVal(SymValKind kind, int idx, TypeExpr* sym_type = nullptr) + : SymValBase(kind, idx), sym_type(sym_type) { } ~SymVal() override = default; TypeExpr* get_type() const { @@ -774,9 +676,9 @@ struct SymValFunc : SymVal { #endif ~SymValFunc() override = default; SymValFunc(int val, TypeExpr* _ft, bool marked_as_pure) - : SymVal(_Func, val, _ft), flags(marked_as_pure ? flagMarkedAsPure : 0) {} + : SymVal(SymValKind::_Func, val, _ft), flags(marked_as_pure ? flagMarkedAsPure : 0) {} SymValFunc(int val, TypeExpr* _ft, std::initializer_list _arg_order, std::initializer_list _ret_order, bool marked_as_pure) - : SymVal(_Func, val, _ft), flags(marked_as_pure ? flagMarkedAsPure : 0), arg_order(_arg_order), ret_order(_ret_order) { + : SymVal(SymValKind::_Func, val, _ft), flags(marked_as_pure ? flagMarkedAsPure : 0), arg_order(_arg_order), ret_order(_ret_order) { } const std::vector* get_arg_order() const { @@ -818,7 +720,7 @@ struct SymValCodeFunc : SymValFunc { struct SymValType : SymValBase { TypeExpr* sym_type; - SymValType(int _type, int _idx, TypeExpr* _stype = nullptr) : SymValBase(_type, _idx), sym_type(_stype) { + SymValType(SymValKind kind, int idx, TypeExpr* _stype = nullptr) : SymValBase(kind, idx), sym_type(_stype) { } ~SymValType() override = default; TypeExpr* get_type() const { @@ -834,7 +736,7 @@ struct SymValGlobVar : SymValBase { std::string name; // seeing variable name in debugger makes it much easier to delve into Tolk sources #endif SymValGlobVar(int val, TypeExpr* gvtype, int oidx = 0) - : SymValBase(_GlobVar, val), sym_type(gvtype), out_idx(oidx) { + : SymValBase(SymValKind::_GlobVar, val), sym_type(gvtype), out_idx(oidx) { } ~SymValGlobVar() override = default; TypeExpr* get_type() const { @@ -843,16 +745,16 @@ struct SymValGlobVar : SymValBase { }; struct SymValConst : SymValBase { + enum ConstKind { IntConst, SliceConst }; + td::RefInt256 intval; std::string strval; - Keyword type; + ConstKind kind; SymValConst(int idx, td::RefInt256 value) - : SymValBase(_Const, idx), intval(value) { - type = _Int; + : SymValBase(SymValKind::_Const, idx), intval(value), kind(IntConst) { } SymValConst(int idx, std::string value) - : SymValBase(_Const, idx), strval(value) { - type = _Slice; + : SymValBase(SymValKind::_Const, idx), strval(value), kind(SliceConst) { } ~SymValConst() override = default; td::RefInt256 get_int_value() const { @@ -861,8 +763,8 @@ struct SymValConst : SymValBase { std::string get_str_value() const { return strval; } - Keyword get_type() const { - return type; + ConstKind get_kind() const { + return kind; } }; @@ -882,35 +784,21 @@ public: ReadCallback(ReadCallback const&) = delete; ReadCallback& operator=(ReadCallback const&) = delete; - enum class Kind - { + enum class Kind { + Realpath, ReadFile, - Realpath }; - static std::string kindString(Kind _kind) - { - switch (_kind) - { - case Kind::ReadFile: - return "source"; - case Kind::Realpath: - return "realpath"; - default: - throw ""; // todo ? - } - } - /// File reading or generic query callback. - using Callback = std::function(ReadCallback::Kind, const char*)>; + using Callback = std::function(Kind, const char*)>; }; // defined in parse-tolk.cpp -bool parse_source(std::istream* is, const FileDescr* fdescr); -bool parse_source_file(const char* filename, Lexem lex = {}, bool is_main = false); -bool parse_source_stdin(); +void parse_source(const SrcFile* file); +bool parse_source_file(const char* filename, SrcLocation loc_included_from); extern std::stack inclusion_locations; +extern AllRegisteredSrcFiles all_src_files; /* * @@ -949,7 +837,7 @@ struct Expr { std::vector args; explicit Expr(ExprCls c = _None) : cls(c) { } - Expr(ExprCls c, const SrcLocation& loc) : cls(c), here(loc) { + Expr(ExprCls c, SrcLocation loc) : cls(c), here(loc) { } Expr(ExprCls c, std::vector _args) : cls(c), args(std::move(_args)) { } @@ -990,14 +878,13 @@ struct Expr { bool is_mktuple() const { return cls == _MkTuple; } - void chk_rvalue(const Lexem& lem) const; - void chk_lvalue(const Lexem& lem) const; - void chk_type(const Lexem& lem) const; - bool deduce_type(const Lexem& lem); - void set_location(const SrcLocation& loc) { + void chk_rvalue(const Lexer& lex) const; // todo here and below: strange to pass Lexer + void chk_lvalue(const Lexer& lex) const; + bool deduce_type(const Lexer& lex); + void set_location(SrcLocation loc) { here = loc; } - const SrcLocation& get_location() const { + SrcLocation get_location() const { return here; } int define_new_vars(CodeBlob& code); @@ -1699,11 +1586,11 @@ struct Stack { * */ -typedef std::function&, std::vector&, const SrcLocation)> simple_compile_func_t; +typedef std::function&, std::vector&, SrcLocation)> simple_compile_func_t; typedef std::function&, std::vector&)> compile_func_t; inline simple_compile_func_t make_simple_compile(AsmOp op) { - return [op](std::vector& out, std::vector& in, const SrcLocation&) -> AsmOp { return op; }; + return [op](std::vector& out, std::vector& in, SrcLocation) -> AsmOp { return op; }; } inline compile_func_t make_ext_compile(std::vector&& ops) { @@ -1739,7 +1626,7 @@ struct SymValAsmFunc : SymValFunc { std::initializer_list ret_order = {}, bool marked_as_pure = false) : SymValFunc(-1, ft, arg_order, ret_order, marked_as_pure), ext_compile(std::move(_compile)) { } - bool compile(AsmOpList& dest, std::vector& out, std::vector& in, const SrcLocation& where) const; + bool compile(AsmOpList& dest, std::vector& out, std::vector& in, SrcLocation where) const; }; // defined in builtins.cpp @@ -1753,8 +1640,8 @@ AsmOp push_const(td::RefInt256 x); void define_builtins(); -extern int verbosity, indent, opt_level; -extern bool stack_layout_comments, op_rewrite_comments, program_envelope, asm_preamble, interactive; +extern int verbosity, opt_level; +extern bool stack_layout_comments; extern std::string generated_from, boc_output_filename; extern ReadCallback::Callback read_callback; @@ -1764,6 +1651,7 @@ class GlobalPragma { public: explicit GlobalPragma(std::string name) : name_(std::move(name)) { } + const std::string& name() const { return name_; } @@ -1771,14 +1659,12 @@ class GlobalPragma { return enabled_; } void enable(SrcLocation loc); - void check_enable_in_libs(); void always_on_and_deprecated(const char *deprecated_from_v); private: std::string name_; bool enabled_ = false; const char *deprecated_from_v_ = nullptr; - std::vector locs_; }; extern GlobalPragma pragma_allow_post_modification, pragma_compute_asm_ltr, pragma_remove_unused_functions; @@ -1788,7 +1674,7 @@ extern GlobalPragma pragma_allow_post_modification, pragma_compute_asm_ltr, prag * */ -int tolk_proceed(const std::vector &sources, std::ostream &outs, std::ostream &errs); +int tolk_proceed(const std::string &entrypoint_file_name, std::ostream &outs, std::ostream &errs); } // namespace tolk diff --git a/tolk/unify-types.cpp b/tolk/unify-types.cpp index 848e454a..04de323d 100644 --- a/tolk/unify-types.cpp +++ b/tolk/unify-types.cpp @@ -354,12 +354,6 @@ std::ostream& operator<<(std::ostream& os, const UnifyError& ue) { return os; } -std::string UnifyError::message() const { - std::ostringstream os; - print_message(os); - return os.str(); -} - void check_width_compat(TypeExpr* te1, TypeExpr* te2) { if (te1->minw > te2->maxw || te2->minw > te1->maxw) { std::ostringstream os{"cannot unify types of widths ", std::ios_base::ate};