1
0
Fork 0
mirror of https://github.com/ton-blockchain/ton synced 2025-03-09 15:40:10 +00:00

Improve handling outbound message queues (#825)

* Improve handling outbound message queues

* Cleanup queue faster
* Calculate queue sizes in background
* Force or limit split/merge depending on queue size

* Increase validate_ref limit for transaction

* Add all changes of public libraries to block size estimation

* Don't crash on timeout in GC

* Don't import external messages when queue is too big

---------

Co-authored-by: SpyCheese <mikle98@yandex.ru>
This commit is contained in:
EmelyanenkoK 2023-12-13 12:57:34 +03:00 committed by GitHub
parent 3a595ce849
commit 5e6b67ae96
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 727 additions and 111 deletions

View file

@ -44,6 +44,12 @@ namespace validator {
using td::Ref;
using namespace std::literals::string_literals;
// Don't increase MERGE_MAX_QUEUE_LIMIT too much: merging requires cleaning the whole queue in out_msg_queue_cleanup
static const td::uint32 FORCE_SPLIT_QUEUE_SIZE = 4096;
static const td::uint32 SPLIT_MAX_QUEUE_SIZE = 100000;
static const td::uint32 MERGE_MAX_QUEUE_SIZE = 2047;
static const td::uint32 SKIP_EXTERNALS_QUEUE_SIZE = 8000;
#define DBG(__n) dbg(__n)&&
#define DSTART int __dcnt = 0;
#define DEB DBG(++__dcnt)
@ -790,6 +796,26 @@ bool Collator::request_neighbor_msg_queues() {
return true;
}
/**
* Requests the size of the outbound message queue from the previous state(s).
*
* @returns True if the request was successful, false otherwise.
*/
bool Collator::request_out_msg_queue_size() {
if (after_split_) {
// If block is after split, the size is calculated during split (see Collator::split_last_state)
return true;
}
for (size_t i = 0; i < prev_blocks.size(); ++i) {
++pending;
send_closure_later(manager, &ValidatorManager::get_out_msg_queue_size, prev_blocks[i],
[self = get_self(), i](td::Result<td::uint32> res) {
td::actor::send_closure(std::move(self), &Collator::got_out_queue_size, i, std::move(res));
});
}
return true;
}
/**
* Handles the result of obtaining the outbound queue for a neighbor.
*
@ -854,6 +880,27 @@ void Collator::got_neighbor_out_queue(int i, td::Result<Ref<MessageQueue>> res)
check_pending();
}
/**
* Handles the result of obtaining the size of the outbound message queue.
*
* If the block is after merge then the two sizes are added.
*
* @param i The index of the previous block (0 or 1).
* @param res The result object containing the size of the queue.
*/
void Collator::got_out_queue_size(size_t i, td::Result<td::uint32> res) {
--pending;
if (res.is_error()) {
fatal_error(
res.move_as_error_prefix(PSTRING() << "failed to get message queue size from prev block #" << i << ": "));
return;
}
td::uint32 size = res.move_as_ok();
LOG(DEBUG) << "got outbound queue size from prev block #" << i << ": " << size;
out_msg_queue_size_ += size;
check_pending();
}
/**
* Unpacks and merges the states of two previous blocks.
* Used if the block is after_merge.
@ -972,7 +1019,7 @@ bool Collator::split_last_state(block::ShardState& ss) {
return fatal_error(res2.move_as_error());
}
sibling_processed_upto_ = res2.move_as_ok();
auto res3 = ss.split(shard_);
auto res3 = ss.split(shard_, &out_msg_queue_size_);
if (res3.is_error()) {
return fatal_error(std::move(res3));
}
@ -1449,6 +1496,9 @@ bool Collator::do_preinit() {
if (!request_neighbor_msg_queues()) {
return false;
}
if (!request_out_msg_queue_size()) {
return false;
}
return true;
}
@ -1824,7 +1874,6 @@ bool Collator::init_utime() {
// Extend collator timeout if previous block is too old
td::Timestamp new_timeout = td::Timestamp::in(std::min(30.0, (td::Clocks::system() - (double)prev_now_) / 2));
if (timeout < new_timeout) {
double add = new_timeout.at() - timeout.at();
timeout = new_timeout;
alarm_timestamp() = timeout;
}
@ -2174,95 +2223,144 @@ bool Collator::out_msg_queue_cleanup() {
block::gen::t_OutMsgQueue.print(std::cerr, *rt);
rt->print_rec(std::cerr);
}
for (const auto& nb : neighbors_) {
if (!nb.is_disabled() && (!nb.processed_upto || !nb.processed_upto->can_check_processed())) {
return fatal_error(-667, PSTRING() << "internal error: no info for checking processed messages from neighbor "
<< nb.blk_.to_str());
}
}
auto queue_root = out_msg_queue_->get_root_cell();
if (queue_root.is_null()) {
LOG(DEBUG) << "out_msg_queue is empty";
return true;
}
auto old_out_msg_queue = std::make_unique<vm::AugmentedDictionary>(queue_root, 352, block::tlb::aug_OutMsgQueue);
if (after_merge_) {
// We need to clean the whole queue after merge
// Queue is not too big, see const MERGE_MAX_QUEUE_SIZE
for (const auto& nb : neighbors_) {
if (!nb.is_disabled() && (!nb.processed_upto || !nb.processed_upto->can_check_processed())) {
return fatal_error(-667, PSTRING() << "internal error: no info for checking processed messages from neighbor "
<< nb.blk_.to_str());
}
}
td::uint32 deleted = 0;
auto res = out_msg_queue_->filter([&](vm::CellSlice& cs, td::ConstBitPtr key, int n) -> int {
assert(n == 352);
block::EnqueuedMsgDescr enq_msg_descr;
unsigned long long created_lt;
if (!(cs.fetch_ulong_bool(64, created_lt) // augmentation
&& enq_msg_descr.unpack(cs) // unpack EnqueuedMsg
&& enq_msg_descr.check_key(key) // check key
&& enq_msg_descr.lt_ == created_lt)) {
LOG(ERROR) << "cannot unpack EnqueuedMsg with key " << key.to_hex(n);
return -1;
}
LOG(DEBUG) << "scanning outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << ","
<< enq_msg_descr.hash_.to_hex() << ") enqueued_lt=" << enq_msg_descr.enqueued_lt_;
bool delivered = false;
ton::LogicalTime deliver_lt = 0;
for (const auto& neighbor : neighbors_) {
// could look up neighbor with shard containing enq_msg_descr.next_prefix more efficiently
// (instead of checking all neighbors)
if (!neighbor.is_disabled() && neighbor.processed_upto->already_processed(enq_msg_descr)) {
delivered = true;
deliver_lt = neighbor.end_lt();
break;
}
}
if (delivered) {
++deleted;
CHECK(out_msg_queue_size_ > 0);
--out_msg_queue_size_;
LOG(DEBUG) << "outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << "," << enq_msg_descr.hash_.to_hex()
<< ") enqueued_lt=" << enq_msg_descr.enqueued_lt_ << " has been already delivered, dequeueing";
if (!dequeue_message(std::move(enq_msg_descr.msg_env_), deliver_lt)) {
fatal_error(PSTRING() << "cannot dequeue outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << ","
<< enq_msg_descr.hash_.to_hex() << ") by inserting a msg_export_deq record");
return -1;
}
register_out_msg_queue_op();
if (!block_limit_status_->fits(block::ParamLimits::cl_normal)) {
block_full_ = true;
}
}
return !delivered;
});
LOG(INFO) << "deleted " << deleted << " messages from out_msg_queue after merge, remaining queue size is "
<< out_msg_queue_size_;
if (res < 0) {
return fatal_error("error scanning/updating OutMsgQueue");
}
} else {
std::vector<std::pair<block::OutputQueueMerger, const block::McShardDescr*>> queue_parts;
int deleted = 0;
int total = 0;
bool fail = false;
old_out_msg_queue->check_for_each([&](Ref<vm::CellSlice> value, td::ConstBitPtr key, int n) -> bool {
++total;
assert(n == 352);
vm::CellSlice& cs = value.write();
// LOG(DEBUG) << "key is " << key.to_hex(n);
if (queue_cleanup_timeout_.is_in_past(td::Timestamp::now())) {
LOG(WARNING) << "cleaning up outbound queue takes too long, ending";
outq_cleanup_partial_ = true;
return false; // retain all remaining outbound queue entries including this one without processing
block::OutputQueueMerger::Neighbor this_queue{BlockIdExt{new_id} /* block id is only used for logs */,
out_msg_queue_->get_root_cell()};
for (const auto& nb : neighbors_) {
if (nb.is_disabled()) {
continue;
}
if (!nb.processed_upto || !nb.processed_upto->can_check_processed()) {
return fatal_error(-667, PSTRING() << "internal error: no info for checking processed messages from neighbor "
<< nb.blk_.to_str());
}
queue_parts.emplace_back(block::OutputQueueMerger{nb.shard(), {this_queue}}, &nb);
}
if (block_full_) {
LOG(WARNING) << "BLOCK FULL while cleaning up outbound queue, cleanup completed only partially";
outq_cleanup_partial_ = true;
return false; // retain all remaining outbound queue entries including this one without processing
}
block::EnqueuedMsgDescr enq_msg_descr;
unsigned long long created_lt;
if (!(cs.fetch_ulong_bool(64, created_lt) // augmentation
&& enq_msg_descr.unpack(cs) // unpack EnqueuedMsg
&& enq_msg_descr.check_key(key) // check key
&& enq_msg_descr.lt_ == created_lt)) {
LOG(ERROR) << "cannot unpack EnqueuedMsg with key " << key.to_hex(n);
fail = true;
return false;
}
LOG(DEBUG) << "scanning outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << ","
<< enq_msg_descr.hash_.to_hex() << ") enqueued_lt=" << enq_msg_descr.enqueued_lt_;
bool delivered = false;
ton::LogicalTime deliver_lt = 0;
for (const auto& neighbor : neighbors_) {
// could look up neighbor with shard containing enq_msg_descr.next_prefix more efficiently
// (instead of checking all neighbors)
if (!neighbor.is_disabled() && neighbor.processed_upto->already_processed(enq_msg_descr)) {
delivered = true;
deliver_lt = neighbor.end_lt();
size_t i = 0;
td::uint32 deleted = 0;
while (!queue_parts.empty()) {
if (block_full_) {
LOG(WARNING) << "BLOCK FULL while cleaning up outbound queue, cleanup completed only partially";
break;
}
}
if (delivered) {
LOG(DEBUG) << "outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << "," << enq_msg_descr.hash_.to_hex()
<< ") enqueued_lt=" << enq_msg_descr.enqueued_lt_ << " has been already delivered, dequeueing";
++deleted;
out_msg_queue_->lookup_delete_with_extra(key, n);
if (!dequeue_message(std::move(enq_msg_descr.msg_env_), deliver_lt)) {
fatal_error(PSTRING() << "cannot dequeue outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << ","
<< enq_msg_descr.hash_.to_hex() << ") by inserting a msg_export_deq record");
fail = true;
return false;
if (queue_cleanup_timeout_.is_in_past(td::Timestamp::now())) {
LOG(WARNING) << "cleaning up outbound queue takes too long, ending";
break;
}
register_out_msg_queue_op();
if (!block_limit_status_->fits(block::ParamLimits::cl_normal)) {
block_full_ = true;
if (i == queue_parts.size()) {
i = 0;
}
auto& queue = queue_parts.at(i).first;
auto nb = queue_parts.at(i).second;
auto kv = queue.extract_cur();
if (kv) {
block::EnqueuedMsgDescr enq_msg_descr;
if (!(enq_msg_descr.unpack(kv->msg.write()) // unpack EnqueuedMsg
&& enq_msg_descr.check_key(kv->key.cbits()) // check key
)) {
return fatal_error(PSTRING() << "error scanning/updating OutMsgQueue: cannot unpack EnqueuedMsg with key "
<< kv->key.to_hex());
}
if (nb->processed_upto->already_processed(enq_msg_descr)) {
LOG(DEBUG) << "scanning outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << ","
<< enq_msg_descr.hash_.to_hex() << ") enqueued_lt=" << enq_msg_descr.enqueued_lt_
<< ": message has been already delivered, dequeueing";
++deleted;
CHECK(out_msg_queue_size_ > 0);
--out_msg_queue_size_;
out_msg_queue_->lookup_delete_with_extra(kv->key.cbits(), kv->key_len);
if (!dequeue_message(std::move(enq_msg_descr.msg_env_), nb->end_lt())) {
return fatal_error(PSTRING() << "cannot dequeue outbound message with (lt,hash)=(" << enq_msg_descr.lt_
<< "," << enq_msg_descr.hash_.to_hex()
<< ") by inserting a msg_export_deq record");
}
register_out_msg_queue_op();
if (!block_limit_status_->fits(block::ParamLimits::cl_normal)) {
block_full_ = true;
}
queue.next();
++i;
continue;
} else {
LOG(DEBUG) << "scanning outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << ","
<< enq_msg_descr.hash_.to_hex() << ") enqueued_lt=" << enq_msg_descr.enqueued_lt_
<< ": message has not been delivered";
}
}
LOG(DEBUG) << "no more unprocessed messages to shard " << nb->shard().to_str();
std::swap(queue_parts[i], queue_parts.back());
queue_parts.pop_back();
}
return true;
}, false, true /* random order */);
LOG(INFO) << "deleted " << deleted << " messages from out_msg_queue, processed " << total << " messages in total";
if (fail) {
return fatal_error("error scanning/updating OutMsgQueue");
LOG(INFO) << "deleted " << deleted << " messages from out_msg_queue, remaining queue size is "
<< out_msg_queue_size_;
}
if (outq_cleanup_partial_ || total > 8000) {
LOG(INFO) << "out_msg_queue too big, skipping importing external messages";
skip_extmsg_ = true;
queue_too_big_ = true;
}
auto rt = out_msg_queue_->get_root();
if (verbosity >= 2) {
auto rt = out_msg_queue_->get_root();
std::cerr << "new out_msg_queue is ";
block::gen::t_OutMsgQueue.print(std::cerr, *rt);
rt->print_rec(std::cerr);
}
// CHECK(block::gen::t_OutMsgQueue.validate_upto(100000, *rt)); // DEBUG, comment later if SLOW
return register_out_msg_queue_op(true);
}
@ -3047,6 +3145,7 @@ bool Collator::enqueue_transit_message(Ref<vm::Cell> msg, Ref<vm::Cell> old_msg_
try {
LOG(DEBUG) << "inserting into outbound queue message with (lt,key)=(" << start_lt << "," << key.to_hex() << ")";
ok = out_msg_queue_->set_builder(key.bits(), 352, cb, vm::Dictionary::SetMode::Add);
++out_msg_queue_size_;
} catch (vm::VmError) {
ok = false;
}
@ -3069,6 +3168,8 @@ bool Collator::delete_out_msg_queue_msg(td::ConstBitPtr key) {
try {
LOG(DEBUG) << "deleting from outbound queue message with key=" << key.to_hex(352);
queue_rec = out_msg_queue_->lookup_delete(key, 352);
CHECK(out_msg_queue_size_ > 0);
--out_msg_queue_size_;
} catch (vm::VmError err) {
LOG(ERROR) << "error deleting from out_msg_queue dictionary: " << err.get_msg();
}
@ -3309,8 +3410,9 @@ bool Collator::process_inbound_external_messages() {
LOG(INFO) << "skipping processing of inbound external messages";
return true;
}
if (out_msg_queue_->get_root_cell().not_null() && out_msg_queue_->get_root_cell()->get_depth() > 12) {
LOG(INFO) << "skipping processing of inbound external messages: out msg queue is too big";
if (out_msg_queue_size_ > SKIP_EXTERNALS_QUEUE_SIZE) {
LOG(INFO) << "skipping processing of inbound external messages because out_msg_queue is too big ("
<< out_msg_queue_size_ << " > " << SKIP_EXTERNALS_QUEUE_SIZE << ")";
return true;
}
bool full = !block_limit_status_->fits(block::ParamLimits::cl_soft);
@ -3550,6 +3652,7 @@ bool Collator::enqueue_message(block::NewOutMsg msg, td::RefInt256 fwd_fees_rema
LOG(DEBUG) << "inserting into outbound queue a new message with (lt,key)=(" << start_lt << "," << key.to_hex()
<< ")";
ok = out_msg_queue_->set_builder(key.bits(), 352, cb, vm::Dictionary::SetMode::Add);
++out_msg_queue_size_;
} catch (vm::VmError) {
ok = false;
}
@ -4158,6 +4261,7 @@ static int history_weight(td::uint64 history) {
* @returns True if the check is successful.
*/
bool Collator::check_block_overload() {
LOG(INFO) << "final out_msg_queue size is " << out_msg_queue_size_;
overload_history_ <<= 1;
underload_history_ <<= 1;
block_size_estimate_ = block_limit_status_->estimate_block_size();
@ -4166,18 +4270,32 @@ bool Collator::check_block_overload() {
<< " size_estimate=" << block_size_estimate_;
auto cl = block_limit_status_->classify();
if (cl <= block::ParamLimits::cl_underload) {
if (queue_too_big_) {
LOG(INFO) << "block is underloaded, but don't set underload history because out msg queue is big";
if (out_msg_queue_size_ > MERGE_MAX_QUEUE_SIZE) {
LOG(INFO)
<< "block is underloaded, but don't set underload history because out_msg_queue size is too big to merge ("
<< out_msg_queue_size_ << " > " << MERGE_MAX_QUEUE_SIZE << ")";
} else {
underload_history_ |= 1;
LOG(INFO) << "block is underloaded";
}
} else if (cl >= block::ParamLimits::cl_soft) {
overload_history_ |= 1;
LOG(INFO) << "block is overloaded (category " << cl << ")";
if (out_msg_queue_size_ > SPLIT_MAX_QUEUE_SIZE) {
LOG(INFO) << "block is overloaded (category " << cl
<< "), but don't set overload history because out_msg_queue size is too big to split ("
<< out_msg_queue_size_ << " > " << SPLIT_MAX_QUEUE_SIZE << ")";
} else {
overload_history_ |= 1;
LOG(INFO) << "block is overloaded (category " << cl << ")";
}
} else {
LOG(INFO) << "block is loaded normally";
}
if (!(overload_history_ & 1) && out_msg_queue_size_ >= FORCE_SPLIT_QUEUE_SIZE &&
out_msg_queue_size_ <= SPLIT_MAX_QUEUE_SIZE) {
overload_history_ |= 1;
LOG(INFO) << "setting overload history because out_msg_queue reached force split limit (" << out_msg_queue_size_
<< " >= " << FORCE_SPLIT_QUEUE_SIZE << ")";
}
if (collator_settings & 1) {
LOG(INFO) << "want_split manually set";
want_split_ = true;