mirror of
https://github.com/ton-blockchain/ton
synced 2025-03-09 15:40:10 +00:00
Improve handling outbound message queues (#825)
* Improve handling outbound message queues * Cleanup queue faster * Calculate queue sizes in background * Force or limit split/merge depending on queue size * Increase validate_ref limit for transaction * Add all changes of public libraries to block size estimation * Don't crash on timeout in GC * Don't import external messages when queue is too big --------- Co-authored-by: SpyCheese <mikle98@yandex.ru>
This commit is contained in:
parent
3a595ce849
commit
5e6b67ae96
16 changed files with 727 additions and 111 deletions
|
@ -44,6 +44,12 @@ namespace validator {
|
|||
using td::Ref;
|
||||
using namespace std::literals::string_literals;
|
||||
|
||||
// Don't increase MERGE_MAX_QUEUE_LIMIT too much: merging requires cleaning the whole queue in out_msg_queue_cleanup
|
||||
static const td::uint32 FORCE_SPLIT_QUEUE_SIZE = 4096;
|
||||
static const td::uint32 SPLIT_MAX_QUEUE_SIZE = 100000;
|
||||
static const td::uint32 MERGE_MAX_QUEUE_SIZE = 2047;
|
||||
static const td::uint32 SKIP_EXTERNALS_QUEUE_SIZE = 8000;
|
||||
|
||||
#define DBG(__n) dbg(__n)&&
|
||||
#define DSTART int __dcnt = 0;
|
||||
#define DEB DBG(++__dcnt)
|
||||
|
@ -790,6 +796,26 @@ bool Collator::request_neighbor_msg_queues() {
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Requests the size of the outbound message queue from the previous state(s).
|
||||
*
|
||||
* @returns True if the request was successful, false otherwise.
|
||||
*/
|
||||
bool Collator::request_out_msg_queue_size() {
|
||||
if (after_split_) {
|
||||
// If block is after split, the size is calculated during split (see Collator::split_last_state)
|
||||
return true;
|
||||
}
|
||||
for (size_t i = 0; i < prev_blocks.size(); ++i) {
|
||||
++pending;
|
||||
send_closure_later(manager, &ValidatorManager::get_out_msg_queue_size, prev_blocks[i],
|
||||
[self = get_self(), i](td::Result<td::uint32> res) {
|
||||
td::actor::send_closure(std::move(self), &Collator::got_out_queue_size, i, std::move(res));
|
||||
});
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles the result of obtaining the outbound queue for a neighbor.
|
||||
*
|
||||
|
@ -854,6 +880,27 @@ void Collator::got_neighbor_out_queue(int i, td::Result<Ref<MessageQueue>> res)
|
|||
check_pending();
|
||||
}
|
||||
|
||||
/**
|
||||
* Handles the result of obtaining the size of the outbound message queue.
|
||||
*
|
||||
* If the block is after merge then the two sizes are added.
|
||||
*
|
||||
* @param i The index of the previous block (0 or 1).
|
||||
* @param res The result object containing the size of the queue.
|
||||
*/
|
||||
void Collator::got_out_queue_size(size_t i, td::Result<td::uint32> res) {
|
||||
--pending;
|
||||
if (res.is_error()) {
|
||||
fatal_error(
|
||||
res.move_as_error_prefix(PSTRING() << "failed to get message queue size from prev block #" << i << ": "));
|
||||
return;
|
||||
}
|
||||
td::uint32 size = res.move_as_ok();
|
||||
LOG(DEBUG) << "got outbound queue size from prev block #" << i << ": " << size;
|
||||
out_msg_queue_size_ += size;
|
||||
check_pending();
|
||||
}
|
||||
|
||||
/**
|
||||
* Unpacks and merges the states of two previous blocks.
|
||||
* Used if the block is after_merge.
|
||||
|
@ -972,7 +1019,7 @@ bool Collator::split_last_state(block::ShardState& ss) {
|
|||
return fatal_error(res2.move_as_error());
|
||||
}
|
||||
sibling_processed_upto_ = res2.move_as_ok();
|
||||
auto res3 = ss.split(shard_);
|
||||
auto res3 = ss.split(shard_, &out_msg_queue_size_);
|
||||
if (res3.is_error()) {
|
||||
return fatal_error(std::move(res3));
|
||||
}
|
||||
|
@ -1449,6 +1496,9 @@ bool Collator::do_preinit() {
|
|||
if (!request_neighbor_msg_queues()) {
|
||||
return false;
|
||||
}
|
||||
if (!request_out_msg_queue_size()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -1824,7 +1874,6 @@ bool Collator::init_utime() {
|
|||
// Extend collator timeout if previous block is too old
|
||||
td::Timestamp new_timeout = td::Timestamp::in(std::min(30.0, (td::Clocks::system() - (double)prev_now_) / 2));
|
||||
if (timeout < new_timeout) {
|
||||
double add = new_timeout.at() - timeout.at();
|
||||
timeout = new_timeout;
|
||||
alarm_timestamp() = timeout;
|
||||
}
|
||||
|
@ -2174,95 +2223,144 @@ bool Collator::out_msg_queue_cleanup() {
|
|||
block::gen::t_OutMsgQueue.print(std::cerr, *rt);
|
||||
rt->print_rec(std::cerr);
|
||||
}
|
||||
for (const auto& nb : neighbors_) {
|
||||
if (!nb.is_disabled() && (!nb.processed_upto || !nb.processed_upto->can_check_processed())) {
|
||||
return fatal_error(-667, PSTRING() << "internal error: no info for checking processed messages from neighbor "
|
||||
<< nb.blk_.to_str());
|
||||
}
|
||||
}
|
||||
|
||||
auto queue_root = out_msg_queue_->get_root_cell();
|
||||
if (queue_root.is_null()) {
|
||||
LOG(DEBUG) << "out_msg_queue is empty";
|
||||
return true;
|
||||
}
|
||||
auto old_out_msg_queue = std::make_unique<vm::AugmentedDictionary>(queue_root, 352, block::tlb::aug_OutMsgQueue);
|
||||
if (after_merge_) {
|
||||
// We need to clean the whole queue after merge
|
||||
// Queue is not too big, see const MERGE_MAX_QUEUE_SIZE
|
||||
for (const auto& nb : neighbors_) {
|
||||
if (!nb.is_disabled() && (!nb.processed_upto || !nb.processed_upto->can_check_processed())) {
|
||||
return fatal_error(-667, PSTRING() << "internal error: no info for checking processed messages from neighbor "
|
||||
<< nb.blk_.to_str());
|
||||
}
|
||||
}
|
||||
td::uint32 deleted = 0;
|
||||
auto res = out_msg_queue_->filter([&](vm::CellSlice& cs, td::ConstBitPtr key, int n) -> int {
|
||||
assert(n == 352);
|
||||
block::EnqueuedMsgDescr enq_msg_descr;
|
||||
unsigned long long created_lt;
|
||||
if (!(cs.fetch_ulong_bool(64, created_lt) // augmentation
|
||||
&& enq_msg_descr.unpack(cs) // unpack EnqueuedMsg
|
||||
&& enq_msg_descr.check_key(key) // check key
|
||||
&& enq_msg_descr.lt_ == created_lt)) {
|
||||
LOG(ERROR) << "cannot unpack EnqueuedMsg with key " << key.to_hex(n);
|
||||
return -1;
|
||||
}
|
||||
LOG(DEBUG) << "scanning outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << ","
|
||||
<< enq_msg_descr.hash_.to_hex() << ") enqueued_lt=" << enq_msg_descr.enqueued_lt_;
|
||||
bool delivered = false;
|
||||
ton::LogicalTime deliver_lt = 0;
|
||||
for (const auto& neighbor : neighbors_) {
|
||||
// could look up neighbor with shard containing enq_msg_descr.next_prefix more efficiently
|
||||
// (instead of checking all neighbors)
|
||||
if (!neighbor.is_disabled() && neighbor.processed_upto->already_processed(enq_msg_descr)) {
|
||||
delivered = true;
|
||||
deliver_lt = neighbor.end_lt();
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (delivered) {
|
||||
++deleted;
|
||||
CHECK(out_msg_queue_size_ > 0);
|
||||
--out_msg_queue_size_;
|
||||
LOG(DEBUG) << "outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << "," << enq_msg_descr.hash_.to_hex()
|
||||
<< ") enqueued_lt=" << enq_msg_descr.enqueued_lt_ << " has been already delivered, dequeueing";
|
||||
if (!dequeue_message(std::move(enq_msg_descr.msg_env_), deliver_lt)) {
|
||||
fatal_error(PSTRING() << "cannot dequeue outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << ","
|
||||
<< enq_msg_descr.hash_.to_hex() << ") by inserting a msg_export_deq record");
|
||||
return -1;
|
||||
}
|
||||
register_out_msg_queue_op();
|
||||
if (!block_limit_status_->fits(block::ParamLimits::cl_normal)) {
|
||||
block_full_ = true;
|
||||
}
|
||||
}
|
||||
return !delivered;
|
||||
});
|
||||
LOG(INFO) << "deleted " << deleted << " messages from out_msg_queue after merge, remaining queue size is "
|
||||
<< out_msg_queue_size_;
|
||||
if (res < 0) {
|
||||
return fatal_error("error scanning/updating OutMsgQueue");
|
||||
}
|
||||
} else {
|
||||
std::vector<std::pair<block::OutputQueueMerger, const block::McShardDescr*>> queue_parts;
|
||||
|
||||
int deleted = 0;
|
||||
int total = 0;
|
||||
bool fail = false;
|
||||
old_out_msg_queue->check_for_each([&](Ref<vm::CellSlice> value, td::ConstBitPtr key, int n) -> bool {
|
||||
++total;
|
||||
assert(n == 352);
|
||||
vm::CellSlice& cs = value.write();
|
||||
// LOG(DEBUG) << "key is " << key.to_hex(n);
|
||||
if (queue_cleanup_timeout_.is_in_past(td::Timestamp::now())) {
|
||||
LOG(WARNING) << "cleaning up outbound queue takes too long, ending";
|
||||
outq_cleanup_partial_ = true;
|
||||
return false; // retain all remaining outbound queue entries including this one without processing
|
||||
block::OutputQueueMerger::Neighbor this_queue{BlockIdExt{new_id} /* block id is only used for logs */,
|
||||
out_msg_queue_->get_root_cell()};
|
||||
for (const auto& nb : neighbors_) {
|
||||
if (nb.is_disabled()) {
|
||||
continue;
|
||||
}
|
||||
if (!nb.processed_upto || !nb.processed_upto->can_check_processed()) {
|
||||
return fatal_error(-667, PSTRING() << "internal error: no info for checking processed messages from neighbor "
|
||||
<< nb.blk_.to_str());
|
||||
}
|
||||
queue_parts.emplace_back(block::OutputQueueMerger{nb.shard(), {this_queue}}, &nb);
|
||||
}
|
||||
if (block_full_) {
|
||||
LOG(WARNING) << "BLOCK FULL while cleaning up outbound queue, cleanup completed only partially";
|
||||
outq_cleanup_partial_ = true;
|
||||
return false; // retain all remaining outbound queue entries including this one without processing
|
||||
}
|
||||
block::EnqueuedMsgDescr enq_msg_descr;
|
||||
unsigned long long created_lt;
|
||||
if (!(cs.fetch_ulong_bool(64, created_lt) // augmentation
|
||||
&& enq_msg_descr.unpack(cs) // unpack EnqueuedMsg
|
||||
&& enq_msg_descr.check_key(key) // check key
|
||||
&& enq_msg_descr.lt_ == created_lt)) {
|
||||
LOG(ERROR) << "cannot unpack EnqueuedMsg with key " << key.to_hex(n);
|
||||
fail = true;
|
||||
return false;
|
||||
}
|
||||
LOG(DEBUG) << "scanning outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << ","
|
||||
<< enq_msg_descr.hash_.to_hex() << ") enqueued_lt=" << enq_msg_descr.enqueued_lt_;
|
||||
bool delivered = false;
|
||||
ton::LogicalTime deliver_lt = 0;
|
||||
for (const auto& neighbor : neighbors_) {
|
||||
// could look up neighbor with shard containing enq_msg_descr.next_prefix more efficiently
|
||||
// (instead of checking all neighbors)
|
||||
if (!neighbor.is_disabled() && neighbor.processed_upto->already_processed(enq_msg_descr)) {
|
||||
delivered = true;
|
||||
deliver_lt = neighbor.end_lt();
|
||||
|
||||
size_t i = 0;
|
||||
td::uint32 deleted = 0;
|
||||
while (!queue_parts.empty()) {
|
||||
if (block_full_) {
|
||||
LOG(WARNING) << "BLOCK FULL while cleaning up outbound queue, cleanup completed only partially";
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (delivered) {
|
||||
LOG(DEBUG) << "outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << "," << enq_msg_descr.hash_.to_hex()
|
||||
<< ") enqueued_lt=" << enq_msg_descr.enqueued_lt_ << " has been already delivered, dequeueing";
|
||||
++deleted;
|
||||
out_msg_queue_->lookup_delete_with_extra(key, n);
|
||||
if (!dequeue_message(std::move(enq_msg_descr.msg_env_), deliver_lt)) {
|
||||
fatal_error(PSTRING() << "cannot dequeue outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << ","
|
||||
<< enq_msg_descr.hash_.to_hex() << ") by inserting a msg_export_deq record");
|
||||
fail = true;
|
||||
return false;
|
||||
if (queue_cleanup_timeout_.is_in_past(td::Timestamp::now())) {
|
||||
LOG(WARNING) << "cleaning up outbound queue takes too long, ending";
|
||||
break;
|
||||
}
|
||||
register_out_msg_queue_op();
|
||||
if (!block_limit_status_->fits(block::ParamLimits::cl_normal)) {
|
||||
block_full_ = true;
|
||||
if (i == queue_parts.size()) {
|
||||
i = 0;
|
||||
}
|
||||
auto& queue = queue_parts.at(i).first;
|
||||
auto nb = queue_parts.at(i).second;
|
||||
auto kv = queue.extract_cur();
|
||||
if (kv) {
|
||||
block::EnqueuedMsgDescr enq_msg_descr;
|
||||
if (!(enq_msg_descr.unpack(kv->msg.write()) // unpack EnqueuedMsg
|
||||
&& enq_msg_descr.check_key(kv->key.cbits()) // check key
|
||||
)) {
|
||||
return fatal_error(PSTRING() << "error scanning/updating OutMsgQueue: cannot unpack EnqueuedMsg with key "
|
||||
<< kv->key.to_hex());
|
||||
}
|
||||
if (nb->processed_upto->already_processed(enq_msg_descr)) {
|
||||
LOG(DEBUG) << "scanning outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << ","
|
||||
<< enq_msg_descr.hash_.to_hex() << ") enqueued_lt=" << enq_msg_descr.enqueued_lt_
|
||||
<< ": message has been already delivered, dequeueing";
|
||||
++deleted;
|
||||
CHECK(out_msg_queue_size_ > 0);
|
||||
--out_msg_queue_size_;
|
||||
out_msg_queue_->lookup_delete_with_extra(kv->key.cbits(), kv->key_len);
|
||||
if (!dequeue_message(std::move(enq_msg_descr.msg_env_), nb->end_lt())) {
|
||||
return fatal_error(PSTRING() << "cannot dequeue outbound message with (lt,hash)=(" << enq_msg_descr.lt_
|
||||
<< "," << enq_msg_descr.hash_.to_hex()
|
||||
<< ") by inserting a msg_export_deq record");
|
||||
}
|
||||
register_out_msg_queue_op();
|
||||
if (!block_limit_status_->fits(block::ParamLimits::cl_normal)) {
|
||||
block_full_ = true;
|
||||
}
|
||||
queue.next();
|
||||
++i;
|
||||
continue;
|
||||
} else {
|
||||
LOG(DEBUG) << "scanning outbound message with (lt,hash)=(" << enq_msg_descr.lt_ << ","
|
||||
<< enq_msg_descr.hash_.to_hex() << ") enqueued_lt=" << enq_msg_descr.enqueued_lt_
|
||||
<< ": message has not been delivered";
|
||||
}
|
||||
}
|
||||
LOG(DEBUG) << "no more unprocessed messages to shard " << nb->shard().to_str();
|
||||
std::swap(queue_parts[i], queue_parts.back());
|
||||
queue_parts.pop_back();
|
||||
}
|
||||
return true;
|
||||
}, false, true /* random order */);
|
||||
LOG(INFO) << "deleted " << deleted << " messages from out_msg_queue, processed " << total << " messages in total";
|
||||
if (fail) {
|
||||
return fatal_error("error scanning/updating OutMsgQueue");
|
||||
LOG(INFO) << "deleted " << deleted << " messages from out_msg_queue, remaining queue size is "
|
||||
<< out_msg_queue_size_;
|
||||
}
|
||||
if (outq_cleanup_partial_ || total > 8000) {
|
||||
LOG(INFO) << "out_msg_queue too big, skipping importing external messages";
|
||||
skip_extmsg_ = true;
|
||||
queue_too_big_ = true;
|
||||
}
|
||||
auto rt = out_msg_queue_->get_root();
|
||||
if (verbosity >= 2) {
|
||||
auto rt = out_msg_queue_->get_root();
|
||||
std::cerr << "new out_msg_queue is ";
|
||||
block::gen::t_OutMsgQueue.print(std::cerr, *rt);
|
||||
rt->print_rec(std::cerr);
|
||||
}
|
||||
// CHECK(block::gen::t_OutMsgQueue.validate_upto(100000, *rt)); // DEBUG, comment later if SLOW
|
||||
return register_out_msg_queue_op(true);
|
||||
}
|
||||
|
||||
|
@ -3047,6 +3145,7 @@ bool Collator::enqueue_transit_message(Ref<vm::Cell> msg, Ref<vm::Cell> old_msg_
|
|||
try {
|
||||
LOG(DEBUG) << "inserting into outbound queue message with (lt,key)=(" << start_lt << "," << key.to_hex() << ")";
|
||||
ok = out_msg_queue_->set_builder(key.bits(), 352, cb, vm::Dictionary::SetMode::Add);
|
||||
++out_msg_queue_size_;
|
||||
} catch (vm::VmError) {
|
||||
ok = false;
|
||||
}
|
||||
|
@ -3069,6 +3168,8 @@ bool Collator::delete_out_msg_queue_msg(td::ConstBitPtr key) {
|
|||
try {
|
||||
LOG(DEBUG) << "deleting from outbound queue message with key=" << key.to_hex(352);
|
||||
queue_rec = out_msg_queue_->lookup_delete(key, 352);
|
||||
CHECK(out_msg_queue_size_ > 0);
|
||||
--out_msg_queue_size_;
|
||||
} catch (vm::VmError err) {
|
||||
LOG(ERROR) << "error deleting from out_msg_queue dictionary: " << err.get_msg();
|
||||
}
|
||||
|
@ -3309,8 +3410,9 @@ bool Collator::process_inbound_external_messages() {
|
|||
LOG(INFO) << "skipping processing of inbound external messages";
|
||||
return true;
|
||||
}
|
||||
if (out_msg_queue_->get_root_cell().not_null() && out_msg_queue_->get_root_cell()->get_depth() > 12) {
|
||||
LOG(INFO) << "skipping processing of inbound external messages: out msg queue is too big";
|
||||
if (out_msg_queue_size_ > SKIP_EXTERNALS_QUEUE_SIZE) {
|
||||
LOG(INFO) << "skipping processing of inbound external messages because out_msg_queue is too big ("
|
||||
<< out_msg_queue_size_ << " > " << SKIP_EXTERNALS_QUEUE_SIZE << ")";
|
||||
return true;
|
||||
}
|
||||
bool full = !block_limit_status_->fits(block::ParamLimits::cl_soft);
|
||||
|
@ -3550,6 +3652,7 @@ bool Collator::enqueue_message(block::NewOutMsg msg, td::RefInt256 fwd_fees_rema
|
|||
LOG(DEBUG) << "inserting into outbound queue a new message with (lt,key)=(" << start_lt << "," << key.to_hex()
|
||||
<< ")";
|
||||
ok = out_msg_queue_->set_builder(key.bits(), 352, cb, vm::Dictionary::SetMode::Add);
|
||||
++out_msg_queue_size_;
|
||||
} catch (vm::VmError) {
|
||||
ok = false;
|
||||
}
|
||||
|
@ -4158,6 +4261,7 @@ static int history_weight(td::uint64 history) {
|
|||
* @returns True if the check is successful.
|
||||
*/
|
||||
bool Collator::check_block_overload() {
|
||||
LOG(INFO) << "final out_msg_queue size is " << out_msg_queue_size_;
|
||||
overload_history_ <<= 1;
|
||||
underload_history_ <<= 1;
|
||||
block_size_estimate_ = block_limit_status_->estimate_block_size();
|
||||
|
@ -4166,18 +4270,32 @@ bool Collator::check_block_overload() {
|
|||
<< " size_estimate=" << block_size_estimate_;
|
||||
auto cl = block_limit_status_->classify();
|
||||
if (cl <= block::ParamLimits::cl_underload) {
|
||||
if (queue_too_big_) {
|
||||
LOG(INFO) << "block is underloaded, but don't set underload history because out msg queue is big";
|
||||
if (out_msg_queue_size_ > MERGE_MAX_QUEUE_SIZE) {
|
||||
LOG(INFO)
|
||||
<< "block is underloaded, but don't set underload history because out_msg_queue size is too big to merge ("
|
||||
<< out_msg_queue_size_ << " > " << MERGE_MAX_QUEUE_SIZE << ")";
|
||||
} else {
|
||||
underload_history_ |= 1;
|
||||
LOG(INFO) << "block is underloaded";
|
||||
}
|
||||
} else if (cl >= block::ParamLimits::cl_soft) {
|
||||
overload_history_ |= 1;
|
||||
LOG(INFO) << "block is overloaded (category " << cl << ")";
|
||||
if (out_msg_queue_size_ > SPLIT_MAX_QUEUE_SIZE) {
|
||||
LOG(INFO) << "block is overloaded (category " << cl
|
||||
<< "), but don't set overload history because out_msg_queue size is too big to split ("
|
||||
<< out_msg_queue_size_ << " > " << SPLIT_MAX_QUEUE_SIZE << ")";
|
||||
} else {
|
||||
overload_history_ |= 1;
|
||||
LOG(INFO) << "block is overloaded (category " << cl << ")";
|
||||
}
|
||||
} else {
|
||||
LOG(INFO) << "block is loaded normally";
|
||||
}
|
||||
if (!(overload_history_ & 1) && out_msg_queue_size_ >= FORCE_SPLIT_QUEUE_SIZE &&
|
||||
out_msg_queue_size_ <= SPLIT_MAX_QUEUE_SIZE) {
|
||||
overload_history_ |= 1;
|
||||
LOG(INFO) << "setting overload history because out_msg_queue reached force split limit (" << out_msg_queue_size_
|
||||
<< " >= " << FORCE_SPLIT_QUEUE_SIZE << ")";
|
||||
}
|
||||
if (collator_settings & 1) {
|
||||
LOG(INFO) << "want_split manually set";
|
||||
want_split_ = true;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue