From 8148c658cfaf58f926a8fd2e0350b7f4a5e4baaa Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Wed, 2 Mar 2022 09:55:23 -0800 Subject: [PATCH] Remove bonds for peers that have fully expired. Remove notion of bond health --- include/ZeroTierOne.h | 5 --- node/Bond.cpp | 77 +++++++++++++++++++++--------------------- node/Bond.hpp | 20 +++++------ node/Node.cpp | 1 - node/Peer.cpp | 2 +- node/Peer.hpp | 5 ++- one.cpp | 33 +++--------------- service/OneService.cpp | 2 -- 8 files changed, 57 insertions(+), 88 deletions(-) diff --git a/include/ZeroTierOne.h b/include/ZeroTierOne.h index e88fbda1..e16d23f8 100644 --- a/include/ZeroTierOne.h +++ b/include/ZeroTierOne.h @@ -1404,11 +1404,6 @@ typedef struct */ int bondingPolicy; - /** - * The health status of the bond to this peer - */ - bool isHealthy; - /** * The number of links that comprise the bond to this peer that are considered alive */ diff --git a/node/Bond.cpp b/node/Bond.cpp index 3e3cd5b2..f1488a8e 100644 --- a/node/Bond.cpp +++ b/node/Bond.cpp @@ -90,7 +90,7 @@ SharedPtr Bond::getBondByPeerId(int64_t identity) return _bonds.count(identity) ? _bonds[identity] : SharedPtr(); } -SharedPtr Bond::createTransportTriggeredBond(const RuntimeEnvironment* renv, const SharedPtr& peer) +SharedPtr Bond::createBond(const RuntimeEnvironment* renv, const SharedPtr& peer) { Mutex::Lock _l(_bonds_m); int64_t identity = peer->identity().address().toInt(); @@ -145,6 +145,12 @@ SharedPtr Bond::createTransportTriggeredBond(const RuntimeEnvironment* ren return SharedPtr(); } +void Bond::destroyBond(uint64_t peerId) +{ + Mutex::Lock _l(_bonds_m); + _bonds.erase(peerId); +} + SharedPtr Bond::getLinkBySocket(const std::string& policyAlias, uint64_t localSocket) { Mutex::Lock _l(_links_m); @@ -816,6 +822,17 @@ void Bond::curateBond(int64_t now, bool rebuildBond) if (! _paths[i].p) { continue; } + + /** + * Remove expired links from bond + */ + if ((now - _paths[i].p->_lastIn) > (ZT_PEER_EXPIRED_PATH_TRIAL_PERIOD)) { + log("link %s has expired, removing from bond", pathToStr(_paths[i].p).c_str()); + _paths[i] = NominatedPath(); + _paths[i].p = SharedPtr(); + continue; + } + tmpNumTotalLinks++; if (_paths[i].eligible) { tmpNumAliveLinks++; @@ -876,42 +893,18 @@ void Bond::curateBond(int64_t now, bool rebuildBond) } /** - * Determine health status to report to user + * Trigger status report if number of links change */ _numAliveLinks = tmpNumAliveLinks; _numTotalLinks = tmpNumTotalLinks; - bool tmpHealthStatus = true; - - if (_policy == ZT_BOND_POLICY_BROADCAST) { - if (_numAliveLinks < 1) { - // Considered healthy if we're able to send frames at all - tmpHealthStatus = false; - } - } - if ((_policy == ZT_BOND_POLICY_BALANCE_RR) || (_policy == ZT_BOND_POLICY_BALANCE_XOR) || (_policy == ZT_BOND_POLICY_BALANCE_AWARE || (_policy == ZT_BOND_POLICY_ACTIVE_BACKUP))) { - if (_numAliveLinks < _numTotalLinks) { - tmpHealthStatus = false; - } - } - if (tmpHealthStatus != _isHealthy) { - std::string healthStatusStr; - if (tmpHealthStatus == true) { - healthStatusStr = "HEALTHY"; - } - else { - healthStatusStr = "DEGRADED"; - } - log("bond is %s (%d/%d links)", healthStatusStr.c_str(), _numAliveLinks, _numTotalLinks); + if ((_numAliveLinks != tmpNumAliveLinks) || (_numTotalLinks != tmpNumTotalLinks)) { dumpInfo(now, true); } - _isHealthy = tmpHealthStatus; - /** * Curate the set of paths that are part of the bond proper. Select a set of paths * per logical link according to eligibility and user-specified constraints. */ - if ((_policy == ZT_BOND_POLICY_BALANCE_RR) || (_policy == ZT_BOND_POLICY_BALANCE_XOR) || (_policy == ZT_BOND_POLICY_BALANCE_AWARE)) { if (! _numBondedPaths) { rebuildBond = true; @@ -1009,14 +1002,14 @@ void Bond::estimatePathQuality(int64_t now) uint32_t totUserSpecifiedLinkSpeed = 0; if (_numBondedPaths) { // Compute relative user-specified speeds of links for (unsigned int i = 0; i < _numBondedPaths; ++i) { - SharedPtr link = RR->bc->getLinkBySocket(_policyAlias, _paths[i].p->localSocket()); if (_paths[i].p && _paths[i].allowed()) { + SharedPtr link = RR->bc->getLinkBySocket(_policyAlias, _paths[i].p->localSocket()); totUserSpecifiedLinkSpeed += link->speed(); } } for (unsigned int i = 0; i < _numBondedPaths; ++i) { - SharedPtr link = RR->bc->getLinkBySocket(_policyAlias, _paths[i].p->localSocket()); if (_paths[i].p && _paths[i].allowed()) { + SharedPtr link = RR->bc->getLinkBySocket(_policyAlias, _paths[i].p->localSocket()); link->setRelativeSpeed((uint8_t)round(((float)link->speed() / (float)totUserSpecifiedLinkSpeed) * 255)); } } @@ -1213,6 +1206,11 @@ void Bond::processActiveBackupTasks(void* tPtr, int64_t now) int nonPreferredPathIdx; bool bFoundPrimaryLink = false; + if (_abPathIdx != ZT_MAX_PEER_NETWORK_PATHS && !_paths[_abPathIdx].p) { + _abPathIdx = ZT_MAX_PEER_NETWORK_PATHS; + log("main active-backup path has been removed"); + } + /** * Generate periodic status report */ @@ -1242,7 +1240,6 @@ void Bond::processActiveBackupTasks(void* tPtr, int64_t now) * simply find the next eligible path. */ if (! userHasSpecifiedLinks()) { - debug("no user-specified links"); for (int i = 0; i < ZT_MAX_PEER_NETWORK_PATHS; ++i) { if (_paths[i].p && _paths[i].eligible) { SharedPtr link = RR->bc->getLinkBySocket(_policyAlias, _paths[i].p->localSocket()); @@ -1575,7 +1572,6 @@ void Bond::setBondParameters(int policy, SharedPtr templateBond, bool useT // Bond status - _isHealthy = false; _numAliveLinks = 0; _numTotalLinks = 0; _numBondedPaths = 0; @@ -1685,11 +1681,14 @@ SharedPtr Bond::getLink(const SharedPtr& path) std::string Bond::pathToStr(const SharedPtr& path) { #ifdef ZT_TRACE - char pathStr[64] = { 0 }; - char fullPathStr[384] = { 0 }; - path->address().toString(pathStr); - snprintf(fullPathStr, 384, "%.16llx-%s/%s", (unsigned long long)(path->localSocket()), getLink(path)->ifname().c_str(), pathStr); - return std::string(fullPathStr); + if (path) { + char pathStr[64] = { 0 }; + char fullPathStr[384] = { 0 }; + path->address().toString(pathStr); + snprintf(fullPathStr, 384, "%.16llx-%s/%s", (unsigned long long)(path->localSocket()), getLink(path)->ifname().c_str(), pathStr); + return std::string(fullPathStr); + } + return ""; #else return ""; #endif @@ -1728,7 +1727,7 @@ void Bond::dumpInfo(int64_t now, bool force) _lastSummaryDump = now; float overhead = (_overheadBytes / (timeSinceLastDump / 1000.0f) / 1000.0f); _overheadBytes = 0; - log("bond: bp=%d, fi=%d, mi=%d, ud=%d, dd=%d, flows=%lu, leaf=%d, overhead=%f KB/s", + log("bond: bp=%d, fi=%d, mi=%d, ud=%d, dd=%d, flows=%lu, leaf=%d, overhead=%f KB/s, links=(%d/%d)", _policy, _failoverInterval, _monitorInterval, @@ -1736,7 +1735,9 @@ void Bond::dumpInfo(int64_t now, bool force) _downDelay, (unsigned long)_flows.size(), _isLeaf, - overhead); + overhead, + _numAliveLinks, + _numTotalLinks); for (int i = 0; i < ZT_MAX_PEER_NETWORK_PATHS; ++i) { if (_paths[i].p) { dumpPathStatus(now, i); diff --git a/node/Bond.hpp b/node/Bond.hpp index b30d3a90..3465db51 100644 --- a/node/Bond.hpp +++ b/node/Bond.hpp @@ -436,7 +436,14 @@ class Bond { * @param peer Remote peer that this bond services * @return A pointer to the newly created Bond */ - static SharedPtr createTransportTriggeredBond(const RuntimeEnvironment* renv, const SharedPtr& peer); + static SharedPtr createBond(const RuntimeEnvironment* renv, const SharedPtr& peer); + + /** + * Remove a bond from the bond controller. + * + * @param peerId Remote peer that this bond services + */ + static void destroyBond(uint64_t peerId); /** * Periodically perform maintenance tasks for the bonding layer. @@ -1020,14 +1027,6 @@ class Bond { return _policy; } - /** - * @return the health status of the bond - */ - inline bool isHealthy() - { - return _isHealthy; - } - /** * @return the number of links comprising this bond which are considered alive */ @@ -1344,7 +1343,7 @@ class Bond { int packetsIn; int packetsOut; - AtomicCounter __refCount; + //AtomicCounter __refCount; SharedPtr p; void set(uint64_t now, const SharedPtr& path) @@ -1490,7 +1489,6 @@ class Bond { /** * Link state reporting */ - bool _isHealthy; uint8_t _numAliveLinks; uint8_t _numTotalLinks; diff --git a/node/Node.cpp b/node/Node.cpp index a0dd03fc..3533a723 100644 --- a/node/Node.cpp +++ b/node/Node.cpp @@ -509,7 +509,6 @@ ZT_PeerList *Node::peers() const if (pi->second->bond()) { p->isBonded = pi->second->bond(); p->bondingPolicy = pi->second->bond()->policy(); - p->isHealthy = pi->second->bond()->isHealthy(); p->numAliveLinks = pi->second->bond()->getNumAliveLinks(); p->numTotalLinks = pi->second->bond()->getNumTotalLinks(); } diff --git a/node/Peer.cpp b/node/Peer.cpp index 5a76eda3..008ece17 100644 --- a/node/Peer.cpp +++ b/node/Peer.cpp @@ -503,7 +503,7 @@ void Peer::performMultipathStateCheck(void *tPtr, int64_t now) _localMultipathSupported = ((numAlivePaths >= 1) && (RR->bc->inUse()) && (ZT_PROTO_VERSION > 9)); if (_localMultipathSupported && !_bond) { if (RR->bc) { - _bond = RR->bc->createTransportTriggeredBond(RR, this); + _bond = RR->bc->createBond(RR, this); /** * Allow new bond to retroactively learn all paths known to this peer */ diff --git a/node/Peer.hpp b/node/Peer.hpp index 77e35b78..ad267acc 100644 --- a/node/Peer.hpp +++ b/node/Peer.hpp @@ -53,7 +53,10 @@ private: Peer() {} // disabled to prevent bugs -- should not be constructed uninitialized public: - ~Peer() { Utils::burn(_key,sizeof(_key)); } + ~Peer() { + Utils::burn(_key,sizeof(_key)); + RR->bc->destroyBond(_id.address().toInt()); + } /** * Construct a new peer diff --git a/one.cpp b/one.cpp index cfd94fd5..524b1ff8 100644 --- a/one.cpp +++ b/one.cpp @@ -523,31 +523,23 @@ static int cli(int argc,char **argv) printf("%s" ZT_EOL_S,OSUtils::jsonDump(j).c_str()); } else { bool bFoundBond = false; - printf(" " ZT_EOL_S); + printf(" " ZT_EOL_S); if (j.is_array()) { for(unsigned long k=0;k= ZT_BOND_POLICY_NONE && bondingPolicy <= ZT_BOND_POLICY_BALANCE_AWARE) { policyStr = Bond::getPolicyStrByCode(bondingPolicy); } - printf("%10s %32s %8s %d/%d" ZT_EOL_S, + printf("%10s %32s %d/%d" ZT_EOL_S, OSUtils::jsonString(p ["address"],"-").c_str(), policyStr.c_str(), - healthStr.c_str(), numAliveLinks, numTotalLinks); } @@ -617,12 +609,6 @@ static int cli(int argc,char **argv) if (json) { printf("%s" ZT_EOL_S,OSUtils::jsonDump(j).c_str()); } else { - std::string healthStr; - if (OSUtils::jsonInt(j["isHealthy"],0)) { - healthStr = "Healthy"; - } else { - healthStr = "Degraded"; - } int numAliveLinks = OSUtils::jsonInt(j["numAliveLinks"],0); int numTotalLinks = OSUtils::jsonInt(j["numTotalLinks"],0); printf("Peer : %s\n", arg1.c_str()); @@ -630,7 +616,6 @@ static int cli(int argc,char **argv) //if (bondingPolicy == ZT_BOND_POLICY_ACTIVE_BACKUP) { printf("Link Select Method : %d\n", (int)OSUtils::jsonInt(j["linkSelectMethod"],0)); //} - printf("Status : %s\n", healthStr.c_str()); printf("Links : %d/%d\n", numAliveLinks, numTotalLinks); printf("Failover Interval : %d (ms)\n", (int)OSUtils::jsonInt(j["failoverInterval"],0)); printf("Up Delay : %d (ms)\n", (int)OSUtils::jsonInt(j["upDelay"],0)); @@ -705,33 +690,23 @@ static int cli(int argc,char **argv) printf("%s" ZT_EOL_S,OSUtils::jsonDump(j).c_str()); } else { bool bFoundBond = false; - printf(" " ZT_EOL_S); + printf(" " ZT_EOL_S); if (j.is_array()) { for(unsigned long k=0;k= ZT_BOND_POLICY_NONE && bondingPolicy <= ZT_BOND_POLICY_BALANCE_AWARE) { policyStr = Bond::getPolicyStrByCode(bondingPolicy); } - - printf("%10s %32s %8s %d/%d" ZT_EOL_S, + printf("%10s %32s %d/%d" ZT_EOL_S, OSUtils::jsonString(p["address"],"-").c_str(), policyStr.c_str(), - healthStr.c_str(), numAliveLinks, numTotalLinks); } diff --git a/service/OneService.cpp b/service/OneService.cpp index 73b3a9d5..1ae4ace9 100644 --- a/service/OneService.cpp +++ b/service/OneService.cpp @@ -510,7 +510,6 @@ static void _peerToJson(nlohmann::json &pj,const ZT_Peer *peer) pj["isBonded"] = peer->isBonded; if (peer->isBonded) { pj["bondingPolicy"] = peer->bondingPolicy; - pj["isHealthy"] = peer->isHealthy; pj["numAliveLinks"] = peer->numAliveLinks; pj["numTotalLinks"] = peer->numTotalLinks; } @@ -542,7 +541,6 @@ static void _bondToJson(nlohmann::json &pj, SharedPtr &bond) return; } - pj["isHealthy"] = bond->isHealthy(); pj["numAliveLinks"] = bond->getNumAliveLinks(); pj["numTotalLinks"] = bond->getNumTotalLinks(); pj["failoverInterval"] = bond->getFailoverInterval();