Add prometheus metrics for Central controllers (#1969)

* add header-only prometheus lib to ext

* rename folder

* Undo rename directory

* prometheus simpleapi included on mac & linux

* wip

* wire up some controller stats

* Get windows building with prometheus

* bsd build flags for prometheus

* Fix multiple network join from environment entrypoint.sh.release (#1961)

* _bond_m guards _bond, not _paths_m (#1965)

* Fix: warning: mutex '_aqm_m' is not held on every path through here [-Wthread-safety-analysis] (#1964)

* Serve prom metrics from /metrics endpoint

* Add prom metrics for Central controller specific things

* reorganize metric initialization

* testing out a labled gauge on Networks

* increment error counter on throw

* Consolidate metrics definitions

Put all metric definitions into node/Metrics.hpp.  Accessed as needed
from there.

* Revert "testing out a labled gauge on Networks"

This reverts commit 499ed6d95e11452019cdf48e32ed4cd878c2705b.

* still blows up but adding to the record for completeness right now

* Fix runtime issues with metrics

* Add metrics files to visual studio project

* Missed an "extern"

* add copyright headers to new files

* Add metrics for sent/received bytes (total)

* put /metrics endpoint behind auth

* sendto returns int on Win32

---------

Co-authored-by: Leonardo Amaral <leleobhz@users.noreply.github.com>
Co-authored-by: Brenton Bostick <bostick@gmail.com>
This commit is contained in:
Grant Limberg 2023-04-21 12:12:43 -07:00 committed by GitHub
parent 0b03ad9a21
commit 8e6e4ede6d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
62 changed files with 4023 additions and 25 deletions

View file

@ -87,6 +87,8 @@
#include "../ext/http-parser/http_parser.h"
#endif
#include "../node/Metrics.hpp"
#if ZT_VAULT_SUPPORT
extern "C" {
#include <curl/curl.h>
@ -846,6 +848,10 @@ public:
_ports[1] = 0;
_ports[2] = 0;
prometheus::simpleapi::saver.set_registry(prometheus::simpleapi::registry_ptr);
prometheus::simpleapi::saver.set_delay(std::chrono::seconds(5));
prometheus::simpleapi::saver.set_out_file(_homePath + ZT_PATH_SEPARATOR + "metrics.prom");
#if ZT_VAULT_SUPPORT
curl_global_init(CURL_GLOBAL_DEFAULT);
#endif
@ -1667,7 +1673,15 @@ public:
} else scode = 404;
_node->freeQueryResult((void *)pl);
} else scode = 500;
} else scode = 500;\
} else if (ps[0] == "metrics") {
std::string statspath = _homePath + ZT_PATH_SEPARATOR + "metrics.prom";
if (!OSUtils::readFile(statspath.c_str(), responseBody)) {
scode = 500;
} else {
scode = 200;
responseContentType = "text/plain";
}
} else {
if (_controller) {
scode = _controller->handleControlPlaneHttpGET(std::vector<std::string>(ps.begin()+1,ps.end()),urlArgs,headers,body,responseBody,responseContentType);
@ -2456,9 +2470,11 @@ public:
if (_forceTcpRelay) {
return;
}
Metrics::udp_recv += len;
const uint64_t now = OSUtils::now();
if ((len >= 16)&&(reinterpret_cast<const InetAddress *>(from)->ipScope() == InetAddress::IP_SCOPE_GLOBAL))
if ((len >= 16)&&(reinterpret_cast<const InetAddress *>(from)->ipScope() == InetAddress::IP_SCOPE_GLOBAL)) {
_lastDirectReceiveFromGlobal = now;
}
const ZT_ResultCode rc = _node->processWirePacket(nullptr,now,reinterpret_cast<int64_t>(sock),reinterpret_cast<const struct sockaddr_storage *>(from),data,len,&_nextBackgroundTaskDeadline);
if (ZT_ResultCode_isFatal(rc)) {
char tmp[256];
@ -2545,6 +2561,7 @@ public:
{
try {
if (!len) return; // sanity check, should never happen
Metrics::tcp_recv += len;
TcpConnection *tc = reinterpret_cast<TcpConnection *>(*uptr);
tc->lastReceive = OSUtils::now();
switch(tc->type) {
@ -2683,6 +2700,7 @@ public:
Mutex::Lock _l(tc->writeq_m);
if (tc->writeq.length() > 0) {
long sent = (long)_phy.streamSend(sock,tc->writeq.data(),(unsigned long)tc->writeq.length(),true);
Metrics::tcp_send += sent;
if (sent > 0) {
if ((unsigned long)sent >= (unsigned long)tc->writeq.length()) {
tc->writeq.clear();
@ -3221,9 +3239,13 @@ public:
// working we can instantly "fail forward" to it and stop using TCP
// proxy fallback, which is slow.
if ((localSocket != -1)&&(localSocket != 0)&&(_binder.isUdpSocketValid((PhySocket *)((uintptr_t)localSocket)))) {
if ((ttl)&&(addr->ss_family == AF_INET)) _phy.setIp4UdpTtl((PhySocket *)((uintptr_t)localSocket),ttl);
if ((ttl)&&(addr->ss_family == AF_INET)) {
_phy.setIp4UdpTtl((PhySocket *)((uintptr_t)localSocket),ttl);
}
const bool r = _phy.udpSend((PhySocket *)((uintptr_t)localSocket),(const struct sockaddr *)addr,data,len);
if ((ttl)&&(addr->ss_family == AF_INET)) _phy.setIp4UdpTtl((PhySocket *)((uintptr_t)localSocket),255);
if ((ttl)&&(addr->ss_family == AF_INET)) {
_phy.setIp4UdpTtl((PhySocket *)((uintptr_t)localSocket),255);
}
return ((r) ? 0 : -1);
} else {
return ((_binder.udpSendAll(_phy,addr,data,len,ttl)) ? 0 : -1);