netbird/management/server/metrics/selfhosted.go
Maycon Santos 85e991ff78
Fix issue with canceled context before pushing metrics and decreasing pushing interval (#2235)
Fix a bug where the post context was canceled before sending metrics to the server.

The interval time was decreased, and an optional environment variable NETBIRD_METRICS_INTERVAL_IN_SECONDS was added to control the interval time.

* update doc URL
2024-07-04 19:15:59 +02:00

436 lines
12 KiB
Go

// Package metrics gather anonymous information about the usage of NetBird management
package metrics
import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"sort"
"strings"
"time"
"github.com/hashicorp/go-version"
log "github.com/sirupsen/logrus"
"github.com/netbirdio/netbird/management/server"
nbversion "github.com/netbirdio/netbird/version"
)
const (
// PayloadEvent identifies an event type
PayloadEvent = "self-hosted stats"
// payloadEndpoint metrics defaultEndpoint to send anonymous data
payloadEndpoint = "https://metrics.netbird.io"
// defaultPushInterval default interval to push metrics
defaultPushInterval = 12 * time.Hour
// requestTimeout http request timeout
requestTimeout = 45 * time.Second
)
type getTokenResponse struct {
PublicAPIToken string `json:"public_api_token"`
}
type pushPayload struct {
APIKey string `json:"api_key"`
DistinctID string `json:"distinct_id"`
Event string `json:"event"`
Properties properties `json:"properties"`
Timestamp time.Time `json:"timestamp"`
}
// properties metrics to push
type properties map[string]interface{}
// DataSource metric data source
type DataSource interface {
GetAllAccounts(ctx context.Context) []*server.Account
GetStoreEngine() server.StoreEngine
}
// ConnManager peer connection manager that holds state for current active connections
type ConnManager interface {
GetAllConnectedPeers() map[string]struct{}
}
// Worker metrics collector and pusher
type Worker struct {
ctx context.Context
id string
idpManager string
dataSource DataSource
connManager ConnManager
startupTime time.Time
lastRun time.Time
}
// NewWorker returns a metrics worker
func NewWorker(ctx context.Context, id string, dataSource DataSource, connManager ConnManager, idpManager string) *Worker {
currentTime := time.Now()
return &Worker{
ctx: ctx,
id: id,
idpManager: idpManager,
dataSource: dataSource,
connManager: connManager,
startupTime: currentTime,
lastRun: currentTime,
}
}
// Run runs the metrics worker
func (w *Worker) Run(ctx context.Context) {
interval := getMetricsInterval(ctx)
pushTicker := time.NewTicker(interval)
for {
select {
case <-w.ctx.Done():
return
case <-pushTicker.C:
err := w.sendMetrics(ctx)
if err != nil {
log.WithContext(ctx).Error(err)
}
w.lastRun = time.Now()
}
}
}
func getMetricsInterval(ctx context.Context) time.Duration {
interval := defaultPushInterval
if os.Getenv("NETBIRD_METRICS_INTERVAL_IN_SECONDS") != "" {
newInterval, err := time.ParseDuration(os.Getenv("NETBIRD_METRICS_INTERVAL_IN_SECONDS") + "s")
if err != nil {
log.WithContext(ctx).Errorf("unable to parse NETBIRD_METRICS_INTERVAL_IN_SECONDS, using default interval %v. Error: %v", defaultPushInterval, err)
} else {
log.WithContext(ctx).Infof("using NETBIRD_METRICS_INTERVAL_IN_SECONDS %s", newInterval)
interval = newInterval
}
}
return interval
}
func (w *Worker) sendMetrics(ctx context.Context) error {
apiKey, err := getAPIKey(w.ctx)
if err != nil {
return err
}
payload := w.generatePayload(ctx, apiKey)
payloadString, err := buildMetricsPayload(payload)
if err != nil {
return err
}
httpClient := http.Client{}
exportJobReq, cancelCTX, err := createPostRequest(w.ctx, payloadEndpoint+"/capture/", payloadString)
if err != nil {
return fmt.Errorf("unable to create metrics post request %v", err)
}
defer cancelCTX()
jobResp, err := httpClient.Do(exportJobReq)
if err != nil {
return fmt.Errorf("unable to push metrics %v", err)
}
defer func() {
err = jobResp.Body.Close()
if err != nil {
log.WithContext(ctx).Errorf("error while closing update metrics response body: %v", err)
}
}()
if jobResp.StatusCode != 200 {
return fmt.Errorf("unable to push anonymous metrics, got statusCode %d", jobResp.StatusCode)
}
log.WithContext(ctx).Infof("sent anonymous metrics, next push will happen in %s. "+
"You can disable these metrics by running with flag --disable-anonymous-metrics,"+
" see more information at https://docs.netbird.io/about-netbird/faq#why-and-what-are-the-anonymous-usage-metrics", getMetricsInterval(ctx))
return nil
}
func (w *Worker) generatePayload(ctx context.Context, apiKey string) pushPayload {
properties := w.generateProperties(ctx)
return pushPayload{
APIKey: apiKey,
DistinctID: w.id,
Event: PayloadEvent,
Properties: properties,
Timestamp: time.Now(),
}
}
func (w *Worker) generateProperties(ctx context.Context) properties {
var (
uptime float64
accounts int
expirationEnabled int
users int
serviceUsers int
pats int
peers int
peersSSHEnabled int
setupKeysUsage int
ephemeralPeersSKs int
ephemeralPeersSKUsage int
activePeersLastDay int
osPeers map[string]int
userPeers int
rules int
rulesProtocol map[string]int
rulesDirection map[string]int
rulesWithSrcPostureChecks int
postureChecks int
groups int
routes int
routesWithRGGroups int
nameservers int
uiClient int
version string
peerActiveVersions []string
osUIClients map[string]int
)
start := time.Now()
metricsProperties := make(properties)
osPeers = make(map[string]int)
osUIClients = make(map[string]int)
rulesProtocol = make(map[string]int)
rulesDirection = make(map[string]int)
uptime = time.Since(w.startupTime).Seconds()
connections := w.connManager.GetAllConnectedPeers()
version = nbversion.NetbirdVersion()
for _, account := range w.dataSource.GetAllAccounts(ctx) {
accounts++
if account.Settings.PeerLoginExpirationEnabled {
expirationEnabled++
}
groups += len(account.Groups)
routes += len(account.Routes)
for _, route := range account.Routes {
if len(route.PeerGroups) > 0 {
routesWithRGGroups++
}
}
nameservers += len(account.NameServerGroups)
for _, policy := range account.Policies {
for _, rule := range policy.Rules {
rules++
rulesProtocol[string(rule.Protocol)]++
if rule.Bidirectional {
rulesDirection["bidirectional"]++
} else {
rulesDirection["oneway"]++
}
}
if len(policy.SourcePostureChecks) > 0 {
rulesWithSrcPostureChecks++
}
}
postureChecks += len(account.PostureChecks)
for _, user := range account.Users {
if user.IsServiceUser {
serviceUsers++
} else {
users++
}
pats += len(user.PATs)
}
for _, key := range account.SetupKeys {
setupKeysUsage += key.UsedTimes
if key.Ephemeral {
ephemeralPeersSKs++
ephemeralPeersSKUsage += key.UsedTimes
}
}
for _, peer := range account.Peers {
peers++
if peer.SSHEnabled {
peersSSHEnabled++
}
if peer.SetupKey == "" {
userPeers++
}
osKey := strings.ToLower(fmt.Sprintf("peer_os_%s", peer.Meta.GoOS))
osCount := osPeers[osKey]
osPeers[osKey] = osCount + 1
if peer.Meta.UIVersion != "" {
uiClient++
uiOSKey := strings.ToLower(fmt.Sprintf("ui_client_os_%s", peer.Meta.GoOS))
osUICount := osUIClients[uiOSKey]
osUIClients[uiOSKey] = osUICount + 1
}
_, connected := connections[peer.ID]
if connected || peer.Status.LastSeen.After(w.lastRun) {
activePeersLastDay++
osActiveKey := osKey + "_active"
osActiveCount := osPeers[osActiveKey]
osPeers[osActiveKey] = osActiveCount + 1
peerActiveVersions = append(peerActiveVersions, peer.Meta.WtVersion)
}
}
}
minActivePeerVersion, maxActivePeerVersion := getMinMaxVersion(peerActiveVersions)
metricsProperties["uptime"] = uptime
metricsProperties["accounts"] = accounts
metricsProperties["users"] = users
metricsProperties["service_users"] = serviceUsers
metricsProperties["pats"] = pats
metricsProperties["peers"] = peers
metricsProperties["peers_ssh_enabled"] = peersSSHEnabled
metricsProperties["peers_login_expiration_enabled"] = expirationEnabled
metricsProperties["setup_keys_usage"] = setupKeysUsage
metricsProperties["ephemeral_peers_setup_keys"] = ephemeralPeersSKs
metricsProperties["ephemeral_peers_setup_keys_usage"] = ephemeralPeersSKUsage
metricsProperties["active_peers_last_day"] = activePeersLastDay
metricsProperties["user_peers"] = userPeers
metricsProperties["rules"] = rules
metricsProperties["rules_with_src_posture_checks"] = rulesWithSrcPostureChecks
metricsProperties["posture_checks"] = postureChecks
metricsProperties["groups"] = groups
metricsProperties["routes"] = routes
metricsProperties["routes_with_routing_groups"] = routesWithRGGroups
metricsProperties["nameservers"] = nameservers
metricsProperties["version"] = version
metricsProperties["min_active_peer_version"] = minActivePeerVersion
metricsProperties["max_active_peer_version"] = maxActivePeerVersion
metricsProperties["ui_clients"] = uiClient
metricsProperties["idp_manager"] = w.idpManager
metricsProperties["store_engine"] = w.dataSource.GetStoreEngine()
for protocol, count := range rulesProtocol {
metricsProperties["rules_protocol_"+protocol] = count
}
for direction, count := range rulesDirection {
metricsProperties["rules_direction_"+direction] = count
}
for os, count := range osPeers {
metricsProperties[os] = count
}
for os, count := range osUIClients {
metricsProperties[os] = count
}
metricsProperties["metric_generation_time"] = time.Since(start).Milliseconds()
return metricsProperties
}
func getAPIKey(ctx context.Context) (string, error) {
ctx, cancel := context.WithTimeout(ctx, requestTimeout)
defer cancel()
httpClient := http.Client{}
req, err := http.NewRequestWithContext(ctx, http.MethodGet, payloadEndpoint+"/GetToken", nil)
if err != nil {
return "", fmt.Errorf("unable to create request for metrics public api token %v", err)
}
response, err := httpClient.Do(req)
if err != nil {
return "", fmt.Errorf("unable to request metrics public api token %v", err)
}
defer func() {
err = response.Body.Close()
if err != nil {
log.WithContext(ctx).Errorf("error while closing metrics token response body: %v", err)
}
}()
if response.StatusCode != 200 {
return "", fmt.Errorf("unable to retrieve metrics token, statusCode %d", response.StatusCode)
}
body, err := io.ReadAll(response.Body)
if err != nil {
return "", fmt.Errorf("coudln't get metrics token response; %v", err)
}
var tokenResponse getTokenResponse
err = json.Unmarshal(body, &tokenResponse)
if err != nil {
return "", fmt.Errorf("coudln't parse metrics public api token; %v", err)
}
return tokenResponse.PublicAPIToken, nil
}
func buildMetricsPayload(payload pushPayload) (string, error) {
str, err := json.Marshal(payload)
if err != nil {
return "", fmt.Errorf("unable to marshal metrics payload, got err: %v", err)
}
return string(str), nil
}
func createPostRequest(ctx context.Context, endpoint string, payloadStr string) (*http.Request, context.CancelFunc, error) {
ctx, cancel := context.WithTimeout(ctx, requestTimeout)
reqURL := endpoint
payload := strings.NewReader(payloadStr)
req, err := http.NewRequestWithContext(ctx, "POST", reqURL, payload)
if err != nil {
cancel()
return nil, nil, err
}
req.Header.Add("content-type", "application/json")
return req, cancel, nil
}
func getMinMaxVersion(inputList []string) (string, string) {
versions := make([]*version.Version, 0)
for _, raw := range inputList {
if raw != "" && nbversion.SemverRegexp.MatchString(raw) {
v, err := version.NewVersion(raw)
if err == nil {
versions = append(versions, v)
}
}
}
targetIndex := 1
l := len(versions)
switch l {
case 0:
return "", ""
case targetIndex:
v := versions[targetIndex-1].String()
return v, v
default:
sort.Sort(version.Collection(versions))
return versions[targetIndex-1].String(), versions[l-1].String()
}
}