[management/signal/relay] add metrics descriptions (#3233)

This commit is contained in:
Pascal Fischer 2025-01-24 14:17:30 +01:00 committed by GitHub
parent 2605948e01
commit b6abd4b4da
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 213 additions and 66 deletions

View File

@ -1130,11 +1130,6 @@ func (am *DefaultAccountManager) UpdateAccountPeers(ctx context.Context, account
}
start := time.Now()
defer func() {
if am.metrics != nil {
am.metrics.AccountManagerMetrics().CountUpdateAccountPeersDuration(time.Since(start))
}
}()
approvedPeersMap, err := am.integratedPeerValidator.GetValidatedPeers(account.Id, maps.Values(account.Groups), maps.Values(account.Peers), account.Settings.Extra)
if err != nil {
@ -1175,6 +1170,9 @@ func (am *DefaultAccountManager) UpdateAccountPeers(ctx context.Context, account
}
wg.Wait()
if am.metrics != nil {
am.metrics.AccountManagerMetrics().CountUpdateAccountPeersDuration(time.Since(start))
}
}
// UpdateAccountPeer updates a single peer that belongs to an account.

View File

@ -22,7 +22,8 @@ func NewAccountManagerMetrics(ctx context.Context, meter metric.Meter) (*Account
metric.WithUnit("milliseconds"),
metric.WithExplicitBucketBoundaries(
0.5, 1, 2.5, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 30000,
))
),
metric.WithDescription("Duration of triggering the account peers update and preparing the required data for the network map being sent to the clients"))
if err != nil {
return nil, err
}
@ -31,7 +32,8 @@ func NewAccountManagerMetrics(ctx context.Context, meter metric.Meter) (*Account
metric.WithUnit("milliseconds"),
metric.WithExplicitBucketBoundaries(
0.1, 0.5, 1, 2.5, 5, 10, 25, 50, 100, 250, 500, 1000,
))
),
metric.WithDescription("Duration of calculating the peer network map that is sent to the clients"))
if err != nil {
return nil, err
}
@ -40,12 +42,15 @@ func NewAccountManagerMetrics(ctx context.Context, meter metric.Meter) (*Account
metric.WithUnit("objects"),
metric.WithExplicitBucketBoundaries(
50, 100, 200, 500, 1000, 2500, 5000, 10000,
))
),
metric.WithDescription("Number of objects in the network map like peers, routes, firewall rules, etc. that are sent to the clients"))
if err != nil {
return nil, err
}
peerMetaUpdateCount, err := meter.Int64Counter("management.account.peer.meta.update.counter", metric.WithUnit("1"))
peerMetaUpdateCount, err := meter.Int64Counter("management.account.peer.meta.update.counter",
metric.WithUnit("1"),
metric.WithDescription("Number of updates with new meta data from the peers"))
if err != nil {
return nil, err
}

View File

@ -22,32 +22,50 @@ type GRPCMetrics struct {
// NewGRPCMetrics creates new GRPCMetrics struct and registers common metrics of the gRPC server
func NewGRPCMetrics(ctx context.Context, meter metric.Meter) (*GRPCMetrics, error) {
syncRequestsCounter, err := meter.Int64Counter("management.grpc.sync.request.counter", metric.WithUnit("1"))
syncRequestsCounter, err := meter.Int64Counter("management.grpc.sync.request.counter",
metric.WithUnit("1"),
metric.WithDescription("Number of sync gRPC requests from the peers to establish a connection and receive network map updates (update channel)"),
)
if err != nil {
return nil, err
}
loginRequestsCounter, err := meter.Int64Counter("management.grpc.login.request.counter", metric.WithUnit("1"))
loginRequestsCounter, err := meter.Int64Counter("management.grpc.login.request.counter",
metric.WithUnit("1"),
metric.WithDescription("Number of login gRPC requests from the peers to authenticate and receive initial configuration and relay credentials"),
)
if err != nil {
return nil, err
}
getKeyRequestsCounter, err := meter.Int64Counter("management.grpc.key.request.counter", metric.WithUnit("1"))
getKeyRequestsCounter, err := meter.Int64Counter("management.grpc.key.request.counter",
metric.WithUnit("1"),
metric.WithDescription("Number of key gRPC requests from the peers to get the server's public WireGuard key"),
)
if err != nil {
return nil, err
}
activeStreamsGauge, err := meter.Int64ObservableGauge("management.grpc.connected.streams", metric.WithUnit("1"))
activeStreamsGauge, err := meter.Int64ObservableGauge("management.grpc.connected.streams",
metric.WithUnit("1"),
metric.WithDescription("Number of active peer streams connected to the gRPC server"),
)
if err != nil {
return nil, err
}
syncRequestDuration, err := meter.Int64Histogram("management.grpc.sync.request.duration.ms", metric.WithUnit("milliseconds"))
syncRequestDuration, err := meter.Int64Histogram("management.grpc.sync.request.duration.ms",
metric.WithUnit("milliseconds"),
metric.WithDescription("Duration of the sync gRPC requests from the peers to establish a connection and receive network map updates (update channel)"),
)
if err != nil {
return nil, err
}
loginRequestDuration, err := meter.Int64Histogram("management.grpc.login.request.duration.ms", metric.WithUnit("milliseconds"))
loginRequestDuration, err := meter.Int64Histogram("management.grpc.login.request.duration.ms",
metric.WithUnit("milliseconds"),
metric.WithDescription("Duration of the login gRPC requests from the peers to authenticate and receive initial configuration and relay credentials"),
)
if err != nil {
return nil, err
}
@ -57,7 +75,7 @@ func NewGRPCMetrics(ctx context.Context, meter metric.Meter) (*GRPCMetrics, erro
// TODO(yury): This needs custom bucketing as we are interested in the values from 0 to server.channelBufferSize (100)
channelQueue, err := meter.Int64Histogram(
"management.grpc.updatechannel.queue",
metric.WithDescription("Number of update messages in the channel queue"),
metric.WithDescription("Number of update messages piling up in the update channel queue"),
metric.WithUnit("length"),
)
if err != nil {

View File

@ -74,37 +74,58 @@ type HTTPMiddleware struct {
// NewMetricsMiddleware creates a new HTTPMiddleware
func NewMetricsMiddleware(ctx context.Context, meter metric.Meter) (*HTTPMiddleware, error) {
httpRequestCounter, err := meter.Int64Counter(httpRequestCounterPrefix, metric.WithUnit("1"))
httpRequestCounter, err := meter.Int64Counter(httpRequestCounterPrefix,
metric.WithUnit("1"),
metric.WithDescription("Number of incoming HTTP requests by endpoint and method"),
)
if err != nil {
return nil, err
}
httpResponseCounter, err := meter.Int64Counter(httpResponseCounterPrefix, metric.WithUnit("1"))
httpResponseCounter, err := meter.Int64Counter(httpResponseCounterPrefix,
metric.WithUnit("1"),
metric.WithDescription("Number of outgoing HTTP responses by endpoint, method and returned status code"),
)
if err != nil {
return nil, err
}
totalHTTPRequestsCounter, err := meter.Int64Counter(fmt.Sprintf("%s.total", httpRequestCounterPrefix), metric.WithUnit("1"))
totalHTTPRequestsCounter, err := meter.Int64Counter(fmt.Sprintf("%s.total", httpRequestCounterPrefix),
metric.WithUnit("1"),
metric.WithDescription("Number of incoming HTTP requests"),
)
if err != nil {
return nil, err
}
totalHTTPResponseCounter, err := meter.Int64Counter(fmt.Sprintf("%s.total", httpResponseCounterPrefix), metric.WithUnit("1"))
totalHTTPResponseCounter, err := meter.Int64Counter(fmt.Sprintf("%s.total", httpResponseCounterPrefix),
metric.WithUnit("1"),
metric.WithDescription("Number of outgoing HTTP responses"),
)
if err != nil {
return nil, err
}
totalHTTPResponseCodeCounter, err := meter.Int64Counter(fmt.Sprintf("%s.code.total", httpResponseCounterPrefix), metric.WithUnit("1"))
totalHTTPResponseCodeCounter, err := meter.Int64Counter(fmt.Sprintf("%s.code.total", httpResponseCounterPrefix),
metric.WithUnit("1"),
metric.WithDescription("Number of outgoing HTTP responses by status code"),
)
if err != nil {
return nil, err
}
httpRequestDuration, err := meter.Int64Histogram(httpRequestDurationPrefix, metric.WithUnit("milliseconds"))
httpRequestDuration, err := meter.Int64Histogram(httpRequestDurationPrefix,
metric.WithUnit("milliseconds"),
metric.WithDescription("Duration of incoming HTTP requests by endpoint and method"),
)
if err != nil {
return nil, err
}
totalHTTPRequestDuration, err := meter.Int64Histogram(fmt.Sprintf("%s.total", httpRequestDurationPrefix), metric.WithUnit("milliseconds"))
totalHTTPRequestDuration, err := meter.Int64Histogram(fmt.Sprintf("%s.total", httpRequestDurationPrefix),
metric.WithUnit("milliseconds"),
metric.WithDescription("Duration of incoming HTTP requests"),
)
if err != nil {
return nil, err
}

View File

@ -23,43 +23,73 @@ type IDPMetrics struct {
// NewIDPMetrics creates new IDPMetrics struct and registers common
func NewIDPMetrics(ctx context.Context, meter metric.Meter) (*IDPMetrics, error) {
metaUpdateCounter, err := meter.Int64Counter("management.idp.update.user.meta.counter", metric.WithUnit("1"))
metaUpdateCounter, err := meter.Int64Counter("management.idp.update.user.meta.counter",
metric.WithUnit("1"),
metric.WithDescription("Number of updates of user metadata sent to the configured identity provider"),
)
if err != nil {
return nil, err
}
getUserByEmailCounter, err := meter.Int64Counter("management.idp.get.user.by.email.counter", metric.WithUnit("1"))
getUserByEmailCounter, err := meter.Int64Counter("management.idp.get.user.by.email.counter",
metric.WithUnit("1"),
metric.WithDescription("Number of requests to get a user by email from the configured identity provider"),
)
if err != nil {
return nil, err
}
getAllAccountsCounter, err := meter.Int64Counter("management.idp.get.accounts.counter", metric.WithUnit("1"))
getAllAccountsCounter, err := meter.Int64Counter("management.idp.get.accounts.counter",
metric.WithUnit("1"),
metric.WithDescription("Number of requests to get all accounts from the configured identity provider"),
)
if err != nil {
return nil, err
}
createUserCounter, err := meter.Int64Counter("management.idp.create.user.counter", metric.WithUnit("1"))
createUserCounter, err := meter.Int64Counter("management.idp.create.user.counter",
metric.WithUnit("1"),
metric.WithDescription("Number of requests to create a new user in the configured identity provider"),
)
if err != nil {
return nil, err
}
deleteUserCounter, err := meter.Int64Counter("management.idp.delete.user.counter", metric.WithUnit("1"))
deleteUserCounter, err := meter.Int64Counter("management.idp.delete.user.counter",
metric.WithUnit("1"),
metric.WithDescription("Number of requests to delete a user from the configured identity provider"),
)
if err != nil {
return nil, err
}
getAccountCounter, err := meter.Int64Counter("management.idp.get.account.counter", metric.WithUnit("1"))
getAccountCounter, err := meter.Int64Counter("management.idp.get.account.counter",
metric.WithUnit("1"),
metric.WithDescription("Number of requests to get all users in an account from the configured identity provider"),
)
if err != nil {
return nil, err
}
getUserByIDCounter, err := meter.Int64Counter("management.idp.get.user.by.id.counter", metric.WithUnit("1"))
getUserByIDCounter, err := meter.Int64Counter("management.idp.get.user.by.id.counter",
metric.WithUnit("1"),
metric.WithDescription("Number of requests to get a user by ID from the configured identity provider"),
)
if err != nil {
return nil, err
}
authenticateRequestCounter, err := meter.Int64Counter("management.idp.authenticate.request.counter", metric.WithUnit("1"))
authenticateRequestCounter, err := meter.Int64Counter("management.idp.authenticate.request.counter",
metric.WithUnit("1"),
metric.WithDescription("Number of requests to authenticate the server with the configured identity provider"),
)
if err != nil {
return nil, err
}
requestErrorCounter, err := meter.Int64Counter("management.idp.request.error.counter", metric.WithUnit("1"))
requestErrorCounter, err := meter.Int64Counter("management.idp.request.error.counter",
metric.WithUnit("1"),
metric.WithDescription("Number of errors that happened when doing http request to the configured identity provider"),
)
if err != nil {
return nil, err
}
requestStatusErrorCounter, err := meter.Int64Counter("management.idp.request.status.error.counter", metric.WithUnit("1"))
requestStatusErrorCounter, err := meter.Int64Counter("management.idp.request.status.error.counter",
metric.WithUnit("1"),
metric.WithDescription("Number of responses that came from the configured identity provider with non success status code"),
)
if err != nil {
return nil, err
}

View File

@ -20,28 +20,41 @@ type StoreMetrics struct {
// NewStoreMetrics creates an instance of StoreMetrics
func NewStoreMetrics(ctx context.Context, meter metric.Meter) (*StoreMetrics, error) {
globalLockAcquisitionDurationMicro, err := meter.Int64Histogram("management.store.global.lock.acquisition.duration.micro",
metric.WithUnit("microseconds"))
metric.WithUnit("microseconds"),
metric.WithDescription("Duration of how long it takes to acquire the global lock in the store to block all other requests to the store"),
)
if err != nil {
return nil, err
}
globalLockAcquisitionDurationMs, err := meter.Int64Histogram("management.store.global.lock.acquisition.duration.ms")
globalLockAcquisitionDurationMs, err := meter.Int64Histogram("management.store.global.lock.acquisition.duration.ms",
metric.WithUnit("milliseconds"),
metric.WithDescription("Duration of how long a process holds the acquired global lock in the store"),
)
if err != nil {
return nil, err
}
persistenceDurationMicro, err := meter.Int64Histogram("management.store.persistence.duration.micro",
metric.WithUnit("microseconds"))
metric.WithUnit("microseconds"),
metric.WithDescription("Duration of how long it takes to save or delete an account in the store"),
)
if err != nil {
return nil, err
}
persistenceDurationMs, err := meter.Int64Histogram("management.store.persistence.duration.ms")
persistenceDurationMs, err := meter.Int64Histogram("management.store.persistence.duration.ms",
metric.WithUnit("milliseconds"),
metric.WithDescription("Duration of how long it takes to save or delete an account in the store"),
)
if err != nil {
return nil, err
}
transactionDurationMs, err := meter.Int64Histogram("management.store.transaction.duration.ms")
transactionDurationMs, err := meter.Int64Histogram("management.store.transaction.duration.ms",
metric.WithUnit("milliseconds"),
metric.WithDescription("Duration of how long it takes to execute a transaction in the store"),
)
if err != nil {
return nil, err
}

View File

@ -23,42 +23,68 @@ type UpdateChannelMetrics struct {
// NewUpdateChannelMetrics creates an instance of UpdateChannel
func NewUpdateChannelMetrics(ctx context.Context, meter metric.Meter) (*UpdateChannelMetrics, error) {
createChannelDurationMicro, err := meter.Int64Histogram("management.updatechannel.create.duration.micro")
createChannelDurationMicro, err := meter.Int64Histogram("management.updatechannel.create.duration.micro",
metric.WithUnit("microseconds"),
metric.WithDescription("Duration of how long it takes to create a new peer update channel"),
)
if err != nil {
return nil, err
}
closeChannelDurationMicro, err := meter.Int64Histogram("management.updatechannel.close.one.duration.micro")
closeChannelDurationMicro, err := meter.Int64Histogram("management.updatechannel.close.one.duration.micro",
metric.WithUnit("microseconds"),
metric.WithDescription("Duration of how long it takes to close a peer update channel"),
)
if err != nil {
return nil, err
}
closeChannelsDurationMicro, err := meter.Int64Histogram("management.updatechannel.close.multiple.duration.micro")
closeChannelsDurationMicro, err := meter.Int64Histogram("management.updatechannel.close.multiple.duration.micro",
metric.WithUnit("microseconds"),
metric.WithDescription("Duration of how long it takes to close a set of peer update channels"),
)
if err != nil {
return nil, err
}
closeChannels, err := meter.Int64Histogram("management.updatechannel.close.multiple.channels")
closeChannels, err := meter.Int64Histogram("management.updatechannel.close.multiple.channels",
metric.WithUnit("1"),
metric.WithDescription("Number of peer update channels that have been closed"),
)
if err != nil {
return nil, err
}
sendUpdateDurationMicro, err := meter.Int64Histogram("management.updatechannel.send.duration.micro")
sendUpdateDurationMicro, err := meter.Int64Histogram("management.updatechannel.send.duration.micro",
metric.WithUnit("microseconds"),
metric.WithDescription("Duration of how long it takes to send an network map update to a peer"),
)
if err != nil {
return nil, err
}
getAllConnectedPeersDurationMicro, err := meter.Int64Histogram("management.updatechannel.get.all.duration.micro")
getAllConnectedPeersDurationMicro, err := meter.Int64Histogram("management.updatechannel.get.all.duration.micro",
metric.WithUnit("microseconds"),
metric.WithDescription("Duration of how long it takes to get all connected peers"),
)
if err != nil {
return nil, err
}
getAllConnectedPeers, err := meter.Int64Histogram("management.updatechannel.get.all.peers")
getAllConnectedPeers, err := meter.Int64Histogram("management.updatechannel.get.all.peers",
metric.WithUnit("1"),
metric.WithDescription("Number of connected peers"),
)
if err != nil {
return nil, err
}
hasChannelDurationMicro, err := meter.Int64Histogram("management.updatechannel.haschannel.duration.micro")
hasChannelDurationMicro, err := meter.Int64Histogram("management.updatechannel.haschannel.duration.micro",
metric.WithUnit("microseconds"),
metric.WithDescription("Duration of how long it takes to check if a peer has a channel"),
)
if err != nil {
return nil, err
}

View File

@ -289,14 +289,14 @@ func (a *Account) GetPeerNetworkMap(
}
if metrics != nil {
objectCount := int64(len(peersToConnect) + len(expiredPeers) + len(routesUpdate) + len(firewallRules))
objectCount := int64(len(peersToConnectIncludingRouters) + len(expiredPeers) + len(routesUpdate) + len(networkResourcesRoutes) + len(firewallRules) + +len(networkResourcesFirewallRules) + len(routesFirewallRules))
metrics.CountNetworkMapObjects(objectCount)
metrics.CountGetPeerNetworkMapDuration(time.Since(start))
if objectCount > 5000 {
log.WithContext(ctx).Tracef("account: %s has a total resource count of %d objects, "+
"peers to connect: %d, expired peers: %d, routes: %d, firewall rules: %d",
a.Id, objectCount, len(peersToConnect), len(expiredPeers), len(routesUpdate), len(firewallRules))
"peers to connect: %d, expired peers: %d, routes: %d, firewall rules: %d, network resources routes: %d, network resources firewall rules: %d, routes firewall rules: %d",
a.Id, objectCount, len(peersToConnectIncludingRouters), len(expiredPeers), len(routesUpdate), len(firewallRules), len(networkResourcesRoutes), len(networkResourcesFirewallRules), len(routesFirewallRules))
}
}

View File

@ -29,37 +29,53 @@ type Metrics struct {
}
func NewMetrics(ctx context.Context, meter metric.Meter) (*Metrics, error) {
bytesSent, err := meter.Int64Counter("relay_transfer_sent_bytes_total")
bytesSent, err := meter.Int64Counter("relay_transfer_sent_bytes_total",
metric.WithDescription("Total number of bytes sent to peers"),
)
if err != nil {
return nil, err
}
bytesRecv, err := meter.Int64Counter("relay_transfer_received_bytes_total")
bytesRecv, err := meter.Int64Counter("relay_transfer_received_bytes_total",
metric.WithDescription("Total number of bytes received from peers"),
)
if err != nil {
return nil, err
}
peers, err := meter.Int64UpDownCounter("relay_peers")
peers, err := meter.Int64UpDownCounter("relay_peers",
metric.WithDescription("Number of connected peers"),
)
if err != nil {
return nil, err
}
peersActive, err := meter.Int64ObservableGauge("relay_peers_active")
peersActive, err := meter.Int64ObservableGauge("relay_peers_active",
metric.WithDescription("Number of active connected peers"),
)
if err != nil {
return nil, err
}
peersIdle, err := meter.Int64ObservableGauge("relay_peers_idle")
peersIdle, err := meter.Int64ObservableGauge("relay_peers_idle",
metric.WithDescription("Number of idle connected peers"),
)
if err != nil {
return nil, err
}
authTime, err := meter.Float64Histogram("relay_peer_authentication_time_milliseconds", metric.WithExplicitBucketBoundaries(getStandardBucketBoundaries()...))
authTime, err := meter.Float64Histogram("relay_peer_authentication_time_milliseconds",
metric.WithExplicitBucketBoundaries(getStandardBucketBoundaries()...),
metric.WithDescription("Time taken to authenticate a peer"),
)
if err != nil {
return nil, err
}
peerStoreTime, err := meter.Float64Histogram("relay_peer_store_time_milliseconds", metric.WithExplicitBucketBoundaries(getStandardBucketBoundaries()...))
peerStoreTime, err := meter.Float64Histogram("relay_peer_store_time_milliseconds",
metric.WithExplicitBucketBoundaries(getStandardBucketBoundaries()...),
metric.WithDescription("Time taken to store a new peer connection"),
)
if err != nil {
return nil, err
}

View File

@ -23,56 +23,76 @@ type AppMetrics struct {
}
func NewAppMetrics(meter metric.Meter) (*AppMetrics, error) {
activePeers, err := meter.Int64UpDownCounter("active_peers")
activePeers, err := meter.Int64UpDownCounter("active_peers",
metric.WithDescription("Number of active connected peers"),
)
if err != nil {
return nil, err
}
peerConnectionDuration, err := meter.Int64Histogram("peer_connection_duration_seconds",
metric.WithExplicitBucketBoundaries(getPeerConnectionDurationBucketBoundaries()...))
metric.WithExplicitBucketBoundaries(getPeerConnectionDurationBucketBoundaries()...),
metric.WithDescription("Duration of how long a peer was connected"),
)
if err != nil {
return nil, err
}
registrations, err := meter.Int64Counter("registrations_total")
registrations, err := meter.Int64Counter("registrations_total",
metric.WithDescription("Total number of peer registrations"),
)
if err != nil {
return nil, err
}
deregistrations, err := meter.Int64Counter("deregistrations_total")
deregistrations, err := meter.Int64Counter("deregistrations_total",
metric.WithDescription("Total number of peer deregistrations"),
)
if err != nil {
return nil, err
}
registrationFailures, err := meter.Int64Counter("registration_failures_total")
registrationFailures, err := meter.Int64Counter("registration_failures_total",
metric.WithDescription("Total number of peer registration failures"),
)
if err != nil {
return nil, err
}
registrationDelay, err := meter.Float64Histogram("registration_delay_milliseconds",
metric.WithExplicitBucketBoundaries(getStandardBucketBoundaries()...))
metric.WithExplicitBucketBoundaries(getStandardBucketBoundaries()...),
metric.WithDescription("Duration of how long it takes to register a peer"),
)
if err != nil {
return nil, err
}
getRegistrationDelay, err := meter.Float64Histogram("get_registration_delay_milliseconds",
metric.WithExplicitBucketBoundaries(getStandardBucketBoundaries()...))
metric.WithExplicitBucketBoundaries(getStandardBucketBoundaries()...),
metric.WithDescription("Duration of how long it takes to load a connection from the registry"),
)
if err != nil {
return nil, err
}
messagesForwarded, err := meter.Int64Counter("messages_forwarded_total")
messagesForwarded, err := meter.Int64Counter("messages_forwarded_total",
metric.WithDescription("Total number of messages forwarded to peers"),
)
if err != nil {
return nil, err
}
messageForwardFailures, err := meter.Int64Counter("message_forward_failures_total")
messageForwardFailures, err := meter.Int64Counter("message_forward_failures_total",
metric.WithDescription("Total number of message forwarding failures"),
)
if err != nil {
return nil, err
}
messageForwardLatency, err := meter.Float64Histogram("message_forward_latency_milliseconds",
metric.WithExplicitBucketBoundaries(getStandardBucketBoundaries()...))
metric.WithExplicitBucketBoundaries(getStandardBucketBoundaries()...),
metric.WithDescription("Duration of how long it takes to forward a message to a peer"),
)
if err != nil {
return nil, err
}