[management] Refactor HTTP metrics (#2476)

* Add logging for slow SQL queries in SaveAccount and GetAccount

* Add resource count log for large accounts

* Refactor metrics middleware to simplify counters and histograms

* Update log levels and remove redundant resource count check
This commit is contained in:
Bethuel Mmbaga 2024-08-23 19:42:55 +03:00 committed by GitHub
parent 33b264e598
commit d97b03656f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 73 additions and 105 deletions

View File

@ -476,6 +476,12 @@ func (a *Account) GetPeerNetworkMap(
objectCount := int64(len(peersToConnect) + len(expiredPeers) + len(routesUpdate) + len(firewallRules)) objectCount := int64(len(peersToConnect) + len(expiredPeers) + len(routesUpdate) + len(firewallRules))
metrics.CountNetworkMapObjects(objectCount) metrics.CountNetworkMapObjects(objectCount)
metrics.CountGetPeerNetworkMapDuration(time.Since(start)) metrics.CountGetPeerNetworkMapDuration(time.Since(start))
if objectCount > 5000 {
log.WithContext(ctx).Tracef("account: %s has a total resource count of %d objects, "+
"peers to connect: %d, expired peers: %d, routes: %d, firewall rules: %d",
a.Id, objectCount, len(peersToConnect), len(expiredPeers), len(routesUpdate), len(firewallRules))
}
} }
return nm return nm

View File

@ -100,27 +100,6 @@ func APIHandler(ctx context.Context, accountManager s.AccountManager, LocationMa
api.addPostureCheckEndpoint() api.addPostureCheckEndpoint()
api.addLocationsEndpoint() api.addLocationsEndpoint()
err := api.Router.Walk(func(route *mux.Route, _ *mux.Router, _ []*mux.Route) error {
methods, err := route.GetMethods()
if err != nil { // we may have wildcard routes from integrations without methods, skip them for now
methods = []string{}
}
for _, method := range methods {
template, err := route.GetPathTemplate()
if err != nil {
return err
}
err = metricsMiddleware.AddHTTPRequestResponseCounter(template, method)
if err != nil {
return err
}
}
return nil
})
if err != nil {
return nil, err
}
return rootRouter, nil return rootRouter, nil
} }

View File

@ -134,6 +134,12 @@ func (s *SqlStore) AcquireReadLockByUID(ctx context.Context, uniqueID string) (u
func (s *SqlStore) SaveAccount(ctx context.Context, account *Account) error { func (s *SqlStore) SaveAccount(ctx context.Context, account *Account) error {
start := time.Now() start := time.Now()
defer func() {
elapsed := time.Since(start)
if elapsed > 1*time.Second {
log.WithContext(ctx).Tracef("SaveAccount for account %s exceeded 1s, took: %v", account.Id, elapsed)
}
}()
// todo: remove this check after the issue is resolved // todo: remove this check after the issue is resolved
s.checkAccountDomainBeforeSave(ctx, account.Id, account.Domain) s.checkAccountDomainBeforeSave(ctx, account.Id, account.Domain)
@ -513,6 +519,13 @@ func (s *SqlStore) GetAllAccounts(ctx context.Context) (all []*Account) {
} }
func (s *SqlStore) GetAccount(ctx context.Context, accountID string) (*Account, error) { func (s *SqlStore) GetAccount(ctx context.Context, accountID string) (*Account, error) {
start := time.Now()
defer func() {
elapsed := time.Since(start)
if elapsed > 1*time.Second {
log.WithContext(ctx).Tracef("GetAccount for account %s exceeded 1s, took: %v", accountID, elapsed)
}
}()
var account Account var account Account
result := s.db.Model(&account). result := s.db.Model(&account).

View File

@ -8,6 +8,7 @@ import (
"time" "time"
"github.com/google/uuid" "github.com/google/uuid"
"github.com/gorilla/mux"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric" "go.opentelemetry.io/otel/metric"
@ -54,48 +55,66 @@ func (rw *WrappedResponseWriter) WriteHeader(code int) {
// HTTPMiddleware handler used to collect metrics of every request/response coming to the API. // HTTPMiddleware handler used to collect metrics of every request/response coming to the API.
// Also adds request tracing (logging). // Also adds request tracing (logging).
type HTTPMiddleware struct { type HTTPMiddleware struct {
meter metric.Meter
ctx context.Context ctx context.Context
// all HTTP requests by endpoint & method // all HTTP requests by endpoint & method
httpRequestCounters map[string]metric.Int64Counter httpRequestCounter metric.Int64Counter
// all HTTP responses by endpoint & method & status code // all HTTP responses by endpoint & method & status code
httpResponseCounters map[string]metric.Int64Counter httpResponseCounter metric.Int64Counter
// all HTTP requests // all HTTP requests
totalHTTPRequestsCounter metric.Int64Counter totalHTTPRequestsCounter metric.Int64Counter
// all HTTP responses // all HTTP responses
totalHTTPResponseCounter metric.Int64Counter totalHTTPResponseCounter metric.Int64Counter
// all HTTP responses by status code // all HTTP responses by status code
totalHTTPResponseCodeCounters map[int]metric.Int64Counter totalHTTPResponseCodeCounter metric.Int64Counter
// all HTTP requests durations by endpoint and method // all HTTP requests durations by endpoint and method
httpRequestDurations map[string]metric.Int64Histogram httpRequestDuration metric.Int64Histogram
// all HTTP requests durations // all HTTP requests durations
totalHTTPRequestDuration metric.Int64Histogram totalHTTPRequestDuration metric.Int64Histogram
} }
// NewMetricsMiddleware creates a new HTTPMiddleware // NewMetricsMiddleware creates a new HTTPMiddleware
func NewMetricsMiddleware(ctx context.Context, meter metric.Meter) (*HTTPMiddleware, error) { func NewMetricsMiddleware(ctx context.Context, meter metric.Meter) (*HTTPMiddleware, error) {
totalHTTPRequestsCounter, err := meter.Int64Counter(fmt.Sprintf("%s_total", httpRequestCounterPrefix), metric.WithUnit("1")) httpRequestCounter, err := meter.Int64Counter(httpRequestCounterPrefix, metric.WithUnit("1"))
if err != nil { if err != nil {
return nil, err return nil, err
} }
totalHTTPResponseCounter, err := meter.Int64Counter(fmt.Sprintf("%s_total", httpResponseCounterPrefix), metric.WithUnit("1")) httpResponseCounter, err := meter.Int64Counter(httpResponseCounterPrefix, metric.WithUnit("1"))
if err != nil { if err != nil {
return nil, err return nil, err
} }
totalHTTPRequestDuration, err := meter.Int64Histogram(fmt.Sprintf("%s_total", httpRequestDurationPrefix), metric.WithUnit("milliseconds")) totalHTTPRequestsCounter, err := meter.Int64Counter(fmt.Sprintf("%s.total", httpRequestCounterPrefix), metric.WithUnit("1"))
if err != nil {
return nil, err
}
totalHTTPResponseCounter, err := meter.Int64Counter(fmt.Sprintf("%s.total", httpResponseCounterPrefix), metric.WithUnit("1"))
if err != nil {
return nil, err
}
totalHTTPResponseCodeCounter, err := meter.Int64Counter(fmt.Sprintf("%s.code.total", httpResponseCounterPrefix), metric.WithUnit("1"))
if err != nil {
return nil, err
}
httpRequestDuration, err := meter.Int64Histogram(httpRequestDurationPrefix, metric.WithUnit("milliseconds"))
if err != nil {
return nil, err
}
totalHTTPRequestDuration, err := meter.Int64Histogram(fmt.Sprintf("%s.total", httpRequestDurationPrefix), metric.WithUnit("milliseconds"))
if err != nil { if err != nil {
return nil, err return nil, err
} }
return &HTTPMiddleware{ return &HTTPMiddleware{
ctx: ctx, ctx: ctx,
httpRequestCounters: map[string]metric.Int64Counter{}, httpRequestCounter: httpRequestCounter,
httpResponseCounters: map[string]metric.Int64Counter{}, httpResponseCounter: httpResponseCounter,
httpRequestDurations: map[string]metric.Int64Histogram{}, httpRequestDuration: httpRequestDuration,
totalHTTPResponseCodeCounters: map[int]metric.Int64Counter{}, totalHTTPResponseCodeCounter: totalHTTPResponseCodeCounter,
meter: meter,
totalHTTPRequestsCounter: totalHTTPRequestsCounter, totalHTTPRequestsCounter: totalHTTPRequestsCounter,
totalHTTPResponseCounter: totalHTTPResponseCounter, totalHTTPResponseCounter: totalHTTPResponseCounter,
totalHTTPRequestDuration: totalHTTPRequestDuration, totalHTTPRequestDuration: totalHTTPRequestDuration,
@ -103,63 +122,22 @@ func NewMetricsMiddleware(ctx context.Context, meter metric.Meter) (*HTTPMiddlew
nil nil
} }
// AddHTTPRequestResponseCounter adds a new meter for an HTTP defaultEndpoint and Method (GET, POST, etc)
// Creates one request counter and multiple response counters (one per http response status code).
func (m *HTTPMiddleware) AddHTTPRequestResponseCounter(endpoint string, method string) error {
meterKey := getRequestCounterKey(endpoint, method)
httpReqCounter, err := m.meter.Int64Counter(meterKey, metric.WithUnit("1"))
if err != nil {
return err
}
m.httpRequestCounters[meterKey] = httpReqCounter
durationKey := getRequestDurationKey(endpoint, method)
requestDuration, err := m.meter.Int64Histogram(durationKey, metric.WithUnit("milliseconds"))
if err != nil {
return err
}
m.httpRequestDurations[durationKey] = requestDuration
respCodes := []int{200, 204, 400, 401, 403, 404, 500, 502, 503}
for _, code := range respCodes {
meterKey = getResponseCounterKey(endpoint, method, code)
httpRespCounter, err := m.meter.Int64Counter(meterKey, metric.WithUnit("1"))
if err != nil {
return err
}
m.httpResponseCounters[meterKey] = httpRespCounter
meterKey = fmt.Sprintf("%s_%d_total", httpResponseCounterPrefix, code)
totalHTTPResponseCodeCounter, err := m.meter.Int64Counter(meterKey, metric.WithUnit("1"))
if err != nil {
return err
}
m.totalHTTPResponseCodeCounters[code] = totalHTTPResponseCodeCounter
}
return nil
}
func replaceEndpointChars(endpoint string) string { func replaceEndpointChars(endpoint string) string {
endpoint = strings.ReplaceAll(endpoint, "/", "_")
endpoint = strings.ReplaceAll(endpoint, "{", "") endpoint = strings.ReplaceAll(endpoint, "{", "")
endpoint = strings.ReplaceAll(endpoint, "}", "") endpoint = strings.ReplaceAll(endpoint, "}", "")
return endpoint return endpoint
} }
func getRequestCounterKey(endpoint, method string) string { func getEndpointMetricAttr(r *http.Request) string {
endpoint = replaceEndpointChars(endpoint) var endpoint string
return fmt.Sprintf("%s%s_%s", httpRequestCounterPrefix, endpoint, method) route := mux.CurrentRoute(r)
if route != nil {
pathTmpl, err := route.GetPathTemplate()
if err == nil {
endpoint = replaceEndpointChars(pathTmpl)
} }
func getRequestDurationKey(endpoint, method string) string {
endpoint = replaceEndpointChars(endpoint)
return fmt.Sprintf("%s%s_%s", httpRequestDurationPrefix, endpoint, method)
} }
return endpoint
func getResponseCounterKey(endpoint, method string, status int) string {
endpoint = replaceEndpointChars(endpoint)
return fmt.Sprintf("%s%s_%s_%d", httpResponseCounterPrefix, endpoint, method, status)
} }
// Handler logs every request and response and adds the, to metrics. // Handler logs every request and response and adds the, to metrics.
@ -176,11 +154,10 @@ func (m *HTTPMiddleware) Handler(h http.Handler) http.Handler {
log.WithContext(ctx).Tracef("HTTP request %v: %v %v", reqID, r.Method, r.URL) log.WithContext(ctx).Tracef("HTTP request %v: %v %v", reqID, r.Method, r.URL)
metricKey := getRequestCounterKey(r.URL.Path, r.Method) endpointAttr := attribute.String("endpoint", getEndpointMetricAttr(r))
methodAttr := attribute.String("method", r.Method)
if c, ok := m.httpRequestCounters[metricKey]; ok { m.httpRequestCounter.Add(m.ctx, 1, metric.WithAttributes(endpointAttr, methodAttr))
c.Add(m.ctx, 1)
}
m.totalHTTPRequestsCounter.Add(m.ctx, 1) m.totalHTTPRequestsCounter.Add(m.ctx, 1)
w := WrapResponseWriter(rw) w := WrapResponseWriter(rw)
@ -193,21 +170,14 @@ func (m *HTTPMiddleware) Handler(h http.Handler) http.Handler {
log.WithContext(ctx).Tracef("HTTP response %v: %v %v status %v", reqID, r.Method, r.URL, w.Status()) log.WithContext(ctx).Tracef("HTTP response %v: %v %v status %v", reqID, r.Method, r.URL, w.Status())
} }
metricKey = getResponseCounterKey(r.URL.Path, r.Method, w.Status()) statusCodeAttr := attribute.Int("code", w.Status())
if c, ok := m.httpResponseCounters[metricKey]; ok {
c.Add(m.ctx, 1)
}
m.httpResponseCounter.Add(m.ctx, 1, metric.WithAttributes(endpointAttr, methodAttr, statusCodeAttr))
m.totalHTTPResponseCounter.Add(m.ctx, 1) m.totalHTTPResponseCounter.Add(m.ctx, 1)
if c, ok := m.totalHTTPResponseCodeCounters[w.Status()]; ok { m.totalHTTPResponseCodeCounter.Add(m.ctx, 1, metric.WithAttributes(statusCodeAttr))
c.Add(m.ctx, 1)
}
durationKey := getRequestDurationKey(r.URL.Path, r.Method)
reqTook := time.Since(reqStart) reqTook := time.Since(reqStart)
if c, ok := m.httpRequestDurations[durationKey]; ok { m.httpRequestDuration.Record(m.ctx, reqTook.Milliseconds(), metric.WithAttributes(endpointAttr, methodAttr))
c.Record(m.ctx, reqTook.Milliseconds())
}
log.WithContext(ctx).Debugf("request %s %s took %d ms and finished with status %d", r.Method, r.URL.Path, reqTook.Milliseconds(), w.Status()) log.WithContext(ctx).Debugf("request %s %s took %d ms and finished with status %d", r.Method, r.URL.Path, reqTook.Milliseconds(), w.Status())
if w.Status() == 200 && (r.Method == http.MethodPut || r.Method == http.MethodPost || r.Method == http.MethodDelete) { if w.Status() == 200 && (r.Method == http.MethodPut || r.Method == http.MethodPost || r.Method == http.MethodDelete) {