zrepl/daemon/prometheus.go
Christian Schwarz 10a14a8c50 [#307] add package trace, integrate it with logging, and adopt it throughout zrepl
package trace:

- introduce the concept of tasks and spans, tracked as linked list within ctx
    - see package-level docs for an overview of the concepts
    - **main feature 1**: unique stack of task and span IDs
        - makes it easy to follow a series of log entries in concurrent code
    - **main feature 2**: ability to produce a chrome://tracing-compatible trace file
        - either via an env variable or a `zrepl pprof` subcommand
        - this is not a CPU profile, we already have go pprof for that
        - but it is very useful to visually inspect where the
          replication / snapshotter / pruner spends its time
          ( fixes #307 )

usage in package daemon/logging:

- goal: every log entry should have a trace field with the ID stack from package trace

- make `logging.GetLogger(ctx, Subsys)` the authoritative `logger.Logger` factory function
    - the context carries a linked list of injected fields which
      `logging.GetLogger` adds to the logger it returns
    - `logging.GetLogger` also uses package `trace` to get the
      task-and-span-stack and injects it into the returned logger's fields
2020-05-19 11:30:02 +02:00

106 lines
2.5 KiB
Go

package daemon
import (
"context"
"net"
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/zrepl/zrepl/config"
"github.com/zrepl/zrepl/daemon/job"
"github.com/zrepl/zrepl/daemon/logging"
"github.com/zrepl/zrepl/endpoint"
"github.com/zrepl/zrepl/logger"
"github.com/zrepl/zrepl/rpc/dataconn/frameconn"
"github.com/zrepl/zrepl/util/tcpsock"
"github.com/zrepl/zrepl/zfs"
)
type prometheusJob struct {
listen string
freeBind bool
}
func newPrometheusJobFromConfig(in *config.PrometheusMonitoring) (*prometheusJob, error) {
if _, _, err := net.SplitHostPort(in.Listen); err != nil {
return nil, err
}
return &prometheusJob{in.Listen, in.ListenFreeBind}, nil
}
var prom struct {
taskLogEntries *prometheus.CounterVec
}
func init() {
prom.taskLogEntries = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "zrepl",
Subsystem: "daemon",
Name: "log_entries",
Help: "number of log entries per job task and level",
}, []string{"zrepl_job", "level"})
prometheus.MustRegister(prom.taskLogEntries)
}
func (j *prometheusJob) Name() string { return jobNamePrometheus }
func (j *prometheusJob) Status() *job.Status { return &job.Status{Type: job.TypeInternal} }
func (j *prometheusJob) OwnedDatasetSubtreeRoot() (p *zfs.DatasetPath, ok bool) { return nil, false }
func (j *prometheusJob) SenderConfig() *endpoint.SenderConfig { return nil }
func (j *prometheusJob) RegisterMetrics(registerer prometheus.Registerer) {}
func (j *prometheusJob) Run(ctx context.Context) {
if err := zfs.PrometheusRegister(prometheus.DefaultRegisterer); err != nil {
panic(err)
}
if err := frameconn.PrometheusRegister(prometheus.DefaultRegisterer); err != nil {
panic(err)
}
log := job.GetLogger(ctx)
l, err := tcpsock.Listen(j.listen, j.freeBind)
if err != nil {
log.WithError(err).Error("cannot listen")
return
}
go func() {
<-ctx.Done()
l.Close()
}()
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.Handler())
err = http.Serve(l, mux)
if err != nil && ctx.Err() == nil {
log.WithError(err).Error("error while serving")
}
}
type prometheusJobOutlet struct {
}
var _ logger.Outlet = prometheusJobOutlet{}
func newPrometheusLogOutlet() prometheusJobOutlet {
return prometheusJobOutlet{}
}
func (o prometheusJobOutlet) WriteEntry(entry logger.Entry) error {
jobFieldVal, ok := entry.Fields[logging.JobField].(string)
if !ok {
jobFieldVal = "_nojobid"
}
prom.taskLogEntries.WithLabelValues(jobFieldVal, entry.Level.String()).Inc()
return nil
}