[#307] add package trace, integrate it with logging, and adopt it throughout zrepl

package trace:

- introduce the concept of tasks and spans, tracked as linked list within ctx
    - see package-level docs for an overview of the concepts
    - **main feature 1**: unique stack of task and span IDs
        - makes it easy to follow a series of log entries in concurrent code
    - **main feature 2**: ability to produce a chrome://tracing-compatible trace file
        - either via an env variable or a `zrepl pprof` subcommand
        - this is not a CPU profile, we already have go pprof for that
        - but it is very useful to visually inspect where the
          replication / snapshotter / pruner spends its time
          ( fixes #307 )

usage in package daemon/logging:

- goal: every log entry should have a trace field with the ID stack from package trace

- make `logging.GetLogger(ctx, Subsys)` the authoritative `logger.Logger` factory function
    - the context carries a linked list of injected fields which
      `logging.GetLogger` adds to the logger it returns
    - `logging.GetLogger` also uses package `trace` to get the
      task-and-span-stack and injects it into the returned logger's fields
This commit is contained in:
Christian Schwarz
2020-04-11 15:49:41 +02:00
parent bcb5965617
commit 10a14a8c50
75 changed files with 1934 additions and 462 deletions

View File

@ -12,6 +12,7 @@ import (
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/zrepl/zrepl/daemon/logging/trace"
"github.com/zrepl/zrepl/config"
"github.com/zrepl/zrepl/daemon/job"
@ -23,9 +24,8 @@ import (
"github.com/zrepl/zrepl/zfs/zfscmd"
)
func Run(conf *config.Config) error {
ctx, cancel := context.WithCancel(context.Background())
func Run(ctx context.Context, conf *config.Config) error {
ctx, cancel := context.WithCancel(ctx)
defer cancel()
sigChan := make(chan os.Signal, 1)
@ -39,6 +39,7 @@ func Run(conf *config.Config) error {
if err != nil {
return errors.Wrap(err, "cannot build logging from config")
}
outlets.Add(newPrometheusLogOutlet(), logger.Debug)
confJobs, err := job.JobsFromConfig(conf)
if err != nil {
@ -48,14 +49,23 @@ func Run(conf *config.Config) error {
log := logger.NewLogger(outlets, 1*time.Second)
log.Info(version.NewZreplVersionInformation().String())
ctx = logging.WithLoggers(ctx, logging.SubsystemLoggersWithUniversalLogger(log))
trace.RegisterCallback(trace.Callback{
OnBegin: func(ctx context.Context) { logging.GetLogger(ctx, logging.SubsysTraceData).Debug("begin span") },
OnEnd: func(ctx context.Context, spanInfo trace.SpanInfo) {
logging.
GetLogger(ctx, logging.SubsysTraceData).
WithField("duration_s", spanInfo.EndedAt().Sub(spanInfo.StartedAt()).Seconds()).
Debug("finished span " + spanInfo.TaskAndSpanStack(trace.SpanStackKindAnnotation))
},
})
for _, job := range confJobs {
if IsInternalJobName(job.Name()) {
panic(fmt.Sprintf("internal job name used for config job '%s'", job.Name())) //FIXME
}
}
ctx = job.WithLogger(ctx, log)
jobs := newJobs()
// start control socket
@ -84,6 +94,7 @@ func Run(conf *config.Config) error {
// register global (=non job-local) metrics
zfscmd.RegisterMetrics(prometheus.DefaultRegisterer)
trace.RegisterMetrics(prometheus.DefaultRegisterer)
log.Info("starting daemon")
@ -98,6 +109,8 @@ func Run(conf *config.Config) error {
case <-ctx.Done():
log.WithError(ctx.Err()).Info("context finished")
}
log.Info("waiting for jobs to finish")
<-jobs.wait()
log.Info("daemon exiting")
return nil
}
@ -120,14 +133,11 @@ func newJobs() *jobs {
}
}
const (
logJobField string = "job"
)
func (s *jobs) wait() <-chan struct{} {
ch := make(chan struct{})
go func() {
s.wg.Wait()
close(ch)
}()
return ch
}
@ -202,9 +212,8 @@ func (s *jobs) start(ctx context.Context, j job.Job, internal bool) {
s.m.Lock()
defer s.m.Unlock()
jobLog := job.GetLogger(ctx).
WithField(logJobField, j.Name()).
WithOutlet(newPrometheusLogOutlet(j.Name()), logger.Debug)
ctx = logging.WithInjectedField(ctx, logging.JobField, j.Name())
jobName := j.Name()
if !internal && IsInternalJobName(jobName) {
panic(fmt.Sprintf("internal job name used for non-internal job %s", jobName))
@ -219,7 +228,6 @@ func (s *jobs) start(ctx context.Context, j job.Job, internal bool) {
j.RegisterMetrics(prometheus.DefaultRegisterer)
s.jobs[jobName] = j
ctx = job.WithLogger(ctx, jobLog)
ctx = zfscmd.WithJobID(ctx, j.Name())
ctx, wakeup := wakeup.Context(ctx)
ctx, resetFunc := reset.Context(ctx)
@ -229,8 +237,8 @@ func (s *jobs) start(ctx context.Context, j job.Job, internal bool) {
s.wg.Add(1)
go func() {
defer s.wg.Done()
jobLog.Info("starting job")
defer jobLog.Info("job exited")
job.GetLogger(ctx).Info("starting job")
defer job.GetLogger(ctx).Info("job exited")
j.Run(ctx)
}()
}