zrepl/daemon/logging/trace/trace.go

312 lines
9.3 KiB
Go

// package trace provides activity tracing via ctx through Tasks and Spans
//
// Basic Concepts
//
// Tracing can be used to identify where a piece of code spends its time.
//
// The Go standard library provides package runtime/trace which is useful to identify CPU bottlenecks or
// to understand what happens inside the Go runtime.
// However, it is not ideal for application level tracing, in particular if those traces should be understandable
// to tech-savvy users (albeit not developers).
//
// This package provides the concept of Tasks and Spans to express what activity is happening within an application:
//
// - Neither task nor span is really tangible but instead contained within the context.Context tree
// - Tasks represent concurrent activity (i.e. goroutines).
// - Spans represent a semantic stack trace within a task.
//
// As a consequence, whenever a context is propagated across goroutine boundary, you need to create a child task:
//
// go func(ctx context.Context) {
// ctx, endTask = WithTask(ctx, "what-happens-inside-the-child-task")
// defer endTask()
// // ...
// }(ctx)
//
// Within the task, you can open up a hierarchy of spans.
// In contrast to tasks, which have can multiple concurrently running child tasks,
// spans must nest and not cross the goroutine boundary.
//
// ctx, endSpan = WithSpan(ctx, "copy-dir")
// defer endSpan()
// for _, f := range dir.Files() {
// func() {
// ctx, endSpan := WithSpan(ctx, fmt.Sprintf("copy-file %q", f))
// defer endspan()
// b, _ := ioutil.ReadFile(f)
// _ = ioutil.WriteFile(f + ".copy", b, 0600)
// }()
// }
//
// In combination:
// ctx, endTask = WithTask(ctx, "copy-dirs")
// defer endTask()
// for i := range dirs {
// go func(dir string) {
// ctx, endTask := WithTask(ctx, "copy-dir")
// defer endTask()
// for _, f := range filesIn(dir) {
// func() {
// ctx, endSpan := WithSpan(ctx, fmt.Sprintf("copy-file %q", f))
// defer endspan()
// b, _ := ioutil.ReadFile(f)
// _ = ioutil.WriteFile(f + ".copy", b, 0600)
// }()
// }
// }()
// }
//
// Note that a span ends at the time you call endSpan - not before and not after that.
// If you violate the stack-like nesting of spans by forgetting an endSpan() invocation,
// the out-of-order endSpan() will panic.
//
// A similar rule applies to the endTask closure returned by WithTask:
// If a task has live child tasks at the time you call endTask(), the call will panic.
//
// Recovering from endSpan() or endTask() panics will corrupt the trace stack and lead to corrupt tracefile output.
//
//
// Best Practices For Naming Tasks And Spans
//
// Tasks should always have string constants as names, and must not contain the `#` character. WHy?
// First, the visualization by chrome://tracing draws a horizontal bar for each task in the trace.
// Also, the package appends `#NUM` for each concurrently running instance of a task name.
// Note that the `#NUM` suffix will be reused if a task has ended, in order to avoid an
// infinite number of horizontal bars in the visualization.
//
//
// Chrome-compatible Tracefile Support
//
// The activity trace generated by usage of WithTask and WithSpan can be rendered to a JSON output file
// that can be loaded into chrome://tracing .
// Apart from function GetSpanStackOrDefault, this is the main benefit of this package.
//
// First, there is a convenience environment variable 'ZREPL_ACTIVITY_TRACE' that can be set to an output path.
// From process start onward, a trace is written to that path.
//
// More consumers can attach to the activity trace through the ChrometraceClientWebsocketHandler websocket handler.
//
// If a write error is encountered with any consumer (including the env-var based one), the consumer is closed and
// will not receive further trace output.
package trace
import (
"context"
"fmt"
"strings"
"sync/atomic"
"time"
"github.com/prometheus/client_golang/prometheus"
)
var metrics struct {
activeTasks prometheus.Gauge
uniqueConcurrentTaskNameBitvecLength *prometheus.GaugeVec
}
func init() {
metrics.activeTasks = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "zrepl",
Subsystem: "trace",
Name: "active_tasks",
Help: "number of active (tracing-level) tasks in the daemon",
})
metrics.uniqueConcurrentTaskNameBitvecLength = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "zrepl",
Subsystem: "trace",
Name: "unique_concurrent_task_name_bitvec_length",
Help: "length of the bitvec used to find unique names for concurrent tasks",
}, []string{"task_name"})
}
func RegisterMetrics(r prometheus.Registerer) {
r.MustRegister(metrics.activeTasks)
r.MustRegister(metrics.uniqueConcurrentTaskNameBitvecLength)
}
var taskNamer = newUniqueTaskNamer(metrics.uniqueConcurrentTaskNameBitvecLength)
type traceNode struct {
id string
annotation string
parentTask *traceNode
activeChildTasks int32 // only for task nodes, insignificant for span nodes
parentSpan *traceNode
hasActiveChildSpan int32
startedAt time.Time
endedAt time.Time
ended int32
}
// Returned from WithTask or WithSpan.
// Must be called once the task or span ends.
// See package-level docs for nesting rules.
// Wrong call order / forgetting to call it will result in panics.
type DoneFunc func()
// Start a new root task or create a child task of an existing task.
//
// This is required when starting a new goroutine and
// passing an existing task context to it.
//
// taskName should be a constantand must not contain '#'
//
// The implementation ensures that,
// if multiple tasks with the same name exist simultaneously,
// a unique suffix is appended to uniquely identify the task opened with this function.
func WithTask(ctx context.Context, taskName string) (context.Context, DoneFunc) {
var parentTask *traceNode
nodeI := ctx.Value(contextKeyTraceNode)
if nodeI != nil {
node := nodeI.(*traceNode)
if node.parentSpan != nil {
parentTask = node.parentSpan // FIXME review this
} else {
parentTask = node
}
}
taskName, taskNameDone := taskNamer.UniqueConcurrentTaskName(taskName)
this := &traceNode{
id: genID(),
annotation: taskName,
parentTask: parentTask,
activeChildTasks: 0,
hasActiveChildSpan: 0,
parentSpan: nil,
startedAt: time.Now(),
ended: 0,
endedAt: time.Time{},
}
if this.parentTask != nil {
atomic.AddInt32(&this.parentTask.activeChildTasks, 1)
}
ctx = context.WithValue(ctx, contextKeyTraceNode, this)
chrometraceBeginTask(this)
metrics.activeTasks.Inc()
endTaskFunc := func() {
if nc := atomic.LoadInt32(&this.activeChildTasks); nc != 0 {
panic(fmt.Sprintf("this task must have 0 active child tasks, got %v", nc))
}
if !atomic.CompareAndSwapInt32(&this.ended, 0, 1) {
return
}
this.endedAt = time.Now()
if this.parentTask != nil {
if atomic.AddInt32(&this.parentTask.activeChildTasks, -1) < 0 {
panic("parent task with negative activeChildTasks count")
}
}
chrometraceEndTask(this)
metrics.activeTasks.Dec()
taskNameDone()
}
return ctx, endTaskFunc
}
// Start a new span.
// Important: ctx must have an active task (see WithTask)
func WithSpan(ctx context.Context, annotation string) (context.Context, DoneFunc) {
var parentSpan, parentTask *traceNode
nodeI := ctx.Value(contextKeyTraceNode)
if nodeI != nil {
parentSpan = nodeI.(*traceNode)
if parentSpan.parentSpan == nil {
parentTask = parentSpan
} else {
parentTask = parentSpan.parentTask
}
} else {
panic("must be called from within a task")
}
this := &traceNode{
id: genID(),
annotation: annotation,
parentTask: parentTask,
parentSpan: parentSpan,
hasActiveChildSpan: 0,
startedAt: time.Now(),
ended: 0,
endedAt: time.Time{},
}
if !atomic.CompareAndSwapInt32(&parentSpan.hasActiveChildSpan, 0, 1) {
panic("already has active child span")
}
ctx = context.WithValue(ctx, contextKeyTraceNode, this)
chrometraceBeginSpan(this)
endTaskFunc := func() {
if !atomic.CompareAndSwapInt32(&parentSpan.hasActiveChildSpan, 1, 0) {
panic("impl error: hasActiveChildSpan should not change to 0 while we hold it")
}
this.endedAt = time.Now()
chrometraceEndSpan(this)
}
return ctx, endTaskFunc
}
func currentTaskNameAndSpanStack(this *traceNode) (taskName string, spanIdStack string) {
task := this.parentTask
if this.parentSpan == nil {
task = this
}
var spansInTask []*traceNode
for s := this; s != nil; s = s.parentSpan {
spansInTask = append(spansInTask, s)
}
var tasks []*traceNode
for t := task; t != nil; t = t.parentTask {
tasks = append(tasks, t)
}
var taskIdsRev []string
for i := len(tasks) - 1; i >= 0; i-- {
taskIdsRev = append(taskIdsRev, tasks[i].id)
}
var spanIdsRev []string
for i := len(spansInTask) - 1; i >= 0; i-- {
spanIdsRev = append(spanIdsRev, spansInTask[i].id)
}
taskStack := strings.Join(taskIdsRev, "$")
spanIdStack = fmt.Sprintf("%s$%s", taskStack, strings.Join(spanIdsRev, "."))
return task.annotation, spanIdStack
}
func GetSpanStackOrDefault(ctx context.Context, def string) string {
if nI := ctx.Value(contextKeyTraceNode); nI != nil {
n := nI.(*traceNode)
_, spanStack := currentTaskNameAndSpanStack(n)
return spanStack
} else {
return def
}
}