mirror of
https://github.com/zrepl/zrepl.git
synced 2025-01-24 23:30:06 +01:00
312 lines
9.3 KiB
Go
312 lines
9.3 KiB
Go
// package trace provides activity tracing via ctx through Tasks and Spans
|
|
//
|
|
// Basic Concepts
|
|
//
|
|
// Tracing can be used to identify where a piece of code spends its time.
|
|
//
|
|
// The Go standard library provides package runtime/trace which is useful to identify CPU bottlenecks or
|
|
// to understand what happens inside the Go runtime.
|
|
// However, it is not ideal for application level tracing, in particular if those traces should be understandable
|
|
// to tech-savvy users (albeit not developers).
|
|
//
|
|
// This package provides the concept of Tasks and Spans to express what activity is happening within an application:
|
|
//
|
|
// - Neither task nor span is really tangible but instead contained within the context.Context tree
|
|
// - Tasks represent concurrent activity (i.e. goroutines).
|
|
// - Spans represent a semantic stack trace within a task.
|
|
//
|
|
// As a consequence, whenever a context is propagated across goroutine boundary, you need to create a child task:
|
|
//
|
|
// go func(ctx context.Context) {
|
|
// ctx, endTask = WithTask(ctx, "what-happens-inside-the-child-task")
|
|
// defer endTask()
|
|
// // ...
|
|
// }(ctx)
|
|
//
|
|
// Within the task, you can open up a hierarchy of spans.
|
|
// In contrast to tasks, which have can multiple concurrently running child tasks,
|
|
// spans must nest and not cross the goroutine boundary.
|
|
//
|
|
// ctx, endSpan = WithSpan(ctx, "copy-dir")
|
|
// defer endSpan()
|
|
// for _, f := range dir.Files() {
|
|
// func() {
|
|
// ctx, endSpan := WithSpan(ctx, fmt.Sprintf("copy-file %q", f))
|
|
// defer endspan()
|
|
// b, _ := ioutil.ReadFile(f)
|
|
// _ = ioutil.WriteFile(f + ".copy", b, 0600)
|
|
// }()
|
|
// }
|
|
//
|
|
// In combination:
|
|
// ctx, endTask = WithTask(ctx, "copy-dirs")
|
|
// defer endTask()
|
|
// for i := range dirs {
|
|
// go func(dir string) {
|
|
// ctx, endTask := WithTask(ctx, "copy-dir")
|
|
// defer endTask()
|
|
// for _, f := range filesIn(dir) {
|
|
// func() {
|
|
// ctx, endSpan := WithSpan(ctx, fmt.Sprintf("copy-file %q", f))
|
|
// defer endspan()
|
|
// b, _ := ioutil.ReadFile(f)
|
|
// _ = ioutil.WriteFile(f + ".copy", b, 0600)
|
|
// }()
|
|
// }
|
|
// }()
|
|
// }
|
|
//
|
|
// Note that a span ends at the time you call endSpan - not before and not after that.
|
|
// If you violate the stack-like nesting of spans by forgetting an endSpan() invocation,
|
|
// the out-of-order endSpan() will panic.
|
|
//
|
|
// A similar rule applies to the endTask closure returned by WithTask:
|
|
// If a task has live child tasks at the time you call endTask(), the call will panic.
|
|
//
|
|
// Recovering from endSpan() or endTask() panics will corrupt the trace stack and lead to corrupt tracefile output.
|
|
//
|
|
//
|
|
// Best Practices For Naming Tasks And Spans
|
|
//
|
|
// Tasks should always have string constants as names, and must not contain the `#` character. WHy?
|
|
// First, the visualization by chrome://tracing draws a horizontal bar for each task in the trace.
|
|
// Also, the package appends `#NUM` for each concurrently running instance of a task name.
|
|
// Note that the `#NUM` suffix will be reused if a task has ended, in order to avoid an
|
|
// infinite number of horizontal bars in the visualization.
|
|
//
|
|
//
|
|
// Chrome-compatible Tracefile Support
|
|
//
|
|
// The activity trace generated by usage of WithTask and WithSpan can be rendered to a JSON output file
|
|
// that can be loaded into chrome://tracing .
|
|
// Apart from function GetSpanStackOrDefault, this is the main benefit of this package.
|
|
//
|
|
// First, there is a convenience environment variable 'ZREPL_ACTIVITY_TRACE' that can be set to an output path.
|
|
// From process start onward, a trace is written to that path.
|
|
//
|
|
// More consumers can attach to the activity trace through the ChrometraceClientWebsocketHandler websocket handler.
|
|
//
|
|
// If a write error is encountered with any consumer (including the env-var based one), the consumer is closed and
|
|
// will not receive further trace output.
|
|
package trace
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strings"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
)
|
|
|
|
var metrics struct {
|
|
activeTasks prometheus.Gauge
|
|
uniqueConcurrentTaskNameBitvecLength *prometheus.GaugeVec
|
|
}
|
|
|
|
func init() {
|
|
metrics.activeTasks = prometheus.NewGauge(prometheus.GaugeOpts{
|
|
Namespace: "zrepl",
|
|
Subsystem: "trace",
|
|
Name: "active_tasks",
|
|
Help: "number of active (tracing-level) tasks in the daemon",
|
|
})
|
|
metrics.uniqueConcurrentTaskNameBitvecLength = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "zrepl",
|
|
Subsystem: "trace",
|
|
Name: "unique_concurrent_task_name_bitvec_length",
|
|
Help: "length of the bitvec used to find unique names for concurrent tasks",
|
|
}, []string{"task_name"})
|
|
}
|
|
|
|
func RegisterMetrics(r prometheus.Registerer) {
|
|
r.MustRegister(metrics.activeTasks)
|
|
r.MustRegister(metrics.uniqueConcurrentTaskNameBitvecLength)
|
|
}
|
|
|
|
var taskNamer = newUniqueTaskNamer(metrics.uniqueConcurrentTaskNameBitvecLength)
|
|
|
|
type traceNode struct {
|
|
id string
|
|
annotation string
|
|
parentTask *traceNode
|
|
activeChildTasks int32 // only for task nodes, insignificant for span nodes
|
|
parentSpan *traceNode
|
|
hasActiveChildSpan int32
|
|
|
|
startedAt time.Time
|
|
endedAt time.Time
|
|
ended int32
|
|
}
|
|
|
|
// Returned from WithTask or WithSpan.
|
|
// Must be called once the task or span ends.
|
|
// See package-level docs for nesting rules.
|
|
// Wrong call order / forgetting to call it will result in panics.
|
|
type DoneFunc func()
|
|
|
|
// Start a new root task or create a child task of an existing task.
|
|
//
|
|
// This is required when starting a new goroutine and
|
|
// passing an existing task context to it.
|
|
//
|
|
// taskName should be a constantand must not contain '#'
|
|
//
|
|
// The implementation ensures that,
|
|
// if multiple tasks with the same name exist simultaneously,
|
|
// a unique suffix is appended to uniquely identify the task opened with this function.
|
|
func WithTask(ctx context.Context, taskName string) (context.Context, DoneFunc) {
|
|
|
|
var parentTask *traceNode
|
|
nodeI := ctx.Value(contextKeyTraceNode)
|
|
if nodeI != nil {
|
|
node := nodeI.(*traceNode)
|
|
if node.parentSpan != nil {
|
|
parentTask = node.parentSpan // FIXME review this
|
|
} else {
|
|
parentTask = node
|
|
}
|
|
}
|
|
|
|
taskName, taskNameDone := taskNamer.UniqueConcurrentTaskName(taskName)
|
|
|
|
this := &traceNode{
|
|
id: genID(),
|
|
annotation: taskName,
|
|
parentTask: parentTask,
|
|
activeChildTasks: 0,
|
|
hasActiveChildSpan: 0,
|
|
parentSpan: nil,
|
|
|
|
startedAt: time.Now(),
|
|
ended: 0,
|
|
endedAt: time.Time{},
|
|
}
|
|
|
|
if this.parentTask != nil {
|
|
atomic.AddInt32(&this.parentTask.activeChildTasks, 1)
|
|
}
|
|
|
|
ctx = context.WithValue(ctx, contextKeyTraceNode, this)
|
|
|
|
chrometraceBeginTask(this)
|
|
|
|
metrics.activeTasks.Inc()
|
|
|
|
endTaskFunc := func() {
|
|
if nc := atomic.LoadInt32(&this.activeChildTasks); nc != 0 {
|
|
panic(fmt.Sprintf("this task must have 0 active child tasks, got %v", nc))
|
|
}
|
|
|
|
if !atomic.CompareAndSwapInt32(&this.ended, 0, 1) {
|
|
return
|
|
}
|
|
this.endedAt = time.Now()
|
|
|
|
if this.parentTask != nil {
|
|
if atomic.AddInt32(&this.parentTask.activeChildTasks, -1) < 0 {
|
|
panic("parent task with negative activeChildTasks count")
|
|
}
|
|
}
|
|
|
|
chrometraceEndTask(this)
|
|
|
|
metrics.activeTasks.Dec()
|
|
|
|
taskNameDone()
|
|
}
|
|
|
|
return ctx, endTaskFunc
|
|
}
|
|
|
|
// Start a new span.
|
|
// Important: ctx must have an active task (see WithTask)
|
|
func WithSpan(ctx context.Context, annotation string) (context.Context, DoneFunc) {
|
|
var parentSpan, parentTask *traceNode
|
|
nodeI := ctx.Value(contextKeyTraceNode)
|
|
if nodeI != nil {
|
|
parentSpan = nodeI.(*traceNode)
|
|
if parentSpan.parentSpan == nil {
|
|
parentTask = parentSpan
|
|
} else {
|
|
parentTask = parentSpan.parentTask
|
|
}
|
|
} else {
|
|
panic("must be called from within a task")
|
|
}
|
|
|
|
this := &traceNode{
|
|
id: genID(),
|
|
annotation: annotation,
|
|
parentTask: parentTask,
|
|
parentSpan: parentSpan,
|
|
hasActiveChildSpan: 0,
|
|
|
|
startedAt: time.Now(),
|
|
ended: 0,
|
|
endedAt: time.Time{},
|
|
}
|
|
|
|
if !atomic.CompareAndSwapInt32(&parentSpan.hasActiveChildSpan, 0, 1) {
|
|
panic("already has active child span")
|
|
}
|
|
|
|
ctx = context.WithValue(ctx, contextKeyTraceNode, this)
|
|
chrometraceBeginSpan(this)
|
|
|
|
endTaskFunc := func() {
|
|
if !atomic.CompareAndSwapInt32(&parentSpan.hasActiveChildSpan, 1, 0) {
|
|
panic("impl error: hasActiveChildSpan should not change to 0 while we hold it")
|
|
}
|
|
this.endedAt = time.Now()
|
|
|
|
chrometraceEndSpan(this)
|
|
}
|
|
|
|
return ctx, endTaskFunc
|
|
}
|
|
|
|
func currentTaskNameAndSpanStack(this *traceNode) (taskName string, spanIdStack string) {
|
|
|
|
task := this.parentTask
|
|
if this.parentSpan == nil {
|
|
task = this
|
|
}
|
|
|
|
var spansInTask []*traceNode
|
|
for s := this; s != nil; s = s.parentSpan {
|
|
spansInTask = append(spansInTask, s)
|
|
}
|
|
|
|
var tasks []*traceNode
|
|
for t := task; t != nil; t = t.parentTask {
|
|
tasks = append(tasks, t)
|
|
}
|
|
|
|
var taskIdsRev []string
|
|
for i := len(tasks) - 1; i >= 0; i-- {
|
|
taskIdsRev = append(taskIdsRev, tasks[i].id)
|
|
}
|
|
|
|
var spanIdsRev []string
|
|
for i := len(spansInTask) - 1; i >= 0; i-- {
|
|
spanIdsRev = append(spanIdsRev, spansInTask[i].id)
|
|
}
|
|
|
|
taskStack := strings.Join(taskIdsRev, "$")
|
|
spanIdStack = fmt.Sprintf("%s$%s", taskStack, strings.Join(spanIdsRev, "."))
|
|
|
|
return task.annotation, spanIdStack
|
|
}
|
|
|
|
func GetSpanStackOrDefault(ctx context.Context, def string) string {
|
|
if nI := ctx.Value(contextKeyTraceNode); nI != nil {
|
|
n := nI.(*traceNode)
|
|
_, spanStack := currentTaskNameAndSpanStack(n)
|
|
return spanStack
|
|
} else {
|
|
return def
|
|
}
|
|
}
|