// package trace provides activity tracing via ctx through Tasks and Spans // // Basic Concepts // // Tracing can be used to identify where a piece of code spends its time. // // The Go standard library provides package runtime/trace which is useful to identify CPU bottlenecks or // to understand what happens inside the Go runtime. // However, it is not ideal for application level tracing, in particular if those traces should be understandable // to tech-savvy users (albeit not developers). // // This package provides the concept of Tasks and Spans to express what activity is happening within an application: // // - Neither task nor span is really tangible but instead contained within the context.Context tree // - Tasks represent concurrent activity (i.e. goroutines). // - Spans represent a semantic stack trace within a task. // // As a consequence, whenever a context is propagated across goroutine boundary, you need to create a child task: // // go func(ctx context.Context) { // ctx, endTask = WithTask(ctx, "what-happens-inside-the-child-task") // defer endTask() // // ... // }(ctx) // // Within the task, you can open up a hierarchy of spans. // In contrast to tasks, which have can multiple concurrently running child tasks, // spans must nest and not cross the goroutine boundary. // // ctx, endSpan = WithSpan(ctx, "copy-dir") // defer endSpan() // for _, f := range dir.Files() { // func() { // ctx, endSpan := WithSpan(ctx, fmt.Sprintf("copy-file %q", f)) // defer endspan() // b, _ := ioutil.ReadFile(f) // _ = ioutil.WriteFile(f + ".copy", b, 0600) // }() // } // // In combination: // ctx, endTask = WithTask(ctx, "copy-dirs") // defer endTask() // for i := range dirs { // go func(dir string) { // ctx, endTask := WithTask(ctx, "copy-dir") // defer endTask() // for _, f := range filesIn(dir) { // func() { // ctx, endSpan := WithSpan(ctx, fmt.Sprintf("copy-file %q", f)) // defer endspan() // b, _ := ioutil.ReadFile(f) // _ = ioutil.WriteFile(f + ".copy", b, 0600) // }() // } // }() // } // // Note that a span ends at the time you call endSpan - not before and not after that. // If you violate the stack-like nesting of spans by forgetting an endSpan() invocation, // the out-of-order endSpan() will panic. // // A similar rule applies to the endTask closure returned by WithTask: // If a task has live child tasks at the time you call endTask(), the call will panic. // // Recovering from endSpan() or endTask() panics will corrupt the trace stack and lead to corrupt tracefile output. // // // Best Practices For Naming Tasks And Spans // // Tasks should always have string constants as names, and must not contain the `#` character. WHy? // First, the visualization by chrome://tracing draws a horizontal bar for each task in the trace. // Also, the package appends `#NUM` for each concurrently running instance of a task name. // Note that the `#NUM` suffix will be reused if a task has ended, in order to avoid an // infinite number of horizontal bars in the visualization. // // // Chrome-compatible Tracefile Support // // The activity trace generated by usage of WithTask and WithSpan can be rendered to a JSON output file // that can be loaded into chrome://tracing . // Apart from function GetSpanStackOrDefault, this is the main benefit of this package. // // First, there is a convenience environment variable 'ZREPL_ACTIVITY_TRACE' that can be set to an output path. // From process start onward, a trace is written to that path. // // More consumers can attach to the activity trace through the ChrometraceClientWebsocketHandler websocket handler. // // If a write error is encountered with any consumer (including the env-var based one), the consumer is closed and // will not receive further trace output. package trace import ( "context" "fmt" "strings" "sync/atomic" "time" "github.com/prometheus/client_golang/prometheus" ) var metrics struct { activeTasks prometheus.Gauge uniqueConcurrentTaskNameBitvecLength *prometheus.GaugeVec } func init() { metrics.activeTasks = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "zrepl", Subsystem: "trace", Name: "active_tasks", Help: "number of active (tracing-level) tasks in the daemon", }) metrics.uniqueConcurrentTaskNameBitvecLength = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "zrepl", Subsystem: "trace", Name: "unique_concurrent_task_name_bitvec_length", Help: "length of the bitvec used to find unique names for concurrent tasks", }, []string{"task_name"}) } func RegisterMetrics(r prometheus.Registerer) { r.MustRegister(metrics.activeTasks) r.MustRegister(metrics.uniqueConcurrentTaskNameBitvecLength) } var taskNamer = newUniqueTaskNamer(metrics.uniqueConcurrentTaskNameBitvecLength) type traceNode struct { id string annotation string parentTask *traceNode activeChildTasks int32 // only for task nodes, insignificant for span nodes parentSpan *traceNode hasActiveChildSpan int32 startedAt time.Time endedAt time.Time ended int32 } // Returned from WithTask or WithSpan. // Must be called once the task or span ends. // See package-level docs for nesting rules. // Wrong call order / forgetting to call it will result in panics. type DoneFunc func() // Start a new root task or create a child task of an existing task. // // This is required when starting a new goroutine and // passing an existing task context to it. // // taskName should be a constantand must not contain '#' // // The implementation ensures that, // if multiple tasks with the same name exist simultaneously, // a unique suffix is appended to uniquely identify the task opened with this function. func WithTask(ctx context.Context, taskName string) (context.Context, DoneFunc) { var parentTask *traceNode nodeI := ctx.Value(contextKeyTraceNode) if nodeI != nil { node := nodeI.(*traceNode) if node.parentSpan != nil { parentTask = node.parentSpan // FIXME review this } else { parentTask = node } } taskName, taskNameDone := taskNamer.UniqueConcurrentTaskName(taskName) this := &traceNode{ id: genID(), annotation: taskName, parentTask: parentTask, activeChildTasks: 0, hasActiveChildSpan: 0, parentSpan: nil, startedAt: time.Now(), ended: 0, endedAt: time.Time{}, } if this.parentTask != nil { atomic.AddInt32(&this.parentTask.activeChildTasks, 1) } ctx = context.WithValue(ctx, contextKeyTraceNode, this) chrometraceBeginTask(this) metrics.activeTasks.Inc() endTaskFunc := func() { if nc := atomic.LoadInt32(&this.activeChildTasks); nc != 0 { panic(fmt.Sprintf("this task must have 0 active child tasks, got %v", nc)) } if !atomic.CompareAndSwapInt32(&this.ended, 0, 1) { return } this.endedAt = time.Now() if this.parentTask != nil { if atomic.AddInt32(&this.parentTask.activeChildTasks, -1) < 0 { panic("parent task with negative activeChildTasks count") } } chrometraceEndTask(this) metrics.activeTasks.Dec() taskNameDone() } return ctx, endTaskFunc } // Start a new span. // Important: ctx must have an active task (see WithTask) func WithSpan(ctx context.Context, annotation string) (context.Context, DoneFunc) { var parentSpan, parentTask *traceNode nodeI := ctx.Value(contextKeyTraceNode) if nodeI != nil { parentSpan = nodeI.(*traceNode) if parentSpan.parentSpan == nil { parentTask = parentSpan } else { parentTask = parentSpan.parentTask } } else { panic("must be called from within a task") } this := &traceNode{ id: genID(), annotation: annotation, parentTask: parentTask, parentSpan: parentSpan, hasActiveChildSpan: 0, startedAt: time.Now(), ended: 0, endedAt: time.Time{}, } if !atomic.CompareAndSwapInt32(&parentSpan.hasActiveChildSpan, 0, 1) { panic("already has active child span") } ctx = context.WithValue(ctx, contextKeyTraceNode, this) chrometraceBeginSpan(this) endTaskFunc := func() { if !atomic.CompareAndSwapInt32(&parentSpan.hasActiveChildSpan, 1, 0) { panic("impl error: hasActiveChildSpan should not change to 0 while we hold it") } this.endedAt = time.Now() chrometraceEndSpan(this) } return ctx, endTaskFunc } func currentTaskNameAndSpanStack(this *traceNode) (taskName string, spanIdStack string) { task := this.parentTask if this.parentSpan == nil { task = this } var spansInTask []*traceNode for s := this; s != nil; s = s.parentSpan { spansInTask = append(spansInTask, s) } var tasks []*traceNode for t := task; t != nil; t = t.parentTask { tasks = append(tasks, t) } var taskIdsRev []string for i := len(tasks) - 1; i >= 0; i-- { taskIdsRev = append(taskIdsRev, tasks[i].id) } var spanIdsRev []string for i := len(spansInTask) - 1; i >= 0; i-- { spanIdsRev = append(spanIdsRev, spansInTask[i].id) } taskStack := strings.Join(taskIdsRev, "$") spanIdStack = fmt.Sprintf("%s$%s", taskStack, strings.Join(spanIdsRev, ".")) return task.annotation, spanIdStack } func GetSpanStackOrDefault(ctx context.Context, def string) string { if nI := ctx.Value(contextKeyTraceNode); nI != nil { n := nI.(*traceNode) _, spanStack := currentTaskNameAndSpanStack(n) return spanStack } else { return def } }