zrepl/client/status.go
Christian Schwarz 1ae087bfcf [WIP] add and use tracing API as part of package logging
- make `logging.GetLogger(ctx, Subsys)` the authoritative `logger.Logger` factory function
    - the context carries a linked list of injected fields which
      `logging.GetLogger` adds to the logger it returns
- introduce the concept of tasks and spans, also tracked as linked list within ctx
    - [ ] TODO automatic logging of span begins and ends, with a unique
      ID stack that makes it easy to follow a series of log entries in
      concurrent code
    - ability to produce a chrome://tracing-compatible trace file,
      either via an env variable or a `zrepl pprof` subcommand
        - this is not a CPU profile, we already have go pprof for that
        - but it is very useful to visually inspect where the
          replication / snapshotter / pruner spends its time
          ( fixes #307 )
2020-04-25 11:16:59 +02:00

812 lines
19 KiB
Go

package client
import (
"context"
"fmt"
"io"
"math"
"net/http"
"os"
"sort"
"strings"
"sync"
"time"
// tcell is the termbox-compatible library for abstracting away escape sequences, etc.
// as of tcell#252, the number of default distributed terminals is relatively limited
// additional terminal definitions can be included via side-effect import
// See https://github.com/gdamore/tcell/blob/master/terminfo/base/base.go
// See https://github.com/gdamore/tcell/issues/252#issuecomment-533836078
"github.com/gdamore/tcell/termbox"
_ "github.com/gdamore/tcell/terminfo/s/screen" // tmux on FreeBSD 11 & 12 without ncurses
"github.com/pkg/errors"
"github.com/spf13/pflag"
"github.com/zrepl/yaml-config"
"github.com/zrepl/zrepl/cli"
"github.com/zrepl/zrepl/daemon"
"github.com/zrepl/zrepl/daemon/job"
"github.com/zrepl/zrepl/daemon/pruner"
"github.com/zrepl/zrepl/daemon/snapper"
"github.com/zrepl/zrepl/replication/report"
)
type byteProgressMeasurement struct {
time time.Time
val int64
}
type bytesProgressHistory struct {
last *byteProgressMeasurement // pointer as poor man's optional
changeCount int
lastChange time.Time
bpsAvg float64
}
func (p *bytesProgressHistory) Update(currentVal int64) (bytesPerSecondAvg int64, changeCount int) {
if p.last == nil {
p.last = &byteProgressMeasurement{
time: time.Now(),
val: currentVal,
}
return 0, 0
}
if p.last.val != currentVal {
p.changeCount++
p.lastChange = time.Now()
}
if time.Since(p.lastChange) > 3*time.Second {
p.last = nil
return 0, 0
}
deltaV := currentVal - p.last.val
deltaT := time.Since(p.last.time)
rate := float64(deltaV) / deltaT.Seconds()
factor := 0.3
p.bpsAvg = (1-factor)*p.bpsAvg + factor*rate
p.last.time = time.Now()
p.last.val = currentVal
return int64(p.bpsAvg), p.changeCount
}
type tui struct {
x, y int
indent int
lock sync.Mutex //For report and error
report map[string]*job.Status
err error
jobFilter string
replicationProgress map[string]*bytesProgressHistory // by job name
}
func newTui() tui {
return tui{
replicationProgress: make(map[string]*bytesProgressHistory),
}
}
const INDENT_MULTIPLIER = 4
func (t *tui) moveLine(dl int, col int) {
t.y += dl
t.x = t.indent*INDENT_MULTIPLIER + col
}
func (t *tui) write(text string) {
for _, c := range text {
if c == '\n' {
t.newline()
continue
}
termbox.SetCell(t.x, t.y, c, termbox.ColorDefault, termbox.ColorDefault)
t.x += 1
}
}
func (t *tui) printf(text string, a ...interface{}) {
t.write(fmt.Sprintf(text, a...))
}
func wrap(s string, width int) string {
var b strings.Builder
for len(s) > 0 {
rem := width
if rem > len(s) {
rem = len(s)
}
if idx := strings.IndexAny(s, "\n\r"); idx != -1 && idx < rem {
rem = idx + 1
}
untilNewline := strings.TrimRight(s[:rem], "\n\r")
s = s[rem:]
if len(untilNewline) == 0 {
continue
}
b.WriteString(untilNewline)
b.WriteString("\n")
}
return strings.TrimRight(b.String(), "\n\r")
}
func (t *tui) printfDrawIndentedAndWrappedIfMultiline(format string, a ...interface{}) {
whole := fmt.Sprintf(format, a...)
width, _ := termbox.Size()
if !strings.ContainsAny(whole, "\n\r") && t.x+len(whole) <= width {
t.printf(format, a...)
} else {
t.addIndent(1)
t.newline()
t.write(wrap(whole, width-INDENT_MULTIPLIER*t.indent))
t.addIndent(-1)
}
}
func (t *tui) newline() {
t.moveLine(1, 0)
}
func (t *tui) setIndent(indent int) {
t.indent = indent
t.moveLine(0, 0)
}
func (t *tui) addIndent(indent int) {
t.indent += indent
t.moveLine(0, 0)
}
var statusFlags struct {
Raw bool
Job string
}
var StatusCmd = &cli.Subcommand{
Use: "status",
Short: "show job activity or dump as JSON for monitoring",
SetupFlags: func(f *pflag.FlagSet) {
f.BoolVar(&statusFlags.Raw, "raw", false, "dump raw status description from zrepl daemon")
f.StringVar(&statusFlags.Job, "job", "", "only dump specified job")
},
Run: runStatus,
}
func runStatus(ctx context.Context, s *cli.Subcommand, args []string) error {
httpc, err := controlHttpClient(s.Config().Global.Control.SockPath)
if err != nil {
return err
}
if statusFlags.Raw {
resp, err := httpc.Get("http://unix" + daemon.ControlJobEndpointStatus)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
fmt.Fprintf(os.Stderr, "Received error response:\n")
_, err := io.CopyN(os.Stderr, resp.Body, 4096)
if err != nil {
return err
}
return errors.Errorf("exit")
}
if _, err := io.Copy(os.Stdout, resp.Body); err != nil {
return err
}
return nil
}
t := newTui()
t.lock.Lock()
t.err = errors.New("Got no report yet")
t.lock.Unlock()
t.jobFilter = statusFlags.Job
err = termbox.Init()
if err != nil {
return err
}
defer termbox.Close()
update := func() {
var m daemon.Status
err2 := jsonRequestResponse(httpc, daemon.ControlJobEndpointStatus,
struct{}{},
&m,
)
t.lock.Lock()
t.err = err2
t.report = m.Jobs
t.lock.Unlock()
t.draw()
}
update()
ticker := time.NewTicker(500 * time.Millisecond)
defer ticker.Stop()
go func() {
for range ticker.C {
update()
}
}()
termbox.HideCursor()
termbox.Clear(termbox.ColorDefault, termbox.ColorDefault)
loop:
for {
switch ev := termbox.PollEvent(); ev.Type {
case termbox.EventKey:
switch ev.Key {
case termbox.KeyEsc:
break loop
case termbox.KeyCtrlC:
break loop
}
case termbox.EventResize:
t.draw()
}
}
return nil
}
func (t *tui) getReplicationProgressHistory(jobName string) *bytesProgressHistory {
p, ok := t.replicationProgress[jobName]
if !ok {
p = &bytesProgressHistory{}
t.replicationProgress[jobName] = p
}
return p
}
func (t *tui) draw() {
t.lock.Lock()
defer t.lock.Unlock()
termbox.Clear(termbox.ColorDefault, termbox.ColorDefault)
t.x = 0
t.y = 0
t.indent = 0
if t.err != nil {
t.write(t.err.Error())
} else {
//Iterate over map in alphabetical order
keys := make([]string, 0, len(t.report))
for k := range t.report {
if len(k) == 0 || daemon.IsInternalJobName(k) { //Internal job
continue
}
if t.jobFilter != "" && k != t.jobFilter {
continue
}
keys = append(keys, k)
}
sort.Strings(keys)
if len(keys) == 0 {
t.setIndent(0)
t.printf("no jobs to display")
t.newline()
termbox.Flush()
return
}
for _, k := range keys {
v := t.report[k]
t.setIndent(0)
t.printf("Job: %s", k)
t.setIndent(1)
t.newline()
t.printf("Type: %s", v.Type)
t.setIndent(1)
t.newline()
if v.Type == job.TypePush || v.Type == job.TypePull {
activeStatus, ok := v.JobSpecific.(*job.ActiveSideStatus)
if !ok || activeStatus == nil {
t.printf("ActiveSideStatus is null")
t.newline()
continue
}
t.printf("Replication:")
t.newline()
t.addIndent(1)
t.renderReplicationReport(activeStatus.Replication, t.getReplicationProgressHistory(k))
t.addIndent(-1)
t.printf("Pruning Sender:")
t.newline()
t.addIndent(1)
t.renderPrunerReport(activeStatus.PruningSender)
t.addIndent(-1)
t.printf("Pruning Receiver:")
t.newline()
t.addIndent(1)
t.renderPrunerReport(activeStatus.PruningReceiver)
t.addIndent(-1)
if v.Type == job.TypePush {
t.printf("Snapshotting:")
t.newline()
t.addIndent(1)
t.renderSnapperReport(activeStatus.Snapshotting)
t.addIndent(-1)
}
} else if v.Type == job.TypeSnap {
snapStatus, ok := v.JobSpecific.(*job.SnapJobStatus)
if !ok || snapStatus == nil {
t.printf("SnapJobStatus is null")
t.newline()
continue
}
t.printf("Pruning snapshots:")
t.newline()
t.addIndent(1)
t.renderPrunerReport(snapStatus.Pruning)
t.addIndent(-1)
t.printf("Snapshotting:")
t.newline()
t.addIndent(1)
t.renderSnapperReport(snapStatus.Snapshotting)
t.addIndent(-1)
} else if v.Type == job.TypeSource {
st := v.JobSpecific.(*job.PassiveStatus)
t.printf("Snapshotting:\n")
t.addIndent(1)
t.renderSnapperReport(st.Snapper)
t.addIndent(-1)
} else {
t.printf("No status representation for job type '%s', dumping as YAML", v.Type)
t.newline()
asYaml, err := yaml.Marshal(v.JobSpecific)
if err != nil {
t.printf("Error marshaling status to YAML: %s", err)
t.newline()
continue
}
t.write(string(asYaml))
t.newline()
continue
}
}
}
termbox.Flush()
}
func (t *tui) renderReplicationReport(rep *report.Report, history *bytesProgressHistory) {
if rep == nil {
t.printf("...\n")
return
}
if rep.WaitReconnectError != nil {
t.printfDrawIndentedAndWrappedIfMultiline("Connectivity: %s", rep.WaitReconnectError)
t.newline()
}
if !rep.WaitReconnectSince.IsZero() {
delta := time.Until(rep.WaitReconnectUntil).Round(time.Second)
if rep.WaitReconnectUntil.IsZero() || delta > 0 {
var until string
if rep.WaitReconnectUntil.IsZero() {
until = "waiting indefinitely"
} else {
until = fmt.Sprintf("hard fail in %s @ %s", delta, rep.WaitReconnectUntil)
}
t.printfDrawIndentedAndWrappedIfMultiline("Connectivity: reconnecting with exponential backoff (since %s) (%s)",
rep.WaitReconnectSince, until)
} else {
t.printfDrawIndentedAndWrappedIfMultiline("Connectivity: reconnects reached hard-fail timeout @ %s", rep.WaitReconnectUntil)
}
t.newline()
}
// TODO visualize more than the latest attempt by folding all attempts into one
if len(rep.Attempts) == 0 {
t.printf("no attempts made yet")
return
} else {
t.printf("Attempt #%d", len(rep.Attempts))
if len(rep.Attempts) > 1 {
t.printf(". Previous attempts failed with the following statuses:")
t.newline()
t.addIndent(1)
for i, a := range rep.Attempts[:len(rep.Attempts)-1] {
t.printfDrawIndentedAndWrappedIfMultiline("#%d: %s (failed at %s) (ran %s)", i+1, a.State, a.FinishAt, a.FinishAt.Sub(a.StartAt))
t.newline()
}
t.addIndent(-1)
} else {
t.newline()
}
}
latest := rep.Attempts[len(rep.Attempts)-1]
sort.Slice(latest.Filesystems, func(i, j int) bool {
return latest.Filesystems[i].Info.Name < latest.Filesystems[j].Info.Name
})
t.printf("Status: %s", latest.State)
t.newline()
if latest.State == report.AttemptPlanningError {
t.printf("Problem: ")
t.printfDrawIndentedAndWrappedIfMultiline("%s", latest.PlanError)
t.newline()
} else if latest.State == report.AttemptFanOutError {
t.printf("Problem: one or more of the filesystems encountered errors")
t.newline()
}
if latest.State != report.AttemptPlanning && latest.State != report.AttemptPlanningError {
// Draw global progress bar
// Progress: [---------------]
expected, replicated, containsInvalidSizeEstimates := latest.BytesSum()
rate, changeCount := history.Update(replicated)
eta := time.Duration(0)
if rate > 0 {
eta = time.Duration((expected-replicated)/rate) * time.Second
}
t.write("Progress: ")
t.drawBar(50, replicated, expected, changeCount)
t.write(fmt.Sprintf(" %s / %s @ %s/s", ByteCountBinary(replicated), ByteCountBinary(expected), ByteCountBinary(rate)))
if eta != 0 {
t.write(fmt.Sprintf(" (%s remaining)", humanizeDuration(eta)))
}
t.newline()
if containsInvalidSizeEstimates {
t.write("NOTE: not all steps could be size-estimated, total estimate is likely imprecise!")
t.newline()
}
var maxFSLen int
for _, fs := range latest.Filesystems {
if len(fs.Info.Name) > maxFSLen {
maxFSLen = len(fs.Info.Name)
}
}
for _, fs := range latest.Filesystems {
t.printFilesystemStatus(fs, false, maxFSLen) // FIXME bring 'active' flag back
}
}
}
func (t *tui) renderPrunerReport(r *pruner.Report) {
if r == nil {
t.printf("...\n")
return
}
state, err := pruner.StateString(r.State)
if err != nil {
t.printf("Status: %q (parse error: %q)\n", r.State, err)
return
}
t.printf("Status: %s", state)
t.newline()
if r.Error != "" {
t.printf("Error: %s\n", r.Error)
}
type commonFS struct {
*pruner.FSReport
completed bool
}
all := make([]commonFS, 0, len(r.Pending)+len(r.Completed))
for i := range r.Pending {
all = append(all, commonFS{&r.Pending[i], false})
}
for i := range r.Completed {
all = append(all, commonFS{&r.Completed[i], true})
}
switch state {
case pruner.Plan:
fallthrough
case pruner.PlanErr:
return
}
if len(all) == 0 {
t.printf("nothing to do\n")
return
}
var totalDestroyCount, completedDestroyCount int
var maxFSname int
for _, fs := range all {
totalDestroyCount += len(fs.DestroyList)
if fs.completed {
completedDestroyCount += len(fs.DestroyList)
}
if maxFSname < len(fs.Filesystem) {
maxFSname = len(fs.Filesystem)
}
}
// global progress bar
progress := int(math.Round(80 * float64(completedDestroyCount) / float64(totalDestroyCount)))
t.write("Progress: ")
t.write("[")
t.write(times("=", progress))
t.write(">")
t.write(times("-", 80-progress))
t.write("]")
t.printf(" %d/%d snapshots", completedDestroyCount, totalDestroyCount)
t.newline()
sort.SliceStable(all, func(i, j int) bool {
return strings.Compare(all[i].Filesystem, all[j].Filesystem) == -1
})
// Draw a table-like representation of 'all'
for _, fs := range all {
t.write(rightPad(fs.Filesystem, maxFSname, " "))
t.write(" ")
if !fs.SkipReason.NotSkipped() {
t.printf("skipped: %s\n", fs.SkipReason)
continue
}
if fs.LastError != "" {
if strings.ContainsAny(fs.LastError, "\r\n") {
t.printf("ERROR:")
t.printfDrawIndentedAndWrappedIfMultiline("%s\n", fs.LastError)
} else {
t.printfDrawIndentedAndWrappedIfMultiline("ERROR: %s\n", fs.LastError)
}
t.newline()
continue
}
pruneRuleActionStr := fmt.Sprintf("(destroy %d of %d snapshots)",
len(fs.DestroyList), len(fs.SnapshotList))
if fs.completed {
t.printf("Completed %s\n", pruneRuleActionStr)
continue
}
t.write("Pending ") // whitespace is padding 10
if len(fs.DestroyList) == 1 {
t.write(fs.DestroyList[0].Name)
} else {
t.write(pruneRuleActionStr)
}
t.newline()
}
}
func (t *tui) renderSnapperReport(r *snapper.Report) {
if r == nil {
t.printf("<snapshot type does not have a report>\n")
return
}
t.printf("Status: %s", r.State)
t.newline()
if r.Error != "" {
t.printf("Error: %s\n", r.Error)
}
if !r.SleepUntil.IsZero() {
t.printf("Sleep until: %s\n", r.SleepUntil)
}
sort.Slice(r.Progress, func(i, j int) bool {
return strings.Compare(r.Progress[i].Path, r.Progress[j].Path) == -1
})
t.addIndent(1)
defer t.addIndent(-1)
dur := func(d time.Duration) string {
return d.Round(100 * time.Millisecond).String()
}
type row struct {
path, state, duration, remainder, hookReport string
}
var widths struct {
path, state, duration int
}
rows := make([]*row, len(r.Progress))
for i, fs := range r.Progress {
r := &row{
path: fs.Path,
state: fs.State.String(),
}
if fs.HooksHadError {
r.hookReport = fs.Hooks // FIXME render here, not in daemon
}
switch fs.State {
case snapper.SnapPending:
r.duration = "..."
r.remainder = ""
case snapper.SnapStarted:
r.duration = dur(time.Since(fs.StartAt))
r.remainder = fmt.Sprintf("snap name: %q", fs.SnapName)
case snapper.SnapDone:
fallthrough
case snapper.SnapError:
r.duration = dur(fs.DoneAt.Sub(fs.StartAt))
r.remainder = fmt.Sprintf("snap name: %q", fs.SnapName)
}
rows[i] = r
if len(r.path) > widths.path {
widths.path = len(r.path)
}
if len(r.state) > widths.state {
widths.state = len(r.state)
}
if len(r.duration) > widths.duration {
widths.duration = len(r.duration)
}
}
for _, r := range rows {
path := rightPad(r.path, widths.path, " ")
state := rightPad(r.state, widths.state, " ")
duration := rightPad(r.duration, widths.duration, " ")
t.printf("%s %s %s", path, state, duration)
t.printfDrawIndentedAndWrappedIfMultiline(" %s", r.remainder)
if r.hookReport != "" {
t.printfDrawIndentedAndWrappedIfMultiline("%s", r.hookReport)
}
t.newline()
}
}
func times(str string, n int) (out string) {
for i := 0; i < n; i++ {
out += str
}
return
}
func rightPad(str string, length int, pad string) string {
if len(str) > length {
return str[:length]
}
return str + strings.Repeat(pad, length-len(str))
}
var arrowPositions = `>\|/`
// changeCount = 0 indicates stall / no progress
func (t *tui) drawBar(length int, bytes, totalBytes int64, changeCount int) {
var completedLength int
if totalBytes > 0 {
completedLength = int(int64(length) * bytes / totalBytes)
if completedLength > length {
completedLength = length
}
} else if totalBytes == bytes {
completedLength = length
}
t.write("[")
t.write(times("=", completedLength))
t.write(string(arrowPositions[changeCount%len(arrowPositions)]))
t.write(times("-", length-completedLength))
t.write("]")
}
func (t *tui) printFilesystemStatus(rep *report.FilesystemReport, active bool, maxFS int) {
expected, replicated, containsInvalidSizeEstimates := rep.BytesSum()
sizeEstimationImpreciseNotice := ""
if containsInvalidSizeEstimates {
sizeEstimationImpreciseNotice = " (some steps lack size estimation)"
}
if rep.CurrentStep < len(rep.Steps) && rep.Steps[rep.CurrentStep].Info.BytesExpected == 0 {
sizeEstimationImpreciseNotice = " (step lacks size estimation)"
}
status := fmt.Sprintf("%s (step %d/%d, %s/%s)%s",
strings.ToUpper(string(rep.State)),
rep.CurrentStep, len(rep.Steps),
ByteCountBinary(replicated), ByteCountBinary(expected),
sizeEstimationImpreciseNotice,
)
activeIndicator := " "
if active {
activeIndicator = "*"
}
t.printf("%s %s %s ",
activeIndicator,
rightPad(rep.Info.Name, maxFS, " "),
status)
next := ""
if err := rep.Error(); err != nil {
next = err.Err
} else if rep.State != report.FilesystemDone {
if nextStep := rep.NextStep(); nextStep != nil {
if nextStep.IsIncremental() {
next = fmt.Sprintf("next: %s => %s", nextStep.Info.From, nextStep.Info.To)
} else {
next = fmt.Sprintf("next: full send %s", nextStep.Info.To)
}
attribs := []string{}
if nextStep.Info.Resumed {
attribs = append(attribs, "resumed")
}
attribs = append(attribs, fmt.Sprintf("encrypted=%s", nextStep.Info.Encrypted))
next += fmt.Sprintf(" (%s)", strings.Join(attribs, ", "))
} else {
next = "" // individual FSes may still be in planning state
}
}
t.printfDrawIndentedAndWrappedIfMultiline("%s", next)
t.newline()
}
func ByteCountBinary(b int64) string {
const unit = 1024
if b < unit {
return fmt.Sprintf("%d B", b)
}
div, exp := int64(unit), 0
for n := b / unit; n >= unit; n /= unit {
div *= unit
exp++
}
return fmt.Sprintf("%.1f %ciB", float64(b)/float64(div), "KMGTPE"[exp])
}
func humanizeDuration(duration time.Duration) string {
days := int64(duration.Hours() / 24)
hours := int64(math.Mod(duration.Hours(), 24))
minutes := int64(math.Mod(duration.Minutes(), 60))
seconds := int64(math.Mod(duration.Seconds(), 60))
var parts []string
force := false
chunks := []int64{days, hours, minutes, seconds}
for i, chunk := range chunks {
if force || chunk > 0 {
padding := 0
if force {
padding = 2
}
parts = append(parts, fmt.Sprintf("%*d%c", padding, chunk, "dhms"[i]))
force = true
}
}
return strings.Join(parts, " ")
}