remove most of the watchdog machinery

This commit is contained in:
InsanePrawn 2018-11-21 02:42:13 +01:00
parent 58dcc07430
commit 442d61918b

View File

@ -6,16 +6,12 @@ import (
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/zrepl/zrepl/config" "github.com/zrepl/zrepl/config"
"github.com/zrepl/zrepl/daemon/filters" "github.com/zrepl/zrepl/daemon/filters"
"github.com/zrepl/zrepl/daemon/job/reset"
"github.com/zrepl/zrepl/daemon/job/wakeup" "github.com/zrepl/zrepl/daemon/job/wakeup"
"github.com/zrepl/zrepl/daemon/logging" "github.com/zrepl/zrepl/daemon/logging"
"github.com/zrepl/zrepl/daemon/pruner" "github.com/zrepl/zrepl/daemon/pruner"
"github.com/zrepl/zrepl/daemon/snapper" "github.com/zrepl/zrepl/daemon/snapper"
"github.com/zrepl/zrepl/endpoint" "github.com/zrepl/zrepl/endpoint"
"github.com/zrepl/zrepl/util/envconst"
"github.com/zrepl/zrepl/zfs" "github.com/zrepl/zrepl/zfs"
"sync"
"time"
) )
type SnapJob struct { type SnapJob struct {
@ -27,19 +23,10 @@ type SnapJob struct {
promPruneSecs *prometheus.HistogramVec // labels: prune_side promPruneSecs *prometheus.HistogramVec // labels: prune_side
tasksMtx sync.Mutex
tasks snap_activeSideTasks
}
type snap_activeSideTasks struct {
state ActiveSideState
pruner *pruner.Pruner pruner *pruner.Pruner
prunerCancel context.CancelFunc
} }
func (j *SnapJob) Name() string { return j.name } func (j *SnapJob) Name() string { return j.name }
func (j *SnapJob) GetPruner(ctx context.Context, sender *endpoint.Sender) (*pruner.Pruner) { func (j *SnapJob) GetPruner(ctx context.Context, sender *endpoint.Sender) (*pruner.Pruner) {
@ -48,20 +35,6 @@ func (j *SnapJob) GetPruner(ctx context.Context, sender *endpoint.Sender) (*prun
} }
func (j *SnapJob) updateTasks(u func(*snap_activeSideTasks)) snap_activeSideTasks {
j.tasksMtx.Lock()
defer j.tasksMtx.Unlock()
var copy snap_activeSideTasks
copy = j.tasks
if u == nil {
return copy
}
u(&copy)
j.tasks = copy
return copy
}
func (j *SnapJob) Type() Type { return TypeSnap } func (j *SnapJob) Type() Type { return TypeSnap }
func (j *SnapJob) RunPeriodic(ctx context.Context, wakeUpCommon chan <- struct{}) { func (j *SnapJob) RunPeriodic(ctx context.Context, wakeUpCommon chan <- struct{}) {
@ -107,12 +80,12 @@ type SnapJobStatus struct {
} }
func (j *SnapJob) Status() *Status { func (j *SnapJob) Status() *Status {
tasks := j.updateTasks(nil) //tasks := j.updateTasks(nil)
s := &SnapJobStatus{} s := &SnapJobStatus{}
t := j.Type() t := j.Type()
if tasks.pruner != nil { if j.pruner != nil {
s.Pruning = tasks.pruner.Report() s.Pruning = j.pruner.Report()
} }
return &Status{Type: t, JobSpecific: s} return &Status{Type: t, JobSpecific: s}
} }
@ -142,98 +115,17 @@ outer:
} }
invocationCount++ invocationCount++
invLog := log.WithField("invocation", invocationCount) invLog := log.WithField("invocation", invocationCount)
j.do(WithLogger(ctx, invLog)) j.doPrune(WithLogger(ctx, invLog))
} }
} }
func (j *SnapJob) do(ctx context.Context) { func (j *SnapJob) doPrune(ctx context.Context) {
log := GetLogger(ctx) log := GetLogger(ctx)
ctx = logging.WithSubsystemLoggers(ctx, log) ctx = logging.WithSubsystemLoggers(ctx, log)
// allow cancellation of an invocation (this function)
ctx, cancelThisRun := context.WithCancel(ctx)
defer cancelThisRun()
go func() {
select {
case <-reset.Wait(ctx):
log.Info("reset received, cancelling current invocation")
cancelThisRun()
case <-ctx.Done():
}
}()
// The code after this watchdog goroutine is sequential and transitions the state from
// ActiveSideReplicating -> ActiveSidePruneSender -> ActiveSidePruneReceiver -> ActiveSideDone
// If any of those sequential tasks 'gets stuck' (livelock, no progress), the watchdog will eventually
// cancel its context.
// If the task is written to support context cancellation, it will return immediately (in permanent error state),
// and the sequential code above transitions to the next state.
go func() {
wdto := envconst.Duration("ZREPL_JOB_WATCHDOG_TIMEOUT", 10*time.Minute)
jitter := envconst.Duration("ZREPL_JOB_WATCHDOG_JITTER", 1*time.Second)
// shadowing!
log := log.WithField("watchdog_timeout", wdto.String())
log.Debug("starting watchdog")
defer log.Debug("watchdog stopped")
t := time.NewTicker(wdto)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C: // fall
}
j.updateTasks(func(tasks *snap_activeSideTasks) {
// Since cancelling a task will cause the sequential code to transition to the next state immediately,
// we cannot check for its progress right then (no fallthrough).
// Instead, we return (not continue because we are in a closure) and give the new state another
// ZREPL_JOB_WATCHDOG_TIMEOUT interval to try make some progress.
log.WithField("state", tasks.state).Debug("watchdog firing")
const WATCHDOG_ENVCONST_NOTICE = " (adjust ZREPL_JOB_WATCHDOG_TIMEOUT env variable if inappropriate)"
switch tasks.state {
case ActiveSidePruneSender:
log.WithField("prune_sender_progress", "TEST DEBUG 123").
Debug("check pruner_sender progress")
if tasks.pruner.Progress.CheckTimeout(wdto, jitter) {
log.Error("pruner_sender did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
tasks.prunerCancel()
return
}
case ActiveSideDone:
// ignore, ctx will be Done() in a few milliseconds and the watchdog will exit
default:
log.WithField("state", tasks.state).
Error("watchdog implementation error: unknown active side state")
}
})
}
}()
ctx, localCancel := context.WithCancel(ctx)
sender := endpoint.NewSender(j.FSFilter()) sender := endpoint.NewSender(j.FSFilter())
tasks := j.updateTasks(func(tasks *snap_activeSideTasks) { j.pruner = j.GetPruner(ctx, sender)
tasks.pruner = j.GetPruner(ctx, sender) log.Info("start pruning")
tasks.prunerCancel = localCancel j.pruner.Prune()
tasks.state = ActiveSidePruneSender log.Info("finished pruning")
})
log.Info("start pruning sender")
tasks.pruner.Prune()
log.Info("finished pruning sender")
localCancel()
j.updateTasks(func(tasks *snap_activeSideTasks) {
tasks.state = ActiveSideDone
})
} }