mirror of
https://github.com/zrepl/zrepl.git
synced 2024-11-26 10:25:05 +01:00
remove most of the watchdog machinery
This commit is contained in:
parent
58dcc07430
commit
442d61918b
@ -6,16 +6,12 @@ import (
|
|||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/zrepl/zrepl/config"
|
"github.com/zrepl/zrepl/config"
|
||||||
"github.com/zrepl/zrepl/daemon/filters"
|
"github.com/zrepl/zrepl/daemon/filters"
|
||||||
"github.com/zrepl/zrepl/daemon/job/reset"
|
|
||||||
"github.com/zrepl/zrepl/daemon/job/wakeup"
|
"github.com/zrepl/zrepl/daemon/job/wakeup"
|
||||||
"github.com/zrepl/zrepl/daemon/logging"
|
"github.com/zrepl/zrepl/daemon/logging"
|
||||||
"github.com/zrepl/zrepl/daemon/pruner"
|
"github.com/zrepl/zrepl/daemon/pruner"
|
||||||
"github.com/zrepl/zrepl/daemon/snapper"
|
"github.com/zrepl/zrepl/daemon/snapper"
|
||||||
"github.com/zrepl/zrepl/endpoint"
|
"github.com/zrepl/zrepl/endpoint"
|
||||||
"github.com/zrepl/zrepl/util/envconst"
|
|
||||||
"github.com/zrepl/zrepl/zfs"
|
"github.com/zrepl/zrepl/zfs"
|
||||||
"sync"
|
|
||||||
"time"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type SnapJob struct {
|
type SnapJob struct {
|
||||||
@ -27,19 +23,10 @@ type SnapJob struct {
|
|||||||
|
|
||||||
promPruneSecs *prometheus.HistogramVec // labels: prune_side
|
promPruneSecs *prometheus.HistogramVec // labels: prune_side
|
||||||
|
|
||||||
tasksMtx sync.Mutex
|
|
||||||
tasks snap_activeSideTasks
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
type snap_activeSideTasks struct {
|
|
||||||
state ActiveSideState
|
|
||||||
|
|
||||||
pruner *pruner.Pruner
|
pruner *pruner.Pruner
|
||||||
|
|
||||||
prunerCancel context.CancelFunc
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func (j *SnapJob) Name() string { return j.name }
|
func (j *SnapJob) Name() string { return j.name }
|
||||||
|
|
||||||
func (j *SnapJob) GetPruner(ctx context.Context, sender *endpoint.Sender) (*pruner.Pruner) {
|
func (j *SnapJob) GetPruner(ctx context.Context, sender *endpoint.Sender) (*pruner.Pruner) {
|
||||||
@ -48,20 +35,6 @@ func (j *SnapJob) GetPruner(ctx context.Context, sender *endpoint.Sender) (*prun
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
func (j *SnapJob) updateTasks(u func(*snap_activeSideTasks)) snap_activeSideTasks {
|
|
||||||
j.tasksMtx.Lock()
|
|
||||||
defer j.tasksMtx.Unlock()
|
|
||||||
var copy snap_activeSideTasks
|
|
||||||
copy = j.tasks
|
|
||||||
if u == nil {
|
|
||||||
return copy
|
|
||||||
}
|
|
||||||
u(©)
|
|
||||||
j.tasks = copy
|
|
||||||
return copy
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
func (j *SnapJob) Type() Type { return TypeSnap }
|
func (j *SnapJob) Type() Type { return TypeSnap }
|
||||||
|
|
||||||
func (j *SnapJob) RunPeriodic(ctx context.Context, wakeUpCommon chan <- struct{}) {
|
func (j *SnapJob) RunPeriodic(ctx context.Context, wakeUpCommon chan <- struct{}) {
|
||||||
@ -107,12 +80,12 @@ type SnapJobStatus struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (j *SnapJob) Status() *Status {
|
func (j *SnapJob) Status() *Status {
|
||||||
tasks := j.updateTasks(nil)
|
//tasks := j.updateTasks(nil)
|
||||||
|
|
||||||
s := &SnapJobStatus{}
|
s := &SnapJobStatus{}
|
||||||
t := j.Type()
|
t := j.Type()
|
||||||
if tasks.pruner != nil {
|
if j.pruner != nil {
|
||||||
s.Pruning = tasks.pruner.Report()
|
s.Pruning = j.pruner.Report()
|
||||||
}
|
}
|
||||||
return &Status{Type: t, JobSpecific: s}
|
return &Status{Type: t, JobSpecific: s}
|
||||||
}
|
}
|
||||||
@ -142,98 +115,17 @@ outer:
|
|||||||
}
|
}
|
||||||
invocationCount++
|
invocationCount++
|
||||||
invLog := log.WithField("invocation", invocationCount)
|
invLog := log.WithField("invocation", invocationCount)
|
||||||
j.do(WithLogger(ctx, invLog))
|
j.doPrune(WithLogger(ctx, invLog))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (j *SnapJob) do(ctx context.Context) {
|
func (j *SnapJob) doPrune(ctx context.Context) {
|
||||||
|
|
||||||
log := GetLogger(ctx)
|
log := GetLogger(ctx)
|
||||||
ctx = logging.WithSubsystemLoggers(ctx, log)
|
ctx = logging.WithSubsystemLoggers(ctx, log)
|
||||||
|
|
||||||
// allow cancellation of an invocation (this function)
|
|
||||||
ctx, cancelThisRun := context.WithCancel(ctx)
|
|
||||||
defer cancelThisRun()
|
|
||||||
go func() {
|
|
||||||
select {
|
|
||||||
case <-reset.Wait(ctx):
|
|
||||||
log.Info("reset received, cancelling current invocation")
|
|
||||||
cancelThisRun()
|
|
||||||
case <-ctx.Done():
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
// The code after this watchdog goroutine is sequential and transitions the state from
|
|
||||||
// ActiveSideReplicating -> ActiveSidePruneSender -> ActiveSidePruneReceiver -> ActiveSideDone
|
|
||||||
// If any of those sequential tasks 'gets stuck' (livelock, no progress), the watchdog will eventually
|
|
||||||
// cancel its context.
|
|
||||||
// If the task is written to support context cancellation, it will return immediately (in permanent error state),
|
|
||||||
// and the sequential code above transitions to the next state.
|
|
||||||
go func() {
|
|
||||||
|
|
||||||
wdto := envconst.Duration("ZREPL_JOB_WATCHDOG_TIMEOUT", 10*time.Minute)
|
|
||||||
jitter := envconst.Duration("ZREPL_JOB_WATCHDOG_JITTER", 1*time.Second)
|
|
||||||
// shadowing!
|
|
||||||
log := log.WithField("watchdog_timeout", wdto.String())
|
|
||||||
|
|
||||||
log.Debug("starting watchdog")
|
|
||||||
defer log.Debug("watchdog stopped")
|
|
||||||
|
|
||||||
t := time.NewTicker(wdto)
|
|
||||||
defer t.Stop()
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return
|
|
||||||
case <-t.C: // fall
|
|
||||||
}
|
|
||||||
|
|
||||||
j.updateTasks(func(tasks *snap_activeSideTasks) {
|
|
||||||
// Since cancelling a task will cause the sequential code to transition to the next state immediately,
|
|
||||||
// we cannot check for its progress right then (no fallthrough).
|
|
||||||
// Instead, we return (not continue because we are in a closure) and give the new state another
|
|
||||||
// ZREPL_JOB_WATCHDOG_TIMEOUT interval to try make some progress.
|
|
||||||
|
|
||||||
log.WithField("state", tasks.state).Debug("watchdog firing")
|
|
||||||
|
|
||||||
const WATCHDOG_ENVCONST_NOTICE = " (adjust ZREPL_JOB_WATCHDOG_TIMEOUT env variable if inappropriate)"
|
|
||||||
|
|
||||||
switch tasks.state {
|
|
||||||
case ActiveSidePruneSender:
|
|
||||||
log.WithField("prune_sender_progress", "TEST DEBUG 123").
|
|
||||||
Debug("check pruner_sender progress")
|
|
||||||
if tasks.pruner.Progress.CheckTimeout(wdto, jitter) {
|
|
||||||
log.Error("pruner_sender did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
|
|
||||||
tasks.prunerCancel()
|
|
||||||
return
|
|
||||||
}
|
|
||||||
case ActiveSideDone:
|
|
||||||
// ignore, ctx will be Done() in a few milliseconds and the watchdog will exit
|
|
||||||
default:
|
|
||||||
log.WithField("state", tasks.state).
|
|
||||||
Error("watchdog implementation error: unknown active side state")
|
|
||||||
}
|
|
||||||
})
|
|
||||||
|
|
||||||
}
|
|
||||||
}()
|
|
||||||
|
|
||||||
ctx, localCancel := context.WithCancel(ctx)
|
|
||||||
sender := endpoint.NewSender(j.FSFilter())
|
sender := endpoint.NewSender(j.FSFilter())
|
||||||
tasks := j.updateTasks(func(tasks *snap_activeSideTasks) {
|
j.pruner = j.GetPruner(ctx, sender)
|
||||||
tasks.pruner = j.GetPruner(ctx, sender)
|
log.Info("start pruning")
|
||||||
tasks.prunerCancel = localCancel
|
j.pruner.Prune()
|
||||||
tasks.state = ActiveSidePruneSender
|
log.Info("finished pruning")
|
||||||
})
|
|
||||||
|
|
||||||
log.Info("start pruning sender")
|
|
||||||
tasks.pruner.Prune()
|
|
||||||
log.Info("finished pruning sender")
|
|
||||||
localCancel()
|
|
||||||
j.updateTasks(func(tasks *snap_activeSideTasks) {
|
|
||||||
tasks.state = ActiveSideDone
|
|
||||||
})
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user