mirror of
https://github.com/zrepl/zrepl.git
synced 2025-06-20 09:47:50 +02:00
daemon/active + watchdog: simplify control flow using explicit ActiveSideState
This commit is contained in:
parent
f704b28cad
commit
190c7270d9
@ -6,19 +6,18 @@ import (
|
|||||||
"github.com/problame/go-streamrpc"
|
"github.com/problame/go-streamrpc"
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/zrepl/zrepl/config"
|
"github.com/zrepl/zrepl/config"
|
||||||
|
"github.com/zrepl/zrepl/daemon/filters"
|
||||||
"github.com/zrepl/zrepl/daemon/job/reset"
|
"github.com/zrepl/zrepl/daemon/job/reset"
|
||||||
"github.com/zrepl/zrepl/daemon/job/wakeup"
|
"github.com/zrepl/zrepl/daemon/job/wakeup"
|
||||||
"github.com/zrepl/zrepl/daemon/transport/connecter"
|
"github.com/zrepl/zrepl/daemon/logging"
|
||||||
"github.com/zrepl/zrepl/daemon/filters"
|
|
||||||
"github.com/zrepl/zrepl/daemon/pruner"
|
"github.com/zrepl/zrepl/daemon/pruner"
|
||||||
|
"github.com/zrepl/zrepl/daemon/snapper"
|
||||||
|
"github.com/zrepl/zrepl/daemon/transport/connecter"
|
||||||
"github.com/zrepl/zrepl/endpoint"
|
"github.com/zrepl/zrepl/endpoint"
|
||||||
"github.com/zrepl/zrepl/replication"
|
"github.com/zrepl/zrepl/replication"
|
||||||
"github.com/zrepl/zrepl/util/envconst"
|
"github.com/zrepl/zrepl/util/envconst"
|
||||||
"github.com/zrepl/zrepl/util/watchdog"
|
|
||||||
"github.com/zrepl/zrepl/zfs"
|
"github.com/zrepl/zrepl/zfs"
|
||||||
"sync"
|
"sync"
|
||||||
"github.com/zrepl/zrepl/daemon/logging"
|
|
||||||
"github.com/zrepl/zrepl/daemon/snapper"
|
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -273,11 +272,8 @@ func (j *ActiveSide) do(ctx context.Context) {
|
|||||||
// allow cancellation of an invocation (this function)
|
// allow cancellation of an invocation (this function)
|
||||||
ctx, cancelThisRun := context.WithCancel(ctx)
|
ctx, cancelThisRun := context.WithCancel(ctx)
|
||||||
defer cancelThisRun()
|
defer cancelThisRun()
|
||||||
runDone := make(chan struct{})
|
|
||||||
defer close(runDone)
|
|
||||||
go func() {
|
go func() {
|
||||||
select {
|
select {
|
||||||
case <-runDone:
|
|
||||||
case <-reset.Wait(ctx):
|
case <-reset.Wait(ctx):
|
||||||
log.Info("reset received, cancelling current invocation")
|
log.Info("reset received, cancelling current invocation")
|
||||||
cancelThisRun()
|
cancelThisRun()
|
||||||
@ -285,53 +281,72 @@ func (j *ActiveSide) do(ctx context.Context) {
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// watchdog
|
// The code after this watchdog goroutine is sequential and transitions the state from
|
||||||
|
// ActiveSideReplicating -> ActiveSidePruneSender -> ActiveSidePruneReceiver -> ActiveSideDone
|
||||||
|
// If any of those sequential tasks 'gets stuck' (livelock, no progress), the watchdog will eventually
|
||||||
|
// cancel its context.
|
||||||
|
// If the task is written to support context cancellation, it will return immediately (in permanent error state),
|
||||||
|
// and the sequential code above transitions to the next state.
|
||||||
go func() {
|
go func() {
|
||||||
// if no progress after 1 minute, kill the task
|
|
||||||
wdto := envconst.Duration("ZREPL_JOB_WATCHDOG_TIMEOUT", 1*time.Minute)
|
wdto := envconst.Duration("ZREPL_JOB_WATCHDOG_TIMEOUT", 1*time.Minute)
|
||||||
log.WithField("watchdog_timeout", wdto.String()).Debug("starting watchdog")
|
jitter := envconst.Duration("ZREPL_JOB_WATCHDOG_JITTER", 1*time.Second)
|
||||||
|
// shadowing!
|
||||||
|
log := log.WithField("watchdog_timeout", wdto.String())
|
||||||
|
|
||||||
|
log.Debug("starting watchdog")
|
||||||
|
defer log.Debug("watchdog stopped")
|
||||||
|
|
||||||
t := time.NewTicker(wdto)
|
t := time.NewTicker(wdto)
|
||||||
defer t.Stop()
|
defer t.Stop()
|
||||||
|
|
||||||
var (
|
|
||||||
rep, prunerSender, prunerReceiver watchdog.Progress
|
|
||||||
)
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-runDone:
|
|
||||||
return
|
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return
|
return
|
||||||
case <-t.C: // fall
|
case <-t.C: // fall
|
||||||
}
|
}
|
||||||
|
|
||||||
log := log.WithField("watchdog_timeout", wdto.String()) // shadowing!
|
|
||||||
|
|
||||||
j.updateTasks(func(tasks *activeSideTasks) {
|
j.updateTasks(func(tasks *activeSideTasks) {
|
||||||
if tasks.replication != nil &&
|
// Since cancelling a task will cause the sequential code to transition to the next state immediately,
|
||||||
!tasks.replication.Progress.ExpectProgress(&rep) &&
|
// we cannot check for its progress right then (no fallthrough).
|
||||||
!tasks.replication.State().IsTerminal() {
|
// Instead, we return (not continue because we are in a closure) and give the new state another
|
||||||
log.Error("replication did not make progress, cancelling")
|
// ZREPL_JOB_WATCHDOG_TIMEOUT interval to try make some progress.
|
||||||
tasks.replicationCancel()
|
|
||||||
}
|
log.WithField("state", tasks.state).Debug("watchdog firing")
|
||||||
if tasks.prunerSender != nil &&
|
|
||||||
!tasks.prunerSender.Progress.ExpectProgress(&prunerSender) &&
|
switch tasks.state {
|
||||||
!tasks.prunerSender.State().IsTerminal() {
|
case ActiveSideReplicating:
|
||||||
log.Error("pruner:sender did not make progress, cancelling")
|
log.WithField("replication_progress", tasks.replication.Progress.String()).
|
||||||
tasks.prunerSenderCancel()
|
Debug("check replication progress")
|
||||||
}
|
if tasks.replication.Progress.CheckTimeout(wdto, jitter) {
|
||||||
if tasks.prunerReceiver != nil &&
|
log.Error("replication did not make progress, cancelling")
|
||||||
!tasks.prunerReceiver.Progress.ExpectProgress(&prunerReceiver) &&
|
tasks.replicationCancel()
|
||||||
!tasks.prunerReceiver.State().IsTerminal() {
|
return
|
||||||
log.Error("pruner:receiver did not make progress, cancelling")
|
}
|
||||||
tasks.prunerReceiverCancel()
|
case ActiveSidePruneSender:
|
||||||
|
log.WithField("prune_sender_progress", tasks.replication.Progress.String()).
|
||||||
|
Debug("check pruner_sender progress")
|
||||||
|
if tasks.prunerSender.Progress.CheckTimeout(wdto, jitter) {
|
||||||
|
log.Error("pruner_sender did not make progress, cancelling")
|
||||||
|
tasks.prunerSenderCancel()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
case ActiveSidePruneReceiver:
|
||||||
|
log.WithField("prune_receiver_progress", tasks.replication.Progress.String()).
|
||||||
|
Debug("check pruner_receiver progress")
|
||||||
|
if tasks.prunerReceiver.Progress.CheckTimeout(wdto, jitter) {
|
||||||
|
log.Error("pruner_receiver did not make progress, cancelling")
|
||||||
|
tasks.prunerReceiverCancel()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
case ActiveSideDone:
|
||||||
|
// ignore, ctx will be Done() in a few milliseconds and the watchdog will exit
|
||||||
|
default:
|
||||||
|
log.WithField("state", tasks.state).
|
||||||
|
Error("watchdog implementation error: unknown active side state")
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
log.WithField("replication_progress", rep.String()).
|
|
||||||
WithField("pruner_sender_progress", prunerSender.String()).
|
|
||||||
WithField("pruner_receiver_progress", prunerReceiver.String()).
|
|
||||||
Debug("watchdog did run")
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
@ -6,36 +6,26 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Progress struct {
|
type KeepAlive struct {
|
||||||
|
mtx sync.Mutex
|
||||||
lastUpd time.Time
|
lastUpd time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Progress) String() string {
|
func (p *KeepAlive) String() string {
|
||||||
return fmt.Sprintf("last update at %s", p.lastUpd)
|
if p.lastUpd.IsZero() {
|
||||||
}
|
return fmt.Sprintf("never updated")
|
||||||
|
|
||||||
func (p *Progress) madeProgressSince(p2 *Progress) bool {
|
|
||||||
if p.lastUpd.IsZero() && p2.lastUpd.IsZero() {
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
return p.lastUpd.After(p2.lastUpd)
|
return fmt.Sprintf("last update at %s", p.lastUpd)
|
||||||
}
|
|
||||||
|
|
||||||
type KeepAlive struct {
|
|
||||||
mtx sync.Mutex
|
|
||||||
p Progress
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (k *KeepAlive) MadeProgress() {
|
func (k *KeepAlive) MadeProgress() {
|
||||||
k.mtx.Lock()
|
k.mtx.Lock()
|
||||||
defer k.mtx.Unlock()
|
defer k.mtx.Unlock()
|
||||||
k.p.lastUpd = time.Now()
|
k.lastUpd = time.Now()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (k *KeepAlive) ExpectProgress(last *Progress) (madeProgress bool) {
|
func (k *KeepAlive) CheckTimeout(timeout time.Duration, jitter time.Duration) (didTimeOut bool) {
|
||||||
k.mtx.Lock()
|
k.mtx.Lock()
|
||||||
defer k.mtx.Unlock()
|
defer k.mtx.Unlock()
|
||||||
madeProgress = k.p.madeProgressSince(last)
|
return k.lastUpd.Add(timeout - jitter).Before(time.Now())
|
||||||
*last = k.p
|
|
||||||
return madeProgress
|
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user