remove most of the watchdog machinery

2025-06-20 09:47:50 +02:00 · 2018-11-21 02:42:13 +01:00 · 2018-11-21 02:42:13 +01:00 · 442d61918b
commit 442d61918b
parent 58dcc07430
1 changed files with 11 additions and 119 deletions
--- a/daemon/job/snapjob.go
+++ b/daemon/job/snapjob.go
@ -6,16 +6,12 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/zrepl/zrepl/config"
 	"github.com/zrepl/zrepl/daemon/filters"
 	"github.com/zrepl/zrepl/daemon/job/reset"
 	"github.com/zrepl/zrepl/daemon/job/wakeup"
 	"github.com/zrepl/zrepl/daemon/logging"
 	"github.com/zrepl/zrepl/daemon/pruner"
 	"github.com/zrepl/zrepl/daemon/snapper"
 	"github.com/zrepl/zrepl/endpoint"
 	"github.com/zrepl/zrepl/util/envconst"
 	"github.com/zrepl/zrepl/zfs"
 	"sync"
 	"time"
 )
 type SnapJob struct {
@ -27,19 +23,10 @@ type SnapJob struct {
 	promPruneSecs *prometheus.HistogramVec // labels: prune_side
 	tasksMtx sync.Mutex
 	tasks    snap_activeSideTasks
 }
 type snap_activeSideTasks struct {
 	state ActiveSideState
 	pruner *pruner.Pruner
 	prunerCancel context.CancelFunc
 }
 func (j *SnapJob) Name() string { return j.name }
 func (j *SnapJob) GetPruner(ctx context.Context, sender *endpoint.Sender) (*pruner.Pruner) {
@ -48,20 +35,6 @@ func (j *SnapJob) GetPruner(ctx context.Context, sender *endpoint.Sender) (*prun
 }
 func (j *SnapJob) updateTasks(u func(*snap_activeSideTasks)) snap_activeSideTasks {
 	j.tasksMtx.Lock()
 	defer j.tasksMtx.Unlock()
 	var copy snap_activeSideTasks
 	copy = j.tasks
 	if u == nil {
 		return copy
 	}
 	u(&copy)
 	j.tasks = copy
 	return copy
 }
 func (j *SnapJob) Type() Type { return TypeSnap }
 func (j *SnapJob) RunPeriodic(ctx context.Context, wakeUpCommon chan <- struct{}) {
@ -107,12 +80,12 @@ type SnapJobStatus struct {
 }
 func (j *SnapJob) Status() *Status {
-	tasks := j.updateTasks(nil)
+	//tasks := j.updateTasks(nil)
 	s := &SnapJobStatus{}
 	t := j.Type()
-	if tasks.pruner != nil {
+	if j.pruner != nil {
-		s.Pruning = tasks.pruner.Report()
+		s.Pruning = j.pruner.Report()
 	}
 	return &Status{Type: t, JobSpecific: s}
 }
@ -142,98 +115,17 @@ outer:
 		}
 		invocationCount++
 		invLog := log.WithField("invocation", invocationCount)
-		j.do(WithLogger(ctx, invLog))
+		j.doPrune(WithLogger(ctx, invLog))
 	}
 }
-func (j *SnapJob) do(ctx context.Context) {
+func (j *SnapJob) doPrune(ctx context.Context) {
 	log := GetLogger(ctx)
 	ctx = logging.WithSubsystemLoggers(ctx, log)
 	// allow cancellation of an invocation (this function)
 	ctx, cancelThisRun := context.WithCancel(ctx)
 	defer cancelThisRun()
 	go func() {
 		select {
 		case <-reset.Wait(ctx):
 			log.Info("reset received, cancelling current invocation")
 			cancelThisRun()
 		case <-ctx.Done():
 		}
 	}()
 	// The code after this watchdog goroutine is sequential and transitions the state from
 	//   ActiveSideReplicating -> ActiveSidePruneSender -> ActiveSidePruneReceiver -> ActiveSideDone
 	// If any of those sequential tasks 'gets stuck' (livelock, no progress), the watchdog will eventually
 	// cancel its context.
 	// If the task is written to support context cancellation, it will return immediately (in permanent error state),
 	// and the sequential code above transitions to the next state.
 	go func() {
 		wdto := envconst.Duration("ZREPL_JOB_WATCHDOG_TIMEOUT", 10*time.Minute)
 		jitter := envconst.Duration("ZREPL_JOB_WATCHDOG_JITTER", 1*time.Second)
 		// shadowing!
 		log := log.WithField("watchdog_timeout", wdto.String())
 		log.Debug("starting watchdog")
 		defer log.Debug("watchdog stopped")
 		t := time.NewTicker(wdto)
 		defer t.Stop()
 		for {
 			select {
 			case <-ctx.Done():
 				return
 			case <-t.C: // fall
 			}
 			j.updateTasks(func(tasks *snap_activeSideTasks) {
 				// Since cancelling a task will cause the sequential code to transition to the next state immediately,
 				// we cannot check for its progress right then (no fallthrough).
 				// Instead, we return (not continue because we are in a closure) and give the new state another
 				// ZREPL_JOB_WATCHDOG_TIMEOUT interval to try make some progress.
 				log.WithField("state", tasks.state).Debug("watchdog firing")
 				const WATCHDOG_ENVCONST_NOTICE = " (adjust ZREPL_JOB_WATCHDOG_TIMEOUT env variable if inappropriate)"
 				switch tasks.state {
 				case ActiveSidePruneSender:
 					log.WithField("prune_sender_progress", "TEST DEBUG 123").
 						Debug("check pruner_sender progress")
 					if tasks.pruner.Progress.CheckTimeout(wdto, jitter) {
 						log.Error("pruner_sender did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
 						tasks.prunerCancel()
 						return
 					}
 				case ActiveSideDone:
 					// ignore, ctx will be Done() in a few milliseconds and the watchdog will exit
 				default:
 					log.WithField("state", tasks.state).
 						Error("watchdog implementation error: unknown active side state")
 				}
 			})
 		}
 	}()
 		ctx, localCancel := context.WithCancel(ctx)
 	sender := endpoint.NewSender(j.FSFilter())
-		tasks := j.updateTasks(func(tasks *snap_activeSideTasks) {
+	j.pruner = j.GetPruner(ctx, sender)
-			tasks.pruner = j.GetPruner(ctx, sender)
+	log.Info("start pruning")
-			tasks.prunerCancel = localCancel
+	j.pruner.Prune()
-			tasks.state = ActiveSidePruneSender
+	log.Info("finished pruning")
 		})
 		log.Info("start pruning sender")
 		tasks.pruner.Prune()
 		log.Info("finished pruning sender")
 		localCancel()
 	j.updateTasks(func(tasks *snap_activeSideTasks) {
 		tasks.state = ActiveSideDone
 	})
 }