replication: fix retry wait behavior

An fsrep.Replication is either Ready, Retry or in a terminal state. The queue prefers Ready over Retry: Ready is sorted by nextStepDate to progress evenly.. Retry is sorted by error count, to de-prioritize filesystems that fail often. This way we don't get stuck with individual filesystems and lose other working filesystems to the watchdog. fsrep.Replication no longer blocks in Retry state, we have replication.WorkingWait for that.
2025-06-19 17:27:46 +02:00 · 2018-10-19 15:53:58 +02:00 · 2018-10-19 15:53:58 +02:00 · 45373168ad
commit 45373168ad
parent 69bfcb7bed
4 changed files with 53 additions and 52 deletions
--- a/replication/fsrep/fsfsm.go
+++ b/replication/fsrep/fsfsm.go
@ -76,7 +76,7 @@ type State uint

 const (
 	Ready State = 1 << iota
-	RetryWait
+	Retry
 	PermanentError
 	Completed
 )
@ -84,13 +84,17 @@ const (
 func (s State) fsrsf() state {
 	m := map[State]state{
 		Ready:          stateReady,
-		RetryWait:      stateRetryWait,
+		Retry:          stateRetry,
 		PermanentError: nil,
 		Completed:      nil,
 	}
 	return m[s]
 }

+func (s State) IsErrorState() bool {
+	return s & (Retry|PermanentError) != 0
+}
+
 type Replication struct {
 	promBytesReplicated prometheus.Counter

@ -99,7 +103,6 @@ type Replication struct {
 	state              State
 	fs                 string
 	err                error
-	retryWaitUntil     time.Time
 	completed, pending []*ReplicationStep
 }

@ -109,6 +112,15 @@ func (f *Replication) State() State {
 	return f.state
 }

+func (f *Replication) Err() error {
+	f.lock.Lock()
+	defer f.lock.Unlock()
+	if f.state & (Retry|PermanentError) != 0 {
+		return f.err
+	}
+	return nil
+}
+
 func (f *Replication) UpdateSizeEsitmate(ctx context.Context, sender Sender) error {
 	f.lock.Lock()
 	defer f.lock.Unlock()
@ -192,7 +204,7 @@ type ReplicationStep struct {
 	expectedSize int64 // 0 means no size estimate present / possible
 }

-func (f *Replication) TakeStep(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver) (post State, nextStepDate, retryWaitUntil time.Time) {
+func (f *Replication) TakeStep(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver) (post State, nextStepDate time.Time) {

 	var u updater = func(fu func(*Replication)) State {
 		f.lock.Lock()
@ -214,7 +226,6 @@ func (f *Replication) TakeStep(ctx context.Context, ka *watchdog.KeepAlive, send
 			return
 		}
 		nextStepDate = f.pending[0].to.SnapshotTime()
-		retryWaitUntil = f.retryWaitUntil
 	})

 	getLogger(ctx).
@ -223,21 +234,13 @@ func (f *Replication) TakeStep(ctx context.Context, ka *watchdog.KeepAlive, send
 		WithField("duration", delta).
 		Debug("fsr step taken")

-	return post, nextStepDate, retryWaitUntil
-}
-
-func (f *Replication) RetryWaitUntil() time.Time {
-	f.lock.Lock()
-	defer f.lock.Unlock()
-	return f.retryWaitUntil
+	return post, nextStepDate
 }

 type updater func(func(fsr *Replication)) State

 type state func(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state

-var RetrySleepDuration = 10 * time.Second // FIXME make configurable
-
 func stateReady(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {

 	var current *ReplicationStep
@ -267,8 +270,7 @@ func stateReady(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, rece
 		case StepReplicationRetry:
 			fallthrough
 		case StepMarkReplicatedRetry:
-			f.retryWaitUntil = time.Now().Add(RetrySleepDuration)
-			f.state = RetryWait
+			f.state = Retry
 		case StepPermanentError:
 			f.state = PermanentError
 			f.err = errors.New("a replication step failed with a permanent error")
@ -278,16 +280,9 @@ func stateReady(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, rece
 	}).fsrsf()
 }

-func stateRetryWait(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {
-	var sleepUntil time.Time
-	u(func(f *Replication) {
-		sleepUntil = f.retryWaitUntil
-	})
-	if time.Now().Before(sleepUntil) {
-		return u(nil).fsrsf()
-	}
-	return u(func(f *Replication) {
-		f.state = Ready
+func stateRetry(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {
+	return u(func(fsr *Replication) {
+		fsr.state = Ready
 	}).fsrsf()
 }

@ -314,8 +309,8 @@ func (fsr *Replication) Report() *Report {
 		rep.Pending[i] = fsr.pending[i].Report()
 	}

-	if fsr.state&RetryWait != 0 {
-		if len(rep.Pending) != 0 { // should always be true for RetryWait == true?
+	if fsr.state&Retry != 0 {
+		if len(rep.Pending) != 0 { // should always be true for Retry == true?
 			rep.Problem = rep.Pending[0].Problem
 		}
 	}
--- a/replication/fsrep/state_string.go
+++ b/replication/fsrep/state_string.go
@ -5,13 +5,13 @@ package fsrep
 import "strconv"

 const (
-	_State_name_0 = "ReadyRetryWait"
+	_State_name_0 = "ReadyRetry"
 	_State_name_1 = "PermanentError"
 	_State_name_2 = "Completed"
 )

 var (
-	_State_index_0 = [...]uint8{0, 5, 14}
+	_State_index_0 = [...]uint8{0, 5, 10}
 )

 func (i State) String() string {
--- a/replication/internal/queue/queue.go
+++ b/replication/internal/queue/queue.go
@ -11,9 +11,8 @@ type replicationQueueItem struct {
 	// duplicates fsr.state to avoid accessing and locking fsr
 	state State
 	// duplicates fsr.current.nextStepDate to avoid accessing & locking fsr
-	nextStepDate time.Time
-	// duplicates fsr.retryWaitUntil to avoid accessing & locking fsr
-	retryWaitUntil time.Time
+	nextStepDate         time.Time
+	errorStateEnterCount int

 	fsr *Replication
 }
@ -40,10 +39,10 @@ var lessmap = map[State]lessmapEntry{
 			return a.nextStepDate.Before(b.nextStepDate)
 		},
 	},
-	RetryWait: {
+	Retry: {
 		prio: 1,
 		less: func(a, b *replicationQueueItem) bool {
-			return a.retryWaitUntil.Before(b.retryWaitUntil)
+			return a.errorStateEnterCount < b.errorStateEnterCount
 		},
 	},
 }
@ -114,8 +113,10 @@ func (h ReplicationQueueItemHandle) GetFSReplication() *Replication {
 	return h.i.fsr
 }

-func (h ReplicationQueueItemHandle) Update(newState State, nextStepDate, retryWaitUntil time.Time) {
+func (h ReplicationQueueItemHandle) Update(newState State, nextStepDate time.Time) {
 	h.i.state = newState
 	h.i.nextStepDate = nextStepDate
-	h.i.retryWaitUntil = retryWaitUntil
+	if h.i.state.IsErrorState() {
+		h.i.errorStateEnterCount++
+	}
 }
--- a/replication/mainfsm.go
+++ b/replication/mainfsm.go
@ -8,6 +8,7 @@ import (
 	"fmt"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/zrepl/zrepl/daemon/job/wakeup"
+	"github.com/zrepl/zrepl/util/envconst"
 	"github.com/zrepl/zrepl/util/watchdog"
 	"math/bits"
 	"net"
@ -192,7 +193,7 @@ func resolveConflict(conflict error) (path []*pdu.FilesystemVersion, msg string)
 	return nil, "no automated way to handle conflict type"
 }

-var PlanningRetryInterval = 10 * time.Second // FIXME make constant onfigurable
+var RetryInterval = envconst.Duration("ZREPL_REPLICATION_RETRY_INTERVAL", 4 * time.Second)

 func isPermanent(err error) bool {
 	switch err {
@ -217,7 +218,7 @@ func statePlanning(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, r
 			if isPermanent(err) {
 				r.state = PermanentError
 			} else {
-				r.sleepUntil = time.Now().Add(PlanningRetryInterval)
+				r.sleepUntil = time.Now().Add(RetryInterval)
 				r.state = PlanningError
 			}
 		}).rsf()
@ -367,17 +368,9 @@ func stateWorking(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, re
 		return rsfNext
 	}

-	retryWaitUntil := active.GetFSReplication().RetryWaitUntil()
-	if retryWaitUntil.After(time.Now()) {
-		return u(func(r *Replication) {
-			r.sleepUntil = retryWaitUntil
-			r.state = WorkingWait
-		}).rsf()
-	}
-
-	state, nextStepDate, retryWaitUntil := active.GetFSReplication().TakeStep(ctx, ka, sender, receiver)
-	return u(func(r *Replication) {
-		active.Update(state, nextStepDate, retryWaitUntil)
+	state, nextStepDate := active.GetFSReplication().TakeStep(ctx, ka, sender, receiver)
+	u(func(r *Replication) {
+		active.Update(state, nextStepDate)
 		r.active = nil
 	}).rsf()

@ -390,6 +383,18 @@ func stateWorking(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, re
 	default:
 	}

+	if err := active.GetFSReplication().Err(); err != nil {
+		return u(func(r *Replication) {
+			r.err = err
+			if isPermanent(err) {
+				r.state = PermanentError
+			} else {
+				r.sleepUntil = time.Now().Add(RetryInterval)
+				r.state = WorkingWait
+			}
+		}).rsf()
+	}
+
 	return u(nil).rsf()
 }

@ -398,8 +403,8 @@ func stateWorkingWait(ctx context.Context, ka *watchdog.KeepAlive, sender Sender
 	u(func(r *Replication) {
 		sleepUntil = r.sleepUntil
 	})
-	t := time.NewTimer(PlanningRetryInterval)
-	getLogger(ctx).WithField("until", sleepUntil).Info("retry wait because no filesystems are ready")
+	t := time.NewTimer(RetryInterval)
+	getLogger(ctx).WithField("until", sleepUntil).Info("retry wait after replication step error")
 	defer t.Stop()
 	select {
 	case <-ctx.Done():