mirror of
https://github.com/zrepl/zrepl.git
synced 2025-02-16 10:29:54 +01:00
replication: wakeup + retry handling: make wakeups work in retry wait states
- handle wakeups in Planning state - fsrep.Replication yields immediately in RetryWait - once the queue only contains fsrep.Replication in retryWait: transition replication.Replication into WorkingWait state - handle wakeups in WorkingWait state, too
This commit is contained in:
parent
d17ecc3b5c
commit
cb83a26c90
@ -191,7 +191,7 @@ type ReplicationStep struct {
|
||||
expectedSize int64 // 0 means no size estimate present / possible
|
||||
}
|
||||
|
||||
func (f *Replication) TakeStep(ctx context.Context, sender Sender, receiver Receiver) (post State, nextStepDate time.Time) {
|
||||
func (f *Replication) TakeStep(ctx context.Context, sender Sender, receiver Receiver) (post State, nextStepDate, retryWaitUntil time.Time) {
|
||||
|
||||
var u updater = func(fu func(*Replication)) State {
|
||||
f.lock.Lock()
|
||||
@ -213,6 +213,7 @@ func (f *Replication) TakeStep(ctx context.Context, sender Sender, receiver Rece
|
||||
return
|
||||
}
|
||||
nextStepDate = f.pending[0].to.SnapshotTime()
|
||||
retryWaitUntil = f.retryWaitUntil
|
||||
})
|
||||
|
||||
getLogger(ctx).
|
||||
@ -221,7 +222,13 @@ func (f *Replication) TakeStep(ctx context.Context, sender Sender, receiver Rece
|
||||
WithField("duration", delta).
|
||||
Debug("fsr step taken")
|
||||
|
||||
return post, nextStepDate
|
||||
return post, nextStepDate, retryWaitUntil
|
||||
}
|
||||
|
||||
func (f *Replication) RetryWaitUntil() time.Time {
|
||||
f.lock.Lock()
|
||||
defer f.lock.Unlock()
|
||||
return f.retryWaitUntil
|
||||
}
|
||||
|
||||
type updater func(func(fsr *Replication)) State
|
||||
@ -275,15 +282,8 @@ func stateRetryWait(ctx context.Context, sender Sender, receiver Receiver, u upd
|
||||
u(func(f *Replication) {
|
||||
sleepUntil = f.retryWaitUntil
|
||||
})
|
||||
t := time.NewTimer(sleepUntil.Sub(time.Now()))
|
||||
defer t.Stop()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return u(func(f *Replication) {
|
||||
f.state = PermanentError
|
||||
f.err = ctx.Err()
|
||||
}).fsrsf()
|
||||
case <-t.C:
|
||||
if time.Now().Before(sleepUntil) {
|
||||
return u(nil).fsrsf()
|
||||
}
|
||||
return u(func(f *Replication) {
|
||||
f.state = Ready
|
||||
|
@ -8,11 +8,12 @@ import (
|
||||
)
|
||||
|
||||
type replicationQueueItem struct {
|
||||
retriesSinceLastError int
|
||||
// duplicates fsr.state to avoid accessing and locking fsr
|
||||
state State
|
||||
// duplicates fsr.current.nextStepDate to avoid accessing & locking fsr
|
||||
nextStepDate time.Time
|
||||
// duplicates fsr.retryWaitUntil to avoid accessing & locking fsr
|
||||
retryWaitUntil time.Time
|
||||
|
||||
fsr *Replication
|
||||
}
|
||||
@ -42,7 +43,7 @@ var lessmap = map[State]lessmapEntry{
|
||||
RetryWait: {
|
||||
prio: 1,
|
||||
less: func(a, b *replicationQueueItem) bool {
|
||||
return a.retriesSinceLastError < b.retriesSinceLastError
|
||||
return a.retryWaitUntil.Before(b.retryWaitUntil)
|
||||
},
|
||||
},
|
||||
}
|
||||
@ -113,12 +114,8 @@ func (h ReplicationQueueItemHandle) GetFSReplication() *Replication {
|
||||
return h.i.fsr
|
||||
}
|
||||
|
||||
func (h ReplicationQueueItemHandle) Update(newState State, nextStepDate time.Time) {
|
||||
func (h ReplicationQueueItemHandle) Update(newState State, nextStepDate, retryWaitUntil time.Time) {
|
||||
h.i.state = newState
|
||||
h.i.nextStepDate = nextStepDate
|
||||
if h.i.state&Ready != 0 {
|
||||
h.i.retriesSinceLastError = 0
|
||||
} else if h.i.state&RetryWait != 0 {
|
||||
h.i.retriesSinceLastError++
|
||||
}
|
||||
h.i.retryWaitUntil = retryWaitUntil
|
||||
}
|
||||
|
@ -7,6 +7,7 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/zrepl/zrepl/daemon/job/wakeup"
|
||||
"math/bits"
|
||||
"sync"
|
||||
"time"
|
||||
@ -186,6 +187,8 @@ func resolveConflict(conflict error) (path []*pdu.FilesystemVersion, msg string)
|
||||
return nil, "no automated way to handle conflict type"
|
||||
}
|
||||
|
||||
var PlanningRetryInterval = 10 * time.Second // FIXME make constant onfigurable
|
||||
|
||||
func statePlanning(ctx context.Context, sender Sender, receiver Receiver, u updater) state {
|
||||
|
||||
log := getLogger(ctx)
|
||||
@ -193,7 +196,9 @@ func statePlanning(ctx context.Context, sender Sender, receiver Receiver, u upda
|
||||
log.Info("start planning")
|
||||
|
||||
handlePlanningError := func(err error) state {
|
||||
// FIXME classify error as temporary or permanent / max retry counter
|
||||
return u(func(r *Replication) {
|
||||
r.sleepUntil = time.Now().Add(PlanningRetryInterval)
|
||||
r.planningError = err
|
||||
r.state = PlanningError
|
||||
}).rsf()
|
||||
@ -301,15 +306,12 @@ func statePlanning(ctx context.Context, sender Sender, receiver Receiver, u upda
|
||||
}).rsf()
|
||||
}
|
||||
|
||||
var RetrySleepDuration = 10 * time.Second // FIXME make constant onfigurable
|
||||
|
||||
func statePlanningError(ctx context.Context, sender Sender, receiver Receiver, u updater) state {
|
||||
|
||||
sleepUntil := time.Now().Add(RetrySleepDuration)
|
||||
var sleepUntil time.Time
|
||||
u(func(r *Replication) {
|
||||
r.sleepUntil = sleepUntil
|
||||
sleepUntil = r.sleepUntil
|
||||
})
|
||||
t := time.NewTimer(RetrySleepDuration)
|
||||
t := time.NewTimer(sleepUntil.Sub(time.Now()))
|
||||
getLogger(ctx).WithField("until", sleepUntil).Info("retry wait after planning error")
|
||||
defer t.Stop()
|
||||
select {
|
||||
@ -319,10 +321,11 @@ func statePlanningError(ctx context.Context, sender Sender, receiver Receiver, u
|
||||
r.contextError = ctx.Err()
|
||||
}).rsf()
|
||||
case <-t.C:
|
||||
return u(func(r *Replication) {
|
||||
r.state = Planning
|
||||
}).rsf()
|
||||
case <-wakeup.Wait(ctx):
|
||||
}
|
||||
return u(func(r *Replication) {
|
||||
r.state = Planning
|
||||
}).rsf()
|
||||
}
|
||||
|
||||
func stateWorking(ctx context.Context, sender Sender, receiver Receiver, u updater) state {
|
||||
@ -342,21 +345,28 @@ func stateWorking(ctx context.Context, sender Sender, receiver Receiver, u updat
|
||||
return rsfNext
|
||||
}
|
||||
|
||||
state, nextStepDate := active.GetFSReplication().TakeStep(ctx, sender, receiver)
|
||||
retryWaitUntil := active.GetFSReplication().RetryWaitUntil()
|
||||
if retryWaitUntil.After(time.Now()) {
|
||||
return u(func(r *Replication) {
|
||||
r.sleepUntil = retryWaitUntil
|
||||
r.state = WorkingWait
|
||||
}).rsf()
|
||||
}
|
||||
|
||||
state, nextStepDate, retryWaitUntil := active.GetFSReplication().TakeStep(ctx, sender, receiver)
|
||||
return u(func(r *Replication) {
|
||||
active.Update(state, nextStepDate)
|
||||
active.Update(state, nextStepDate, retryWaitUntil)
|
||||
r.active = nil
|
||||
}).rsf()
|
||||
}
|
||||
|
||||
func stateWorkingWait(ctx context.Context, sender Sender, receiver Receiver, u updater) state {
|
||||
sleepUntil := time.Now().Add(RetrySleepDuration)
|
||||
var sleepUntil time.Time
|
||||
u(func(r *Replication) {
|
||||
r.sleepUntil = sleepUntil
|
||||
sleepUntil = r.sleepUntil
|
||||
})
|
||||
t := time.NewTimer(RetrySleepDuration)
|
||||
getLogger(ctx).WithField("until", sleepUntil).Info("retry wait after send/recv error")
|
||||
t := time.NewTimer(PlanningRetryInterval)
|
||||
getLogger(ctx).WithField("until", sleepUntil).Info("retry wait because no filesystems are ready")
|
||||
defer t.Stop()
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
@ -364,11 +374,13 @@ func stateWorkingWait(ctx context.Context, sender Sender, receiver Receiver, u u
|
||||
r.state = ContextDone
|
||||
r.contextError = ctx.Err()
|
||||
}).rsf()
|
||||
|
||||
case <-t.C:
|
||||
return u(func(r *Replication) {
|
||||
r.state = Working
|
||||
}).rsf()
|
||||
case <-wakeup.Wait(ctx):
|
||||
}
|
||||
return u(func(r *Replication) {
|
||||
r.state = Working
|
||||
}).rsf()
|
||||
}
|
||||
|
||||
// Report provides a summary of the progress of the Replication,
|
||||
|
Loading…
Reference in New Issue
Block a user