replication: wakeup + retry handling: make wakeups work in retry wait states

- handle wakeups in Planning state
- fsrep.Replication yields immediately in RetryWait
- once the queue only contains fsrep.Replication in retryWait:
transition replication.Replication into WorkingWait state
- handle wakeups in WorkingWait state, too
This commit is contained in:
Christian Schwarz 2018-10-12 13:12:28 +02:00
parent d17ecc3b5c
commit cb83a26c90
3 changed files with 46 additions and 37 deletions

View File

@ -191,7 +191,7 @@ type ReplicationStep struct {
expectedSize int64 // 0 means no size estimate present / possible
}
func (f *Replication) TakeStep(ctx context.Context, sender Sender, receiver Receiver) (post State, nextStepDate time.Time) {
func (f *Replication) TakeStep(ctx context.Context, sender Sender, receiver Receiver) (post State, nextStepDate, retryWaitUntil time.Time) {
var u updater = func(fu func(*Replication)) State {
f.lock.Lock()
@ -213,6 +213,7 @@ func (f *Replication) TakeStep(ctx context.Context, sender Sender, receiver Rece
return
}
nextStepDate = f.pending[0].to.SnapshotTime()
retryWaitUntil = f.retryWaitUntil
})
getLogger(ctx).
@ -221,7 +222,13 @@ func (f *Replication) TakeStep(ctx context.Context, sender Sender, receiver Rece
WithField("duration", delta).
Debug("fsr step taken")
return post, nextStepDate
return post, nextStepDate, retryWaitUntil
}
func (f *Replication) RetryWaitUntil() time.Time {
f.lock.Lock()
defer f.lock.Unlock()
return f.retryWaitUntil
}
type updater func(func(fsr *Replication)) State
@ -275,15 +282,8 @@ func stateRetryWait(ctx context.Context, sender Sender, receiver Receiver, u upd
u(func(f *Replication) {
sleepUntil = f.retryWaitUntil
})
t := time.NewTimer(sleepUntil.Sub(time.Now()))
defer t.Stop()
select {
case <-ctx.Done():
return u(func(f *Replication) {
f.state = PermanentError
f.err = ctx.Err()
}).fsrsf()
case <-t.C:
if time.Now().Before(sleepUntil) {
return u(nil).fsrsf()
}
return u(func(f *Replication) {
f.state = Ready

View File

@ -8,11 +8,12 @@ import (
)
type replicationQueueItem struct {
retriesSinceLastError int
// duplicates fsr.state to avoid accessing and locking fsr
state State
// duplicates fsr.current.nextStepDate to avoid accessing & locking fsr
nextStepDate time.Time
// duplicates fsr.retryWaitUntil to avoid accessing & locking fsr
retryWaitUntil time.Time
fsr *Replication
}
@ -42,7 +43,7 @@ var lessmap = map[State]lessmapEntry{
RetryWait: {
prio: 1,
less: func(a, b *replicationQueueItem) bool {
return a.retriesSinceLastError < b.retriesSinceLastError
return a.retryWaitUntil.Before(b.retryWaitUntil)
},
},
}
@ -113,12 +114,8 @@ func (h ReplicationQueueItemHandle) GetFSReplication() *Replication {
return h.i.fsr
}
func (h ReplicationQueueItemHandle) Update(newState State, nextStepDate time.Time) {
func (h ReplicationQueueItemHandle) Update(newState State, nextStepDate, retryWaitUntil time.Time) {
h.i.state = newState
h.i.nextStepDate = nextStepDate
if h.i.state&Ready != 0 {
h.i.retriesSinceLastError = 0
} else if h.i.state&RetryWait != 0 {
h.i.retriesSinceLastError++
}
h.i.retryWaitUntil = retryWaitUntil
}

View File

@ -7,6 +7,7 @@ import (
"errors"
"fmt"
"github.com/prometheus/client_golang/prometheus"
"github.com/zrepl/zrepl/daemon/job/wakeup"
"math/bits"
"sync"
"time"
@ -186,6 +187,8 @@ func resolveConflict(conflict error) (path []*pdu.FilesystemVersion, msg string)
return nil, "no automated way to handle conflict type"
}
var PlanningRetryInterval = 10 * time.Second // FIXME make constant onfigurable
func statePlanning(ctx context.Context, sender Sender, receiver Receiver, u updater) state {
log := getLogger(ctx)
@ -193,7 +196,9 @@ func statePlanning(ctx context.Context, sender Sender, receiver Receiver, u upda
log.Info("start planning")
handlePlanningError := func(err error) state {
// FIXME classify error as temporary or permanent / max retry counter
return u(func(r *Replication) {
r.sleepUntil = time.Now().Add(PlanningRetryInterval)
r.planningError = err
r.state = PlanningError
}).rsf()
@ -301,15 +306,12 @@ func statePlanning(ctx context.Context, sender Sender, receiver Receiver, u upda
}).rsf()
}
var RetrySleepDuration = 10 * time.Second // FIXME make constant onfigurable
func statePlanningError(ctx context.Context, sender Sender, receiver Receiver, u updater) state {
sleepUntil := time.Now().Add(RetrySleepDuration)
var sleepUntil time.Time
u(func(r *Replication) {
r.sleepUntil = sleepUntil
sleepUntil = r.sleepUntil
})
t := time.NewTimer(RetrySleepDuration)
t := time.NewTimer(sleepUntil.Sub(time.Now()))
getLogger(ctx).WithField("until", sleepUntil).Info("retry wait after planning error")
defer t.Stop()
select {
@ -319,10 +321,11 @@ func statePlanningError(ctx context.Context, sender Sender, receiver Receiver, u
r.contextError = ctx.Err()
}).rsf()
case <-t.C:
return u(func(r *Replication) {
r.state = Planning
}).rsf()
case <-wakeup.Wait(ctx):
}
return u(func(r *Replication) {
r.state = Planning
}).rsf()
}
func stateWorking(ctx context.Context, sender Sender, receiver Receiver, u updater) state {
@ -342,21 +345,28 @@ func stateWorking(ctx context.Context, sender Sender, receiver Receiver, u updat
return rsfNext
}
state, nextStepDate := active.GetFSReplication().TakeStep(ctx, sender, receiver)
retryWaitUntil := active.GetFSReplication().RetryWaitUntil()
if retryWaitUntil.After(time.Now()) {
return u(func(r *Replication) {
r.sleepUntil = retryWaitUntil
r.state = WorkingWait
}).rsf()
}
state, nextStepDate, retryWaitUntil := active.GetFSReplication().TakeStep(ctx, sender, receiver)
return u(func(r *Replication) {
active.Update(state, nextStepDate)
active.Update(state, nextStepDate, retryWaitUntil)
r.active = nil
}).rsf()
}
func stateWorkingWait(ctx context.Context, sender Sender, receiver Receiver, u updater) state {
sleepUntil := time.Now().Add(RetrySleepDuration)
var sleepUntil time.Time
u(func(r *Replication) {
r.sleepUntil = sleepUntil
sleepUntil = r.sleepUntil
})
t := time.NewTimer(RetrySleepDuration)
getLogger(ctx).WithField("until", sleepUntil).Info("retry wait after send/recv error")
t := time.NewTimer(PlanningRetryInterval)
getLogger(ctx).WithField("until", sleepUntil).Info("retry wait because no filesystems are ready")
defer t.Stop()
select {
case <-ctx.Done():
@ -364,11 +374,13 @@ func stateWorkingWait(ctx context.Context, sender Sender, receiver Receiver, u u
r.state = ContextDone
r.contextError = ctx.Err()
}).rsf()
case <-t.C:
return u(func(r *Replication) {
r.state = Working
}).rsf()
case <-wakeup.Wait(ctx):
}
return u(func(r *Replication) {
r.state = Working
}).rsf()
}
// Report provides a summary of the progress of the Replication,