replication: wakeup + retry handling: make wakeups work in retry wait states

- handle wakeups in Planning state
- fsrep.Replication yields immediately in RetryWait
- once the queue only contains fsrep.Replication in retryWait:
transition replication.Replication into WorkingWait state
- handle wakeups in WorkingWait state, too
This commit is contained in:
Christian Schwarz 2018-10-12 13:12:28 +02:00
parent d17ecc3b5c
commit cb83a26c90
3 changed files with 46 additions and 37 deletions

View File

@ -191,7 +191,7 @@ type ReplicationStep struct {
expectedSize int64 // 0 means no size estimate present / possible expectedSize int64 // 0 means no size estimate present / possible
} }
func (f *Replication) TakeStep(ctx context.Context, sender Sender, receiver Receiver) (post State, nextStepDate time.Time) { func (f *Replication) TakeStep(ctx context.Context, sender Sender, receiver Receiver) (post State, nextStepDate, retryWaitUntil time.Time) {
var u updater = func(fu func(*Replication)) State { var u updater = func(fu func(*Replication)) State {
f.lock.Lock() f.lock.Lock()
@ -213,6 +213,7 @@ func (f *Replication) TakeStep(ctx context.Context, sender Sender, receiver Rece
return return
} }
nextStepDate = f.pending[0].to.SnapshotTime() nextStepDate = f.pending[0].to.SnapshotTime()
retryWaitUntil = f.retryWaitUntil
}) })
getLogger(ctx). getLogger(ctx).
@ -221,7 +222,13 @@ func (f *Replication) TakeStep(ctx context.Context, sender Sender, receiver Rece
WithField("duration", delta). WithField("duration", delta).
Debug("fsr step taken") Debug("fsr step taken")
return post, nextStepDate return post, nextStepDate, retryWaitUntil
}
func (f *Replication) RetryWaitUntil() time.Time {
f.lock.Lock()
defer f.lock.Unlock()
return f.retryWaitUntil
} }
type updater func(func(fsr *Replication)) State type updater func(func(fsr *Replication)) State
@ -275,15 +282,8 @@ func stateRetryWait(ctx context.Context, sender Sender, receiver Receiver, u upd
u(func(f *Replication) { u(func(f *Replication) {
sleepUntil = f.retryWaitUntil sleepUntil = f.retryWaitUntil
}) })
t := time.NewTimer(sleepUntil.Sub(time.Now())) if time.Now().Before(sleepUntil) {
defer t.Stop() return u(nil).fsrsf()
select {
case <-ctx.Done():
return u(func(f *Replication) {
f.state = PermanentError
f.err = ctx.Err()
}).fsrsf()
case <-t.C:
} }
return u(func(f *Replication) { return u(func(f *Replication) {
f.state = Ready f.state = Ready

View File

@ -8,11 +8,12 @@ import (
) )
type replicationQueueItem struct { type replicationQueueItem struct {
retriesSinceLastError int
// duplicates fsr.state to avoid accessing and locking fsr // duplicates fsr.state to avoid accessing and locking fsr
state State state State
// duplicates fsr.current.nextStepDate to avoid accessing & locking fsr // duplicates fsr.current.nextStepDate to avoid accessing & locking fsr
nextStepDate time.Time nextStepDate time.Time
// duplicates fsr.retryWaitUntil to avoid accessing & locking fsr
retryWaitUntil time.Time
fsr *Replication fsr *Replication
} }
@ -42,7 +43,7 @@ var lessmap = map[State]lessmapEntry{
RetryWait: { RetryWait: {
prio: 1, prio: 1,
less: func(a, b *replicationQueueItem) bool { less: func(a, b *replicationQueueItem) bool {
return a.retriesSinceLastError < b.retriesSinceLastError return a.retryWaitUntil.Before(b.retryWaitUntil)
}, },
}, },
} }
@ -113,12 +114,8 @@ func (h ReplicationQueueItemHandle) GetFSReplication() *Replication {
return h.i.fsr return h.i.fsr
} }
func (h ReplicationQueueItemHandle) Update(newState State, nextStepDate time.Time) { func (h ReplicationQueueItemHandle) Update(newState State, nextStepDate, retryWaitUntil time.Time) {
h.i.state = newState h.i.state = newState
h.i.nextStepDate = nextStepDate h.i.nextStepDate = nextStepDate
if h.i.state&Ready != 0 { h.i.retryWaitUntil = retryWaitUntil
h.i.retriesSinceLastError = 0
} else if h.i.state&RetryWait != 0 {
h.i.retriesSinceLastError++
}
} }

View File

@ -7,6 +7,7 @@ import (
"errors" "errors"
"fmt" "fmt"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
"github.com/zrepl/zrepl/daemon/job/wakeup"
"math/bits" "math/bits"
"sync" "sync"
"time" "time"
@ -186,6 +187,8 @@ func resolveConflict(conflict error) (path []*pdu.FilesystemVersion, msg string)
return nil, "no automated way to handle conflict type" return nil, "no automated way to handle conflict type"
} }
var PlanningRetryInterval = 10 * time.Second // FIXME make constant onfigurable
func statePlanning(ctx context.Context, sender Sender, receiver Receiver, u updater) state { func statePlanning(ctx context.Context, sender Sender, receiver Receiver, u updater) state {
log := getLogger(ctx) log := getLogger(ctx)
@ -193,7 +196,9 @@ func statePlanning(ctx context.Context, sender Sender, receiver Receiver, u upda
log.Info("start planning") log.Info("start planning")
handlePlanningError := func(err error) state { handlePlanningError := func(err error) state {
// FIXME classify error as temporary or permanent / max retry counter
return u(func(r *Replication) { return u(func(r *Replication) {
r.sleepUntil = time.Now().Add(PlanningRetryInterval)
r.planningError = err r.planningError = err
r.state = PlanningError r.state = PlanningError
}).rsf() }).rsf()
@ -301,15 +306,12 @@ func statePlanning(ctx context.Context, sender Sender, receiver Receiver, u upda
}).rsf() }).rsf()
} }
var RetrySleepDuration = 10 * time.Second // FIXME make constant onfigurable
func statePlanningError(ctx context.Context, sender Sender, receiver Receiver, u updater) state { func statePlanningError(ctx context.Context, sender Sender, receiver Receiver, u updater) state {
var sleepUntil time.Time
sleepUntil := time.Now().Add(RetrySleepDuration)
u(func(r *Replication) { u(func(r *Replication) {
r.sleepUntil = sleepUntil sleepUntil = r.sleepUntil
}) })
t := time.NewTimer(RetrySleepDuration) t := time.NewTimer(sleepUntil.Sub(time.Now()))
getLogger(ctx).WithField("until", sleepUntil).Info("retry wait after planning error") getLogger(ctx).WithField("until", sleepUntil).Info("retry wait after planning error")
defer t.Stop() defer t.Stop()
select { select {
@ -319,11 +321,12 @@ func statePlanningError(ctx context.Context, sender Sender, receiver Receiver, u
r.contextError = ctx.Err() r.contextError = ctx.Err()
}).rsf() }).rsf()
case <-t.C: case <-t.C:
case <-wakeup.Wait(ctx):
}
return u(func(r *Replication) { return u(func(r *Replication) {
r.state = Planning r.state = Planning
}).rsf() }).rsf()
} }
}
func stateWorking(ctx context.Context, sender Sender, receiver Receiver, u updater) state { func stateWorking(ctx context.Context, sender Sender, receiver Receiver, u updater) state {
@ -342,21 +345,28 @@ func stateWorking(ctx context.Context, sender Sender, receiver Receiver, u updat
return rsfNext return rsfNext
} }
state, nextStepDate := active.GetFSReplication().TakeStep(ctx, sender, receiver) retryWaitUntil := active.GetFSReplication().RetryWaitUntil()
if retryWaitUntil.After(time.Now()) {
return u(func(r *Replication) { return u(func(r *Replication) {
active.Update(state, nextStepDate) r.sleepUntil = retryWaitUntil
r.state = WorkingWait
}).rsf()
}
state, nextStepDate, retryWaitUntil := active.GetFSReplication().TakeStep(ctx, sender, receiver)
return u(func(r *Replication) {
active.Update(state, nextStepDate, retryWaitUntil)
r.active = nil r.active = nil
}).rsf() }).rsf()
} }
func stateWorkingWait(ctx context.Context, sender Sender, receiver Receiver, u updater) state { func stateWorkingWait(ctx context.Context, sender Sender, receiver Receiver, u updater) state {
sleepUntil := time.Now().Add(RetrySleepDuration) var sleepUntil time.Time
u(func(r *Replication) { u(func(r *Replication) {
r.sleepUntil = sleepUntil sleepUntil = r.sleepUntil
}) })
t := time.NewTimer(RetrySleepDuration) t := time.NewTimer(PlanningRetryInterval)
getLogger(ctx).WithField("until", sleepUntil).Info("retry wait after send/recv error") getLogger(ctx).WithField("until", sleepUntil).Info("retry wait because no filesystems are ready")
defer t.Stop() defer t.Stop()
select { select {
case <-ctx.Done(): case <-ctx.Done():
@ -364,12 +374,14 @@ func stateWorkingWait(ctx context.Context, sender Sender, receiver Receiver, u u
r.state = ContextDone r.state = ContextDone
r.contextError = ctx.Err() r.contextError = ctx.Err()
}).rsf() }).rsf()
case <-t.C: case <-t.C:
case <-wakeup.Wait(ctx):
}
return u(func(r *Replication) { return u(func(r *Replication) {
r.state = Working r.state = Working
}).rsf() }).rsf()
} }
}
// Report provides a summary of the progress of the Replication, // Report provides a summary of the progress of the Replication,
// i.e., a condensed dump of the internal state machine. // i.e., a condensed dump of the internal state machine.