propert locking on FSReplication

This commit is contained in:
Christian Schwarz 2018-08-16 12:01:51 +02:00
parent 5479463783
commit 93929b61e4
3 changed files with 103 additions and 76 deletions

View File

@ -5,14 +5,13 @@ package replication
import "strconv" import "strconv"
const ( const (
_FSReplicationState_name_0 = "FSQueuedFSActive" _FSReplicationState_name_0 = "FSReadyFSRetryWait"
_FSReplicationState_name_1 = "FSRetryWait" _FSReplicationState_name_1 = "FSPermanentError"
_FSReplicationState_name_2 = "FSPermanentError" _FSReplicationState_name_2 = "FSCompleted"
_FSReplicationState_name_3 = "FSCompleted"
) )
var ( var (
_FSReplicationState_index_0 = [...]uint8{0, 8, 16} _FSReplicationState_index_0 = [...]uint8{0, 7, 18}
) )
func (i FSReplicationState) String() string { func (i FSReplicationState) String() string {
@ -24,8 +23,6 @@ func (i FSReplicationState) String() string {
return _FSReplicationState_name_1 return _FSReplicationState_name_1
case i == 8: case i == 8:
return _FSReplicationState_name_2 return _FSReplicationState_name_2
case i == 16:
return _FSReplicationState_name_3
default: default:
return "FSReplicationState(" + strconv.FormatInt(int64(i), 10) + ")" return "FSReplicationState(" + strconv.FormatInt(int64(i), 10) + ")"
} }

View File

@ -69,30 +69,45 @@ type replicationUpdater func(func(*Replication)) (newState ReplicationState)
type replicationStateFunc func(context.Context, EndpointPair, replicationUpdater) replicationStateFunc type replicationStateFunc func(context.Context, EndpointPair, replicationUpdater) replicationStateFunc
//go:generate stringer -type=FSReplicationState //go:generate stringer -type=FSReplicationState
type FSReplicationState int type FSReplicationState uint
const ( const (
FSQueued FSReplicationState = 1 << iota FSReady FSReplicationState = 1 << iota
FSActive
FSRetryWait FSRetryWait
FSPermanentError FSPermanentError
FSCompleted FSCompleted
) )
func (s FSReplicationState) fsrsf() fsrsf {
idx := bits.TrailingZeros(uint(s))
if idx == bits.UintSize {
panic(s)
}
m := []fsrsf{
fsrsfReady,
fsrsfRetryWait,
nil,
nil,
}
return m[idx]
}
type FSReplication struct { type FSReplication struct {
lock sync.Mutex // lock protects all fields in this struct, but not the data behind pointers
lock sync.Mutex
state FSReplicationState state FSReplicationState
fs *Filesystem fs *Filesystem
permanentError error err error
retryWaitUntil time.Time
completed, pending []*FSReplicationStep completed, pending []*FSReplicationStep
active *FSReplicationStep current *FSReplicationStep
} }
func newReplicationQueueItemPermanentError(fs *Filesystem, err error) *replicationQueueItem { func newReplicationQueueItemPermanentError(fs *Filesystem, err error) *replicationQueueItem {
return &replicationQueueItem{0, &FSReplication{ return &replicationQueueItem{0, &FSReplication{
state: FSPermanentError, state: FSPermanentError,
fs: fs, fs: fs,
permanentError: err, err: err,
}} }}
} }
@ -123,7 +138,7 @@ func (b *replicationQueueItemBuilder) AddStep(from, to *FilesystemVersion) *repl
func (b *replicationQueueItemBuilder) Complete() *replicationQueueItem { func (b *replicationQueueItemBuilder) Complete() *replicationQueueItem {
if len(b.r.pending) > 0 { if len(b.r.pending) > 0 {
b.r.state = FSQueued b.r.state = FSReady
} else { } else {
b.r.state = FSCompleted b.r.state = FSCompleted
} }
@ -280,7 +295,7 @@ func rsfPlanning(ctx context.Context, ep EndpointPair, u replicationUpdater) rep
switch qitem.fsr.state { switch qitem.fsr.state {
case FSCompleted: case FSCompleted:
completed = append(completed, qitem) completed = append(completed, qitem)
case FSQueued: case FSReady:
pending = append(pending, qitem) pending = append(pending, qitem)
default: default:
panic(qitem) panic(qitem)
@ -297,8 +312,8 @@ func rsfPlanning(ctx context.Context, ep EndpointPair, u replicationUpdater) rep
} }
func rsfPlanningError(ctx context.Context, ep EndpointPair, u replicationUpdater) replicationStateFunc { func rsfPlanningError(ctx context.Context, ep EndpointPair, u replicationUpdater) replicationStateFunc {
sleepTime := 10*time.Second sleepTime := 10 * time.Second
u(func(r *Replication){ u(func(r *Replication) {
r.sleepUntil = time.Now().Add(sleepTime) r.sleepUntil = time.Now().Add(sleepTime)
}) })
t := time.NewTimer(sleepTime) // FIXME make constant onfigurable t := time.NewTimer(sleepTime) // FIXME make constant onfigurable
@ -334,10 +349,10 @@ func rsfWorking(ctx context.Context, ep EndpointPair, u replicationUpdater) repl
sort.Slice(r.pending, func(i, j int) bool { sort.Slice(r.pending, func(i, j int) bool {
a, b := r.pending[i], r.pending[j] a, b := r.pending[i], r.pending[j]
statePrio := func(x *replicationQueueItem) int { statePrio := func(x *replicationQueueItem) int {
if x.fsr.state&(FSQueued|FSRetryWait) == 0 { if x.fsr.state&(FSReady|FSRetryWait) == 0 {
panic(x) panic(x)
} }
if x.fsr.state == FSQueued { if x.fsr.state == FSReady {
return 0 return 0
} else { } else {
return 1 return 1
@ -348,7 +363,7 @@ func rsfWorking(ctx context.Context, ep EndpointPair, u replicationUpdater) repl
return aprio < bprio return aprio < bprio
} }
// now we know they are the same state // now we know they are the same state
if a.fsr.state == FSQueued { if a.fsr.state == FSReady {
return a.fsr.nextStepDate().Before(b.fsr.nextStepDate()) return a.fsr.nextStepDate().Before(b.fsr.nextStepDate())
} }
if a.fsr.state == FSRetryWait { if a.fsr.state == FSRetryWait {
@ -367,20 +382,11 @@ func rsfWorking(ctx context.Context, ep EndpointPair, u replicationUpdater) repl
return rsfNext return rsfNext
} }
if active.fsr.state == FSRetryWait { fsState := active.fsr.takeStep(ctx, ep)
return u(func(r *Replication) {
r.state = WorkingWait
}).rsf()
}
if active.fsr.state != FSQueued {
panic(active)
}
fsState := active.fsr.drive(ctx, ep)
return u(func(r *Replication) { return u(func(r *Replication) {
if fsState&FSQueued != 0 { if fsState&FSReady != 0 {
r.active.retriesSinceLastError = 0 r.active.retriesSinceLastError = 0
} else if fsState&FSRetryWait != 0 { } else if fsState&FSRetryWait != 0 {
r.active.retriesSinceLastError++ r.active.retriesSinceLastError++
@ -416,7 +422,7 @@ func rsfWorkingWait(ctx context.Context, ep EndpointPair, u replicationUpdater)
// caller must have exclusive access to f // caller must have exclusive access to f
func (f *FSReplication) nextStepDate() time.Time { func (f *FSReplication) nextStepDate() time.Time {
if f.state != FSQueued { if f.state != FSReady {
panic(f) panic(f)
} }
ct, err := f.pending[0].to.CreationAsTime() ct, err := f.pending[0].to.CreationAsTime()
@ -426,70 +432,94 @@ func (f *FSReplication) nextStepDate() time.Time {
return ct return ct
} }
func (f *FSReplication) drive(ctx context.Context, ep EndpointPair) FSReplicationState { func (f *FSReplication) takeStep(ctx context.Context, ep EndpointPair) FSReplicationState {
f.lock.Lock()
defer f.lock.Unlock() var u fsrUpdater = func(fu func(*FSReplication)) FSReplicationState {
for f.state&(FSRetryWait|FSPermanentError|FSCompleted) == 0 { f.lock.Lock()
pre := f.state defer f.lock.Unlock()
if fu != nil {
fu(f)
}
return f.state
}
var s fsrsf = u(nil).fsrsf()
for s != nil {
pre := u(nil)
preTime := time.Now() preTime := time.Now()
f.doDrive(ctx, ep) s = s(ctx, ep, u)
delta := time.Now().Sub(preTime) delta := time.Now().Sub(preTime)
post := f.state post := u(nil)
getLogger(ctx). getLogger(ctx).
WithField("fs", f.fs.Path).
WithField("transition", fmt.Sprintf("%s => %s", pre, post)). WithField("transition", fmt.Sprintf("%s => %s", pre, post)).
WithField("duration", delta). WithField("duration", delta).
Debug("fsr state transition") Debug("fsr state transition")
} }
return f.state return u(nil)
} }
// caller must hold f.lock type fsrUpdater func(func(fsr *FSReplication)) FSReplicationState
func (f *FSReplication) doDrive(ctx context.Context, ep EndpointPair) FSReplicationState { type fsrsf func(ctx context.Context, ep EndpointPair, u fsrUpdater) fsrsf
switch f.state {
case FSPermanentError: func fsrsfReady(ctx context.Context, ep EndpointPair, u fsrUpdater) fsrsf {
fallthrough
case FSCompleted: var current *FSReplicationStep
return f.state s := u(func(f *FSReplication) {
case FSRetryWait: if f.current == nil {
f.state = FSQueued
return f.state
case FSQueued:
if f.active == nil {
if len(f.pending) == 0 { if len(f.pending) == 0 {
f.state = FSCompleted f.state = FSCompleted
return f.state return
} }
f.active = f.pending[0] f.current = f.pending[0]
f.pending = f.pending[1:] f.pending = f.pending[1:]
} }
f.state = FSActive current = f.current
return f.state })
if s != FSReady {
return s.fsrsf()
}
case FSActive: stepState := current.do(ctx, ep)
var stepState FSReplicationStepState
func() { // drop lock during long call return u(func(f *FSReplication) {
f.lock.Unlock()
defer f.lock.Lock()
stepState = f.active.do(ctx, ep)
}()
switch stepState { switch stepState {
case StepCompleted: case StepCompleted:
f.completed = append(f.completed, f.active) f.completed = append(f.completed, f.current)
f.active = nil f.current = nil
if len(f.pending) > 0 { if len(f.pending) > 0 {
f.state = FSQueued f.state = FSReady
} else { } else {
f.state = FSCompleted f.state = FSCompleted
} }
case StepRetry: case StepRetry:
f.retryWaitUntil = time.Now().Add(10 * time.Second) // FIXME make configurable
f.state = FSRetryWait f.state = FSRetryWait
case StepPermanentError: case StepPermanentError:
f.state = FSPermanentError f.state = FSPermanentError
default:
panic(f)
} }
return f.state }).fsrsf()
} }
panic(f) func fsrsfRetryWait(ctx context.Context, ep EndpointPair, u fsrUpdater) fsrsf {
var sleepUntil time.Time
u(func(f *FSReplication) {
sleepUntil = f.retryWaitUntil
})
t := time.NewTimer(sleepUntil.Sub(time.Now()))
defer t.Stop()
select {
case <-ctx.Done():
return u(func(f *FSReplication) {
f.state = FSPermanentError
f.err = ctx.Err()
}).fsrsf()
case <-t.C:
}
return u(func(f *FSReplication) {
f.state = FSReady
}).fsrsf()
} }
func (s *FSReplicationStep) do(ctx context.Context, ep EndpointPair) FSReplicationStepState { func (s *FSReplicationStep) do(ctx context.Context, ep EndpointPair) FSReplicationStepState {

View File

@ -46,7 +46,7 @@ func filesystemReplicationReportFromQueueItem(qitem *replicationQueueItem) *File
} }
if fsr.state&FSPermanentError != 0 { if fsr.state&FSPermanentError != 0 {
rep.Problem = fsr.permanentError.Error() rep.Problem = fsr.err.Error()
return &rep return &rep
} }
@ -54,8 +54,8 @@ func filesystemReplicationReportFromQueueItem(qitem *replicationQueueItem) *File
for _, step := range fsr.completed { for _, step := range fsr.completed {
rep.Steps = append(rep.Steps, stepReportFromStep(step)) rep.Steps = append(rep.Steps, stepReportFromStep(step))
} }
if fsr.active != nil { if fsr.current != nil {
rep.Steps = append(rep.Steps, stepReportFromStep(fsr.active)) rep.Steps = append(rep.Steps, stepReportFromStep(fsr.current))
} }
for _, step := range fsr.pending { for _, step := range fsr.pending {
rep.Steps = append(rep.Steps, stepReportFromStep(step)) rep.Steps = append(rep.Steps, stepReportFromStep(step))