mirror of
https://github.com/zrepl/zrepl.git
synced 2024-11-22 08:23:50 +01:00
propert locking on FSReplication
This commit is contained in:
parent
5479463783
commit
93929b61e4
@ -5,14 +5,13 @@ package replication
|
|||||||
import "strconv"
|
import "strconv"
|
||||||
|
|
||||||
const (
|
const (
|
||||||
_FSReplicationState_name_0 = "FSQueuedFSActive"
|
_FSReplicationState_name_0 = "FSReadyFSRetryWait"
|
||||||
_FSReplicationState_name_1 = "FSRetryWait"
|
_FSReplicationState_name_1 = "FSPermanentError"
|
||||||
_FSReplicationState_name_2 = "FSPermanentError"
|
_FSReplicationState_name_2 = "FSCompleted"
|
||||||
_FSReplicationState_name_3 = "FSCompleted"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
_FSReplicationState_index_0 = [...]uint8{0, 8, 16}
|
_FSReplicationState_index_0 = [...]uint8{0, 7, 18}
|
||||||
)
|
)
|
||||||
|
|
||||||
func (i FSReplicationState) String() string {
|
func (i FSReplicationState) String() string {
|
||||||
@ -24,8 +23,6 @@ func (i FSReplicationState) String() string {
|
|||||||
return _FSReplicationState_name_1
|
return _FSReplicationState_name_1
|
||||||
case i == 8:
|
case i == 8:
|
||||||
return _FSReplicationState_name_2
|
return _FSReplicationState_name_2
|
||||||
case i == 16:
|
|
||||||
return _FSReplicationState_name_3
|
|
||||||
default:
|
default:
|
||||||
return "FSReplicationState(" + strconv.FormatInt(int64(i), 10) + ")"
|
return "FSReplicationState(" + strconv.FormatInt(int64(i), 10) + ")"
|
||||||
}
|
}
|
||||||
|
@ -69,30 +69,45 @@ type replicationUpdater func(func(*Replication)) (newState ReplicationState)
|
|||||||
type replicationStateFunc func(context.Context, EndpointPair, replicationUpdater) replicationStateFunc
|
type replicationStateFunc func(context.Context, EndpointPair, replicationUpdater) replicationStateFunc
|
||||||
|
|
||||||
//go:generate stringer -type=FSReplicationState
|
//go:generate stringer -type=FSReplicationState
|
||||||
type FSReplicationState int
|
type FSReplicationState uint
|
||||||
|
|
||||||
const (
|
const (
|
||||||
FSQueued FSReplicationState = 1 << iota
|
FSReady FSReplicationState = 1 << iota
|
||||||
FSActive
|
|
||||||
FSRetryWait
|
FSRetryWait
|
||||||
FSPermanentError
|
FSPermanentError
|
||||||
FSCompleted
|
FSCompleted
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func (s FSReplicationState) fsrsf() fsrsf {
|
||||||
|
idx := bits.TrailingZeros(uint(s))
|
||||||
|
if idx == bits.UintSize {
|
||||||
|
panic(s)
|
||||||
|
}
|
||||||
|
m := []fsrsf{
|
||||||
|
fsrsfReady,
|
||||||
|
fsrsfRetryWait,
|
||||||
|
nil,
|
||||||
|
nil,
|
||||||
|
}
|
||||||
|
return m[idx]
|
||||||
|
}
|
||||||
|
|
||||||
type FSReplication struct {
|
type FSReplication struct {
|
||||||
lock sync.Mutex
|
// lock protects all fields in this struct, but not the data behind pointers
|
||||||
|
lock sync.Mutex
|
||||||
state FSReplicationState
|
state FSReplicationState
|
||||||
fs *Filesystem
|
fs *Filesystem
|
||||||
permanentError error
|
err error
|
||||||
|
retryWaitUntil time.Time
|
||||||
completed, pending []*FSReplicationStep
|
completed, pending []*FSReplicationStep
|
||||||
active *FSReplicationStep
|
current *FSReplicationStep
|
||||||
}
|
}
|
||||||
|
|
||||||
func newReplicationQueueItemPermanentError(fs *Filesystem, err error) *replicationQueueItem {
|
func newReplicationQueueItemPermanentError(fs *Filesystem, err error) *replicationQueueItem {
|
||||||
return &replicationQueueItem{0, &FSReplication{
|
return &replicationQueueItem{0, &FSReplication{
|
||||||
state: FSPermanentError,
|
state: FSPermanentError,
|
||||||
fs: fs,
|
fs: fs,
|
||||||
permanentError: err,
|
err: err,
|
||||||
}}
|
}}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -123,7 +138,7 @@ func (b *replicationQueueItemBuilder) AddStep(from, to *FilesystemVersion) *repl
|
|||||||
|
|
||||||
func (b *replicationQueueItemBuilder) Complete() *replicationQueueItem {
|
func (b *replicationQueueItemBuilder) Complete() *replicationQueueItem {
|
||||||
if len(b.r.pending) > 0 {
|
if len(b.r.pending) > 0 {
|
||||||
b.r.state = FSQueued
|
b.r.state = FSReady
|
||||||
} else {
|
} else {
|
||||||
b.r.state = FSCompleted
|
b.r.state = FSCompleted
|
||||||
}
|
}
|
||||||
@ -185,7 +200,7 @@ func (r *Replication) Drive(ctx context.Context, ep EndpointPair, retryNow chan
|
|||||||
}
|
}
|
||||||
|
|
||||||
func rsfPlanning(ctx context.Context, ep EndpointPair, u replicationUpdater) replicationStateFunc {
|
func rsfPlanning(ctx context.Context, ep EndpointPair, u replicationUpdater) replicationStateFunc {
|
||||||
|
|
||||||
log := getLogger(ctx)
|
log := getLogger(ctx)
|
||||||
|
|
||||||
handlePlanningError := func(err error) replicationStateFunc {
|
handlePlanningError := func(err error) replicationStateFunc {
|
||||||
@ -280,7 +295,7 @@ func rsfPlanning(ctx context.Context, ep EndpointPair, u replicationUpdater) rep
|
|||||||
switch qitem.fsr.state {
|
switch qitem.fsr.state {
|
||||||
case FSCompleted:
|
case FSCompleted:
|
||||||
completed = append(completed, qitem)
|
completed = append(completed, qitem)
|
||||||
case FSQueued:
|
case FSReady:
|
||||||
pending = append(pending, qitem)
|
pending = append(pending, qitem)
|
||||||
default:
|
default:
|
||||||
panic(qitem)
|
panic(qitem)
|
||||||
@ -297,8 +312,8 @@ func rsfPlanning(ctx context.Context, ep EndpointPair, u replicationUpdater) rep
|
|||||||
}
|
}
|
||||||
|
|
||||||
func rsfPlanningError(ctx context.Context, ep EndpointPair, u replicationUpdater) replicationStateFunc {
|
func rsfPlanningError(ctx context.Context, ep EndpointPair, u replicationUpdater) replicationStateFunc {
|
||||||
sleepTime := 10*time.Second
|
sleepTime := 10 * time.Second
|
||||||
u(func(r *Replication){
|
u(func(r *Replication) {
|
||||||
r.sleepUntil = time.Now().Add(sleepTime)
|
r.sleepUntil = time.Now().Add(sleepTime)
|
||||||
})
|
})
|
||||||
t := time.NewTimer(sleepTime) // FIXME make constant onfigurable
|
t := time.NewTimer(sleepTime) // FIXME make constant onfigurable
|
||||||
@ -334,10 +349,10 @@ func rsfWorking(ctx context.Context, ep EndpointPair, u replicationUpdater) repl
|
|||||||
sort.Slice(r.pending, func(i, j int) bool {
|
sort.Slice(r.pending, func(i, j int) bool {
|
||||||
a, b := r.pending[i], r.pending[j]
|
a, b := r.pending[i], r.pending[j]
|
||||||
statePrio := func(x *replicationQueueItem) int {
|
statePrio := func(x *replicationQueueItem) int {
|
||||||
if x.fsr.state&(FSQueued|FSRetryWait) == 0 {
|
if x.fsr.state&(FSReady|FSRetryWait) == 0 {
|
||||||
panic(x)
|
panic(x)
|
||||||
}
|
}
|
||||||
if x.fsr.state == FSQueued {
|
if x.fsr.state == FSReady {
|
||||||
return 0
|
return 0
|
||||||
} else {
|
} else {
|
||||||
return 1
|
return 1
|
||||||
@ -348,7 +363,7 @@ func rsfWorking(ctx context.Context, ep EndpointPair, u replicationUpdater) repl
|
|||||||
return aprio < bprio
|
return aprio < bprio
|
||||||
}
|
}
|
||||||
// now we know they are the same state
|
// now we know they are the same state
|
||||||
if a.fsr.state == FSQueued {
|
if a.fsr.state == FSReady {
|
||||||
return a.fsr.nextStepDate().Before(b.fsr.nextStepDate())
|
return a.fsr.nextStepDate().Before(b.fsr.nextStepDate())
|
||||||
}
|
}
|
||||||
if a.fsr.state == FSRetryWait {
|
if a.fsr.state == FSRetryWait {
|
||||||
@ -367,20 +382,11 @@ func rsfWorking(ctx context.Context, ep EndpointPair, u replicationUpdater) repl
|
|||||||
return rsfNext
|
return rsfNext
|
||||||
}
|
}
|
||||||
|
|
||||||
if active.fsr.state == FSRetryWait {
|
fsState := active.fsr.takeStep(ctx, ep)
|
||||||
return u(func(r *Replication) {
|
|
||||||
r.state = WorkingWait
|
|
||||||
}).rsf()
|
|
||||||
}
|
|
||||||
if active.fsr.state != FSQueued {
|
|
||||||
panic(active)
|
|
||||||
}
|
|
||||||
|
|
||||||
fsState := active.fsr.drive(ctx, ep)
|
|
||||||
|
|
||||||
return u(func(r *Replication) {
|
return u(func(r *Replication) {
|
||||||
|
|
||||||
if fsState&FSQueued != 0 {
|
if fsState&FSReady != 0 {
|
||||||
r.active.retriesSinceLastError = 0
|
r.active.retriesSinceLastError = 0
|
||||||
} else if fsState&FSRetryWait != 0 {
|
} else if fsState&FSRetryWait != 0 {
|
||||||
r.active.retriesSinceLastError++
|
r.active.retriesSinceLastError++
|
||||||
@ -416,7 +422,7 @@ func rsfWorkingWait(ctx context.Context, ep EndpointPair, u replicationUpdater)
|
|||||||
|
|
||||||
// caller must have exclusive access to f
|
// caller must have exclusive access to f
|
||||||
func (f *FSReplication) nextStepDate() time.Time {
|
func (f *FSReplication) nextStepDate() time.Time {
|
||||||
if f.state != FSQueued {
|
if f.state != FSReady {
|
||||||
panic(f)
|
panic(f)
|
||||||
}
|
}
|
||||||
ct, err := f.pending[0].to.CreationAsTime()
|
ct, err := f.pending[0].to.CreationAsTime()
|
||||||
@ -426,70 +432,94 @@ func (f *FSReplication) nextStepDate() time.Time {
|
|||||||
return ct
|
return ct
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f *FSReplication) drive(ctx context.Context, ep EndpointPair) FSReplicationState {
|
func (f *FSReplication) takeStep(ctx context.Context, ep EndpointPair) FSReplicationState {
|
||||||
f.lock.Lock()
|
|
||||||
defer f.lock.Unlock()
|
var u fsrUpdater = func(fu func(*FSReplication)) FSReplicationState {
|
||||||
for f.state&(FSRetryWait|FSPermanentError|FSCompleted) == 0 {
|
f.lock.Lock()
|
||||||
pre := f.state
|
defer f.lock.Unlock()
|
||||||
|
if fu != nil {
|
||||||
|
fu(f)
|
||||||
|
}
|
||||||
|
return f.state
|
||||||
|
}
|
||||||
|
var s fsrsf = u(nil).fsrsf()
|
||||||
|
for s != nil {
|
||||||
|
pre := u(nil)
|
||||||
preTime := time.Now()
|
preTime := time.Now()
|
||||||
f.doDrive(ctx, ep)
|
s = s(ctx, ep, u)
|
||||||
delta := time.Now().Sub(preTime)
|
delta := time.Now().Sub(preTime)
|
||||||
post := f.state
|
post := u(nil)
|
||||||
getLogger(ctx).
|
getLogger(ctx).
|
||||||
|
WithField("fs", f.fs.Path).
|
||||||
WithField("transition", fmt.Sprintf("%s => %s", pre, post)).
|
WithField("transition", fmt.Sprintf("%s => %s", pre, post)).
|
||||||
WithField("duration", delta).
|
WithField("duration", delta).
|
||||||
Debug("fsr state transition")
|
Debug("fsr state transition")
|
||||||
}
|
}
|
||||||
return f.state
|
return u(nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
// caller must hold f.lock
|
type fsrUpdater func(func(fsr *FSReplication)) FSReplicationState
|
||||||
func (f *FSReplication) doDrive(ctx context.Context, ep EndpointPair) FSReplicationState {
|
type fsrsf func(ctx context.Context, ep EndpointPair, u fsrUpdater) fsrsf
|
||||||
switch f.state {
|
|
||||||
case FSPermanentError:
|
func fsrsfReady(ctx context.Context, ep EndpointPair, u fsrUpdater) fsrsf {
|
||||||
fallthrough
|
|
||||||
case FSCompleted:
|
var current *FSReplicationStep
|
||||||
return f.state
|
s := u(func(f *FSReplication) {
|
||||||
case FSRetryWait:
|
if f.current == nil {
|
||||||
f.state = FSQueued
|
|
||||||
return f.state
|
|
||||||
case FSQueued:
|
|
||||||
if f.active == nil {
|
|
||||||
if len(f.pending) == 0 {
|
if len(f.pending) == 0 {
|
||||||
f.state = FSCompleted
|
f.state = FSCompleted
|
||||||
return f.state
|
return
|
||||||
}
|
}
|
||||||
f.active = f.pending[0]
|
f.current = f.pending[0]
|
||||||
f.pending = f.pending[1:]
|
f.pending = f.pending[1:]
|
||||||
}
|
}
|
||||||
f.state = FSActive
|
current = f.current
|
||||||
return f.state
|
})
|
||||||
|
if s != FSReady {
|
||||||
|
return s.fsrsf()
|
||||||
|
}
|
||||||
|
|
||||||
case FSActive:
|
stepState := current.do(ctx, ep)
|
||||||
var stepState FSReplicationStepState
|
|
||||||
func() { // drop lock during long call
|
return u(func(f *FSReplication) {
|
||||||
f.lock.Unlock()
|
|
||||||
defer f.lock.Lock()
|
|
||||||
stepState = f.active.do(ctx, ep)
|
|
||||||
}()
|
|
||||||
switch stepState {
|
switch stepState {
|
||||||
case StepCompleted:
|
case StepCompleted:
|
||||||
f.completed = append(f.completed, f.active)
|
f.completed = append(f.completed, f.current)
|
||||||
f.active = nil
|
f.current = nil
|
||||||
if len(f.pending) > 0 {
|
if len(f.pending) > 0 {
|
||||||
f.state = FSQueued
|
f.state = FSReady
|
||||||
} else {
|
} else {
|
||||||
f.state = FSCompleted
|
f.state = FSCompleted
|
||||||
}
|
}
|
||||||
case StepRetry:
|
case StepRetry:
|
||||||
|
f.retryWaitUntil = time.Now().Add(10 * time.Second) // FIXME make configurable
|
||||||
f.state = FSRetryWait
|
f.state = FSRetryWait
|
||||||
case StepPermanentError:
|
case StepPermanentError:
|
||||||
f.state = FSPermanentError
|
f.state = FSPermanentError
|
||||||
|
default:
|
||||||
|
panic(f)
|
||||||
}
|
}
|
||||||
return f.state
|
}).fsrsf()
|
||||||
}
|
}
|
||||||
|
|
||||||
panic(f)
|
func fsrsfRetryWait(ctx context.Context, ep EndpointPair, u fsrUpdater) fsrsf {
|
||||||
|
var sleepUntil time.Time
|
||||||
|
u(func(f *FSReplication) {
|
||||||
|
sleepUntil = f.retryWaitUntil
|
||||||
|
})
|
||||||
|
t := time.NewTimer(sleepUntil.Sub(time.Now()))
|
||||||
|
defer t.Stop()
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return u(func(f *FSReplication) {
|
||||||
|
f.state = FSPermanentError
|
||||||
|
f.err = ctx.Err()
|
||||||
|
}).fsrsf()
|
||||||
|
case <-t.C:
|
||||||
|
}
|
||||||
|
return u(func(f *FSReplication) {
|
||||||
|
f.state = FSReady
|
||||||
|
}).fsrsf()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *FSReplicationStep) do(ctx context.Context, ep EndpointPair) FSReplicationStepState {
|
func (s *FSReplicationStep) do(ctx context.Context, ep EndpointPair) FSReplicationStepState {
|
||||||
|
@ -46,7 +46,7 @@ func filesystemReplicationReportFromQueueItem(qitem *replicationQueueItem) *File
|
|||||||
}
|
}
|
||||||
|
|
||||||
if fsr.state&FSPermanentError != 0 {
|
if fsr.state&FSPermanentError != 0 {
|
||||||
rep.Problem = fsr.permanentError.Error()
|
rep.Problem = fsr.err.Error()
|
||||||
return &rep
|
return &rep
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -54,8 +54,8 @@ func filesystemReplicationReportFromQueueItem(qitem *replicationQueueItem) *File
|
|||||||
for _, step := range fsr.completed {
|
for _, step := range fsr.completed {
|
||||||
rep.Steps = append(rep.Steps, stepReportFromStep(step))
|
rep.Steps = append(rep.Steps, stepReportFromStep(step))
|
||||||
}
|
}
|
||||||
if fsr.active != nil {
|
if fsr.current != nil {
|
||||||
rep.Steps = append(rep.Steps, stepReportFromStep(fsr.active))
|
rep.Steps = append(rep.Steps, stepReportFromStep(fsr.current))
|
||||||
}
|
}
|
||||||
for _, step := range fsr.pending {
|
for _, step := range fsr.pending {
|
||||||
rep.Steps = append(rep.Steps, stepReportFromStep(step))
|
rep.Steps = append(rep.Steps, stepReportFromStep(step))
|
||||||
|
Loading…
Reference in New Issue
Block a user