mirror of
https://github.com/zrepl/zrepl.git
synced 2025-04-10 19:38:16 +02:00
Revert "daemon/job/active: push mode: awful hack for handling of concurrent snapshots + stale remote operation"
This reverts commit aeb87ffbcf
.
This commit is contained in:
parent
53ac853cb4
commit
82f0060eec
@ -238,11 +238,11 @@ outer:
|
|||||||
}
|
}
|
||||||
invocationCount++
|
invocationCount++
|
||||||
invLog := log.WithField("invocation", invocationCount)
|
invLog := log.WithField("invocation", invocationCount)
|
||||||
j.do(WithLogger(ctx, invLog), periodicDone)
|
j.do(WithLogger(ctx, invLog))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (j *ActiveSide) do(ctx context.Context, periodicWakeup <-chan struct{}) {
|
func (j *ActiveSide) do(ctx context.Context) {
|
||||||
|
|
||||||
log := GetLogger(ctx)
|
log := GetLogger(ctx)
|
||||||
ctx = logging.WithSubsystemLoggers(ctx, log)
|
ctx = logging.WithSubsystemLoggers(ctx, log)
|
||||||
@ -277,129 +277,14 @@ func (j *ActiveSide) do(ctx context.Context, periodicWakeup <-chan struct{}) {
|
|||||||
})
|
})
|
||||||
|
|
||||||
log.Info("start replication")
|
log.Info("start replication")
|
||||||
replicationDone := make(chan struct{})
|
tasks.replication.Drive(ctx, sender, receiver)
|
||||||
replicationCtx, replicationCancel := context.WithCancel(ctx)
|
|
||||||
defer replicationCancel()
|
|
||||||
go func() {
|
|
||||||
tasks.replication.Drive(replicationCtx, sender, receiver)
|
|
||||||
close(replicationDone)
|
|
||||||
}()
|
|
||||||
outer:
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-replicationDone:
|
|
||||||
// fine!
|
|
||||||
break outer
|
|
||||||
case <-periodicWakeup:
|
|
||||||
// Replication took longer than the periodic interval.
|
|
||||||
//
|
|
||||||
// For pull jobs, this isn't so bad because nothing changes on the active side
|
|
||||||
// if replication doesn't go forward.
|
|
||||||
//
|
|
||||||
// For push jobs, this means snapshots were taken.
|
|
||||||
// We need to invoke the pruner now, because otherwise an infinitely stuck replication
|
|
||||||
// will cause this side to fill up with snapshots.
|
|
||||||
//
|
|
||||||
// However, there are cases where replication progresses and just takes longer,
|
|
||||||
// and we don't want these situations be interrupted by a prune, which will require
|
|
||||||
// re-planning and starting over (think of initial replication as an example).
|
|
||||||
//
|
|
||||||
// Therefore, we prohibit pruning of snapshots that are part of the current replication plan.
|
|
||||||
// If there is no such plan, we kill the replication.
|
|
||||||
|
|
||||||
if j.mode.Type() == TypePush {
|
tasks = j.updateTasks(func(tasks *activeSideTasks) {
|
||||||
|
tasks.prunerSender = j.prunerFactory.BuildSenderPruner(ctx, sender, sender)
|
||||||
rep := tasks.replication.Report()
|
tasks.prunerReceiver = j.prunerFactory.BuildReceiverPruner(ctx, receiver, sender)
|
||||||
state, err := replication.StateString(rep.Status)
|
})
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
switch state {
|
|
||||||
case replication.Planning:
|
|
||||||
fallthrough
|
|
||||||
case replication.PlanningError:
|
|
||||||
fallthrough
|
|
||||||
case replication.WorkingWait:
|
|
||||||
log.WithField("repl_state", state.String()).
|
|
||||||
Info("cancelling replication after new snapshots invalidated its current state")
|
|
||||||
replicationCancel()
|
|
||||||
log.Info("waiting for replication to stop")
|
|
||||||
<-replicationDone // no need to wait for ctx.Done, replication is already bound to global cancel
|
|
||||||
break outer
|
|
||||||
default:
|
|
||||||
log.WithField("repl_state", state.String()).
|
|
||||||
Warn("new snapshots while replication is running and making progress")
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
var pruningWg sync.WaitGroup
|
|
||||||
log.Info("start pruning sender")
|
log.Info("start pruning sender")
|
||||||
pruningWg.Add(1)
|
tasks.prunerSender.Prune()
|
||||||
go func() {
|
|
||||||
defer pruningWg.Done()
|
|
||||||
tasks := j.updateTasks(func(tasks *activeSideTasks) {
|
|
||||||
tasks.prunerSender = j.prunerFactory.BuildSenderPruner(ctx, sender, sender)
|
|
||||||
})
|
|
||||||
tasks.prunerSender.Prune()
|
|
||||||
// FIXME no need to do the cancellation dance with sender, we know it's local for push
|
|
||||||
// FIXME and we don't worry about pull ATM
|
|
||||||
}()
|
|
||||||
log.Info("start pruning receiver")
|
log.Info("start pruning receiver")
|
||||||
pruningWg.Add(1)
|
tasks.prunerReceiver.Prune()
|
||||||
go func() {
|
|
||||||
defer pruningWg.Done()
|
|
||||||
|
|
||||||
receiverPrunerCtx, receiverPrunerCancel := context.WithCancel(ctx)
|
|
||||||
defer receiverPrunerCancel()
|
|
||||||
tasks := j.updateTasks(func(tasks *activeSideTasks) {
|
|
||||||
tasks.prunerReceiver = j.prunerFactory.BuildReceiverPruner(receiverPrunerCtx, receiver, sender)
|
|
||||||
})
|
|
||||||
receiverPrunerDone := make(chan struct{})
|
|
||||||
go func() {
|
|
||||||
defer close(receiverPrunerDone)
|
|
||||||
tasks.prunerReceiver.Prune()
|
|
||||||
}()
|
|
||||||
|
|
||||||
outer:
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-receiverPrunerDone:
|
|
||||||
// fine!
|
|
||||||
break outer
|
|
||||||
case <-periodicWakeup:
|
|
||||||
// see comments for similar apporach with replication above
|
|
||||||
if j.mode.Type() == TypePush {
|
|
||||||
rep := tasks.prunerReceiver.Report()
|
|
||||||
state, err := pruner.StateString(rep.State)
|
|
||||||
if err != nil {
|
|
||||||
panic(err)
|
|
||||||
}
|
|
||||||
switch state {
|
|
||||||
case pruner.PlanWait:
|
|
||||||
fallthrough
|
|
||||||
case pruner.ExecWait:
|
|
||||||
log.WithField("pruner_state", state.String()).
|
|
||||||
Info("cancelling failing prune on receiver because new snapshots were taken on sender")
|
|
||||||
receiverPrunerCancel()
|
|
||||||
log.Info("waiting for receiver pruner to stop")
|
|
||||||
<-receiverPrunerDone
|
|
||||||
break outer
|
|
||||||
default:
|
|
||||||
log.WithField("pruner_state", state.String()).
|
|
||||||
Warn("new snapshots while prune on receiver is still running")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}()
|
|
||||||
|
|
||||||
pruningWg.Wait() // if pruners handle ctx cancellation correctly, we don't need to wait for it here
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user