mirror of
https://github.com/zrepl/zrepl.git
synced 2025-06-19 17:27:46 +02:00
WIP state-machine based replication
This commit is contained in:
parent
c1f3076eb3
commit
7303d91abf
@ -8,7 +8,7 @@ import (
|
|||||||
"github.com/pkg/errors"
|
"github.com/pkg/errors"
|
||||||
"github.com/zrepl/zrepl/zfs"
|
"github.com/zrepl/zrepl/zfs"
|
||||||
"sync"
|
"sync"
|
||||||
"github.com/zrepl/zrepl/cmd/replication"
|
"github.com/zrepl/zrepl/cmd/replication.v2"
|
||||||
)
|
)
|
||||||
|
|
||||||
type LocalJob struct {
|
type LocalJob struct {
|
||||||
@ -146,7 +146,7 @@ outer:
|
|||||||
j.mainTask.Log().Debug("replicating from lhs to rhs")
|
j.mainTask.Log().Debug("replicating from lhs to rhs")
|
||||||
j.mainTask.Enter("replicate")
|
j.mainTask.Enter("replicate")
|
||||||
|
|
||||||
replication.Replicate(ctx, replication.NewEndpointPairPull(sender, receiver))
|
replication.Replicate(ctx, replication.NewEndpointPairPull(sender, receiver), nil) // FIXME
|
||||||
|
|
||||||
j.mainTask.Finish()
|
j.mainTask.Finish()
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"net"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
"syscall"
|
"syscall"
|
||||||
@ -12,7 +13,7 @@ import (
|
|||||||
"github.com/mitchellh/mapstructure"
|
"github.com/mitchellh/mapstructure"
|
||||||
"github.com/pkg/errors"
|
"github.com/pkg/errors"
|
||||||
"github.com/problame/go-streamrpc"
|
"github.com/problame/go-streamrpc"
|
||||||
"github.com/zrepl/zrepl/cmd/replication"
|
"github.com/zrepl/zrepl/cmd/replication.v2"
|
||||||
)
|
)
|
||||||
|
|
||||||
type PullJob struct {
|
type PullJob struct {
|
||||||
@ -165,7 +166,10 @@ func (j *PullJob) doRun(ctx context.Context) {
|
|||||||
ConnConfig: STREAMRPC_CONFIG,
|
ConnConfig: STREAMRPC_CONFIG,
|
||||||
}
|
}
|
||||||
|
|
||||||
client, err := streamrpc.NewClient(j.Connect, clientConf)
|
//client, err := streamrpc.NewClient(j.Connect, clientConf)
|
||||||
|
client, err := streamrpc.NewClient(&tcpConnecter{net.Dialer{
|
||||||
|
Timeout: 10*time.Second,
|
||||||
|
}}, clientConf)
|
||||||
defer client.Close()
|
defer client.Close()
|
||||||
|
|
||||||
j.task.Enter("pull")
|
j.task.Enter("pull")
|
||||||
@ -182,10 +186,26 @@ func (j *PullJob) doRun(ctx context.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
usr2 := make(chan os.Signal)
|
||||||
|
defer close(usr2)
|
||||||
|
signal.Notify(usr2, syscall.SIGUSR2)
|
||||||
|
defer signal.Stop(usr2)
|
||||||
|
retryNow := make(chan struct{}, 1) // buffered so we don't leak the goroutine
|
||||||
|
go func() {
|
||||||
|
for {
|
||||||
|
sig := <-usr2
|
||||||
|
if sig != nil {
|
||||||
|
retryNow <- struct{}{}
|
||||||
|
} else {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
ctx = replication.ContextWithLogger(ctx, replicationLogAdaptor{j.task.Log().WithField("subsystem", "replication")})
|
ctx = replication.ContextWithLogger(ctx, replicationLogAdaptor{j.task.Log().WithField("subsystem", "replication")})
|
||||||
ctx = streamrpc.ContextWithLogger(ctx, streamrpcLogAdaptor{j.task.Log().WithField("subsystem", "rpc.protocol")})
|
ctx = streamrpc.ContextWithLogger(ctx, streamrpcLogAdaptor{j.task.Log().WithField("subsystem", "rpc.protocol")})
|
||||||
ctx = context.WithValue(ctx, contextKeyLog, j.task.Log().WithField("subsystem", "rpc.endpoint"))
|
ctx = context.WithValue(ctx, contextKeyLog, j.task.Log().WithField("subsystem", "rpc.endpoint"))
|
||||||
replication.Replicate(ctx, replication.NewEndpointPairPull(sender, puller))
|
replication.Replicate(ctx, replication.NewEndpointPairPull(sender, puller), retryNow)
|
||||||
|
|
||||||
client.Close()
|
client.Close()
|
||||||
j.task.Finish()
|
j.task.Finish()
|
||||||
|
@ -146,7 +146,9 @@ func (j *SourceJob) Pruner(task *Task, side PrunePolicySide, dryRun bool) (p Pru
|
|||||||
|
|
||||||
func (j *SourceJob) serve(ctx context.Context, task *Task) {
|
func (j *SourceJob) serve(ctx context.Context, task *Task) {
|
||||||
|
|
||||||
listener, err := j.Serve.Listen()
|
//listener, err := j.Serve.Listen()
|
||||||
|
|
||||||
|
listener, err := net.Listen("tcp", ":8888")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
task.Log().WithError(err).Error("error listening")
|
task.Log().WithError(err).Error("error listening")
|
||||||
return
|
return
|
||||||
|
@ -2,7 +2,7 @@ package cmd
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/zrepl/zrepl/cmd/replication"
|
"github.com/zrepl/zrepl/cmd/replication.v2"
|
||||||
"github.com/problame/go-streamrpc"
|
"github.com/problame/go-streamrpc"
|
||||||
"github.com/zrepl/zrepl/zfs"
|
"github.com/zrepl/zrepl/zfs"
|
||||||
"io"
|
"io"
|
||||||
|
32
cmd/replication.v2/fsreplicationstate_string.go
Normal file
32
cmd/replication.v2/fsreplicationstate_string.go
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
// Code generated by "stringer -type=FSReplicationState"; DO NOT EDIT.
|
||||||
|
|
||||||
|
package replication
|
||||||
|
|
||||||
|
import "strconv"
|
||||||
|
|
||||||
|
const (
|
||||||
|
_FSReplicationState_name_0 = "FSQueuedFSActive"
|
||||||
|
_FSReplicationState_name_1 = "FSRetry"
|
||||||
|
_FSReplicationState_name_2 = "FSPermanentError"
|
||||||
|
_FSReplicationState_name_3 = "FSCompleted"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
_FSReplicationState_index_0 = [...]uint8{0, 8, 16}
|
||||||
|
)
|
||||||
|
|
||||||
|
func (i FSReplicationState) String() string {
|
||||||
|
switch {
|
||||||
|
case 1 <= i && i <= 2:
|
||||||
|
i -= 1
|
||||||
|
return _FSReplicationState_name_0[_FSReplicationState_index_0[i]:_FSReplicationState_index_0[i+1]]
|
||||||
|
case i == 4:
|
||||||
|
return _FSReplicationState_name_1
|
||||||
|
case i == 8:
|
||||||
|
return _FSReplicationState_name_2
|
||||||
|
case i == 16:
|
||||||
|
return _FSReplicationState_name_3
|
||||||
|
default:
|
||||||
|
return "FSReplicationState(" + strconv.FormatInt(int64(i), 10) + ")"
|
||||||
|
}
|
||||||
|
}
|
16
cmd/replication.v2/fsreplicationstepstate_string.go
Normal file
16
cmd/replication.v2/fsreplicationstepstate_string.go
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
// Code generated by "stringer -type=FSReplicationStepState"; DO NOT EDIT.
|
||||||
|
|
||||||
|
package replication
|
||||||
|
|
||||||
|
import "strconv"
|
||||||
|
|
||||||
|
const _FSReplicationStepState_name = "StepPendingStepActiveStepRetryStepPermanentErrorStepCompleted"
|
||||||
|
|
||||||
|
var _FSReplicationStepState_index = [...]uint8{0, 11, 21, 30, 48, 61}
|
||||||
|
|
||||||
|
func (i FSReplicationStepState) String() string {
|
||||||
|
if i < 0 || i >= FSReplicationStepState(len(_FSReplicationStepState_index)-1) {
|
||||||
|
return "FSReplicationStepState(" + strconv.FormatInt(int64(i), 10) + ")"
|
||||||
|
}
|
||||||
|
return _FSReplicationStepState_name[_FSReplicationStepState_index[i]:_FSReplicationStepState_index[i+1]]
|
||||||
|
}
|
474
cmd/replication.v2/plan.go
Normal file
474
cmd/replication.v2/plan.go
Normal file
@ -0,0 +1,474 @@
|
|||||||
|
package replication
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"net"
|
||||||
|
"sort"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
//go:generate stringer -type=ReplicationState
|
||||||
|
type ReplicationState int
|
||||||
|
|
||||||
|
const (
|
||||||
|
Planning ReplicationState = iota
|
||||||
|
PlanningError
|
||||||
|
Working
|
||||||
|
WorkingWait
|
||||||
|
Completed
|
||||||
|
ContextDone
|
||||||
|
)
|
||||||
|
|
||||||
|
type Replication struct {
|
||||||
|
state ReplicationState
|
||||||
|
|
||||||
|
// Working / WorkingWait
|
||||||
|
|
||||||
|
pending, completed []*FSReplication
|
||||||
|
|
||||||
|
// PlanningError
|
||||||
|
planningError error
|
||||||
|
|
||||||
|
// ContextDone
|
||||||
|
contextError error
|
||||||
|
}
|
||||||
|
|
||||||
|
type FSReplicationState int
|
||||||
|
|
||||||
|
//go:generate stringer -type=FSReplicationState
|
||||||
|
const (
|
||||||
|
FSQueued FSReplicationState = 1 << iota
|
||||||
|
FSActive
|
||||||
|
FSRetry
|
||||||
|
FSPermanentError
|
||||||
|
FSCompleted
|
||||||
|
)
|
||||||
|
|
||||||
|
type FSReplication struct {
|
||||||
|
state FSReplicationState
|
||||||
|
fs *Filesystem
|
||||||
|
permanentError error
|
||||||
|
retryAt time.Time
|
||||||
|
completed, pending []*FSReplicationStep
|
||||||
|
}
|
||||||
|
|
||||||
|
func newFSReplicationPermanentError(fs *Filesystem, err error) *FSReplication {
|
||||||
|
return &FSReplication{
|
||||||
|
state: FSPermanentError,
|
||||||
|
fs: fs,
|
||||||
|
permanentError: err,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type FSReplicationBuilder struct {
|
||||||
|
r *FSReplication
|
||||||
|
steps []*FSReplicationStep
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildNewFSReplication(fs *Filesystem) *FSReplicationBuilder {
|
||||||
|
return &FSReplicationBuilder{
|
||||||
|
r: &FSReplication{
|
||||||
|
fs: fs,
|
||||||
|
pending: make([]*FSReplicationStep, 0),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *FSReplicationBuilder) AddStep(from, to *FilesystemVersion) *FSReplication {
|
||||||
|
step := &FSReplicationStep{
|
||||||
|
state: StepPending,
|
||||||
|
fsrep: b.r,
|
||||||
|
from: from,
|
||||||
|
to: to,
|
||||||
|
}
|
||||||
|
b.r.pending = append(b.r.pending, step)
|
||||||
|
return b.r
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *FSReplicationBuilder) Complete() *FSReplication {
|
||||||
|
if len(b.r.pending) > 0 {
|
||||||
|
b.r.state = FSQueued
|
||||||
|
} else {
|
||||||
|
b.r.state = FSCompleted
|
||||||
|
}
|
||||||
|
r := b.r
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
//go:generate stringer -type=FSReplicationStepState
|
||||||
|
type FSReplicationStepState int
|
||||||
|
|
||||||
|
const (
|
||||||
|
StepPending FSReplicationStepState = iota
|
||||||
|
StepActive
|
||||||
|
StepRetry
|
||||||
|
StepPermanentError
|
||||||
|
StepCompleted
|
||||||
|
)
|
||||||
|
|
||||||
|
type FSReplicationStep struct {
|
||||||
|
state FSReplicationStepState
|
||||||
|
from, to *FilesystemVersion
|
||||||
|
fsrep *FSReplication
|
||||||
|
|
||||||
|
// both retry and permanent error
|
||||||
|
err error
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *Replication) Drive(ctx context.Context, ep EndpointPair, retryNow chan struct{}) {
|
||||||
|
for !(r.state == Completed || r.state == ContextDone) {
|
||||||
|
pre := r.state
|
||||||
|
preTime := time.Now()
|
||||||
|
r.doDrive(ctx, ep, retryNow)
|
||||||
|
delta := time.Now().Sub(preTime)
|
||||||
|
post := r.state
|
||||||
|
getLogger(ctx).
|
||||||
|
WithField("transition", fmt.Sprintf("%s => %s", pre, post)).
|
||||||
|
WithField("duration", delta).
|
||||||
|
Debug("state transition")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *Replication) doDrive(ctx context.Context, ep EndpointPair, retryNow chan struct{}) {
|
||||||
|
|
||||||
|
switch r.state {
|
||||||
|
|
||||||
|
case Planning:
|
||||||
|
r.tryBuildPlan(ctx, ep)
|
||||||
|
|
||||||
|
case PlanningError:
|
||||||
|
w := time.NewTimer(10 * time.Second) // FIXME constant make configurable
|
||||||
|
defer w.Stop()
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
r.state = ContextDone
|
||||||
|
r.contextError = ctx.Err()
|
||||||
|
case <-retryNow:
|
||||||
|
r.state = Planning
|
||||||
|
r.planningError = nil
|
||||||
|
case <-w.C:
|
||||||
|
r.state = Planning
|
||||||
|
r.planningError = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
case Working:
|
||||||
|
|
||||||
|
if len(r.pending) == 0 {
|
||||||
|
r.state = Completed
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.Slice(r.pending, func(i, j int) bool {
|
||||||
|
a, b := r.pending[i], r.pending[j]
|
||||||
|
statePrio := func(x *FSReplication) int {
|
||||||
|
if !(x.state == FSQueued || x.state == FSRetry) {
|
||||||
|
panic(x)
|
||||||
|
}
|
||||||
|
if x.state == FSQueued {
|
||||||
|
return 0
|
||||||
|
} else {
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
aprio, bprio := statePrio(a), statePrio(b)
|
||||||
|
if aprio != bprio {
|
||||||
|
return aprio < bprio
|
||||||
|
}
|
||||||
|
// now we know they are the same state
|
||||||
|
if a.state == FSQueued {
|
||||||
|
return a.nextStepDate().Before(b.nextStepDate())
|
||||||
|
}
|
||||||
|
if a.state == FSRetry {
|
||||||
|
return a.retryAt.Before(b.retryAt)
|
||||||
|
}
|
||||||
|
panic("should not be reached")
|
||||||
|
})
|
||||||
|
|
||||||
|
fsrep := r.pending[0]
|
||||||
|
|
||||||
|
if fsrep.state == FSRetry {
|
||||||
|
r.state = WorkingWait
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if fsrep.state != FSQueued {
|
||||||
|
panic(fsrep)
|
||||||
|
}
|
||||||
|
|
||||||
|
fsState := fsrep.takeStep(ctx, ep)
|
||||||
|
if fsState&(FSPermanentError|FSCompleted) != 0 {
|
||||||
|
r.pending = r.pending[1:]
|
||||||
|
r.completed = append(r.completed, fsrep)
|
||||||
|
}
|
||||||
|
|
||||||
|
case WorkingWait:
|
||||||
|
fsrep := r.pending[0]
|
||||||
|
w := time.NewTimer(fsrep.retryAt.Sub(time.Now()))
|
||||||
|
defer w.Stop()
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
r.state = ContextDone
|
||||||
|
r.contextError = ctx.Err()
|
||||||
|
case <-retryNow:
|
||||||
|
for _, fsr := range r.pending {
|
||||||
|
fsr.retryNow()
|
||||||
|
}
|
||||||
|
r.state = Working
|
||||||
|
case <-w.C:
|
||||||
|
fsrep.retryNow() // avoid timer jitter
|
||||||
|
r.state = Working
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *Replication) tryBuildPlan(ctx context.Context, ep EndpointPair) ReplicationState {
|
||||||
|
|
||||||
|
log := getLogger(ctx)
|
||||||
|
|
||||||
|
planningError := func(err error) ReplicationState {
|
||||||
|
r.state = PlanningError
|
||||||
|
r.planningError = err
|
||||||
|
return r.state
|
||||||
|
}
|
||||||
|
done := func() ReplicationState {
|
||||||
|
r.state = Working
|
||||||
|
r.planningError = nil
|
||||||
|
return r.state
|
||||||
|
}
|
||||||
|
|
||||||
|
sfss, err := ep.Sender().ListFilesystems(ctx)
|
||||||
|
if err != nil {
|
||||||
|
log.WithError(err).Error("error listing sender filesystems")
|
||||||
|
return planningError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
rfss, err := ep.Receiver().ListFilesystems(ctx)
|
||||||
|
if err != nil {
|
||||||
|
log.WithError(err).Error("error listing receiver filesystems")
|
||||||
|
return planningError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
r.pending = make([]*FSReplication, 0, len(sfss))
|
||||||
|
r.completed = make([]*FSReplication, 0, len(sfss))
|
||||||
|
mainlog := log
|
||||||
|
for _, fs := range sfss {
|
||||||
|
|
||||||
|
log := mainlog.WithField("filesystem", fs.Path)
|
||||||
|
|
||||||
|
log.Info("assessing filesystem")
|
||||||
|
|
||||||
|
sfsvs, err := ep.Sender().ListFilesystemVersions(ctx, fs.Path)
|
||||||
|
if err != nil {
|
||||||
|
log.WithError(err).Error("cannot get remote filesystem versions")
|
||||||
|
return planningError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(sfsvs) <= 1 {
|
||||||
|
err := errors.New("sender does not have any versions")
|
||||||
|
log.Error(err.Error())
|
||||||
|
r.completed = append(r.completed, newFSReplicationPermanentError(fs, err))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
receiverFSExists := false
|
||||||
|
for _, rfs := range rfss {
|
||||||
|
if rfs.Path == fs.Path {
|
||||||
|
receiverFSExists = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var rfsvs []*FilesystemVersion
|
||||||
|
if receiverFSExists {
|
||||||
|
rfsvs, err = ep.Receiver().ListFilesystemVersions(ctx, fs.Path)
|
||||||
|
if err != nil {
|
||||||
|
if _, ok := err.(FilteredError); ok {
|
||||||
|
log.Info("receiver ignores filesystem")
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
log.WithError(err).Error("receiver error")
|
||||||
|
return planningError(err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
rfsvs = []*FilesystemVersion{}
|
||||||
|
}
|
||||||
|
|
||||||
|
path, conflict := IncrementalPath(rfsvs, sfsvs)
|
||||||
|
if conflict != nil {
|
||||||
|
var msg string
|
||||||
|
path, msg = resolveConflict(conflict) // no shadowing allowed!
|
||||||
|
if path != nil {
|
||||||
|
log.WithField("conflict", conflict).Info("conflict")
|
||||||
|
log.WithField("resolution", msg).Info("automatically resolved")
|
||||||
|
} else {
|
||||||
|
log.WithField("conflict", conflict).Error("conflict")
|
||||||
|
log.WithField("problem", msg).Error("cannot resolve conflict")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if path == nil {
|
||||||
|
r.completed = append(r.completed, newFSReplicationPermanentError(fs, conflict))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
fsreplbuilder := buildNewFSReplication(fs)
|
||||||
|
if len(path) == 1 {
|
||||||
|
fsreplbuilder.AddStep(nil, path[0])
|
||||||
|
} else {
|
||||||
|
for i := 0; i < len(path)-1; i++ {
|
||||||
|
fsreplbuilder.AddStep(path[i], path[i+1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fsrepl := fsreplbuilder.Complete()
|
||||||
|
switch fsrepl.state {
|
||||||
|
case FSCompleted:
|
||||||
|
r.completed = append(r.completed, fsreplbuilder.Complete())
|
||||||
|
case FSQueued:
|
||||||
|
r.pending = append(r.pending, fsreplbuilder.Complete())
|
||||||
|
default:
|
||||||
|
panic(fsrepl)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return done()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *FSReplication) nextStepDate() time.Time {
|
||||||
|
if f.state != FSQueued {
|
||||||
|
panic(f)
|
||||||
|
}
|
||||||
|
ct, err := f.pending[0].to.CreationAsTime()
|
||||||
|
if err != nil {
|
||||||
|
panic(err) // FIXME
|
||||||
|
}
|
||||||
|
return ct
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *FSReplication) takeStep(ctx context.Context, ep EndpointPair) FSReplicationState {
|
||||||
|
if f.state != FSQueued {
|
||||||
|
panic(f)
|
||||||
|
}
|
||||||
|
|
||||||
|
f.state = FSActive
|
||||||
|
step := f.pending[0]
|
||||||
|
stepState := step.do(ctx, ep)
|
||||||
|
|
||||||
|
switch stepState {
|
||||||
|
case StepCompleted:
|
||||||
|
f.pending = f.pending[1:]
|
||||||
|
f.completed = append(f.completed, step)
|
||||||
|
if len(f.pending) > 0 {
|
||||||
|
f.state = FSQueued
|
||||||
|
} else {
|
||||||
|
f.state = FSCompleted
|
||||||
|
}
|
||||||
|
|
||||||
|
case StepRetry:
|
||||||
|
f.state = FSRetry
|
||||||
|
f.retryAt = time.Now().Add(10 * time.Second) // FIXME hardcoded constant
|
||||||
|
|
||||||
|
case StepPermanentError:
|
||||||
|
f.state = FSPermanentError
|
||||||
|
|
||||||
|
}
|
||||||
|
return f.state
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f *FSReplication) retryNow() {
|
||||||
|
if f.state != FSRetry {
|
||||||
|
panic(f)
|
||||||
|
}
|
||||||
|
f.retryAt = time.Time{}
|
||||||
|
f.state = FSQueued
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *FSReplicationStep) do(ctx context.Context, ep EndpointPair) FSReplicationStepState {
|
||||||
|
|
||||||
|
fs := s.fsrep.fs
|
||||||
|
|
||||||
|
log := getLogger(ctx).
|
||||||
|
WithField("filesystem", fs.Path).
|
||||||
|
WithField("step", s.String())
|
||||||
|
|
||||||
|
updateStateError := func(err error) FSReplicationStepState {
|
||||||
|
s.err = err
|
||||||
|
switch err {
|
||||||
|
case io.EOF: fallthrough
|
||||||
|
case io.ErrUnexpectedEOF: fallthrough
|
||||||
|
case io.ErrClosedPipe:
|
||||||
|
return StepRetry
|
||||||
|
}
|
||||||
|
if _, ok := err.(net.Error); ok {
|
||||||
|
return StepRetry
|
||||||
|
}
|
||||||
|
return StepPermanentError
|
||||||
|
}
|
||||||
|
|
||||||
|
updateStateCompleted := func() FSReplicationStepState {
|
||||||
|
s.err = nil
|
||||||
|
s.state = StepCompleted
|
||||||
|
return s.state
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME refresh fs resume token
|
||||||
|
fs.ResumeToken = ""
|
||||||
|
|
||||||
|
var sr *SendReq
|
||||||
|
if fs.ResumeToken != "" {
|
||||||
|
sr = &SendReq{
|
||||||
|
Filesystem: fs.Path,
|
||||||
|
ResumeToken: fs.ResumeToken,
|
||||||
|
}
|
||||||
|
} else if s.from == nil {
|
||||||
|
sr = &SendReq{
|
||||||
|
Filesystem: fs.Path,
|
||||||
|
From: s.to.RelName(), // FIXME fix protocol to use To, like zfs does internally
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
sr = &SendReq{
|
||||||
|
Filesystem: fs.Path,
|
||||||
|
From: s.from.RelName(),
|
||||||
|
To: s.to.RelName(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log.WithField("request", sr).Debug("initiate send request")
|
||||||
|
sres, sstream, err := ep.Sender().Send(ctx, sr)
|
||||||
|
if err != nil {
|
||||||
|
log.WithError(err).Error("send request failed")
|
||||||
|
return updateStateError(err)
|
||||||
|
}
|
||||||
|
if sstream == nil {
|
||||||
|
err := errors.New("send request did not return a stream, broken endpoint implementation")
|
||||||
|
return updateStateError(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
rr := &ReceiveReq{
|
||||||
|
Filesystem: fs.Path,
|
||||||
|
ClearResumeToken: !sres.UsedResumeToken,
|
||||||
|
}
|
||||||
|
log.WithField("request", rr).Debug("initiate receive request")
|
||||||
|
err = ep.Receiver().Receive(ctx, rr, sstream)
|
||||||
|
if err != nil {
|
||||||
|
log.WithError(err).Error("receive request failed (might also be error on sender)")
|
||||||
|
sstream.Close()
|
||||||
|
// This failure could be due to
|
||||||
|
// - an unexpected exit of ZFS on the sending side
|
||||||
|
// - an unexpected exit of ZFS on the receiving side
|
||||||
|
// - a connectivity issue
|
||||||
|
return updateStateError(err)
|
||||||
|
}
|
||||||
|
log.Info("receive finished")
|
||||||
|
return updateStateCompleted()
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *FSReplicationStep) String() string {
|
||||||
|
if s.from == nil { // FIXME: ZFS semantics are that to is nil on non-incremental send
|
||||||
|
return fmt.Sprintf("%s%s (full)", s.fsrep.fs.Path, s.to.RelName())
|
||||||
|
} else {
|
||||||
|
return fmt.Sprintf("%s(%s => %s)", s.fsrep.fs.Path, s.from.RelName(), s.to.RelName())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
137
cmd/replication.v2/replication.go
Normal file
137
cmd/replication.v2/replication.go
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
package replication
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"github.com/zrepl/zrepl/logger"
|
||||||
|
"io"
|
||||||
|
)
|
||||||
|
|
||||||
|
type ReplicationEndpoint interface {
|
||||||
|
// Does not include placeholder filesystems
|
||||||
|
ListFilesystems(ctx context.Context) ([]*Filesystem, error)
|
||||||
|
ListFilesystemVersions(ctx context.Context, fs string) ([]*FilesystemVersion, error) // fix depS
|
||||||
|
Send(ctx context.Context, r *SendReq) (*SendRes, io.ReadCloser, error)
|
||||||
|
Receive(ctx context.Context, r *ReceiveReq, sendStream io.ReadCloser) error
|
||||||
|
}
|
||||||
|
|
||||||
|
type FilteredError struct{ fs string }
|
||||||
|
|
||||||
|
func NewFilteredError(fs string) FilteredError {
|
||||||
|
return FilteredError{fs}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (f FilteredError) Error() string { return "endpoint does not allow access to filesystem " + f.fs }
|
||||||
|
|
||||||
|
type ReplicationMode int
|
||||||
|
|
||||||
|
const (
|
||||||
|
ReplicationModePull ReplicationMode = iota
|
||||||
|
ReplicationModePush
|
||||||
|
)
|
||||||
|
|
||||||
|
type EndpointPair struct {
|
||||||
|
a, b ReplicationEndpoint
|
||||||
|
m ReplicationMode
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewEndpointPairPull(sender, receiver ReplicationEndpoint) EndpointPair {
|
||||||
|
return EndpointPair{sender, receiver, ReplicationModePull}
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewEndpointPairPush(sender, receiver ReplicationEndpoint) EndpointPair {
|
||||||
|
return EndpointPair{receiver, sender, ReplicationModePush}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p EndpointPair) Sender() ReplicationEndpoint {
|
||||||
|
switch p.m {
|
||||||
|
case ReplicationModePull:
|
||||||
|
return p.a
|
||||||
|
case ReplicationModePush:
|
||||||
|
return p.b
|
||||||
|
}
|
||||||
|
panic("should not be reached")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p EndpointPair) Receiver() ReplicationEndpoint {
|
||||||
|
switch p.m {
|
||||||
|
case ReplicationModePull:
|
||||||
|
return p.b
|
||||||
|
case ReplicationModePush:
|
||||||
|
return p.a
|
||||||
|
}
|
||||||
|
panic("should not be reached")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p EndpointPair) Mode() ReplicationMode {
|
||||||
|
return p.m
|
||||||
|
}
|
||||||
|
|
||||||
|
type contextKey int
|
||||||
|
|
||||||
|
const (
|
||||||
|
contextKeyLog contextKey = iota
|
||||||
|
)
|
||||||
|
|
||||||
|
//type Logger interface {
|
||||||
|
// Infof(fmt string, args ...interface{})
|
||||||
|
// Errorf(fmt string, args ...interface{})
|
||||||
|
//}
|
||||||
|
|
||||||
|
//var _ Logger = nullLogger{}
|
||||||
|
|
||||||
|
//type nullLogger struct{}
|
||||||
|
//
|
||||||
|
//func (nullLogger) Infof(fmt string, args ...interface{}) {}
|
||||||
|
//func (nullLogger) Errorf(fmt string, args ...interface{}) {}
|
||||||
|
|
||||||
|
type Logger = logger.Logger
|
||||||
|
|
||||||
|
func ContextWithLogger(ctx context.Context, l Logger) context.Context {
|
||||||
|
return context.WithValue(ctx, contextKeyLog, l)
|
||||||
|
}
|
||||||
|
|
||||||
|
func getLogger(ctx context.Context) Logger {
|
||||||
|
l, ok := ctx.Value(contextKeyLog).(Logger)
|
||||||
|
if !ok {
|
||||||
|
l = logger.NewNullLogger()
|
||||||
|
}
|
||||||
|
return l
|
||||||
|
}
|
||||||
|
|
||||||
|
func resolveConflict(conflict error) (path []*FilesystemVersion, msg string) {
|
||||||
|
if noCommonAncestor, ok := conflict.(*ConflictNoCommonAncestor); ok {
|
||||||
|
if len(noCommonAncestor.SortedReceiverVersions) == 0 {
|
||||||
|
// FIXME hard-coded replication policy: most recent
|
||||||
|
// snapshot as source
|
||||||
|
var mostRecentSnap *FilesystemVersion
|
||||||
|
for n := len(noCommonAncestor.SortedSenderVersions) - 1; n >= 0; n-- {
|
||||||
|
if noCommonAncestor.SortedSenderVersions[n].Type == FilesystemVersion_Snapshot {
|
||||||
|
mostRecentSnap = noCommonAncestor.SortedSenderVersions[n]
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if mostRecentSnap == nil {
|
||||||
|
return nil, "no snapshots available on sender side"
|
||||||
|
}
|
||||||
|
return []*FilesystemVersion{mostRecentSnap}, fmt.Sprintf("start replication at most recent snapshot %s", mostRecentSnap.RelName())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil, "no automated way to handle conflict type"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Replicate replicates filesystems from ep.Sender() to ep.Receiver().
|
||||||
|
//
|
||||||
|
// All filesystems presented by the sending side are replicated,
|
||||||
|
// unless the receiver rejects a Receive request with a *FilteredError.
|
||||||
|
//
|
||||||
|
// If an error occurs when replicating a filesystem, that error is logged to the logger in ctx.
|
||||||
|
// Replicate continues with the replication of the remaining file systems.
|
||||||
|
// Depending on the type of error, failed replications are retried in an unspecified order (currently FIFO).
|
||||||
|
func Replicate(ctx context.Context, ep EndpointPair, retryNow chan struct{}) {
|
||||||
|
r := Replication{}
|
||||||
|
r.Drive(ctx, ep, retryNow)
|
||||||
|
}
|
||||||
|
|
@ -83,18 +83,18 @@ func (m *MockIncrementalPathRecorder) Finished() bool {
|
|||||||
//
|
//
|
||||||
//}
|
//}
|
||||||
|
|
||||||
type testLog struct {
|
//type testLog struct {
|
||||||
t *testing.T
|
// t *testing.T
|
||||||
}
|
//}
|
||||||
|
//
|
||||||
var _ replication.Logger = testLog{}
|
//var _ replication.Logger = testLog{}
|
||||||
|
//
|
||||||
func (t testLog) Infof(fmt string, args ...interface{}) {
|
//func (t testLog) Infof(fmt string, args ...interface{}) {
|
||||||
t.t.Logf(fmt, args)
|
// t.t.Logf(fmt, args)
|
||||||
}
|
//}
|
||||||
func (t testLog) Errorf(fmt string, args ...interface{}) {
|
//func (t testLog) Errorf(fmt string, args ...interface{}) {
|
||||||
t.t.Logf(fmt, args)
|
// t.t.Logf(fmt, args)
|
||||||
}
|
//}
|
||||||
|
|
||||||
|
|
||||||
//func TestIncrementalPathReplicator_Replicate(t *testing.T) {
|
//func TestIncrementalPathReplicator_Replicate(t *testing.T) {
|
16
cmd/replication.v2/replicationstate_string.go
Normal file
16
cmd/replication.v2/replicationstate_string.go
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
// Code generated by "stringer -type=ReplicationState"; DO NOT EDIT.
|
||||||
|
|
||||||
|
package replication
|
||||||
|
|
||||||
|
import "strconv"
|
||||||
|
|
||||||
|
const _ReplicationState_name = "PlanningPlanningErrorWorkingWorkingWaitCompletedContextDone"
|
||||||
|
|
||||||
|
var _ReplicationState_index = [...]uint8{0, 8, 21, 28, 39, 48, 59}
|
||||||
|
|
||||||
|
func (i ReplicationState) String() string {
|
||||||
|
if i < 0 || i >= ReplicationState(len(_ReplicationState_index)-1) {
|
||||||
|
return "ReplicationState(" + strconv.FormatInt(int64(i), 10) + ")"
|
||||||
|
}
|
||||||
|
return _ReplicationState_name[_ReplicationState_index[i]:_ReplicationState_index[i+1]]
|
||||||
|
}
|
@ -1,472 +0,0 @@
|
|||||||
package replication
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"fmt"
|
|
||||||
"github.com/zrepl/zrepl/logger"
|
|
||||||
"io"
|
|
||||||
"net"
|
|
||||||
"sort"
|
|
||||||
"time"
|
|
||||||
)
|
|
||||||
|
|
||||||
type ReplicationEndpoint interface {
|
|
||||||
// Does not include placeholder filesystems
|
|
||||||
ListFilesystems(ctx context.Context) ([]*Filesystem, error)
|
|
||||||
ListFilesystemVersions(ctx context.Context, fs string) ([]*FilesystemVersion, error) // fix depS
|
|
||||||
Send(ctx context.Context, r *SendReq) (*SendRes, io.ReadCloser, error)
|
|
||||||
Receive(ctx context.Context, r *ReceiveReq, sendStream io.ReadCloser) error
|
|
||||||
}
|
|
||||||
|
|
||||||
type FilteredError struct{ fs string }
|
|
||||||
|
|
||||||
func NewFilteredError(fs string) FilteredError {
|
|
||||||
return FilteredError{fs}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (f FilteredError) Error() string { return "endpoint does not allow access to filesystem " + f.fs }
|
|
||||||
|
|
||||||
type ReplicationMode int
|
|
||||||
|
|
||||||
const (
|
|
||||||
ReplicationModePull ReplicationMode = iota
|
|
||||||
ReplicationModePush
|
|
||||||
)
|
|
||||||
|
|
||||||
type EndpointPair struct {
|
|
||||||
a, b ReplicationEndpoint
|
|
||||||
m ReplicationMode
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewEndpointPairPull(sender, receiver ReplicationEndpoint) EndpointPair {
|
|
||||||
return EndpointPair{sender, receiver, ReplicationModePull}
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewEndpointPairPush(sender, receiver ReplicationEndpoint) EndpointPair {
|
|
||||||
return EndpointPair{receiver, sender, ReplicationModePush}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p EndpointPair) Sender() ReplicationEndpoint {
|
|
||||||
switch p.m {
|
|
||||||
case ReplicationModePull:
|
|
||||||
return p.a
|
|
||||||
case ReplicationModePush:
|
|
||||||
return p.b
|
|
||||||
}
|
|
||||||
panic("should not be reached")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p EndpointPair) Receiver() ReplicationEndpoint {
|
|
||||||
switch p.m {
|
|
||||||
case ReplicationModePull:
|
|
||||||
return p.b
|
|
||||||
case ReplicationModePush:
|
|
||||||
return p.a
|
|
||||||
}
|
|
||||||
panic("should not be reached")
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p EndpointPair) Mode() ReplicationMode {
|
|
||||||
return p.m
|
|
||||||
}
|
|
||||||
|
|
||||||
type contextKey int
|
|
||||||
|
|
||||||
const (
|
|
||||||
contextKeyLog contextKey = iota
|
|
||||||
)
|
|
||||||
|
|
||||||
//type Logger interface {
|
|
||||||
// Infof(fmt string, args ...interface{})
|
|
||||||
// Errorf(fmt string, args ...interface{})
|
|
||||||
//}
|
|
||||||
|
|
||||||
//var _ Logger = nullLogger{}
|
|
||||||
|
|
||||||
//type nullLogger struct{}
|
|
||||||
//
|
|
||||||
//func (nullLogger) Infof(fmt string, args ...interface{}) {}
|
|
||||||
//func (nullLogger) Errorf(fmt string, args ...interface{}) {}
|
|
||||||
|
|
||||||
type Logger = logger.Logger
|
|
||||||
|
|
||||||
func ContextWithLogger(ctx context.Context, l Logger) context.Context {
|
|
||||||
return context.WithValue(ctx, contextKeyLog, l)
|
|
||||||
}
|
|
||||||
|
|
||||||
func getLogger(ctx context.Context) Logger {
|
|
||||||
l, ok := ctx.Value(contextKeyLog).(Logger)
|
|
||||||
if !ok {
|
|
||||||
l = logger.NewNullLogger()
|
|
||||||
}
|
|
||||||
return l
|
|
||||||
}
|
|
||||||
|
|
||||||
type replicationStep struct {
|
|
||||||
from, to *FilesystemVersion
|
|
||||||
fswork *replicateFSWork
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *replicationStep) String() string {
|
|
||||||
if s.from == nil { // FIXME: ZFS semantics are that to is nil on non-incremental send
|
|
||||||
return fmt.Sprintf("%s%s (full)", s.fswork.fs.Path, s.to.RelName())
|
|
||||||
} else {
|
|
||||||
return fmt.Sprintf("%s(%s => %s)", s.fswork.fs.Path, s.from.RelName(), s.to.RelName())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func newReplicationStep(from, to *FilesystemVersion) *replicationStep {
|
|
||||||
return &replicationStep{from: from, to: to}
|
|
||||||
}
|
|
||||||
|
|
||||||
type replicateFSWork struct {
|
|
||||||
fs *Filesystem
|
|
||||||
steps []*replicationStep
|
|
||||||
currentStep int
|
|
||||||
errorCount int
|
|
||||||
}
|
|
||||||
|
|
||||||
func newReplicateFSWork(fs *Filesystem) *replicateFSWork {
|
|
||||||
if fs == nil {
|
|
||||||
panic("implementation error")
|
|
||||||
}
|
|
||||||
return &replicateFSWork{
|
|
||||||
fs: fs,
|
|
||||||
steps: make([]*replicationStep, 0),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func newReplicateFSWorkWithConflict(fs *Filesystem, conflict error) *replicateFSWork {
|
|
||||||
// FIXME ignore conflict for now, but will be useful later when we make the replicationPlan exportable
|
|
||||||
return &replicateFSWork{
|
|
||||||
fs: fs,
|
|
||||||
steps: make([]*replicationStep, 0),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *replicateFSWork) AddStep(step *replicationStep) {
|
|
||||||
if step == nil {
|
|
||||||
panic("implementation error")
|
|
||||||
}
|
|
||||||
if step.fswork != nil {
|
|
||||||
panic("implementation error")
|
|
||||||
}
|
|
||||||
step.fswork = r
|
|
||||||
r.steps = append(r.steps, step)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (w *replicateFSWork) CurrentStepDate() time.Time {
|
|
||||||
if len(w.steps) == 0 {
|
|
||||||
return time.Time{}
|
|
||||||
}
|
|
||||||
toTime, err := w.steps[w.currentStep].to.CreationAsTime()
|
|
||||||
if err != nil {
|
|
||||||
panic(err) // implementation inconsistent: should not admit invalid FilesystemVersion objects
|
|
||||||
}
|
|
||||||
return toTime
|
|
||||||
}
|
|
||||||
|
|
||||||
func (w *replicateFSWork) CurrentStep() *replicationStep {
|
|
||||||
if w.currentStep >= len(w.steps) {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
return w.steps[w.currentStep]
|
|
||||||
}
|
|
||||||
|
|
||||||
func (w *replicateFSWork) CompleteStep() {
|
|
||||||
w.currentStep++
|
|
||||||
}
|
|
||||||
|
|
||||||
type replicationPlan struct {
|
|
||||||
fsws []*replicateFSWork
|
|
||||||
}
|
|
||||||
|
|
||||||
func newReplicationPlan() *replicationPlan {
|
|
||||||
return &replicationPlan{
|
|
||||||
fsws: make([]*replicateFSWork, 0),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *replicationPlan) addWork(work *replicateFSWork) {
|
|
||||||
p.fsws = append(p.fsws, work)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (p *replicationPlan) executeOldestFirst(ctx context.Context, doStep func(fs *Filesystem, from, to *FilesystemVersion) tryRes) {
|
|
||||||
log := getLogger(ctx)
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
log.WithError(ctx.Err()).Info("aborting replication due to context error")
|
|
||||||
return
|
|
||||||
default:
|
|
||||||
}
|
|
||||||
|
|
||||||
// FIXME poor man's nested priority queue
|
|
||||||
pending := make([]*replicateFSWork, 0, len(p.fsws))
|
|
||||||
for _, fsw := range p.fsws {
|
|
||||||
if fsw.CurrentStep() != nil {
|
|
||||||
pending = append(pending, fsw)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
sort.Slice(pending, func(i, j int) bool {
|
|
||||||
if pending[i].errorCount == pending[j].errorCount {
|
|
||||||
return pending[i].CurrentStepDate().Before(pending[j].CurrentStepDate())
|
|
||||||
}
|
|
||||||
return pending[i].errorCount < pending[j].errorCount
|
|
||||||
})
|
|
||||||
// pending is now sorted ascending by errorCount,CurrentStep().Creation
|
|
||||||
|
|
||||||
if len(pending) == 0 {
|
|
||||||
log.Info("replication complete")
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
fsw := pending[0]
|
|
||||||
step := fsw.CurrentStep()
|
|
||||||
if step == nil {
|
|
||||||
panic("implementation error")
|
|
||||||
}
|
|
||||||
|
|
||||||
log.WithField("step", step).Info("begin replication step")
|
|
||||||
res := doStep(step.fswork.fs, step.from, step.to)
|
|
||||||
|
|
||||||
if res.done {
|
|
||||||
log.Info("replication step successful")
|
|
||||||
fsw.errorCount = 0
|
|
||||||
fsw.CompleteStep()
|
|
||||||
} else {
|
|
||||||
log.Error("replication step failed, queuing for retry result")
|
|
||||||
fsw.errorCount++
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
func resolveConflict(conflict error) (path []*FilesystemVersion, msg string) {
|
|
||||||
if noCommonAncestor, ok := conflict.(*ConflictNoCommonAncestor); ok {
|
|
||||||
if len(noCommonAncestor.SortedReceiverVersions) == 0 {
|
|
||||||
// FIXME hard-coded replication policy: most recent
|
|
||||||
// snapshot as source
|
|
||||||
var mostRecentSnap *FilesystemVersion
|
|
||||||
for n := len(noCommonAncestor.SortedSenderVersions) - 1; n >= 0; n-- {
|
|
||||||
if noCommonAncestor.SortedSenderVersions[n].Type == FilesystemVersion_Snapshot {
|
|
||||||
mostRecentSnap = noCommonAncestor.SortedSenderVersions[n]
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if mostRecentSnap == nil {
|
|
||||||
return nil, "no snapshots available on sender side"
|
|
||||||
}
|
|
||||||
return []*FilesystemVersion{mostRecentSnap}, fmt.Sprintf("start replication at most recent snapshot %s", mostRecentSnap.RelName())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil, "no automated way to handle conflict type"
|
|
||||||
}
|
|
||||||
|
|
||||||
// Replicate replicates filesystems from ep.Sender() to ep.Receiver().
|
|
||||||
//
|
|
||||||
// All filesystems presented by the sending side are replicated,
|
|
||||||
// unless the receiver rejects a Receive request with a *FilteredError.
|
|
||||||
//
|
|
||||||
// If an error occurs when replicating a filesystem, that error is logged to the logger in ctx.
|
|
||||||
// Replicate continues with the replication of the remaining file systems.
|
|
||||||
// Depending on the type of error, failed replications are retried in an unspecified order (currently FIFO).
|
|
||||||
func Replicate(ctx context.Context, ep EndpointPair) {
|
|
||||||
|
|
||||||
log := getLogger(ctx)
|
|
||||||
|
|
||||||
retryPlanTicker := time.NewTicker(15 * time.Second) // FIXME make configurable
|
|
||||||
defer retryPlanTicker.Stop()
|
|
||||||
|
|
||||||
var (
|
|
||||||
plan *replicationPlan
|
|
||||||
res tryRes
|
|
||||||
)
|
|
||||||
for {
|
|
||||||
log.Info("build replication plan")
|
|
||||||
plan, res = tryBuildReplicationPlan(ctx, ep)
|
|
||||||
if plan != nil {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
log.WithField("result", res).Error("building replication plan failed, wait for retry timer result")
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
log.WithError(ctx.Err()).Info("aborting replication because context is done")
|
|
||||||
return
|
|
||||||
case <-retryPlanTicker.C:
|
|
||||||
// TODO also accept an external channel that allows us to tick
|
|
||||||
}
|
|
||||||
}
|
|
||||||
retryPlanTicker.Stop()
|
|
||||||
|
|
||||||
mainlog := log
|
|
||||||
plan.executeOldestFirst(ctx, func(fs *Filesystem, from, to *FilesystemVersion) tryRes {
|
|
||||||
|
|
||||||
log := mainlog.WithField("filesystem", fs.Path)
|
|
||||||
|
|
||||||
// FIXME refresh fs resume token
|
|
||||||
fs.ResumeToken = ""
|
|
||||||
|
|
||||||
var sr *SendReq
|
|
||||||
if fs.ResumeToken != "" {
|
|
||||||
sr = &SendReq{
|
|
||||||
Filesystem: fs.Path,
|
|
||||||
ResumeToken: fs.ResumeToken,
|
|
||||||
}
|
|
||||||
} else if from == nil {
|
|
||||||
sr = &SendReq{
|
|
||||||
Filesystem: fs.Path,
|
|
||||||
From: to.RelName(), // FIXME fix protocol to use To, like zfs does internally
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
sr = &SendReq{
|
|
||||||
Filesystem: fs.Path,
|
|
||||||
From: from.RelName(),
|
|
||||||
To: to.RelName(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
log.WithField("request", sr).Debug("initiate send request")
|
|
||||||
sres, sstream, err := ep.Sender().Send(ctx, sr)
|
|
||||||
if err != nil {
|
|
||||||
log.WithError(err).Error("send request failed")
|
|
||||||
return tryResFromEndpointError(err)
|
|
||||||
}
|
|
||||||
if sstream == nil {
|
|
||||||
log.Error("send request did not return a stream, broken endpoint implementation")
|
|
||||||
return tryRes{unfixable: true}
|
|
||||||
}
|
|
||||||
|
|
||||||
rr := &ReceiveReq{
|
|
||||||
Filesystem: fs.Path,
|
|
||||||
ClearResumeToken: !sres.UsedResumeToken,
|
|
||||||
}
|
|
||||||
log.WithField("request", rr).Debug("initiate receive request")
|
|
||||||
err = ep.Receiver().Receive(ctx, rr, sstream)
|
|
||||||
if err != nil {
|
|
||||||
log.WithError(err).Error("receive request failed (might also be error on sender)")
|
|
||||||
sstream.Close()
|
|
||||||
// This failure could be due to
|
|
||||||
// - an unexpected exit of ZFS on the sending side
|
|
||||||
// - an unexpected exit of ZFS on the receiving side
|
|
||||||
// - a connectivity issue
|
|
||||||
return tryResFromEndpointError(err)
|
|
||||||
}
|
|
||||||
log.Info("receive finished")
|
|
||||||
return tryRes{done: true}
|
|
||||||
|
|
||||||
})
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
type tryRes struct {
|
|
||||||
done bool
|
|
||||||
retry bool
|
|
||||||
unfixable bool
|
|
||||||
}
|
|
||||||
|
|
||||||
func tryResFromEndpointError(err error) tryRes {
|
|
||||||
if _, ok := err.(net.Error); ok {
|
|
||||||
return tryRes{retry: true}
|
|
||||||
}
|
|
||||||
return tryRes{unfixable: true}
|
|
||||||
}
|
|
||||||
|
|
||||||
func tryBuildReplicationPlan(ctx context.Context, ep EndpointPair) (*replicationPlan, tryRes) {
|
|
||||||
|
|
||||||
log := getLogger(ctx)
|
|
||||||
|
|
||||||
early := func(err error) (*replicationPlan, tryRes) {
|
|
||||||
return nil, tryResFromEndpointError(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
sfss, err := ep.Sender().ListFilesystems(ctx)
|
|
||||||
if err != nil {
|
|
||||||
log.WithError(err).Error("error listing sender filesystems")
|
|
||||||
return early(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
rfss, err := ep.Receiver().ListFilesystems(ctx)
|
|
||||||
if err != nil {
|
|
||||||
log.WithError(err).Error("error listing receiver filesystems")
|
|
||||||
return early(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
plan := newReplicationPlan()
|
|
||||||
mainlog := log
|
|
||||||
for _, fs := range sfss {
|
|
||||||
|
|
||||||
log := mainlog.WithField("filesystem", fs.Path)
|
|
||||||
|
|
||||||
log.Info("assessing filesystem")
|
|
||||||
|
|
||||||
sfsvs, err := ep.Sender().ListFilesystemVersions(ctx, fs.Path)
|
|
||||||
if err != nil {
|
|
||||||
log.WithError(err).Error("cannot get remote filesystem versions")
|
|
||||||
return early(err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(sfsvs) <= 1 {
|
|
||||||
log.Error("sender does not have any versions")
|
|
||||||
return nil, tryRes{unfixable: true}
|
|
||||||
}
|
|
||||||
|
|
||||||
receiverFSExists := false
|
|
||||||
for _, rfs := range rfss {
|
|
||||||
if rfs.Path == fs.Path {
|
|
||||||
receiverFSExists = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var rfsvs []*FilesystemVersion
|
|
||||||
if receiverFSExists {
|
|
||||||
rfsvs, err = ep.Receiver().ListFilesystemVersions(ctx, fs.Path)
|
|
||||||
if err != nil {
|
|
||||||
if _, ok := err.(FilteredError); ok {
|
|
||||||
log.Info("receiver ignores filesystem")
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
log.WithError(err).Error("receiver error")
|
|
||||||
return early(err)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
rfsvs = []*FilesystemVersion{}
|
|
||||||
}
|
|
||||||
|
|
||||||
path, conflict := IncrementalPath(rfsvs, sfsvs)
|
|
||||||
if conflict != nil {
|
|
||||||
var msg string
|
|
||||||
path, msg = resolveConflict(conflict) // no shadowing allowed!
|
|
||||||
if path != nil {
|
|
||||||
log.WithField("conflict", conflict).Info("conflict")
|
|
||||||
log.WithField("resolution", msg).Info("automatically resolved")
|
|
||||||
} else {
|
|
||||||
log.WithField("conflict", conflict).Error("conflict")
|
|
||||||
log.WithField("problem", msg).Error("cannot resolve conflict")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if path == nil {
|
|
||||||
plan.addWork(newReplicateFSWorkWithConflict(fs, conflict))
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
w := newReplicateFSWork(fs)
|
|
||||||
if len(path) == 1 {
|
|
||||||
step := newReplicationStep(nil, path[0])
|
|
||||||
w.AddStep(step)
|
|
||||||
} else {
|
|
||||||
for i := 0; i < len(path)-1; i++ {
|
|
||||||
step := newReplicationStep(path[i], path[i+1])
|
|
||||||
w.AddStep(step)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
plan.addWork(w)
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
return plan, tryRes{done: true}
|
|
||||||
}
|
|
Loading…
x
Reference in New Issue
Block a user