replication: refactor driving logic (no more explicit state machine)

This commit is contained in:
Christian Schwarz 2019-02-22 11:40:27 +01:00
parent 0230c6321f
commit 07b43bffa4
33 changed files with 1594 additions and 1515 deletions

38
Gopkg.lock generated
View File

@ -161,6 +161,14 @@
revision = "3247c84500bff8d9fb6d579d800f20b3e091582c"
version = "v1.0.0"
[[projects]]
digest = "1:4ff67dde814694496d7aa31be44b900f9717a10c8bc9136b13f49c8ef97f439a"
name = "github.com/montanaflynn/stats"
packages = ["."]
pruneopts = ""
revision = "63fbb2597b7a13043b453a4b819945badb8f8926"
version = "v0.5.0"
[[projects]]
branch = "master"
digest = "1:f60ff065b58bd53e641112b38bbda9d2684deb828393c7ffb89c69a1ee301d17"
@ -245,6 +253,14 @@
pruneopts = ""
revision = "8b1c2da0d56deffdbb9e48d4414b4e674bd8083e"
[[projects]]
digest = "1:3962f553b77bf6c03fc07cd687a22dd3b00fe11aa14d31194f5505f5bb65cdc8"
name = "github.com/sergi/go-diff"
packages = ["diffmatchpatch"]
pruneopts = ""
revision = "1744e2970ca51c86172c8190fadad617561ed6e7"
version = "v1.0.0"
[[projects]]
branch = "master"
digest = "1:146327ce93be37e68bd3ff8541090d96da8cb3adc9e35d57570e9170a29f6bf6"
@ -280,6 +296,25 @@
revision = "93babf24513d0e8277635da8169fcc5a46ae3f6a"
version = "v1.11.0"
[[projects]]
digest = "1:529ed3f98838f69e13761788d0cc71b44e130058fab13bae2ce09f7a176bced4"
name = "github.com/yudai/gojsondiff"
packages = [
".",
"formatter",
]
pruneopts = ""
revision = "7b1b7adf999dab73a6eb02669c3d82dbb27a3dd6"
version = "1.0.0"
[[projects]]
branch = "master"
digest = "1:9857bb2293f372b2181004d8b62179bbdb4ab0982ec6f762abe6cf2bfedaff85"
name = "github.com/yudai/golcs"
packages = ["."]
pruneopts = ""
revision = "ecda9a501e8220fae3b4b600c3db4b0ba22cfc68"
[[projects]]
branch = "v2"
digest = "1:6b8a6afafde7ed31cd0c577ba40d88ce39e8f1c5eb76d7836be7d5b74f1c534a"
@ -406,6 +441,7 @@
"github.com/jinzhu/copier",
"github.com/kr/pretty",
"github.com/mattn/go-isatty",
"github.com/montanaflynn/stats",
"github.com/pkg/errors",
"github.com/pkg/profile",
"github.com/problame/go-netssh",
@ -415,6 +451,8 @@
"github.com/spf13/pflag",
"github.com/stretchr/testify/assert",
"github.com/stretchr/testify/require",
"github.com/yudai/gojsondiff",
"github.com/yudai/gojsondiff/formatter",
"github.com/zrepl/yaml-config",
"golang.org/x/net/context",
"golang.org/x/sys/unix",

View File

@ -27,7 +27,7 @@ vendordeps:
dep ensure -v -vendor-only
generate: #not part of the build, must do that manually
protoc -I=replication/pdu --go_out=plugins=grpc:replication/pdu replication/pdu/pdu.proto
protoc -I=replication/logic/pdu --go_out=plugins=grpc:replication/logic/pdu replication/logic/pdu/pdu.proto
go generate -x ./...
build:

View File

@ -10,8 +10,7 @@ import (
"github.com/zrepl/zrepl/daemon"
"github.com/zrepl/zrepl/daemon/job"
"github.com/zrepl/zrepl/daemon/pruner"
"github.com/zrepl/zrepl/replication"
"github.com/zrepl/zrepl/replication/fsrep"
"github.com/zrepl/zrepl/replication/report"
"io"
"math"
"net/http"
@ -342,74 +341,58 @@ func (t *tui) draw() {
termbox.Flush()
}
func (t *tui) renderReplicationReport(rep *replication.Report, history *bytesProgressHistory) {
func (t *tui) renderReplicationReport(rep *report.Report, history *bytesProgressHistory) {
if rep == nil {
t.printf("...\n")
return
}
all := make([]*fsrep.Report, 0, len(rep.Completed)+len(rep.Pending) + 1)
all = append(all, rep.Completed...)
all = append(all, rep.Pending...)
if rep.Active != nil {
all = append(all, rep.Active)
}
sort.Slice(all, func(i, j int) bool {
return all[i].Filesystem < all[j].Filesystem
})
state, err := replication.StateString(rep.Status)
if err != nil {
t.printf("Status: %q (parse error: %q)\n", rep.Status, err)
// TODO visualize more than the latest attempt by folding all attempts into one
if len(rep.Attempts) == 0 {
t.printf("no attempts made yet")
return
}
t.printf("Status: %s", state)
latest := rep.Attempts[len(rep.Attempts)-1]
sort.Slice(latest.Filesystems, func(i, j int) bool {
return latest.Filesystems[i].Info.Name < latest.Filesystems[j].Info.Name
})
t.printf("Status: %s", latest.State)
t.newline()
if rep.Problem != "" {
if latest.State == report.AttemptPlanningError {
t.printf("Problem: ")
t.printfDrawIndentedAndWrappedIfMultiline("%s", rep.Problem)
t.printfDrawIndentedAndWrappedIfMultiline("%s", latest.PlanError)
t.newline()
} else if latest.State == report.AttemptFanOutError {
t.printf("Problem: one or more of the filesystems encountered errors")
t.newline()
}
if rep.SleepUntil.After(time.Now()) && !state.IsTerminal() {
t.printf("Sleeping until %s (%s left)\n", rep.SleepUntil, rep.SleepUntil.Sub(time.Now()))
}
if state != replication.Planning && state != replication.PlanningError {
// TODO report sleep time between retry attempts once that is implemented
if latest.State != report.AttemptPlanning && latest.State != report.AttemptPlanningError {
// Draw global progress bar
// Progress: [---------------]
sumUpFSRep := func(rep *fsrep.Report) (transferred, total int64) {
for _, s := range rep.Pending {
transferred += s.Bytes
total += s.ExpectedBytes
}
for _, s := range rep.Completed {
transferred += s.Bytes
total += s.ExpectedBytes
}
return
}
var transferred, total int64
for _, fs := range all {
fstx, fstotal := sumUpFSRep(fs)
transferred += fstx
total += fstotal
}
rate, changeCount := history.Update(transferred)
expected, replicated := latest.BytesSum()
rate, changeCount := history.Update(replicated)
t.write("Progress: ")
t.drawBar(50, transferred, total, changeCount)
t.write(fmt.Sprintf(" %s / %s @ %s/s", ByteCountBinary(transferred), ByteCountBinary(total), ByteCountBinary(rate)))
t.drawBar(50, replicated, expected, changeCount)
t.write(fmt.Sprintf(" %s / %s @ %s/s", ByteCountBinary(replicated), ByteCountBinary(expected), ByteCountBinary(rate)))
t.newline()
var maxFSLen int
for _, fs := range latest.Filesystems {
if len(fs.Info.Name) > maxFSLen {
maxFSLen = len(fs.Info.Name)
}
}
for _, fs := range latest.Filesystems {
t.printFilesystemStatus(fs, false, maxFSLen) // FIXME bring 'active' flag back
}
}
var maxFSLen int
for _, fs := range all {
if len(fs.Filesystem) > maxFSLen {
maxFSLen = len(fs.Filesystem)
}
}
for _, fs := range all {
t.printFilesystemStatus(fs, fs == rep.Active, maxFSLen)
}
}
func (t *tui) renderPrunerReport(r *pruner.Report) {
@ -513,25 +496,6 @@ func (t *tui) renderPrunerReport(r *pruner.Report) {
}
const snapshotIndent = 1
func calculateMaxFSLength(all []*fsrep.Report) (maxFS, maxStatus int) {
for _, e := range all {
if len(e.Filesystem) > maxFS {
maxFS = len(e.Filesystem)
}
all2 := make([]*fsrep.StepReport, 0, len(e.Pending) + len(e.Completed))
all2 = append(all2, e.Pending...)
all2 = append(all2, e.Completed...)
for _, e2 := range all2 {
elen := len(e2.Problem) + len(e2.From) + len(e2.To) + 60 // random spacing, units, labels, etc
if elen > maxStatus {
maxStatus = elen
}
}
}
return
}
func times(str string, n int) (out string) {
for i := 0; i < n; i++ {
out += str
@ -575,35 +539,13 @@ func (t *tui) drawBar(length int, bytes, totalBytes int64, changeCount int) {
t.write("]")
}
func StringStepState(s fsrep.StepState) string {
switch s {
case fsrep.StepReplicationReady: return "Ready"
case fsrep.StepMarkReplicatedReady: return "MarkReady"
case fsrep.StepCompleted: return "Completed"
default:
return fmt.Sprintf("UNKNOWN %d", s)
}
}
func (t *tui) printFilesystemStatus(rep *fsrep.Report, active bool, maxFS int) {
bytes := int64(0)
totalBytes := int64(0)
for _, s := range rep.Pending {
bytes += s.Bytes
totalBytes += s.ExpectedBytes
}
for _, s := range rep.Completed {
bytes += s.Bytes
totalBytes += s.ExpectedBytes
}
func (t *tui) printFilesystemStatus(rep *report.FilesystemReport, active bool, maxFS int) {
expected, replicated := rep.BytesSum()
status := fmt.Sprintf("%s (step %d/%d, %s/%s)",
rep.Status,
len(rep.Completed), len(rep.Pending) + len(rep.Completed),
ByteCountBinary(bytes), ByteCountBinary(totalBytes),
strings.ToUpper(string(rep.State)),
rep.CurrentStep, len(rep.Steps),
ByteCountBinary(replicated), ByteCountBinary(expected),
)
activeIndicator := " "
@ -612,18 +554,23 @@ func (t *tui) printFilesystemStatus(rep *fsrep.Report, active bool, maxFS int) {
}
t.printf("%s %s %s ",
activeIndicator,
rightPad(rep.Filesystem, maxFS, " "),
rightPad(rep.Info.Name, maxFS, " "),
status)
next := ""
if rep.Problem != "" {
next = rep.Problem
} else if len(rep.Pending) > 0 {
if rep.Pending[0].From != "" {
next = fmt.Sprintf("next: %s => %s", rep.Pending[0].From, rep.Pending[0].To)
if err := rep.Error(); err != nil {
next = err.Err
} else if rep.State != report.FilesystemDone {
if nextStep := rep.NextStep(); nextStep != nil {
if nextStep.IsIncremental() {
next = fmt.Sprintf("next: %s => %s", nextStep.Info.From, nextStep.Info.To)
} else {
next = fmt.Sprintf("next: %s (full)", nextStep.Info.To)
}
} else {
next = fmt.Sprintf("next: %s (full)", rep.Pending[0].To)
next = "" // individual FSes may still be in planning state
}
}
t.printfDrawIndentedAndWrappedIfMultiline("%s", next)

View File

@ -17,10 +17,12 @@ import (
"github.com/zrepl/zrepl/daemon/snapper"
"github.com/zrepl/zrepl/endpoint"
"github.com/zrepl/zrepl/replication"
"github.com/zrepl/zrepl/replication/driver"
"github.com/zrepl/zrepl/replication/logic"
"github.com/zrepl/zrepl/replication/report"
"github.com/zrepl/zrepl/rpc"
"github.com/zrepl/zrepl/transport"
"github.com/zrepl/zrepl/transport/fromconfig"
"github.com/zrepl/zrepl/util/envconst"
"github.com/zrepl/zrepl/zfs"
)
@ -53,7 +55,7 @@ type activeSideTasks struct {
state ActiveSideState
// valid for state ActiveSideReplicating, ActiveSidePruneSender, ActiveSidePruneReceiver, ActiveSideDone
replication *replication.Replication
replicationReport driver.ReportFunc
replicationCancel context.CancelFunc
// valid for state ActiveSidePruneSender, ActiveSidePruneReceiver, ActiveSideDone
@ -79,7 +81,7 @@ func (a *ActiveSide) updateTasks(u func(*activeSideTasks)) activeSideTasks {
type activeMode interface {
ConnectEndpoints(rpcLoggers rpc.Loggers, connecter transport.Connecter)
DisconnectEndpoints()
SenderReceiver() (replication.Sender, replication.Receiver)
SenderReceiver() (logic.Sender, logic.Receiver)
Type() Type
RunPeriodic(ctx context.Context, wakeUpCommon chan<- struct{})
ResetConnectBackoff()
@ -111,7 +113,7 @@ func (m *modePush) DisconnectEndpoints() {
m.receiver = nil
}
func (m *modePush) SenderReceiver() (replication.Sender, replication.Receiver) {
func (m *modePush) SenderReceiver() (logic.Sender, logic.Receiver) {
m.setupMtx.Lock()
defer m.setupMtx.Unlock()
return m.sender, m.receiver
@ -172,7 +174,7 @@ func (m *modePull) DisconnectEndpoints() {
m.receiver = nil
}
func (m *modePull) SenderReceiver() (replication.Sender, replication.Receiver) {
func (m *modePull) SenderReceiver() (logic.Sender, logic.Receiver) {
m.setupMtx.Lock()
defer m.setupMtx.Unlock()
return m.sender, m.receiver
@ -274,7 +276,7 @@ func (j *ActiveSide) RegisterMetrics(registerer prometheus.Registerer) {
func (j *ActiveSide) Name() string { return j.name }
type ActiveSideStatus struct {
Replication *replication.Report
Replication *report.Report
PruningSender, PruningReceiver *pruner.Report
}
@ -283,8 +285,8 @@ func (j *ActiveSide) Status() *Status {
s := &ActiveSideStatus{}
t := j.mode.Type()
if tasks.replication != nil {
s.Replication = tasks.replication.Report()
if tasks.replicationReport != nil {
s.Replication = tasks.replicationReport()
}
if tasks.prunerSender != nil {
s.PruningSender = tasks.prunerSender.Report()
@ -345,78 +347,6 @@ func (j *ActiveSide) do(ctx context.Context) {
}
}()
// The code after this watchdog goroutine is sequential and transitions the state from
// ActiveSideReplicating -> ActiveSidePruneSender -> ActiveSidePruneReceiver -> ActiveSideDone
// If any of those sequential tasks 'gets stuck' (livelock, no progress), the watchdog will eventually
// cancel its context.
// If the task is written to support context cancellation, it will return immediately (in permanent error state),
// and the sequential code above transitions to the next state.
go func() {
wdto := envconst.Duration("ZREPL_JOB_WATCHDOG_TIMEOUT", 10*time.Minute)
jitter := envconst.Duration("ZREPL_JOB_WATCHDOG_JITTER", 1*time.Second)
// shadowing!
log := log.WithField("watchdog_timeout", wdto.String())
log.Debug("starting watchdog")
defer log.Debug("watchdog stopped")
t := time.NewTicker(wdto)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C: // fall
}
j.updateTasks(func(tasks *activeSideTasks) {
// Since cancelling a task will cause the sequential code to transition to the next state immediately,
// we cannot check for its progress right then (no fallthrough).
// Instead, we return (not continue because we are in a closure) and give the new state another
// ZREPL_JOB_WATCHDOG_TIMEOUT interval to try make some progress.
log.WithField("state", tasks.state).Debug("watchdog firing")
const WATCHDOG_ENVCONST_NOTICE = " (adjust ZREPL_JOB_WATCHDOG_TIMEOUT env variable if inappropriate)"
switch tasks.state {
case ActiveSideReplicating:
log.WithField("replication_progress", tasks.replication.Progress.String()).
Debug("check replication progress")
if tasks.replication.Progress.CheckTimeout(wdto, jitter) {
log.Error("replication did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
tasks.replicationCancel()
return
}
case ActiveSidePruneSender:
log.WithField("prune_sender_progress", tasks.replication.Progress.String()).
Debug("check pruner_sender progress")
if tasks.prunerSender.Progress.CheckTimeout(wdto, jitter) {
log.Error("pruner_sender did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
tasks.prunerSenderCancel()
return
}
case ActiveSidePruneReceiver:
log.WithField("prune_receiver_progress", tasks.replication.Progress.String()).
Debug("check pruner_receiver progress")
if tasks.prunerReceiver.Progress.CheckTimeout(wdto, jitter) {
log.Error("pruner_receiver did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
tasks.prunerReceiverCancel()
return
}
case ActiveSideDone:
// ignore, ctx will be Done() in a few milliseconds and the watchdog will exit
default:
log.WithField("state", tasks.state).
Error("watchdog implementation error: unknown active side state")
}
})
}
}()
sender, receiver := j.mode.SenderReceiver()
{
@ -426,16 +356,19 @@ func (j *ActiveSide) do(ctx context.Context) {
default:
}
ctx, repCancel := context.WithCancel(ctx)
tasks := j.updateTasks(func(tasks *activeSideTasks) {
var repWait driver.WaitFunc
j.updateTasks(func(tasks *activeSideTasks) {
// reset it
*tasks = activeSideTasks{}
tasks.replicationCancel = repCancel
tasks.replication = replication.NewReplication(j.promRepStateSecs, j.promBytesReplicated)
tasks.replicationReport, repWait = replication.Do(
ctx, logic.NewPlanner(j.promRepStateSecs, j.promBytesReplicated, sender, receiver),
)
tasks.state = ActiveSideReplicating
})
log.Info("start replication")
tasks.replication.Drive(ctx, sender, receiver)
repCancel() // always cancel to free up context resources
repWait(true) // wait blocking
repCancel() // always cancel to free up context resources
}
{

View File

@ -14,7 +14,6 @@ import (
"github.com/zrepl/zrepl/daemon/snapper"
"github.com/zrepl/zrepl/endpoint"
"github.com/zrepl/zrepl/logger"
"github.com/zrepl/zrepl/replication"
"github.com/zrepl/zrepl/rpc"
"github.com/zrepl/zrepl/rpc/transportmux"
"github.com/zrepl/zrepl/tlsconf"
@ -78,7 +77,6 @@ const (
)
func WithSubsystemLoggers(ctx context.Context, log logger.Logger) context.Context {
ctx = replication.WithLogger(ctx, log.WithField(SubsysField, SubsysReplication))
ctx = endpoint.WithLogger(ctx, log.WithField(SubsysField, SubsyEndpoint))
ctx = pruner.WithLogger(ctx, log.WithField(SubsysField, SubsysPruning))
ctx = snapper.WithLogger(ctx, log.WithField(SubsysField, SubsysSnapshot))

View File

@ -8,7 +8,7 @@ import (
"github.com/zrepl/zrepl/config"
"github.com/zrepl/zrepl/logger"
"github.com/zrepl/zrepl/pruning"
"github.com/zrepl/zrepl/replication/pdu"
"github.com/zrepl/zrepl/replication/logic/pdu"
"github.com/zrepl/zrepl/util/envconst"
"github.com/zrepl/zrepl/util/watchdog"
"net"

View File

@ -6,7 +6,7 @@ import (
"github.com/stretchr/testify/assert"
"github.com/zrepl/zrepl/logger"
"github.com/zrepl/zrepl/pruning"
"github.com/zrepl/zrepl/replication/pdu"
"github.com/zrepl/zrepl/replication/logic/pdu"
"net"
"testing"
"time"

View File

@ -7,8 +7,7 @@ import (
"path"
"github.com/pkg/errors"
"github.com/zrepl/zrepl/replication"
"github.com/zrepl/zrepl/replication/pdu"
"github.com/zrepl/zrepl/replication/logic/pdu"
"github.com/zrepl/zrepl/zfs"
)
@ -34,7 +33,7 @@ func (s *Sender) filterCheckFS(fs string) (*zfs.DatasetPath, error) {
return nil, err
}
if !pass {
return nil, replication.NewFilteredError(fs)
return nil, fmt.Errorf("endpoint does not allow access to filesystem %s", fs)
}
return dp, nil
}

View File

@ -0,0 +1,301 @@
package driver
import (
"context"
"sync"
"time"
"github.com/zrepl/zrepl/replication/report"
"github.com/zrepl/zrepl/util/chainlock"
)
type run struct {
l *chainlock.L
startedAt, finishedAt time.Time
// the attempts attempted so far:
// All but the last in this slice must have finished with some errors.
// The last attempt may not be finished and may not have errors.
attempts []*attempt
}
type Planner interface {
Plan(context.Context) ([]FS, error)
}
// an attempt represents a single planning & execution of fs replications
type attempt struct {
planner Planner
l *chainlock.L
startedAt, finishedAt time.Time
// after Planner.Plan was called, planErr and fss are mutually exclusive with regards to nil-ness
// if both are nil, it must be assumed that Planner.Plan is active
planErr *timedError
fss []*fs
}
type timedError struct {
Err error
Time time.Time
}
func newTimedError(err error, t time.Time) *timedError {
if err == nil {
panic("error must be non-nil")
}
if t.IsZero() {
panic("t must be non-zero")
}
return &timedError{err, t}
}
func (e *timedError) IntoReportError() *report.TimedError {
if e == nil {
return nil
}
return report.NewTimedError(e.Err.Error(), e.Time)
}
type FS interface {
PlanFS(context.Context) ([]Step, error)
ReportInfo() *report.FilesystemInfo
}
type Step interface {
TargetDate() time.Time
Step(context.Context) error
ReportInfo() *report.StepInfo
}
type fs struct {
fs FS
l *chainlock.L
planning struct {
done bool
err *timedError
}
// valid iff planning.done && planning.err == nil
planned struct {
// valid iff planning.done && planning.err == nil
stepErr *timedError
// all steps, in the order in which they must be completed
steps []*step
// index into steps, pointing at the step that is currently executing
// if step >= len(steps), no more work needs to be done
step int
}
}
type step struct {
l *chainlock.L
step Step
}
type ReportFunc func() *report.Report
type WaitFunc func(block bool) (done bool)
func Do(ctx context.Context, planner Planner) (ReportFunc, WaitFunc) {
l := chainlock.New()
run := &run{
l: l,
startedAt: time.Now(),
}
done := make(chan struct{})
go func() {
defer close(done)
run.l.Lock()
a1 := &attempt{
l: l,
startedAt: time.Now(),
planner: planner,
}
run.attempts = append(run.attempts, a1)
run.l.Unlock()
a1.do(ctx)
}()
wait := func(block bool) bool {
if block {
<-done
}
select {
case <-done:
return true
default:
return false
}
}
report := func() *report.Report {
defer run.l.Lock().Unlock()
return run.report()
}
return report, wait
}
func (a *attempt) do(ctx context.Context) {
pfss, err := a.planner.Plan(ctx)
errTime := time.Now()
defer a.l.Lock().Unlock()
if err != nil {
a.planErr = newTimedError(err, errTime)
a.fss = nil
a.finishedAt = time.Now()
return
}
for _, pfs := range pfss {
fs := &fs{
fs: pfs,
l: a.l,
}
a.fss = append(a.fss, fs)
}
stepQueue := newStepQueue()
defer stepQueue.Start(1)()
var fssesDone sync.WaitGroup
for _, f := range a.fss {
fssesDone.Add(1)
go func(f *fs) {
defer fssesDone.Done()
f.do(ctx, stepQueue)
}(f)
}
a.l.DropWhile(func() {
fssesDone.Wait()
})
a.finishedAt = time.Now()
}
func (fs *fs) do(ctx context.Context, pq *stepQueue) {
psteps, err := fs.fs.PlanFS(ctx)
errTime := time.Now()
defer fs.l.Lock().Unlock()
fs.planning.done = true
if err != nil {
fs.planning.err = newTimedError(err, errTime)
return
}
for _, pstep := range psteps {
step := &step{
l: fs.l,
step: pstep,
}
fs.planned.steps = append(fs.planned.steps, step)
}
for i, s := range fs.planned.steps {
var (
err error
errTime time.Time
)
// lock must not be held while executing step in order for reporting to work
fs.l.DropWhile(func() {
targetDate := s.step.TargetDate()
defer pq.WaitReady(fs, targetDate)()
err = s.step.Step(ctx) // no shadow
errTime = time.Now() // no shadow
})
if err != nil {
fs.planned.stepErr = newTimedError(err, errTime)
break
}
fs.planned.step = i + 1 // fs.planned.step must be == len(fs.planned.steps) if all went OK
}
}
// caller must hold lock l
func (r *run) report() *report.Report {
report := &report.Report{
Attempts: make([]*report.AttemptReport, len(r.attempts)),
StartAt: r.startedAt,
FinishAt: r.finishedAt,
}
for i := range report.Attempts {
report.Attempts[i] = r.attempts[i].report()
}
return report
}
// caller must hold lock l
func (a *attempt) report() *report.AttemptReport {
r := &report.AttemptReport{
// State is set below
Filesystems: make([]*report.FilesystemReport, len(a.fss)),
StartAt: a.startedAt,
FinishAt: a.finishedAt,
PlanError: a.planErr.IntoReportError(),
}
for i := range r.Filesystems {
r.Filesystems[i] = a.fss[i].report()
}
state := report.AttemptPlanning
if a.planErr != nil {
state = report.AttemptPlanningError
} else if a.fss != nil {
if a.finishedAt.IsZero() {
state = report.AttemptFanOutFSs
} else {
fsWithError := false
for _, s := range r.Filesystems {
fsWithError = fsWithError || s.Error() != nil
}
state = report.AttemptDone
if fsWithError {
state = report.AttemptFanOutError
}
}
}
r.State = state
return r
}
// caller must hold lock l
func (f *fs) report() *report.FilesystemReport {
state := report.FilesystemPlanningErrored
if f.planning.err == nil {
if f.planning.done {
if f.planned.stepErr != nil {
state = report.FilesystemSteppingErrored
} else if f.planned.step < len(f.planned.steps) {
state = report.FilesystemStepping
} else {
state = report.FilesystemDone
}
} else {
state = report.FilesystemPlanning
}
}
r := &report.FilesystemReport{
Info: f.fs.ReportInfo(),
State: state,
PlanError: f.planning.err.IntoReportError(),
StepError: f.planned.stepErr.IntoReportError(),
Steps: make([]*report.StepReport, len(f.planned.steps)),
CurrentStep: f.planned.step,
}
for i := range r.Steps {
r.Steps[i] = f.planned.steps[i].report()
}
return r
}
// caller must hold lock l
func (s *step) report() *report.StepReport {
r := &report.StepReport{
Info: s.step.ReportInfo(),
}
return r
}

View File

@ -0,0 +1,204 @@
package driver
import (
"context"
"encoding/json"
"fmt"
"sort"
"sync/atomic"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/zrepl/zrepl/replication/report"
"github.com/stretchr/testify/assert"
jsondiff "github.com/yudai/gojsondiff"
jsondiffformatter "github.com/yudai/gojsondiff/formatter"
)
type mockPlanner struct {
stepCounter uint32
fss []FS // *mockFS
}
func (p *mockPlanner) Plan(ctx context.Context) ([]FS, error) {
time.Sleep(1 * time.Second)
p.fss = []FS{
&mockFS{
&p.stepCounter,
"zroot/one",
nil,
},
&mockFS{
&p.stepCounter,
"zroot/two",
nil,
},
}
return p.fss, nil
}
type mockFS struct {
globalStepCounter *uint32
name string
steps []Step
}
func (f *mockFS) PlanFS(ctx context.Context) ([]Step, error) {
if f.steps != nil {
panic("PlanFS used twice")
}
switch f.name {
case "zroot/one":
f.steps = []Step{
&mockStep{
fs: f,
ident: "a",
duration: 1 * time.Second,
targetDate: time.Unix(2, 0),
},
&mockStep{
fs: f,
ident: "b",
duration: 1 * time.Second,
targetDate: time.Unix(10, 0),
},
&mockStep{
fs: f,
ident: "c",
duration: 1 * time.Second,
targetDate: time.Unix(20, 0),
},
}
case "zroot/two":
f.steps = []Step{
&mockStep{
fs: f,
ident: "u",
duration: 500 * time.Millisecond,
targetDate: time.Unix(15, 0),
},
&mockStep{
fs: f,
duration: 500 * time.Millisecond,
ident: "v",
targetDate: time.Unix(30, 0),
},
}
default:
panic("unimplemented")
}
return f.steps, nil
}
func (f *mockFS) ReportInfo() *report.FilesystemInfo {
return &report.FilesystemInfo{Name: f.name}
}
type mockStep struct {
fs *mockFS
ident string
duration time.Duration
targetDate time.Time
// filled by method Step
globalCtr uint32
}
func (f *mockStep) String() string {
return fmt.Sprintf("%s{%s} targetDate=%s globalCtr=%v", f.fs.name, f.ident, f.targetDate, f.globalCtr)
}
func (f *mockStep) Step(ctx context.Context) error {
f.globalCtr = atomic.AddUint32(f.fs.globalStepCounter, 1)
time.Sleep(f.duration)
return nil
}
func (f *mockStep) TargetDate() time.Time {
return f.targetDate
}
func (f *mockStep) ReportInfo() *report.StepInfo {
return &report.StepInfo{From: f.ident, To: f.ident, BytesExpected: 100, BytesReplicated: 25}
}
// TODO: add meaningful validation (i.e. actual checks)
// Since the stepqueue is not deterministic due to scheduler jitter,
// we cannot test for any definitive sequence of steps here.
// Such checks would further only be sensible for a non-concurrent step-queue,
// but we're going to have concurrent replication in the future.
//
// For the time being, let's just exercise the code a bit.
func TestReplication(t *testing.T) {
ctx := context.Background()
mp := &mockPlanner{}
getReport, wait := Do(ctx, mp)
begin := time.Now()
fireAt := []time.Duration{
// the following values are relative to the start
500 * time.Millisecond, // planning
1500 * time.Millisecond, // nothing is done, a is running
2500 * time.Millisecond, // a done, b running
3250 * time.Millisecond, // a,b done, u running
3750 * time.Millisecond, // a,b,u done, c running
4750 * time.Millisecond, // a,b,u,c done, v running
5250 * time.Millisecond, // a,b,u,c,v done
}
reports := make([]*report.Report, len(fireAt))
for i := range fireAt {
sleepUntil := begin.Add(fireAt[i])
time.Sleep(sleepUntil.Sub(time.Now()))
reports[i] = getReport()
// uncomment for viewing non-diffed results
// t.Logf("report @ %6.4f:\n%s", fireAt[i].Seconds(), pretty.Sprint(reports[i]))
}
waitBegin := time.Now()
wait(true)
waitDuration := time.Now().Sub(waitBegin)
assert.True(t, waitDuration < 10*time.Millisecond, "%v", waitDuration) // and that's gratious
prev, err := json.Marshal(reports[0])
require.NoError(t, err)
for _, r := range reports[1:] {
this, err := json.Marshal(r)
require.NoError(t, err)
differ := jsondiff.New()
diff, err := differ.Compare(prev, this)
require.NoError(t, err)
df := jsondiffformatter.NewDeltaFormatter()
res, err := df.Format(diff)
res = res
require.NoError(t, err)
// uncomment the following line to get json diffs between each captured step
// t.Logf("%s", res)
prev, err = json.Marshal(r)
require.NoError(t, err)
}
steps := make([]*mockStep, 0)
for _, fs := range mp.fss {
for _, step := range fs.(*mockFS).steps {
steps = append(steps, step.(*mockStep))
}
}
// sort steps in pq order (although, remember, pq is not deterministic)
sort.Slice(steps, func(i, j int) bool {
return steps[i].targetDate.Before(steps[j].targetDate)
})
// manual inspection of the globalCtr value should show that, despite
// scheduler-dependent behavior of pq, steps should generally be taken
// from oldest to newest target date (globally, not per FS).
t.Logf("steps sorted by target date:")
for _, step := range steps {
t.Logf("\t%s", step)
}
}

View File

@ -0,0 +1,163 @@
package driver
import (
"container/heap"
"time"
"github.com/zrepl/zrepl/util/chainlock"
)
type stepQueueRec struct {
ident interface{}
targetDate time.Time
wakeup chan StepCompletedFunc
}
type stepQueue struct {
stop chan struct{}
reqs chan stepQueueRec
}
type stepQueueHeapItem struct {
idx int
req stepQueueRec
}
type stepQueueHeap []*stepQueueHeapItem
func (h stepQueueHeap) Less(i, j int) bool {
return h[i].req.targetDate.Before(h[j].req.targetDate)
}
func (h stepQueueHeap) Swap(i, j int) {
h[i], h[j] = h[j], h[i]
h[i].idx = i
h[j].idx = j
}
func (h stepQueueHeap) Len() int {
return len(h)
}
func (h *stepQueueHeap) Push(elem interface{}) {
hitem := elem.(*stepQueueHeapItem)
hitem.idx = h.Len()
*h = append(*h, hitem)
}
func (h *stepQueueHeap) Pop() interface{} {
elem := (*h)[h.Len()-1]
elem.idx = -1
*h = (*h)[:h.Len()-1]
return elem
}
// returned stepQueue must be closed with method Close
func newStepQueue() *stepQueue {
q := &stepQueue{
stop: make(chan struct{}),
reqs: make(chan stepQueueRec),
}
return q
}
// the returned done function must be called to free resources
// allocated by the call to Start
//
// No WaitReady calls must be active at the time done is called
// The behavior of calling WaitReady after done was called is undefined
func (q *stepQueue) Start(concurrency int) (done func()) {
if concurrency < 1 {
panic("concurrency must be >= 1")
}
// l protects pending and queueItems
l := chainlock.New()
pendingCond := l.NewCond()
// priority queue
pending := &stepQueueHeap{}
// ident => queueItem
queueItems := make(map[interface{}]*stepQueueHeapItem)
// stopped is used for cancellation of "wake" goroutine
stopped := false
active := 0
go func() { // "stopper" goroutine
<-q.stop
defer l.Lock().Unlock()
stopped = true
pendingCond.Broadcast()
}()
go func() { // "reqs" goroutine
for {
select {
case <-q.stop:
select {
case <-q.reqs:
panic("WaitReady call active while calling Close")
default:
return
}
case req := <-q.reqs:
func() {
defer l.Lock().Unlock()
if _, ok := queueItems[req.ident]; ok {
panic("WaitReady must not be called twice for the same ident")
}
qitem := &stepQueueHeapItem{
req: req,
}
queueItems[req.ident] = qitem
heap.Push(pending, qitem)
pendingCond.Broadcast()
}()
}
}
}()
go func() { // "wake" goroutine
defer l.Lock().Unlock()
for {
for !stopped && (active >= concurrency || pending.Len() == 0) {
pendingCond.Wait()
}
if stopped {
return
}
if pending.Len() <= 0 {
return
}
active++
next := heap.Pop(pending).(*stepQueueHeapItem).req
delete(queueItems, next.ident)
next.wakeup <- func() {
defer l.Lock().Unlock()
active--
pendingCond.Broadcast()
}
}
}()
done = func() {
close(q.stop)
}
return done
}
type StepCompletedFunc func()
func (q *stepQueue) sendAndWaitForWakeup(ident interface{}, targetDate time.Time) StepCompletedFunc {
req := stepQueueRec{
ident,
targetDate,
make(chan StepCompletedFunc),
}
q.reqs <- req
return <-req.wakeup
}
// Wait for the ident with targetDate to be selected to run.
func (q *stepQueue) WaitReady(ident interface{}, targetDate time.Time) StepCompletedFunc {
if targetDate.IsZero() {
panic("targetDate of zero is reserved for marking Done")
}
return q.sendAndWaitForWakeup(ident, targetDate)
}

View File

@ -0,0 +1,166 @@
package driver
import (
"fmt"
"math"
"sort"
"sync"
"sync/atomic"
"testing"
"time"
"github.com/montanaflynn/stats"
"github.com/stretchr/testify/assert"
)
func TestPqNotconcurrent(t *testing.T) {
var ctr uint32
q := newStepQueue()
var wg sync.WaitGroup
wg.Add(3)
go func() {
defer wg.Done()
defer q.WaitReady("1", time.Unix(1, 0))()
ret := atomic.AddUint32(&ctr, 1)
assert.Equal(t, uint32(1), ret)
}()
go func() {
defer wg.Done()
defer q.WaitReady("2", time.Unix(2, 0))()
ret := atomic.AddUint32(&ctr, 1)
assert.Equal(t, uint32(2), ret)
}()
go func() {
defer wg.Done()
defer q.WaitReady("3", time.Unix(3, 0))()
ret := atomic.AddUint32(&ctr, 1)
assert.Equal(t, uint32(3), ret)
}()
time.Sleep(1 * time.Second)
defer q.Start(1)()
wg.Wait()
}
type record struct {
fs int
step int
globalCtr uint32
wakeAt time.Duration // relative to begin
}
func (r record) String() string {
return fmt.Sprintf("fs %08d step %08d globalCtr %08d wakeAt %2.8f", r.fs, r.step, r.globalCtr, r.wakeAt.Seconds())
}
// This tests uses stepPq concurrently, simulating the following scenario:
// Given a number of filesystems F, each filesystem has N steps to take.
// The number of concurrent steps is limited to C.
// The target date for each step is the step number N.
// Hence, there are always F filesystems runnable (calling WaitReady)
// The priority queue prioritizes steps with lower target data (= lower step number).
// Hence, all steps with lower numbers should be woken up before steps with higher numbers.
// However, scheduling is not 100% deterministic (runtime, OS scheduler, etc).
// Hence, perform some statistics on the wakeup times and assert that the mean wakeup
// times for each step are close together.
func TestPqConcurrent(t *testing.T) {
q := newStepQueue()
var wg sync.WaitGroup
filesystems := 100
stepsPerFS := 20
sleepTimePerStep := 50 * time.Millisecond
wg.Add(filesystems)
var globalCtr uint32
begin := time.Now()
records := make(chan []record, filesystems)
for fs := 0; fs < filesystems; fs++ {
go func(fs int) {
defer wg.Done()
recs := make([]record, 0)
for step := 0; step < stepsPerFS; step++ {
pos := atomic.AddUint32(&globalCtr, 1)
t := time.Unix(int64(step), 0)
done := q.WaitReady(fs, t)
wakeAt := time.Now().Sub(begin)
time.Sleep(sleepTimePerStep)
done()
recs = append(recs, record{fs, step, pos, wakeAt})
}
records <- recs
}(fs)
}
concurrency := 5
defer q.Start(concurrency)()
wg.Wait()
close(records)
t.Logf("loop done")
flattenedRecs := make([]record, 0)
for recs := range records {
flattenedRecs = append(flattenedRecs, recs...)
}
sort.Slice(flattenedRecs, func(i, j int) bool {
return flattenedRecs[i].globalCtr < flattenedRecs[j].globalCtr
})
wakeTimesByStep := map[int][]float64{}
for _, rec := range flattenedRecs {
wakeTimes, ok := wakeTimesByStep[rec.step]
if !ok {
wakeTimes = []float64{}
}
wakeTimes = append(wakeTimes, rec.wakeAt.Seconds())
wakeTimesByStep[rec.step] = wakeTimes
}
meansByStepId := make([]float64, stepsPerFS)
interQuartileRangesByStepIdx := make([]float64, stepsPerFS)
for step := 0; step < stepsPerFS; step++ {
t.Logf("step %d", step)
mean, _ := stats.Mean(wakeTimesByStep[step])
meansByStepId[step] = mean
t.Logf("\tmean: %v", mean)
median, _ := stats.Median(wakeTimesByStep[step])
t.Logf("\tmedian: %v", median)
midhinge, _ := stats.Midhinge(wakeTimesByStep[step])
t.Logf("\tmidhinge: %v", midhinge)
min, _ := stats.Min(wakeTimesByStep[step])
t.Logf("\tmin: %v", min)
max, _ := stats.Max(wakeTimesByStep[step])
t.Logf("\tmax: %v", max)
quartiles, _ := stats.Quartile(wakeTimesByStep[step])
t.Logf("\t%#v", quartiles)
interQuartileRange, _ := stats.InterQuartileRange(wakeTimesByStep[step])
t.Logf("\tinter-quartile range: %v", interQuartileRange)
interQuartileRangesByStepIdx[step] = interQuartileRange
}
iqrMean, _ := stats.Mean(interQuartileRangesByStepIdx)
t.Logf("inter-quartile-range mean: %v", iqrMean)
iqrDev, _ := stats.StandardDeviation(interQuartileRangesByStepIdx)
t.Logf("inter-quartile-range deviation: %v", iqrDev)
// each step should have the same "distribution" (=~ "spread")
assert.True(t, iqrDev < 0.01)
minTimeForAllStepsWithIdxI := sleepTimePerStep.Seconds() * float64(filesystems) / float64(concurrency)
t.Logf("minTimeForAllStepsWithIdxI = %11.8f", minTimeForAllStepsWithIdxI)
for i, mean := range meansByStepId {
// we can't just do (i + 0.5) * minTimeforAllStepsWithIdxI
// because this doesn't account for drift
idealMean := 0.5 * minTimeForAllStepsWithIdxI
if i > 0 {
previousMean := meansByStepId[i-1]
idealMean = previousMean + minTimeForAllStepsWithIdxI
}
deltaFromIdeal := idealMean - mean
t.Logf("step %02d delta from ideal mean wake time: %11.8f - %11.8f = %11.8f", i, idealMean, mean, deltaFromIdeal)
assert.True(t, math.Abs(deltaFromIdeal) < 0.05)
}
}

View File

@ -1,557 +0,0 @@
// Package fsrep implements replication of a single file system with existing versions
// from a sender to a receiver.
package fsrep
import (
"context"
"errors"
"fmt"
"net"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/zrepl/zrepl/logger"
"github.com/zrepl/zrepl/replication/pdu"
"github.com/zrepl/zrepl/util/bytecounter"
"github.com/zrepl/zrepl/util/watchdog"
"github.com/zrepl/zrepl/zfs"
)
type contextKey int
const (
contextKeyLogger contextKey = iota
)
type Logger = logger.Logger
func WithLogger(ctx context.Context, log Logger) context.Context {
return context.WithValue(ctx, contextKeyLogger, log)
}
func getLogger(ctx context.Context) Logger {
l, ok := ctx.Value(contextKeyLogger).(Logger)
if !ok {
l = logger.NewNullLogger()
}
return l
}
// A Sender is usually part of a github.com/zrepl/zrepl/replication.Endpoint.
type Sender interface {
// If a non-nil io.ReadCloser is returned, it is guaranteed to be closed before
// any next call to the parent github.com/zrepl/zrepl/replication.Endpoint.
// If the send request is for dry run the io.ReadCloser will be nil
Send(ctx context.Context, r *pdu.SendReq) (*pdu.SendRes, zfs.StreamCopier, error)
ReplicationCursor(ctx context.Context, req *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error)
}
// A Sender is usually part of a github.com/zrepl/zrepl/replication.Endpoint.
type Receiver interface {
// Receive sends r and sendStream (the latter containing a ZFS send stream)
// to the parent github.com/zrepl/zrepl/replication.Endpoint.
Receive(ctx context.Context, req *pdu.ReceiveReq, receive zfs.StreamCopier) (*pdu.ReceiveRes, error)
}
type StepReport struct {
From, To string
Status StepState
Problem string
Bytes int64
ExpectedBytes int64 // 0 means no size estimate possible
}
type Report struct {
Filesystem string
Status string
Problem string
Completed, Pending []*StepReport
}
//go:generate enumer -type=State
type State uint
const (
Ready State = 1 << iota
Completed
)
type Error interface {
error
Temporary() bool
ContextErr() bool
LocalToFS() bool
}
type Replication struct {
promBytesReplicated prometheus.Counter
fs string
// lock protects all fields below it in this struct, but not the data behind pointers
lock sync.Mutex
state State
err Error
completed, pending []*ReplicationStep
}
func (f *Replication) State() State {
f.lock.Lock()
defer f.lock.Unlock()
return f.state
}
func (f *Replication) FS() string { return f.fs }
// returns zero value time.Time{} if no more pending steps
func (f *Replication) NextStepDate() time.Time {
if len(f.pending) == 0 {
return time.Time{}
}
return f.pending[0].to.SnapshotTime()
}
func (f *Replication) Err() Error {
f.lock.Lock()
defer f.lock.Unlock()
return f.err
}
func (f *Replication) CanRetry() bool {
f.lock.Lock()
defer f.lock.Unlock()
if f.state == Completed {
return false
}
if f.state != Ready {
panic(fmt.Sprintf("implementation error: %v", f.state))
}
if f.err == nil {
return true
}
return f.err.Temporary()
}
func (f *Replication) UpdateSizeEsitmate(ctx context.Context, sender Sender) error {
f.lock.Lock()
defer f.lock.Unlock()
for _, e := range f.pending {
if err := e.updateSizeEstimate(ctx, sender); err != nil {
return err
}
}
return nil
}
type ReplicationBuilder struct {
r *Replication
}
func BuildReplication(fs string, promBytesReplicated prometheus.Counter) *ReplicationBuilder {
return &ReplicationBuilder{&Replication{fs: fs, promBytesReplicated: promBytesReplicated}}
}
func (b *ReplicationBuilder) AddStep(from, to FilesystemVersion) *ReplicationBuilder {
step := &ReplicationStep{
state: StepReplicationReady,
parent: b.r,
from: from,
to: to,
}
b.r.pending = append(b.r.pending, step)
return b
}
func (b *ReplicationBuilder) Done() (r *Replication) {
if len(b.r.pending) > 0 {
b.r.state = Ready
} else {
b.r.state = Completed
}
r = b.r
b.r = nil
return r
}
type ReplicationConflictError struct {
Err error
}
func (e *ReplicationConflictError) Timeout() bool { return false }
func (e *ReplicationConflictError) Temporary() bool { return false }
func (e *ReplicationConflictError) Error() string { return fmt.Sprintf("permanent error: %s", e.Err.Error()) }
func (e *ReplicationConflictError) LocalToFS() bool { return true }
func (e *ReplicationConflictError) ContextErr() bool { return false }
func NewReplicationConflictError(fs string, err error) *Replication {
return &Replication{
state: Completed,
fs: fs,
err: &ReplicationConflictError{Err: err},
}
}
//go:generate enumer -type=StepState
type StepState uint
const (
StepReplicationReady StepState = 1 << iota
StepMarkReplicatedReady
StepCompleted
)
func (s StepState) IsTerminal() bool { return s == StepCompleted }
type FilesystemVersion interface {
SnapshotTime() time.Time
GetName() string // name without @ or #
RelName() string // name with @ or #
}
type ReplicationStep struct {
// only protects state, err
// from, to and parent are assumed to be immutable
lock sync.Mutex
state StepState
from, to FilesystemVersion
parent *Replication
// both retry and permanent error
err error
byteCounter bytecounter.StreamCopier
expectedSize int64 // 0 means no size estimate present / possible
}
func (f *Replication) Retry(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver) Error {
var u updater = func(fu func(*Replication)) State {
f.lock.Lock()
defer f.lock.Unlock()
if fu != nil {
fu(f)
}
return f.state
}
var current *ReplicationStep
pre := u(nil)
getLogger(ctx).WithField("fsrep_state", pre).Debug("begin fsrep.Retry")
defer func() {
post := u(nil)
getLogger(ctx).WithField("fsrep_transition", post).Debug("end fsrep.Retry")
}()
st := u(func(f *Replication) {
if len(f.pending) == 0 {
f.state = Completed
return
}
current = f.pending[0]
})
if st == Completed {
return nil
}
if st != Ready {
panic(fmt.Sprintf("implementation error: %v", st))
}
stepCtx := WithLogger(ctx, getLogger(ctx).WithField("step", current))
getLogger(stepCtx).Debug("take step")
err := current.Retry(stepCtx, ka, sender, receiver)
if err != nil {
getLogger(stepCtx).WithError(err).Error("step could not be completed")
}
u(func(fsr *Replication) {
if err != nil {
f.err = &StepError{stepStr: current.String(), err: err}
return
}
if err == nil && current.state != StepCompleted {
panic(fmt.Sprintf("implementation error: %v", current.state))
}
f.err = nil
f.completed = append(f.completed, current)
f.pending = f.pending[1:]
if len(f.pending) > 0 {
f.state = Ready
} else {
f.state = Completed
}
})
var retErr Error = nil
u(func(fsr *Replication) {
retErr = fsr.err
})
return retErr
}
type updater func(func(fsr *Replication)) State
type state func(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state
type StepError struct {
stepStr string
err error
}
var _ Error = &StepError{}
func (e StepError) Error() string {
if e.LocalToFS() {
return fmt.Sprintf("step %s failed: %s", e.stepStr, e.err)
}
return e.err.Error()
}
func (e StepError) Timeout() bool {
if neterr, ok := e.err.(net.Error); ok {
return neterr.Timeout()
}
return false
}
func (e StepError) Temporary() bool {
if neterr, ok := e.err.(net.Error); ok {
return neterr.Temporary()
}
return false
}
func (e StepError) LocalToFS() bool {
if _, ok := e.err.(net.Error); ok {
return false
}
return true // conservative approximation: we'd like to check for specific errors returned over RPC here...
}
func (e StepError) ContextErr() bool {
switch e.err {
case context.Canceled:
return true
case context.DeadlineExceeded:
return true
}
return false
}
func (fsr *Replication) Report() *Report {
fsr.lock.Lock()
defer fsr.lock.Unlock()
rep := Report{
Filesystem: fsr.fs,
Status: fsr.state.String(),
}
if fsr.err != nil && fsr.err.LocalToFS() {
rep.Problem = fsr.err.Error()
}
rep.Completed = make([]*StepReport, len(fsr.completed))
for i := range fsr.completed {
rep.Completed[i] = fsr.completed[i].Report()
}
rep.Pending = make([]*StepReport, len(fsr.pending))
for i := range fsr.pending {
rep.Pending[i] = fsr.pending[i].Report()
}
return &rep
}
func (s *ReplicationStep) Retry(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver) error {
switch s.state {
case StepReplicationReady:
return s.doReplication(ctx, ka, sender, receiver)
case StepMarkReplicatedReady:
return s.doMarkReplicated(ctx, ka, sender)
case StepCompleted:
return nil
}
panic(fmt.Sprintf("implementation error: %v", s.state))
}
func (s *ReplicationStep) Error() error {
if s.state & (StepReplicationReady|StepMarkReplicatedReady) != 0 {
return s.err
}
return nil
}
func (s *ReplicationStep) doReplication(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver) error {
if s.state != StepReplicationReady {
panic(fmt.Sprintf("implementation error: %v", s.state))
}
fs := s.parent.fs
log := getLogger(ctx)
sr := s.buildSendRequest(false)
log.Debug("initiate send request")
sres, sstreamCopier, err := sender.Send(ctx, sr)
if err != nil {
log.WithError(err).Error("send request failed")
return err
}
if sstreamCopier == nil {
err := errors.New("send request did not return a stream, broken endpoint implementation")
return err
}
defer sstreamCopier.Close()
// Install a byte counter to track progress + for status report
s.byteCounter = bytecounter.NewStreamCopier(sstreamCopier)
byteCounterStopProgress := make(chan struct{})
defer close(byteCounterStopProgress)
go func() {
var lastCount int64
t := time.NewTicker(1 * time.Second)
defer t.Stop()
for {
select {
case <-byteCounterStopProgress:
return
case <-t.C:
newCount := s.byteCounter.Count()
if lastCount != newCount {
ka.MadeProgress()
} else {
lastCount = newCount
}
}
}
}()
defer func() {
s.parent.promBytesReplicated.Add(float64(s.byteCounter.Count()))
}()
rr := &pdu.ReceiveReq{
Filesystem: fs,
ClearResumeToken: !sres.UsedResumeToken,
}
log.Debug("initiate receive request")
_, err = receiver.Receive(ctx, rr, s.byteCounter)
if err != nil {
log.
WithError(err).
WithField("errType", fmt.Sprintf("%T", err)).
Error("receive request failed (might also be error on sender)")
// This failure could be due to
// - an unexpected exit of ZFS on the sending side
// - an unexpected exit of ZFS on the receiving side
// - a connectivity issue
return err
}
log.Debug("receive finished")
ka.MadeProgress()
s.state = StepMarkReplicatedReady
return s.doMarkReplicated(ctx, ka, sender)
}
func (s *ReplicationStep) doMarkReplicated(ctx context.Context, ka *watchdog.KeepAlive, sender Sender) error {
if s.state != StepMarkReplicatedReady {
panic(fmt.Sprintf("implementation error: %v", s.state))
}
log := getLogger(ctx)
log.Debug("advance replication cursor")
req := &pdu.ReplicationCursorReq{
Filesystem: s.parent.fs,
Op: &pdu.ReplicationCursorReq_Set{
Set: &pdu.ReplicationCursorReq_SetOp{
Snapshot: s.to.GetName(),
},
},
}
_, err := sender.ReplicationCursor(ctx, req)
if err != nil {
log.WithError(err).Error("error advancing replication cursor")
return err
}
ka.MadeProgress()
s.state = StepCompleted
return err
}
func (s *ReplicationStep) updateSizeEstimate(ctx context.Context, sender Sender) error {
log := getLogger(ctx)
sr := s.buildSendRequest(true)
log.Debug("initiate dry run send request")
sres, _, err := sender.Send(ctx, sr)
if err != nil {
log.WithError(err).Error("dry run send request failed")
return err
}
s.expectedSize = sres.ExpectedSize
return nil
}
func (s *ReplicationStep) buildSendRequest(dryRun bool) (sr *pdu.SendReq) {
fs := s.parent.fs
if s.from == nil {
sr = &pdu.SendReq{
Filesystem: fs,
To: s.to.RelName(),
DryRun: dryRun,
}
} else {
sr = &pdu.SendReq{
Filesystem: fs,
From: s.from.RelName(),
To: s.to.RelName(),
DryRun: dryRun,
}
}
return sr
}
func (s *ReplicationStep) String() string {
if s.from == nil { // FIXME: ZFS semantics are that to is nil on non-incremental send
return fmt.Sprintf("%s%s (full)", s.parent.fs, s.to.RelName())
} else {
return fmt.Sprintf("%s(%s => %s)", s.parent.fs, s.from.RelName(), s.to.RelName())
}
}
func (s *ReplicationStep) Report() *StepReport {
var from string // FIXME follow same convention as ZFS: to should be nil on full send
if s.from != nil {
from = s.from.RelName()
}
bytes := int64(0)
if s.byteCounter != nil {
bytes = s.byteCounter.Count()
}
problem := ""
if s.err != nil {
problem = s.err.Error()
}
rep := StepReport{
From: from,
To: s.to.RelName(),
Status: s.state,
Problem: problem,
Bytes: bytes,
ExpectedBytes: s.expectedSize,
}
return &rep
}

View File

@ -1,50 +0,0 @@
// Code generated by "enumer -type=State"; DO NOT EDIT.
package fsrep
import (
"fmt"
)
const _StateName = "ReadyCompleted"
var _StateIndex = [...]uint8{0, 5, 14}
func (i State) String() string {
i -= 1
if i >= State(len(_StateIndex)-1) {
return fmt.Sprintf("State(%d)", i+1)
}
return _StateName[_StateIndex[i]:_StateIndex[i+1]]
}
var _StateValues = []State{1, 2}
var _StateNameToValueMap = map[string]State{
_StateName[0:5]: 1,
_StateName[5:14]: 2,
}
// StateString retrieves an enum value from the enum constants string name.
// Throws an error if the param is not part of the enum.
func StateString(s string) (State, error) {
if val, ok := _StateNameToValueMap[s]; ok {
return val, nil
}
return 0, fmt.Errorf("%s does not belong to State values", s)
}
// StateValues returns all values of the enum
func StateValues() []State {
return _StateValues
}
// IsAState returns "true" if the value is listed in the enum definition. "false" otherwise
func (i State) IsAState() bool {
for _, v := range _StateValues {
if i == v {
return true
}
}
return false
}

View File

@ -1,61 +0,0 @@
// Code generated by "enumer -type=StepState"; DO NOT EDIT.
package fsrep
import (
"fmt"
)
const (
_StepStateName_0 = "StepReplicationReadyStepMarkReplicatedReady"
_StepStateName_1 = "StepCompleted"
)
var (
_StepStateIndex_0 = [...]uint8{0, 20, 43}
_StepStateIndex_1 = [...]uint8{0, 13}
)
func (i StepState) String() string {
switch {
case 1 <= i && i <= 2:
i -= 1
return _StepStateName_0[_StepStateIndex_0[i]:_StepStateIndex_0[i+1]]
case i == 4:
return _StepStateName_1
default:
return fmt.Sprintf("StepState(%d)", i)
}
}
var _StepStateValues = []StepState{1, 2, 4}
var _StepStateNameToValueMap = map[string]StepState{
_StepStateName_0[0:20]: 1,
_StepStateName_0[20:43]: 2,
_StepStateName_1[0:13]: 4,
}
// StepStateString retrieves an enum value from the enum constants string name.
// Throws an error if the param is not part of the enum.
func StepStateString(s string) (StepState, error) {
if val, ok := _StepStateNameToValueMap[s]; ok {
return val, nil
}
return 0, fmt.Errorf("%s does not belong to StepState values", s)
}
// StepStateValues returns all values of the enum
func StepStateValues() []StepState {
return _StepStateValues
}
// IsAStepState returns "true" if the value is listed in the enum definition. "false" otherwise
func (i StepState) IsAStepState() bool {
for _, v := range _StepStateValues {
if i == v {
return true
}
}
return false
}

View File

@ -3,7 +3,7 @@ package mainfsm
import (
"sort"
. "github.com/zrepl/zrepl/replication/pdu"
. "github.com/zrepl/zrepl/replication/logic/pdu"
)
type ConflictNoCommonAncestor struct {

View File

@ -0,0 +1,430 @@
package logic
import (
"context"
"errors"
"fmt"
"sync"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/zrepl/zrepl/replication/driver"
. "github.com/zrepl/zrepl/replication/logic/diff"
"github.com/zrepl/zrepl/replication/logic/pdu"
"github.com/zrepl/zrepl/replication/report"
"github.com/zrepl/zrepl/util/bytecounter"
"github.com/zrepl/zrepl/zfs"
)
// Endpoint represents one side of the replication.
//
// An endpoint is either in Sender or Receiver mode, represented by the correspondingly
// named interfaces defined in this package.
type Endpoint interface {
// Does not include placeholder filesystems
ListFilesystems(ctx context.Context, req *pdu.ListFilesystemReq) (*pdu.ListFilesystemRes, error)
ListFilesystemVersions(ctx context.Context, req *pdu.ListFilesystemVersionsReq) (*pdu.ListFilesystemVersionsRes, error)
DestroySnapshots(ctx context.Context, req *pdu.DestroySnapshotsReq) (*pdu.DestroySnapshotsRes, error)
}
type Sender interface {
Endpoint
// If a non-nil io.ReadCloser is returned, it is guaranteed to be closed before
// any next call to the parent github.com/zrepl/zrepl/replication.Endpoint.
// If the send request is for dry run the io.ReadCloser will be nil
Send(ctx context.Context, r *pdu.SendReq) (*pdu.SendRes, zfs.StreamCopier, error)
ReplicationCursor(ctx context.Context, req *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error)
}
type Receiver interface {
Endpoint
// Receive sends r and sendStream (the latter containing a ZFS send stream)
// to the parent github.com/zrepl/zrepl/replication.Endpoint.
Receive(ctx context.Context, req *pdu.ReceiveReq, receive zfs.StreamCopier) (*pdu.ReceiveRes, error)
}
type Planner struct {
sender Sender
receiver Receiver
promSecsPerState *prometheus.HistogramVec // labels: state
promBytesReplicated *prometheus.CounterVec // labels: filesystem
}
func (p *Planner) Plan(ctx context.Context) ([]driver.FS, error) {
fss, err := p.doPlanning(ctx)
if err != nil {
return nil, err
}
dfss := make([]driver.FS, len(fss))
for i := range dfss {
dfss[i] = fss[i]
}
return dfss, nil
}
type Filesystem struct {
sender Sender
receiver Receiver
Path string // compat
receiverFSExists bool // compat
promBytesReplicated prometheus.Counter // compat
}
func (f *Filesystem) PlanFS(ctx context.Context) ([]driver.Step, error) {
steps, err := f.doPlanning(ctx)
if err != nil {
return nil, err
}
dsteps := make([]driver.Step, len(steps))
for i := range dsteps {
dsteps[i] = steps[i]
}
return dsteps, nil
}
func (f *Filesystem) ReportInfo() *report.FilesystemInfo {
return &report.FilesystemInfo{Name: f.Path} // FIXME compat name
}
type Step struct {
sender Sender
receiver Receiver
parent *Filesystem
from, to *pdu.FilesystemVersion // compat
byteCounter bytecounter.StreamCopier
expectedSize int64 // 0 means no size estimate present / possible
}
func (s *Step) TargetDate() time.Time {
return s.to.SnapshotTime() // FIXME compat name
}
func (s *Step) Step(ctx context.Context) error {
return s.doReplication(ctx)
}
func (s *Step) ReportInfo() *report.StepInfo {
var byteCounter int64
if s.byteCounter != nil {
byteCounter = s.byteCounter.Count()
}
return &report.StepInfo{
From: s.from.RelName(),
To: s.to.RelName(),
BytesExpected: s.expectedSize,
BytesReplicated: byteCounter,
}
}
func NewPlanner(secsPerState *prometheus.HistogramVec, bytesReplicated *prometheus.CounterVec, sender Sender, receiver Receiver) *Planner {
return &Planner{
sender: sender,
receiver: receiver,
promSecsPerState: secsPerState,
promBytesReplicated: bytesReplicated,
}
}
func resolveConflict(conflict error) (path []*pdu.FilesystemVersion, msg string) {
if noCommonAncestor, ok := conflict.(*ConflictNoCommonAncestor); ok {
if len(noCommonAncestor.SortedReceiverVersions) == 0 {
// TODO this is hard-coded replication policy: most recent snapshot as source
var mostRecentSnap *pdu.FilesystemVersion
for n := len(noCommonAncestor.SortedSenderVersions) - 1; n >= 0; n-- {
if noCommonAncestor.SortedSenderVersions[n].Type == pdu.FilesystemVersion_Snapshot {
mostRecentSnap = noCommonAncestor.SortedSenderVersions[n]
break
}
}
if mostRecentSnap == nil {
return nil, "no snapshots available on sender side"
}
return []*pdu.FilesystemVersion{mostRecentSnap}, fmt.Sprintf("start replication at most recent snapshot %s", mostRecentSnap.RelName())
}
}
return nil, "no automated way to handle conflict type"
}
func (p *Planner) doPlanning(ctx context.Context) ([]*Filesystem, error) {
log := getLogger(ctx)
log.Info("start planning")
slfssres, err := p.sender.ListFilesystems(ctx, &pdu.ListFilesystemReq{})
if err != nil {
log.WithError(err).WithField("errType", fmt.Sprintf("%T", err)).Error("error listing sender filesystems")
return nil, err
}
sfss := slfssres.GetFilesystems()
// no progress here since we could run in a live-lock on connectivity issues
rlfssres, err := p.receiver.ListFilesystems(ctx, &pdu.ListFilesystemReq{})
if err != nil {
log.WithError(err).WithField("errType", fmt.Sprintf("%T", err)).Error("error listing receiver filesystems")
return nil, err
}
rfss := rlfssres.GetFilesystems()
q := make([]*Filesystem, 0, len(sfss))
for _, fs := range sfss {
receiverFSExists := false
for _, rfs := range rfss {
if rfs.Path == fs.Path {
receiverFSExists = true
}
}
ctr := p.promBytesReplicated.WithLabelValues(fs.Path)
q = append(q, &Filesystem{
sender: p.sender,
receiver: p.receiver,
Path: fs.Path,
receiverFSExists: receiverFSExists,
promBytesReplicated: ctr,
})
}
return q, nil
}
func (fs *Filesystem) doPlanning(ctx context.Context) ([]*Step, error) {
log := getLogger(ctx).WithField("filesystem", fs.Path)
log.Debug("assessing filesystem")
sfsvsres, err := fs.sender.ListFilesystemVersions(ctx, &pdu.ListFilesystemVersionsReq{Filesystem: fs.Path})
if err != nil {
log.WithError(err).Error("cannot get remote filesystem versions")
return nil, err
}
sfsvs := sfsvsres.GetVersions()
if len(sfsvs) < 1 {
err := errors.New("sender does not have any versions")
log.Error(err.Error())
return nil, err
}
var rfsvs []*pdu.FilesystemVersion
if fs.receiverFSExists {
rfsvsres, err := fs.receiver.ListFilesystemVersions(ctx, &pdu.ListFilesystemVersionsReq{Filesystem: fs.Path})
if err != nil {
log.WithError(err).Error("receiver error")
return nil, err
}
rfsvs = rfsvsres.GetVersions()
} else {
rfsvs = []*pdu.FilesystemVersion{}
}
path, conflict := IncrementalPath(rfsvs, sfsvs)
if conflict != nil {
var msg string
path, msg = resolveConflict(conflict) // no shadowing allowed!
if path != nil {
log.WithField("conflict", conflict).Info("conflict")
log.WithField("resolution", msg).Info("automatically resolved")
} else {
log.WithField("conflict", conflict).Error("conflict")
log.WithField("problem", msg).Error("cannot resolve conflict")
}
}
if path == nil {
return nil, conflict
}
steps := make([]*Step, 0, len(path))
// FIXME unify struct declarations => initializer?
if len(path) == 1 {
steps = append(steps, &Step{
parent: fs,
sender: fs.sender,
receiver: fs.receiver,
from: nil,
to: path[0],
})
} else {
for i := 0; i < len(path)-1; i++ {
steps = append(steps, &Step{
parent: fs,
sender: fs.sender,
receiver: fs.receiver,
from: path[i],
to: path[i+1],
})
}
}
log.Debug("compute send size estimate")
errs := make(chan error, len(steps))
var wg sync.WaitGroup
fanOutCtx, fanOutCancel := context.WithCancel(ctx)
defer fanOutCancel()
for _, step := range steps {
wg.Add(1)
go func(step *Step) {
defer wg.Done()
err := step.updateSizeEstimate(fanOutCtx)
if err != nil {
log.WithError(err).WithField("step", step).Error("error computing size estimate")
fanOutCancel()
}
errs <- err
}(step)
}
wg.Wait()
close(errs)
var significantErr error = nil
for err := range errs {
if err != nil {
if significantErr == nil || significantErr == context.Canceled {
significantErr = err
}
}
}
if significantErr != nil {
return nil, significantErr
}
log.Debug("filesystem planning finished")
return steps, nil
}
// type FilesystemsReplicationFailedError struct {
// FilesystemsWithError []*fsrep.Replication
// }
// func (e FilesystemsReplicationFailedError) Error() string {
// allSame := true
// lastErr := e.FilesystemsWithError[0].Err().Error()
// for _, fs := range e.FilesystemsWithError {
// fsErr := fs.Err().Error()
// allSame = allSame && lastErr == fsErr
// }
// fsstr := "multiple filesystems"
// if len(e.FilesystemsWithError) == 1 {
// fsstr = fmt.Sprintf("filesystem %s", e.FilesystemsWithError[0].FS())
// }
// errorStr := lastErr
// if !allSame {
// errorStr = "multiple different errors"
// }
// return fmt.Sprintf("%s could not be replicated: %s", fsstr, errorStr)
// }
func (s *Step) updateSizeEstimate(ctx context.Context) error {
log := getLogger(ctx)
sr := s.buildSendRequest(true)
log.Debug("initiate dry run send request")
sres, _, err := s.sender.Send(ctx, sr)
if err != nil {
log.WithError(err).Error("dry run send request failed")
return err
}
s.expectedSize = sres.ExpectedSize
return nil
}
func (s *Step) buildSendRequest(dryRun bool) (sr *pdu.SendReq) {
fs := s.parent.Path
if s.from == nil {
sr = &pdu.SendReq{
Filesystem: fs,
To: s.to.RelName(),
DryRun: dryRun,
}
} else {
sr = &pdu.SendReq{
Filesystem: fs,
From: s.from.RelName(),
To: s.to.RelName(),
DryRun: dryRun,
}
}
return sr
}
func (s *Step) doReplication(ctx context.Context) error {
fs := s.parent.Path
log := getLogger(ctx)
sr := s.buildSendRequest(false)
log.Debug("initiate send request")
sres, sstreamCopier, err := s.sender.Send(ctx, sr)
if err != nil {
log.WithError(err).Error("send request failed")
return err
}
if sstreamCopier == nil {
err := errors.New("send request did not return a stream, broken endpoint implementation")
return err
}
defer sstreamCopier.Close()
// Install a byte counter to track progress + for status report
s.byteCounter = bytecounter.NewStreamCopier(sstreamCopier)
defer func() {
s.parent.promBytesReplicated.Add(float64(s.byteCounter.Count()))
}()
rr := &pdu.ReceiveReq{
Filesystem: fs,
ClearResumeToken: !sres.UsedResumeToken,
}
log.Debug("initiate receive request")
_, err = s.receiver.Receive(ctx, rr, s.byteCounter)
if err != nil {
log.
WithError(err).
WithField("errType", fmt.Sprintf("%T", err)).
Error("receive request failed (might also be error on sender)")
// This failure could be due to
// - an unexpected exit of ZFS on the sending side
// - an unexpected exit of ZFS on the receiving side
// - a connectivity issue
return err
}
log.Debug("receive finished")
log.Debug("advance replication cursor")
req := &pdu.ReplicationCursorReq{
Filesystem: fs,
Op: &pdu.ReplicationCursorReq_Set{
Set: &pdu.ReplicationCursorReq_SetOp{
Snapshot: s.to.GetName(),
},
},
}
_, err = s.sender.ReplicationCursor(ctx, req)
if err != nil {
log.WithError(err).Error("error advancing replication cursor")
// If this fails and replication planning restarts, the diff algorithm will find
// that cursor out of place. This is not a problem because then, it would just use another FS
// However, we FIXME have no means to just update the cursor in a
// second replication attempt right after this one where we don't have new snaps yet
return err
}
return err
}
func (s *Step) String() string {
if s.from == nil { // FIXME: ZFS semantics are that to is nil on non-incremental send
return fmt.Sprintf("%s%s (full)", s.parent.Path, s.to.RelName())
} else {
return fmt.Sprintf("%s(%s => %s)", s.parent.Path, s.from.RelName(), s.to.RelName())
}
}

View File

@ -1,9 +1,9 @@
package replication
package logic
import (
"context"
"github.com/zrepl/zrepl/logger"
"github.com/zrepl/zrepl/replication/fsrep"
)
type contextKey int
@ -16,7 +16,6 @@ type Logger = logger.Logger
func WithLogger(ctx context.Context, l Logger) context.Context {
ctx = context.WithValue(ctx, contextKeyLog, l)
ctx = fsrep.WithLogger(ctx, l)
return ctx
}

View File

@ -1,560 +0,0 @@
// Package replication implements replication of filesystems with existing
// versions (snapshots) from a sender to a receiver.
package replication
import (
"context"
"errors"
"fmt"
"github.com/prometheus/client_golang/prometheus"
"github.com/zrepl/zrepl/daemon/job/wakeup"
"github.com/zrepl/zrepl/util/envconst"
"github.com/zrepl/zrepl/util/watchdog"
"math/bits"
"net"
"sort"
"sync"
"time"
"github.com/zrepl/zrepl/replication/fsrep"
. "github.com/zrepl/zrepl/replication/internal/diff"
"github.com/zrepl/zrepl/replication/pdu"
)
//go:generate enumer -type=State
type State uint
const (
Planning State = 1 << iota
PlanningError
Working
WorkingWait
Completed
PermanentError
)
func (s State) rsf() state {
idx := bits.TrailingZeros(uint(s))
if idx == bits.UintSize {
panic(s) // invalid value
}
m := []state{
statePlanning,
statePlanningError,
stateWorking,
stateWorkingWait,
nil,
nil,
}
return m[idx]
}
func (s State) IsTerminal() bool {
return s.rsf() == nil
}
// Replication implements the replication of multiple file systems from a Sender to a Receiver.
//
// It is a state machine that is driven by the Drive method
// and provides asynchronous reporting via the Report method (i.e. from another goroutine).
type Replication struct {
// not protected by lock
promSecsPerState *prometheus.HistogramVec // labels: state
promBytesReplicated *prometheus.CounterVec // labels: filesystem
Progress watchdog.KeepAlive
// lock protects all fields of this struct (but not the fields behind pointers!)
lock sync.Mutex
state State
// Working, WorkingWait, Completed, ContextDone
queue []*fsrep.Replication
completed []*fsrep.Replication
active *fsrep.Replication // == queue[0] or nil, unlike in Report
// for PlanningError, WorkingWait and ContextError and Completed
err error
// PlanningError, WorkingWait
sleepUntil time.Time
}
type Report struct {
Status string
Problem string
SleepUntil time.Time
Completed []*fsrep.Report
Pending []*fsrep.Report
Active *fsrep.Report // not contained in Pending, unlike in struct Replication
}
func NewReplication(secsPerState *prometheus.HistogramVec, bytesReplicated *prometheus.CounterVec) *Replication {
r := Replication{
promSecsPerState: secsPerState,
promBytesReplicated: bytesReplicated,
state: Planning,
}
return &r
}
// Endpoint represents one side of the replication.
//
// An endpoint is either in Sender or Receiver mode, represented by the correspondingly
// named interfaces defined in this package.
type Endpoint interface {
// Does not include placeholder filesystems
ListFilesystems(ctx context.Context, req *pdu.ListFilesystemReq) (*pdu.ListFilesystemRes, error)
ListFilesystemVersions(ctx context.Context, req *pdu.ListFilesystemVersionsReq) (*pdu.ListFilesystemVersionsRes, error)
DestroySnapshots(ctx context.Context, req *pdu.DestroySnapshotsReq) (*pdu.DestroySnapshotsRes, error)
}
type Sender interface {
Endpoint
fsrep.Sender
}
type Receiver interface {
Endpoint
fsrep.Receiver
}
type FilteredError struct{ fs string }
func NewFilteredError(fs string) *FilteredError {
return &FilteredError{fs}
}
func (f FilteredError) Error() string { return "endpoint does not allow access to filesystem " + f.fs }
type updater func(func(*Replication)) (newState State)
type state func(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state
// Drive starts the state machine and returns only after replication has finished (with or without errors).
// The Logger in ctx is used for both debug and error logging, but is not guaranteed to be stable
// or end-user friendly.
// User-facing replication progress reports and can be obtained using the Report method,
// whose output will not change after Drive returns.
//
// FIXME: Drive may be only called once per instance of Replication
func (r *Replication) Drive(ctx context.Context, sender Sender, receiver Receiver) {
var u updater = func(f func(*Replication)) State {
r.lock.Lock()
defer r.lock.Unlock()
if f != nil {
f(r)
}
return r.state
}
var s state = statePlanning
var pre, post State
for s != nil {
preTime := time.Now()
pre = u(nil)
s = s(ctx, &r.Progress, sender, receiver, u)
delta := time.Now().Sub(preTime)
r.promSecsPerState.WithLabelValues(pre.String()).Observe(delta.Seconds())
post = u(nil)
getLogger(ctx).
WithField("transition", fmt.Sprintf("%s => %s", pre, post)).
WithField("duration", delta).
Debug("main state transition")
if post == Working && pre != post {
getLogger(ctx).Info("start working")
}
}
getLogger(ctx).
WithField("final_state", post).
Debug("main final state")
}
func resolveConflict(conflict error) (path []*pdu.FilesystemVersion, msg string) {
if noCommonAncestor, ok := conflict.(*ConflictNoCommonAncestor); ok {
if len(noCommonAncestor.SortedReceiverVersions) == 0 {
// TODO this is hard-coded replication policy: most recent snapshot as source
var mostRecentSnap *pdu.FilesystemVersion
for n := len(noCommonAncestor.SortedSenderVersions) - 1; n >= 0; n-- {
if noCommonAncestor.SortedSenderVersions[n].Type == pdu.FilesystemVersion_Snapshot {
mostRecentSnap = noCommonAncestor.SortedSenderVersions[n]
break
}
}
if mostRecentSnap == nil {
return nil, "no snapshots available on sender side"
}
return []*pdu.FilesystemVersion{mostRecentSnap}, fmt.Sprintf("start replication at most recent snapshot %s", mostRecentSnap.RelName())
}
}
return nil, "no automated way to handle conflict type"
}
var RetryInterval = envconst.Duration("ZREPL_REPLICATION_RETRY_INTERVAL", 10 * time.Second)
type Error interface {
error
Temporary() bool
}
var _ Error = fsrep.Error(nil)
var _ Error = net.Error(nil)
func isPermanent(err error) bool {
if e, ok := err.(Error); ok {
return !e.Temporary()
}
return true
}
func statePlanning(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {
log := getLogger(ctx)
log.Info("start planning")
handlePlanningError := func(err error) state {
return u(func(r *Replication) {
ge := GlobalError{Err: err, Temporary: !isPermanent(err)}
log.WithError(ge).Error("encountered global error while planning replication")
r.err = ge
if !ge.Temporary {
r.state = PermanentError
} else {
r.sleepUntil = time.Now().Add(RetryInterval)
r.state = PlanningError
}
}).rsf()
}
slfssres, err := sender.ListFilesystems(ctx, &pdu.ListFilesystemReq{})
if err != nil {
log.WithError(err).WithField("errType", fmt.Sprintf("%T", err)).Error("error listing sender filesystems")
return handlePlanningError(err)
}
sfss := slfssres.GetFilesystems()
// no progress here since we could run in a live-lock on connectivity issues
rlfssres, err := receiver.ListFilesystems(ctx, &pdu.ListFilesystemReq{})
if err != nil {
log.WithError(err).WithField("errType", fmt.Sprintf("%T", err)).Error("error listing receiver filesystems")
return handlePlanningError(err)
}
rfss := rlfssres.GetFilesystems()
ka.MadeProgress() // for both sender and receiver
q := make([]*fsrep.Replication, 0, len(sfss))
mainlog := log
for _, fs := range sfss {
log := mainlog.WithField("filesystem", fs.Path)
log.Debug("assessing filesystem")
sfsvsres, err := sender.ListFilesystemVersions(ctx, &pdu.ListFilesystemVersionsReq{Filesystem: fs.Path})
if err != nil {
log.WithError(err).Error("cannot get remote filesystem versions")
return handlePlanningError(err)
}
sfsvs := sfsvsres.GetVersions()
ka.MadeProgress()
if len(sfsvs) < 1 {
err := errors.New("sender does not have any versions")
log.Error(err.Error())
q = append(q, fsrep.NewReplicationConflictError(fs.Path, err))
continue
}
receiverFSExists := false
for _, rfs := range rfss {
if rfs.Path == fs.Path {
receiverFSExists = true
}
}
var rfsvs []*pdu.FilesystemVersion
if receiverFSExists {
rfsvsres, err := receiver.ListFilesystemVersions(ctx, &pdu.ListFilesystemVersionsReq{Filesystem: fs.Path})
if err != nil {
if _, ok := err.(*FilteredError); ok {
log.Info("receiver ignores filesystem")
continue
}
log.WithError(err).Error("receiver error")
return handlePlanningError(err)
}
rfsvs = rfsvsres.GetVersions()
} else {
rfsvs = []*pdu.FilesystemVersion{}
}
ka.MadeProgress()
path, conflict := IncrementalPath(rfsvs, sfsvs)
if conflict != nil {
var msg string
path, msg = resolveConflict(conflict) // no shadowing allowed!
if path != nil {
log.WithField("conflict", conflict).Info("conflict")
log.WithField("resolution", msg).Info("automatically resolved")
} else {
log.WithField("conflict", conflict).Error("conflict")
log.WithField("problem", msg).Error("cannot resolve conflict")
}
}
ka.MadeProgress()
if path == nil {
q = append(q, fsrep.NewReplicationConflictError(fs.Path, conflict))
continue
}
var promBytesReplicated *prometheus.CounterVec
u(func(replication *Replication) { // FIXME args struct like in pruner (also use for sender and receiver)
promBytesReplicated = replication.promBytesReplicated
})
fsrfsm := fsrep.BuildReplication(fs.Path, promBytesReplicated.WithLabelValues(fs.Path))
if len(path) == 1 {
fsrfsm.AddStep(nil, path[0])
} else {
for i := 0; i < len(path)-1; i++ {
fsrfsm.AddStep(path[i], path[i+1])
}
}
qitem := fsrfsm.Done()
ka.MadeProgress()
log.Debug("compute send size estimate")
if err = qitem.UpdateSizeEsitmate(ctx, sender); err != nil {
log.WithError(err).Error("error computing size estimate")
return handlePlanningError(err)
}
ka.MadeProgress()
q = append(q, qitem)
}
ka.MadeProgress()
return u(func(r *Replication) {
r.completed = nil
r.queue = q
r.err = nil
r.state = Working
}).rsf()
}
func statePlanningError(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {
var sleepUntil time.Time
u(func(r *Replication) {
sleepUntil = r.sleepUntil
})
t := time.NewTimer(sleepUntil.Sub(time.Now()))
getLogger(ctx).WithField("until", sleepUntil).Info("retry wait after planning error")
defer t.Stop()
select {
case <-ctx.Done():
return u(func(r *Replication) {
r.state = PermanentError
r.err = ctx.Err()
}).rsf()
case <-t.C:
case <-wakeup.Wait(ctx):
}
return u(func(r *Replication) {
r.state = Planning
}).rsf()
}
type GlobalError struct {
Err error
Temporary bool
}
func (e GlobalError) Error() string {
errClass := "temporary"
if !e.Temporary {
errClass = "permanent"
}
return fmt.Sprintf("%s global error: %s", errClass, e.Err)
}
type FilesystemsReplicationFailedError struct {
FilesystemsWithError []*fsrep.Replication
}
func (e FilesystemsReplicationFailedError) Error() string {
allSame := true
lastErr := e.FilesystemsWithError[0].Err().Error()
for _, fs := range e.FilesystemsWithError {
fsErr := fs.Err().Error()
allSame = allSame && lastErr == fsErr
}
fsstr := "multiple filesystems"
if len(e.FilesystemsWithError) == 1 {
fsstr = fmt.Sprintf("filesystem %s", e.FilesystemsWithError[0].FS())
}
errorStr := lastErr
if !allSame {
errorStr = "multiple different errors"
}
return fmt.Sprintf("%s could not be replicated: %s", fsstr, errorStr)
}
func stateWorking(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {
var active *fsrep.Replication
rsfNext := u(func(r *Replication) {
r.err = nil
newq := make([]*fsrep.Replication, 0, len(r.queue))
for i := range r.queue {
if r.queue[i].CanRetry() {
newq = append(newq, r.queue[i])
} else {
r.completed = append(r.completed, r.queue[i])
}
}
sort.SliceStable(newq, func(i, j int) bool {
return newq[i].NextStepDate().Before(newq[j].NextStepDate())
})
r.queue = newq
if len(r.queue) == 0 {
r.state = Completed
fsWithErr := FilesystemsReplicationFailedError{ // prepare it
FilesystemsWithError: make([]*fsrep.Replication, 0, len(r.completed)),
}
for _, fs := range r.completed {
if fs.CanRetry() {
panic(fmt.Sprintf("implementation error: completed contains retryable FS %s %#v",
fs.FS(), fs.Err()))
}
if fs.Err() != nil {
fsWithErr.FilesystemsWithError = append(fsWithErr.FilesystemsWithError, fs)
}
}
if len(fsWithErr.FilesystemsWithError) > 0 {
r.err = fsWithErr
r.state = PermanentError
}
return
}
active = r.queue[0] // do not dequeue: if it's done, it will be sorted the next time we check for more work
r.active = active
}).rsf()
if active == nil {
return rsfNext
}
activeCtx := fsrep.WithLogger(ctx, getLogger(ctx).WithField("fs", active.FS()))
err := active.Retry(activeCtx, ka, sender, receiver)
u(func(r *Replication) {
r.active = nil
}).rsf()
if err != nil {
if err.ContextErr() && ctx.Err() != nil {
getLogger(ctx).WithError(err).
Info("filesystem replication was cancelled")
u(func(r*Replication) {
r.err = GlobalError{Err: err, Temporary: false}
r.state = PermanentError
})
} else if err.LocalToFS() {
getLogger(ctx).WithError(err).
Error("filesystem replication encountered a filesystem-specific error")
// we stay in this state and let the queuing logic above de-prioritize this failing FS
} else if err.Temporary() {
getLogger(ctx).WithError(err).
Error("filesystem encountered a non-filesystem-specific temporary error, enter retry-wait")
u(func(r *Replication) {
r.err = GlobalError{Err: err, Temporary: true}
r.sleepUntil = time.Now().Add(RetryInterval)
r.state = WorkingWait
}).rsf()
} else {
getLogger(ctx).WithError(err).
Error("encountered a permanent non-filesystem-specific error")
u(func(r *Replication) {
r.err = GlobalError{Err: err, Temporary: false}
r.state = PermanentError
}).rsf()
}
}
return u(nil).rsf()
}
func stateWorkingWait(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {
var sleepUntil time.Time
u(func(r *Replication) {
sleepUntil = r.sleepUntil
})
t := time.NewTimer(RetryInterval)
getLogger(ctx).WithField("until", sleepUntil).Info("retry wait after error")
defer t.Stop()
select {
case <-ctx.Done():
return u(func(r *Replication) {
r.state = PermanentError
r.err = ctx.Err()
}).rsf()
case <-t.C:
case <-wakeup.Wait(ctx):
}
return u(func(r *Replication) {
r.state = Working
}).rsf()
}
// Report provides a summary of the progress of the Replication,
// i.e., a condensed dump of the internal state machine.
// Report is safe to be called asynchronously while Drive is running.
func (r *Replication) Report() *Report {
r.lock.Lock()
defer r.lock.Unlock()
rep := Report{
Status: r.state.String(),
SleepUntil: r.sleepUntil,
}
if r.err != nil {
rep.Problem = r.err.Error()
}
if r.state&(Planning|PlanningError) != 0 {
return &rep
}
rep.Pending = make([]*fsrep.Report, 0, len(r.queue))
rep.Completed = make([]*fsrep.Report, 0, len(r.completed)) // room for active (potentially)
// since r.active == r.queue[0], do not contain it in pending output
pending := r.queue
if r.active != nil {
rep.Active = r.active.Report()
pending = r.queue[1:]
}
for _, fsr := range pending {
rep.Pending= append(rep.Pending, fsr.Report())
}
for _, fsr := range r.completed {
rep.Completed = append(rep.Completed, fsr.Report())
}
return &rep
}
func (r *Replication) State() State {
r.lock.Lock()
defer r.lock.Unlock()
return r.state
}

View File

@ -0,0 +1,13 @@
// Package replication implements replication of filesystems with existing
// versions (snapshots) from a sender to a receiver.
package replication
import (
"context"
"github.com/zrepl/zrepl/replication/driver"
)
func Do(ctx context.Context, planner driver.Planner) (driver.ReportFunc, driver.WaitFunc) {
return driver.Do(ctx, planner)
}

View File

@ -0,0 +1,150 @@
package report
import (
"encoding/json"
"time"
)
type Report struct {
StartAt, FinishAt time.Time
Attempts []*AttemptReport
}
var _, _ = json.Marshal(&Report{})
type TimedError struct {
Err string
Time time.Time
}
func NewTimedError(err string, t time.Time) *TimedError {
if err == "" {
panic("error must be empty")
}
if t.IsZero() {
panic("t must be non-zero")
}
return &TimedError{err, t}
}
func (s *TimedError) Error() string {
return s.Err
}
var _, _ = json.Marshal(&TimedError{})
type AttemptReport struct {
State AttemptState
StartAt, FinishAt time.Time
PlanError *TimedError
Filesystems []*FilesystemReport
}
type AttemptState string
const (
AttemptPlanning AttemptState = "planning"
AttemptPlanningError AttemptState = "planning-error"
AttemptFanOutFSs AttemptState = "fan-out-filesystems"
AttemptFanOutError AttemptState = "filesystem-error"
AttemptDone AttemptState = "done"
)
type FilesystemState string
const (
FilesystemPlanning FilesystemState = "planning"
FilesystemPlanningErrored FilesystemState = "planning-error"
FilesystemStepping FilesystemState = "stepping"
FilesystemSteppingErrored FilesystemState = "step-error"
FilesystemDone FilesystemState = "done"
)
type FilesystemReport struct {
Info *FilesystemInfo
State FilesystemState
// Valid in State = FilesystemPlanningErrored
PlanError *TimedError
// Valid in State = FilesystemSteppingErrored
StepError *TimedError
// Valid in State = FilesystemStepping
CurrentStep int
Steps []*StepReport
}
type FilesystemInfo struct {
Name string
}
type StepReport struct {
Info *StepInfo
}
type StepInfo struct {
From, To string
BytesExpected int64
BytesReplicated int64
}
func (a *AttemptReport) BytesSum() (expected, replicated int64) {
for _, fs := range a.Filesystems {
e, r := fs.BytesSum()
expected += e
replicated += r
}
return expected, replicated
}
func (f *FilesystemReport) BytesSum() (expected, replicated int64) {
for _, step := range f.Steps {
expected += step.Info.BytesExpected
replicated += step.Info.BytesReplicated
}
return
}
func (f *AttemptReport) FilesystemsByState() map[FilesystemState][]*FilesystemReport {
r := make(map[FilesystemState][]*FilesystemReport, 4)
for _, fs := range f.Filesystems {
l := r[fs.State]
l = append(l, fs)
r[fs.State] = l
}
return r
}
func (f *FilesystemReport) Error() *TimedError {
switch f.State {
case FilesystemPlanningErrored:
return f.PlanError
case FilesystemSteppingErrored:
return f.StepError
}
return nil
}
// may return nil
func (f *FilesystemReport) NextStep() *StepReport {
switch f.State {
case FilesystemDone:
return nil
case FilesystemPlanningErrored:
return nil
case FilesystemSteppingErrored:
return nil
case FilesystemPlanning:
return nil
case FilesystemStepping:
// invariant is that this is always correct
// TODO what about 0-length Steps but short intermediary state?
return f.Steps[f.CurrentStep]
}
panic("unreachable")
}
func (f *StepReport) IsIncremental() bool {
return f.Info.From != "" // FIXME change to ZFS semantics (To != "")
}

View File

@ -1,76 +0,0 @@
// Code generated by "enumer -type=State"; DO NOT EDIT.
package replication
import (
"fmt"
)
const (
_StateName_0 = "PlanningPlanningError"
_StateName_1 = "Working"
_StateName_2 = "WorkingWait"
_StateName_3 = "Completed"
_StateName_4 = "PermanentError"
)
var (
_StateIndex_0 = [...]uint8{0, 8, 21}
_StateIndex_1 = [...]uint8{0, 7}
_StateIndex_2 = [...]uint8{0, 11}
_StateIndex_3 = [...]uint8{0, 9}
_StateIndex_4 = [...]uint8{0, 14}
)
func (i State) String() string {
switch {
case 1 <= i && i <= 2:
i -= 1
return _StateName_0[_StateIndex_0[i]:_StateIndex_0[i+1]]
case i == 4:
return _StateName_1
case i == 8:
return _StateName_2
case i == 16:
return _StateName_3
case i == 32:
return _StateName_4
default:
return fmt.Sprintf("State(%d)", i)
}
}
var _StateValues = []State{1, 2, 4, 8, 16, 32}
var _StateNameToValueMap = map[string]State{
_StateName_0[0:8]: 1,
_StateName_0[8:21]: 2,
_StateName_1[0:7]: 4,
_StateName_2[0:11]: 8,
_StateName_3[0:9]: 16,
_StateName_4[0:14]: 32,
}
// StateString retrieves an enum value from the enum constants string name.
// Throws an error if the param is not part of the enum.
func StateString(s string) (State, error) {
if val, ok := _StateNameToValueMap[s]; ok {
return val, nil
}
return 0, fmt.Errorf("%s does not belong to State values", s)
}
// StateValues returns all values of the enum
func StateValues() []State {
return _StateValues
}
// IsAState returns "true" if the value is listed in the enum definition. "false" otherwise
func (i State) IsAState() bool {
for _, v := range _StateValues {
if i == v {
return true
}
}
return false
}

View File

@ -7,7 +7,7 @@ import (
"strings"
"github.com/golang/protobuf/proto"
"github.com/zrepl/zrepl/replication/pdu"
"github.com/zrepl/zrepl/replication/logic/pdu"
"github.com/zrepl/zrepl/rpc/dataconn/stream"
"github.com/zrepl/zrepl/transport"
"github.com/zrepl/zrepl/zfs"

View File

@ -7,7 +7,7 @@ import (
"github.com/golang/protobuf/proto"
"github.com/zrepl/zrepl/logger"
"github.com/zrepl/zrepl/replication/pdu"
"github.com/zrepl/zrepl/replication/logic/pdu"
"github.com/zrepl/zrepl/rpc/dataconn/stream"
"github.com/zrepl/zrepl/transport"
"github.com/zrepl/zrepl/zfs"

View File

@ -24,7 +24,7 @@ import (
"github.com/pkg/profile"
"github.com/zrepl/zrepl/logger"
"github.com/zrepl/zrepl/replication/pdu"
"github.com/zrepl/zrepl/replication/logic/pdu"
"github.com/zrepl/zrepl/rpc/dataconn"
"github.com/zrepl/zrepl/rpc/dataconn/timeoutconn"
"github.com/zrepl/zrepl/transport"

View File

@ -36,7 +36,7 @@ func ClientConn(cn transport.Connecter, log Logger) *grpc.ClientConn {
})
dialerOption := grpc.WithDialer(grpcclientidentity.NewDialer(log, cn))
cred := grpc.WithTransportCredentials(grpcclientidentity.NewTransportCredentials(log))
ctx, cancel := context.WithTimeout(context.TODO(), 5*time.Second)
ctx, cancel := context.WithTimeout(context.TODO(), 5*time.Second) // FIXME constant
defer cancel()
cc, err := grpc.DialContext(ctx, "doesn't matter done by dialer", dialerOption, cred, ka)
if err != nil {

View File

@ -7,8 +7,8 @@ import (
"google.golang.org/grpc"
"github.com/zrepl/zrepl/replication"
"github.com/zrepl/zrepl/replication/pdu"
"github.com/zrepl/zrepl/replication/logic"
"github.com/zrepl/zrepl/replication/logic/pdu"
"github.com/zrepl/zrepl/rpc/dataconn"
"github.com/zrepl/zrepl/rpc/grpcclientidentity/grpchelper"
"github.com/zrepl/zrepl/rpc/versionhandshake"
@ -26,9 +26,9 @@ type Client struct {
loggers Loggers
}
var _ replication.Endpoint = &Client{}
var _ replication.Sender = &Client{}
var _ replication.Receiver = &Client{}
var _ logic.Endpoint = &Client{}
var _ logic.Sender = &Client{}
var _ logic.Receiver = &Client{}
type DialContextFunc = func(ctx context.Context, network string, addr string) (net.Conn, error)

View File

@ -7,7 +7,7 @@ import (
"google.golang.org/grpc"
"github.com/zrepl/zrepl/endpoint"
"github.com/zrepl/zrepl/replication/pdu"
"github.com/zrepl/zrepl/replication/logic/pdu"
"github.com/zrepl/zrepl/rpc/dataconn"
"github.com/zrepl/zrepl/rpc/grpcclientidentity"
"github.com/zrepl/zrepl/rpc/netadaptor"

View File

@ -0,0 +1,42 @@
// package chainlock implements a mutex whose Lock and Unlock
// methods return the lock itself, to enable chaining.
//
// Intended Usage
//
// defer s.lock().unlock()
// // drop lock while waiting for wait group
// func() {
// defer a.l.Unlock().Lock()
// fssesDone.Wait()
// }()
//
package chainlock
import "sync"
type L struct {
mtx sync.Mutex
}
func New() *L {
return &L{}
}
func (l *L) Lock() *L {
l.mtx.Lock()
return l
}
func (l *L) Unlock() *L {
l.mtx.Unlock()
return l
}
func (l *L) NewCond() *sync.Cond {
return sync.NewCond(&l.mtx)
}
func (l *L) DropWhile(f func()) {
defer l.Unlock().Lock()
f()
}