replication: refactor driving logic (no more explicit state machine)

2025-06-24 11:41:27 +02:00 · 2019-02-22 11:40:27 +01:00 · 2019-02-22 11:40:27 +01:00 · 07b43bffa4
commit 07b43bffa4
parent 0230c6321f
33 changed files with 1594 additions and 1515 deletions
--- a/Gopkg.lock
+++ b/Gopkg.lock
@ -161,6 +161,14 @@
  revision = "3247c84500bff8d9fb6d579d800f20b3e091582c"
  version = "v1.0.0"

+[[projects]]
+  digest = "1:4ff67dde814694496d7aa31be44b900f9717a10c8bc9136b13f49c8ef97f439a"
+  name = "github.com/montanaflynn/stats"
+  packages = ["."]
+  pruneopts = ""
+  revision = "63fbb2597b7a13043b453a4b819945badb8f8926"
+  version = "v0.5.0"
+
 [[projects]]
  branch = "master"
  digest = "1:f60ff065b58bd53e641112b38bbda9d2684deb828393c7ffb89c69a1ee301d17"
@ -245,6 +253,14 @@
  pruneopts = ""
  revision = "8b1c2da0d56deffdbb9e48d4414b4e674bd8083e"

+[[projects]]
+  digest = "1:3962f553b77bf6c03fc07cd687a22dd3b00fe11aa14d31194f5505f5bb65cdc8"
+  name = "github.com/sergi/go-diff"
+  packages = ["diffmatchpatch"]
+  pruneopts = ""
+  revision = "1744e2970ca51c86172c8190fadad617561ed6e7"
+  version = "v1.0.0"
+
 [[projects]]
  branch = "master"
  digest = "1:146327ce93be37e68bd3ff8541090d96da8cb3adc9e35d57570e9170a29f6bf6"
@ -280,6 +296,25 @@
  revision = "93babf24513d0e8277635da8169fcc5a46ae3f6a"
  version = "v1.11.0"

+[[projects]]
+  digest = "1:529ed3f98838f69e13761788d0cc71b44e130058fab13bae2ce09f7a176bced4"
+  name = "github.com/yudai/gojsondiff"
+  packages = [
+    ".",
+    "formatter",
+  ]
+  pruneopts = ""
+  revision = "7b1b7adf999dab73a6eb02669c3d82dbb27a3dd6"
+  version = "1.0.0"
+
+[[projects]]
+  branch = "master"
+  digest = "1:9857bb2293f372b2181004d8b62179bbdb4ab0982ec6f762abe6cf2bfedaff85"
+  name = "github.com/yudai/golcs"
+  packages = ["."]
+  pruneopts = ""
+  revision = "ecda9a501e8220fae3b4b600c3db4b0ba22cfc68"
+
 [[projects]]
  branch = "v2"
  digest = "1:6b8a6afafde7ed31cd0c577ba40d88ce39e8f1c5eb76d7836be7d5b74f1c534a"
@ -406,6 +441,7 @@
    "github.com/jinzhu/copier",
    "github.com/kr/pretty",
    "github.com/mattn/go-isatty",
+    "github.com/montanaflynn/stats",
    "github.com/pkg/errors",
    "github.com/pkg/profile",
    "github.com/problame/go-netssh",
@ -415,6 +451,8 @@
    "github.com/spf13/pflag",
    "github.com/stretchr/testify/assert",
    "github.com/stretchr/testify/require",
+    "github.com/yudai/gojsondiff",
+    "github.com/yudai/gojsondiff/formatter",
    "github.com/zrepl/yaml-config",
    "golang.org/x/net/context",
    "golang.org/x/sys/unix",
--- a/2
+++ b/2
@ -27,7 +27,7 @@ vendordeps:
 	dep ensure -v -vendor-only

 generate: #not part of the build, must do that manually
-	protoc -I=replication/pdu --go_out=plugins=grpc:replication/pdu replication/pdu/pdu.proto
+	protoc -I=replication/logic/pdu --go_out=plugins=grpc:replication/logic/pdu replication/logic/pdu/pdu.proto
 	go generate -x ./...

 build:
--- a/client/status.go
+++ b/client/status.go
@ -10,8 +10,7 @@ import (
 	"github.com/zrepl/zrepl/daemon"
 	"github.com/zrepl/zrepl/daemon/job"
 	"github.com/zrepl/zrepl/daemon/pruner"
-	"github.com/zrepl/zrepl/replication"
-	"github.com/zrepl/zrepl/replication/fsrep"
+	"github.com/zrepl/zrepl/replication/report"
 	"io"
 	"math"
 	"net/http"
@ -342,74 +341,58 @@ func (t *tui) draw() {
 	termbox.Flush()
 }

-func (t *tui) renderReplicationReport(rep *replication.Report, history *bytesProgressHistory) {
+func (t *tui) renderReplicationReport(rep *report.Report, history *bytesProgressHistory) {
 	if rep == nil {
 		t.printf("...\n")
 		return
 	}

-	all := make([]*fsrep.Report, 0, len(rep.Completed)+len(rep.Pending) + 1)
-	all = append(all, rep.Completed...)
-	all = append(all, rep.Pending...)
-	if rep.Active != nil {
-		all = append(all, rep.Active)
-	}
-	sort.Slice(all, func(i, j int) bool {
-		return all[i].Filesystem < all[j].Filesystem
-	})
-
-	state, err := replication.StateString(rep.Status)
-	if err != nil {
-		t.printf("Status: %q (parse error: %q)\n", rep.Status, err)
+	// TODO visualize more than the latest attempt by folding all attempts into one
+	if len(rep.Attempts) == 0 {
+		t.printf("no attempts made yet")
 		return
 	}

-	t.printf("Status: %s", state)
+	latest := rep.Attempts[len(rep.Attempts)-1]
+	sort.Slice(latest.Filesystems, func(i, j int) bool {
+		return latest.Filesystems[i].Info.Name < latest.Filesystems[j].Info.Name
+	})
+
+	t.printf("Status: %s", latest.State)
 	t.newline()
-	if rep.Problem != "" {
+	if latest.State == report.AttemptPlanningError {
 		t.printf("Problem: ")
-		t.printfDrawIndentedAndWrappedIfMultiline("%s", rep.Problem)
+		t.printfDrawIndentedAndWrappedIfMultiline("%s", latest.PlanError)
+		t.newline()
+	} else if latest.State == report.AttemptFanOutError {
+		t.printf("Problem: one or more of the filesystems encountered errors")
 		t.newline()
 	}
-	if rep.SleepUntil.After(time.Now()) && !state.IsTerminal() {
-		t.printf("Sleeping until %s (%s left)\n", rep.SleepUntil, rep.SleepUntil.Sub(time.Now()))
-	}

-	if state != replication.Planning && state != replication.PlanningError {
+	// TODO report sleep time between retry attempts once that is implemented
+
+	if latest.State != report.AttemptPlanning && latest.State != report.AttemptPlanningError {
+		// Draw global progress bar
 		// Progress: [---------------]
-		sumUpFSRep := func(rep *fsrep.Report) (transferred, total int64) {
-			for _, s := range rep.Pending {
-				transferred += s.Bytes
-				total  += s.ExpectedBytes
-			}
-			for _, s := range rep.Completed {
-				transferred += s.Bytes
-				total += s.ExpectedBytes
-			}
-			return
-		}
-		var transferred, total int64
-		for _, fs := range all {
-			fstx, fstotal := sumUpFSRep(fs)
-			transferred += fstx
-			total += fstotal
-		}
-		rate, changeCount := history.Update(transferred)
+		expected, replicated := latest.BytesSum()
+		rate, changeCount := history.Update(replicated)
 		t.write("Progress: ")
-		t.drawBar(50, transferred, total, changeCount)
-		t.write(fmt.Sprintf(" %s / %s @ %s/s", ByteCountBinary(transferred), ByteCountBinary(total), ByteCountBinary(rate)))
+		t.drawBar(50, replicated, expected, changeCount)
+		t.write(fmt.Sprintf(" %s / %s @ %s/s", ByteCountBinary(replicated), ByteCountBinary(expected), ByteCountBinary(rate)))
 		t.newline()
+
+		var maxFSLen int
+		for _, fs := range latest.Filesystems {
+			if len(fs.Info.Name) > maxFSLen {
+				maxFSLen = len(fs.Info.Name)
+			}
+		}
+		for _, fs := range latest.Filesystems {
+			t.printFilesystemStatus(fs, false, maxFSLen) // FIXME bring 'active' flag back
+		}
+
 	}

-	var maxFSLen int
-	for _, fs := range all {
-		if len(fs.Filesystem) > maxFSLen {
-			maxFSLen = len(fs.Filesystem)
-		}
-	}
-	for _, fs := range all {
-		t.printFilesystemStatus(fs, fs == rep.Active, maxFSLen)
-	}
 }

 func (t *tui) renderPrunerReport(r *pruner.Report) {
@ -513,25 +496,6 @@ func (t *tui) renderPrunerReport(r *pruner.Report) {

 }

-const snapshotIndent = 1
-func calculateMaxFSLength(all []*fsrep.Report) (maxFS, maxStatus int) {
-	for _, e := range all {
-		if len(e.Filesystem) > maxFS {
-			maxFS = len(e.Filesystem)
-		}
-		all2 := make([]*fsrep.StepReport, 0, len(e.Pending) + len(e.Completed))
-		all2 = append(all2, e.Pending...)
-		all2 = append(all2, e.Completed...)
-		for _, e2 := range all2 {
-			elen := len(e2.Problem) + len(e2.From) + len(e2.To) + 60 // random spacing, units, labels, etc
-			if elen > maxStatus {
-				maxStatus = elen
-			}
-		}
-	}
-	return
-}
-
 func times(str string, n int) (out string) {
 	for i := 0; i < n; i++ {
 		out += str
@ -575,35 +539,13 @@ func (t *tui) drawBar(length int, bytes, totalBytes int64, changeCount int) {
 	t.write("]")
 }

-func StringStepState(s fsrep.StepState) string {
-	switch s {
-	case fsrep.StepReplicationReady: return "Ready"
-	case fsrep.StepMarkReplicatedReady: return "MarkReady"
-	case fsrep.StepCompleted: return "Completed"
-	default:
-		return fmt.Sprintf("UNKNOWN %d", s)
-	}
-}
-
-func (t *tui) printFilesystemStatus(rep *fsrep.Report, active bool, maxFS int) {
-
-	bytes := int64(0)
-	totalBytes := int64(0)
-	for _, s := range rep.Pending {
-		bytes += s.Bytes
-		totalBytes += s.ExpectedBytes
-	}
-	for _, s := range rep.Completed {
-		bytes += s.Bytes
-		totalBytes += s.ExpectedBytes
-	}
-
+func (t *tui) printFilesystemStatus(rep *report.FilesystemReport, active bool, maxFS int) {

+	expected, replicated := rep.BytesSum()
 	status := fmt.Sprintf("%s (step %d/%d, %s/%s)",
-		rep.Status,
-		len(rep.Completed), len(rep.Pending) + len(rep.Completed),
-		ByteCountBinary(bytes), ByteCountBinary(totalBytes),
-
+		strings.ToUpper(string(rep.State)),
+		rep.CurrentStep, len(rep.Steps),
+		ByteCountBinary(replicated), ByteCountBinary(expected),
 	)

 	activeIndicator := " "
@ -612,18 +554,23 @@ func (t *tui) printFilesystemStatus(rep *fsrep.Report, active bool, maxFS int) {
 	}
 	t.printf("%s %s %s ",
 		activeIndicator,
-		rightPad(rep.Filesystem, maxFS, " "),
+		rightPad(rep.Info.Name, maxFS, " "),
 		status)

 	next := ""
-	if rep.Problem != "" {
-		next = rep.Problem
-	} else if len(rep.Pending) > 0 {
-		if rep.Pending[0].From != "" {
-			next = fmt.Sprintf("next: %s => %s", rep.Pending[0].From, rep.Pending[0].To)
+	if err := rep.Error(); err != nil {
+		next = err.Err
+	} else if rep.State != report.FilesystemDone {
+		if nextStep := rep.NextStep(); nextStep != nil {
+			if nextStep.IsIncremental() {
+				next = fmt.Sprintf("next: %s => %s", nextStep.Info.From, nextStep.Info.To)
+			} else {
+				next = fmt.Sprintf("next: %s (full)", nextStep.Info.To)
+			}
 		} else {
-			next = fmt.Sprintf("next: %s (full)", rep.Pending[0].To)
+			next = "" // individual FSes may still be in planning state
 		}
+
 	}
 	t.printfDrawIndentedAndWrappedIfMultiline("%s", next)

--- a/daemon/job/active.go
+++ b/daemon/job/active.go
@ -17,10 +17,12 @@ import (
 	"github.com/zrepl/zrepl/daemon/snapper"
 	"github.com/zrepl/zrepl/endpoint"
 	"github.com/zrepl/zrepl/replication"
+	"github.com/zrepl/zrepl/replication/driver"
+	"github.com/zrepl/zrepl/replication/logic"
+	"github.com/zrepl/zrepl/replication/report"
 	"github.com/zrepl/zrepl/rpc"
 	"github.com/zrepl/zrepl/transport"
 	"github.com/zrepl/zrepl/transport/fromconfig"
-	"github.com/zrepl/zrepl/util/envconst"
 	"github.com/zrepl/zrepl/zfs"
 )

@ -53,7 +55,7 @@ type activeSideTasks struct {
 	state ActiveSideState

 	// valid for state ActiveSideReplicating, ActiveSidePruneSender, ActiveSidePruneReceiver, ActiveSideDone
-	replication       *replication.Replication
+	replicationReport driver.ReportFunc
 	replicationCancel context.CancelFunc

 	// valid for state ActiveSidePruneSender, ActiveSidePruneReceiver, ActiveSideDone
@ -79,7 +81,7 @@ func (a *ActiveSide) updateTasks(u func(*activeSideTasks)) activeSideTasks {
 type activeMode interface {
 	ConnectEndpoints(rpcLoggers rpc.Loggers, connecter transport.Connecter)
 	DisconnectEndpoints()
-	SenderReceiver() (replication.Sender, replication.Receiver)
+	SenderReceiver() (logic.Sender, logic.Receiver)
 	Type() Type
 	RunPeriodic(ctx context.Context, wakeUpCommon chan<- struct{})
 	ResetConnectBackoff()
@ -111,7 +113,7 @@ func (m *modePush) DisconnectEndpoints() {
 	m.receiver = nil
 }

-func (m *modePush) SenderReceiver() (replication.Sender, replication.Receiver) {
+func (m *modePush) SenderReceiver() (logic.Sender, logic.Receiver) {
 	m.setupMtx.Lock()
 	defer m.setupMtx.Unlock()
 	return m.sender, m.receiver
@ -172,7 +174,7 @@ func (m *modePull) DisconnectEndpoints() {
 	m.receiver = nil
 }

-func (m *modePull) SenderReceiver() (replication.Sender, replication.Receiver) {
+func (m *modePull) SenderReceiver() (logic.Sender, logic.Receiver) {
 	m.setupMtx.Lock()
 	defer m.setupMtx.Unlock()
 	return m.sender, m.receiver
@ -274,7 +276,7 @@ func (j *ActiveSide) RegisterMetrics(registerer prometheus.Registerer) {
 func (j *ActiveSide) Name() string { return j.name }

 type ActiveSideStatus struct {
-	Replication                    *replication.Report
+	Replication                    *report.Report
 	PruningSender, PruningReceiver *pruner.Report
 }

@ -283,8 +285,8 @@ func (j *ActiveSide) Status() *Status {

 	s := &ActiveSideStatus{}
 	t := j.mode.Type()
-	if tasks.replication != nil {
-		s.Replication = tasks.replication.Report()
+	if tasks.replicationReport != nil {
+		s.Replication = tasks.replicationReport()
 	}
 	if tasks.prunerSender != nil {
 		s.PruningSender = tasks.prunerSender.Report()
@ -345,78 +347,6 @@ func (j *ActiveSide) do(ctx context.Context) {
 		}
 	}()

-	// The code after this watchdog goroutine is sequential and transitions the state from
-	//   ActiveSideReplicating -> ActiveSidePruneSender -> ActiveSidePruneReceiver -> ActiveSideDone
-	// If any of those sequential tasks 'gets stuck' (livelock, no progress), the watchdog will eventually
-	// cancel its context.
-	// If the task is written to support context cancellation, it will return immediately (in permanent error state),
-	// and the sequential code above transitions to the next state.
-	go func() {
-
-		wdto := envconst.Duration("ZREPL_JOB_WATCHDOG_TIMEOUT", 10*time.Minute)
-		jitter := envconst.Duration("ZREPL_JOB_WATCHDOG_JITTER", 1*time.Second)
-		// shadowing!
-		log := log.WithField("watchdog_timeout", wdto.String())
-
-		log.Debug("starting watchdog")
-		defer log.Debug("watchdog stopped")
-
-		t := time.NewTicker(wdto)
-		defer t.Stop()
-
-		for {
-			select {
-			case <-ctx.Done():
-				return
-			case <-t.C: // fall
-			}
-
-			j.updateTasks(func(tasks *activeSideTasks) {
-				// Since cancelling a task will cause the sequential code to transition to the next state immediately,
-				// we cannot check for its progress right then (no fallthrough).
-				// Instead, we return (not continue because we are in a closure) and give the new state another
-				// ZREPL_JOB_WATCHDOG_TIMEOUT interval to try make some progress.
-
-				log.WithField("state", tasks.state).Debug("watchdog firing")
-
-				const WATCHDOG_ENVCONST_NOTICE = " (adjust ZREPL_JOB_WATCHDOG_TIMEOUT env variable if inappropriate)"
-
-				switch tasks.state {
-				case ActiveSideReplicating:
-					log.WithField("replication_progress", tasks.replication.Progress.String()).
-						Debug("check replication progress")
-					if tasks.replication.Progress.CheckTimeout(wdto, jitter) {
-						log.Error("replication did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
-						tasks.replicationCancel()
-						return
-					}
-				case ActiveSidePruneSender:
-					log.WithField("prune_sender_progress", tasks.replication.Progress.String()).
-						Debug("check pruner_sender progress")
-					if tasks.prunerSender.Progress.CheckTimeout(wdto, jitter) {
-						log.Error("pruner_sender did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
-						tasks.prunerSenderCancel()
-						return
-					}
-				case ActiveSidePruneReceiver:
-					log.WithField("prune_receiver_progress", tasks.replication.Progress.String()).
-						Debug("check pruner_receiver progress")
-					if tasks.prunerReceiver.Progress.CheckTimeout(wdto, jitter) {
-						log.Error("pruner_receiver did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
-						tasks.prunerReceiverCancel()
-						return
-					}
-				case ActiveSideDone:
-					// ignore, ctx will be Done() in a few milliseconds and the watchdog will exit
-				default:
-					log.WithField("state", tasks.state).
-						Error("watchdog implementation error: unknown active side state")
-				}
-			})
-
-		}
-	}()
-
 	sender, receiver := j.mode.SenderReceiver()

 	{
@ -426,16 +356,19 @@ func (j *ActiveSide) do(ctx context.Context) {
 		default:
 		}
 		ctx, repCancel := context.WithCancel(ctx)
-		tasks := j.updateTasks(func(tasks *activeSideTasks) {
+		var repWait driver.WaitFunc
+		j.updateTasks(func(tasks *activeSideTasks) {
 			// reset it
 			*tasks = activeSideTasks{}
 			tasks.replicationCancel = repCancel
-			tasks.replication = replication.NewReplication(j.promRepStateSecs, j.promBytesReplicated)
+			tasks.replicationReport, repWait = replication.Do(
+				ctx, logic.NewPlanner(j.promRepStateSecs, j.promBytesReplicated, sender, receiver),
+			)
 			tasks.state = ActiveSideReplicating
 		})
 		log.Info("start replication")
-		tasks.replication.Drive(ctx, sender, receiver)
-		repCancel() // always cancel to free up context resources
+		repWait(true) // wait blocking
+		repCancel()   // always cancel to free up context resources
 	}

 	{
--- a/daemon/logging/build_logging.go
+++ b/daemon/logging/build_logging.go
@ -14,7 +14,6 @@ import (
 	"github.com/zrepl/zrepl/daemon/snapper"
 	"github.com/zrepl/zrepl/endpoint"
 	"github.com/zrepl/zrepl/logger"
-	"github.com/zrepl/zrepl/replication"
 	"github.com/zrepl/zrepl/rpc"
 	"github.com/zrepl/zrepl/rpc/transportmux"
 	"github.com/zrepl/zrepl/tlsconf"
@ -78,7 +77,6 @@ const (
 )

 func WithSubsystemLoggers(ctx context.Context, log logger.Logger) context.Context {
-	ctx = replication.WithLogger(ctx, log.WithField(SubsysField, SubsysReplication))
 	ctx = endpoint.WithLogger(ctx, log.WithField(SubsysField, SubsyEndpoint))
 	ctx = pruner.WithLogger(ctx, log.WithField(SubsysField, SubsysPruning))
 	ctx = snapper.WithLogger(ctx, log.WithField(SubsysField, SubsysSnapshot))
--- a/daemon/pruner/pruner.go
+++ b/daemon/pruner/pruner.go
@ -8,7 +8,7 @@ import (
 	"github.com/zrepl/zrepl/config"
 	"github.com/zrepl/zrepl/logger"
 	"github.com/zrepl/zrepl/pruning"
-	"github.com/zrepl/zrepl/replication/pdu"
+	"github.com/zrepl/zrepl/replication/logic/pdu"
 	"github.com/zrepl/zrepl/util/envconst"
 	"github.com/zrepl/zrepl/util/watchdog"
 	"net"
--- a/daemon/pruner/pruner_test.go
+++ b/daemon/pruner/pruner_test.go
@ -6,7 +6,7 @@ import (
 	"github.com/stretchr/testify/assert"
 	"github.com/zrepl/zrepl/logger"
 	"github.com/zrepl/zrepl/pruning"
-	"github.com/zrepl/zrepl/replication/pdu"
+	"github.com/zrepl/zrepl/replication/logic/pdu"
 	"net"
 	"testing"
 	"time"
--- a/endpoint/endpoint.go
+++ b/endpoint/endpoint.go
@ -7,8 +7,7 @@ import (
 	"path"

 	"github.com/pkg/errors"
-	"github.com/zrepl/zrepl/replication"
-	"github.com/zrepl/zrepl/replication/pdu"
+	"github.com/zrepl/zrepl/replication/logic/pdu"
 	"github.com/zrepl/zrepl/zfs"
 )

@ -34,7 +33,7 @@ func (s *Sender) filterCheckFS(fs string) (*zfs.DatasetPath, error) {
 		return nil, err
 	}
 	if !pass {
-		return nil, replication.NewFilteredError(fs)
+		return nil, fmt.Errorf("endpoint does not allow access to filesystem %s", fs)
 	}
 	return dp, nil
 }
--- a/replication/driver/replication_driver.go
+++ b/replication/driver/replication_driver.go
@ -0,0 +1,301 @@
+package driver
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"github.com/zrepl/zrepl/replication/report"
+	"github.com/zrepl/zrepl/util/chainlock"
+)
+
+type run struct {
+	l *chainlock.L
+
+	startedAt, finishedAt time.Time
+
+	// the attempts attempted so far:
+	// All but the last in this slice must have finished with some errors.
+	// The last attempt may not be finished and may not have errors.
+	attempts []*attempt
+}
+
+type Planner interface {
+	Plan(context.Context) ([]FS, error)
+}
+
+// an attempt represents a single planning & execution of fs replications
+type attempt struct {
+	planner Planner
+
+	l *chainlock.L
+
+	startedAt, finishedAt time.Time
+	// after Planner.Plan was called, planErr and fss are mutually exclusive with regards to nil-ness
+	// if both are nil, it must be assumed that Planner.Plan is active
+	planErr *timedError
+	fss     []*fs
+}
+
+type timedError struct {
+	Err  error
+	Time time.Time
+}
+
+func newTimedError(err error, t time.Time) *timedError {
+	if err == nil {
+		panic("error must be non-nil")
+	}
+	if t.IsZero() {
+		panic("t must be non-zero")
+	}
+	return &timedError{err, t}
+}
+
+func (e *timedError) IntoReportError() *report.TimedError {
+	if e == nil {
+		return nil
+	}
+	return report.NewTimedError(e.Err.Error(), e.Time)
+}
+
+type FS interface {
+	PlanFS(context.Context) ([]Step, error)
+	ReportInfo() *report.FilesystemInfo
+}
+
+type Step interface {
+	TargetDate() time.Time
+	Step(context.Context) error
+	ReportInfo() *report.StepInfo
+}
+
+type fs struct {
+	fs FS
+
+	l *chainlock.L
+
+	planning struct {
+		done bool
+		err  *timedError
+	}
+
+	// valid iff planning.done && planning.err == nil
+	planned struct {
+		// valid iff planning.done && planning.err == nil
+		stepErr *timedError
+		// all steps, in the order in which they must be completed
+		steps []*step
+		// index into steps, pointing at the step that is currently executing
+		// if step >= len(steps), no more work needs to be done
+		step int
+	}
+}
+
+type step struct {
+	l    *chainlock.L
+	step Step
+}
+
+type ReportFunc func() *report.Report
+type WaitFunc func(block bool) (done bool)
+
+func Do(ctx context.Context, planner Planner) (ReportFunc, WaitFunc) {
+	l := chainlock.New()
+	run := &run{
+		l:         l,
+		startedAt: time.Now(),
+	}
+
+	done := make(chan struct{})
+	go func() {
+		defer close(done)
+
+		run.l.Lock()
+		a1 := &attempt{
+			l:         l,
+			startedAt: time.Now(),
+			planner:   planner,
+		}
+		run.attempts = append(run.attempts, a1)
+		run.l.Unlock()
+		a1.do(ctx)
+
+	}()
+
+	wait := func(block bool) bool {
+		if block {
+			<-done
+		}
+		select {
+		case <-done:
+			return true
+		default:
+			return false
+		}
+	}
+	report := func() *report.Report {
+		defer run.l.Lock().Unlock()
+		return run.report()
+	}
+	return report, wait
+}
+
+func (a *attempt) do(ctx context.Context) {
+	pfss, err := a.planner.Plan(ctx)
+	errTime := time.Now()
+	defer a.l.Lock().Unlock()
+	if err != nil {
+		a.planErr = newTimedError(err, errTime)
+		a.fss = nil
+		a.finishedAt = time.Now()
+		return
+	}
+
+	for _, pfs := range pfss {
+		fs := &fs{
+			fs: pfs,
+			l:  a.l,
+		}
+		a.fss = append(a.fss, fs)
+	}
+
+	stepQueue := newStepQueue()
+	defer stepQueue.Start(1)()
+	var fssesDone sync.WaitGroup
+	for _, f := range a.fss {
+		fssesDone.Add(1)
+		go func(f *fs) {
+			defer fssesDone.Done()
+			f.do(ctx, stepQueue)
+		}(f)
+	}
+	a.l.DropWhile(func() {
+		fssesDone.Wait()
+	})
+	a.finishedAt = time.Now()
+}
+
+func (fs *fs) do(ctx context.Context, pq *stepQueue) {
+	psteps, err := fs.fs.PlanFS(ctx)
+	errTime := time.Now()
+	defer fs.l.Lock().Unlock()
+	fs.planning.done = true
+	if err != nil {
+		fs.planning.err = newTimedError(err, errTime)
+		return
+	}
+	for _, pstep := range psteps {
+		step := &step{
+			l:    fs.l,
+			step: pstep,
+		}
+		fs.planned.steps = append(fs.planned.steps, step)
+	}
+	for i, s := range fs.planned.steps {
+		var (
+			err     error
+			errTime time.Time
+		)
+		// lock must not be held while executing step in order for reporting to work
+		fs.l.DropWhile(func() {
+			targetDate := s.step.TargetDate()
+			defer pq.WaitReady(fs, targetDate)()
+			err = s.step.Step(ctx) // no shadow
+			errTime = time.Now()   // no shadow
+		})
+		if err != nil {
+			fs.planned.stepErr = newTimedError(err, errTime)
+			break
+		}
+		fs.planned.step = i + 1 // fs.planned.step must be == len(fs.planned.steps) if all went OK
+	}
+}
+
+// caller must hold lock l
+func (r *run) report() *report.Report {
+	report := &report.Report{
+		Attempts: make([]*report.AttemptReport, len(r.attempts)),
+		StartAt:  r.startedAt,
+		FinishAt: r.finishedAt,
+	}
+	for i := range report.Attempts {
+		report.Attempts[i] = r.attempts[i].report()
+	}
+	return report
+}
+
+// caller must hold lock l
+func (a *attempt) report() *report.AttemptReport {
+
+	r := &report.AttemptReport{
+		// State is set below
+		Filesystems: make([]*report.FilesystemReport, len(a.fss)),
+		StartAt:     a.startedAt,
+		FinishAt:    a.finishedAt,
+		PlanError:   a.planErr.IntoReportError(),
+	}
+
+	for i := range r.Filesystems {
+		r.Filesystems[i] = a.fss[i].report()
+	}
+
+	state := report.AttemptPlanning
+	if a.planErr != nil {
+		state = report.AttemptPlanningError
+	} else if a.fss != nil {
+		if a.finishedAt.IsZero() {
+			state = report.AttemptFanOutFSs
+		} else {
+			fsWithError := false
+			for _, s := range r.Filesystems {
+				fsWithError = fsWithError || s.Error() != nil
+			}
+			state = report.AttemptDone
+			if fsWithError {
+				state = report.AttemptFanOutError
+			}
+		}
+	}
+	r.State = state
+
+	return r
+}
+
+// caller must hold lock l
+func (f *fs) report() *report.FilesystemReport {
+	state := report.FilesystemPlanningErrored
+	if f.planning.err == nil {
+		if f.planning.done {
+			if f.planned.stepErr != nil {
+				state = report.FilesystemSteppingErrored
+			} else if f.planned.step < len(f.planned.steps) {
+				state = report.FilesystemStepping
+			} else {
+				state = report.FilesystemDone
+			}
+		} else {
+			state = report.FilesystemPlanning
+		}
+	}
+	r := &report.FilesystemReport{
+		Info:        f.fs.ReportInfo(),
+		State:       state,
+		PlanError:   f.planning.err.IntoReportError(),
+		StepError:   f.planned.stepErr.IntoReportError(),
+		Steps:       make([]*report.StepReport, len(f.planned.steps)),
+		CurrentStep: f.planned.step,
+	}
+	for i := range r.Steps {
+		r.Steps[i] = f.planned.steps[i].report()
+	}
+	return r
+}
+
+// caller must hold lock l
+func (s *step) report() *report.StepReport {
+	r := &report.StepReport{
+		Info: s.step.ReportInfo(),
+	}
+	return r
+}
--- a/replication/driver/replication_driver_test.go
+++ b/replication/driver/replication_driver_test.go
@ -0,0 +1,204 @@
+package driver
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"sort"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+	"github.com/zrepl/zrepl/replication/report"
+
+	"github.com/stretchr/testify/assert"
+
+	jsondiff "github.com/yudai/gojsondiff"
+	jsondiffformatter "github.com/yudai/gojsondiff/formatter"
+)
+
+type mockPlanner struct {
+	stepCounter uint32
+	fss         []FS // *mockFS
+}
+
+func (p *mockPlanner) Plan(ctx context.Context) ([]FS, error) {
+	time.Sleep(1 * time.Second)
+	p.fss = []FS{
+		&mockFS{
+			&p.stepCounter,
+			"zroot/one",
+			nil,
+		},
+		&mockFS{
+			&p.stepCounter,
+			"zroot/two",
+			nil,
+		},
+	}
+	return p.fss, nil
+}
+
+type mockFS struct {
+	globalStepCounter *uint32
+	name              string
+	steps             []Step
+}
+
+func (f *mockFS) PlanFS(ctx context.Context) ([]Step, error) {
+	if f.steps != nil {
+		panic("PlanFS used twice")
+	}
+	switch f.name {
+	case "zroot/one":
+		f.steps = []Step{
+			&mockStep{
+				fs:         f,
+				ident:      "a",
+				duration:   1 * time.Second,
+				targetDate: time.Unix(2, 0),
+			},
+			&mockStep{
+				fs:         f,
+				ident:      "b",
+				duration:   1 * time.Second,
+				targetDate: time.Unix(10, 0),
+			},
+			&mockStep{
+				fs:         f,
+				ident:      "c",
+				duration:   1 * time.Second,
+				targetDate: time.Unix(20, 0),
+			},
+		}
+	case "zroot/two":
+		f.steps = []Step{
+			&mockStep{
+				fs:         f,
+				ident:      "u",
+				duration:   500 * time.Millisecond,
+				targetDate: time.Unix(15, 0),
+			},
+			&mockStep{
+				fs:         f,
+				duration:   500 * time.Millisecond,
+				ident:      "v",
+				targetDate: time.Unix(30, 0),
+			},
+		}
+	default:
+		panic("unimplemented")
+	}
+
+	return f.steps, nil
+}
+
+func (f *mockFS) ReportInfo() *report.FilesystemInfo {
+	return &report.FilesystemInfo{Name: f.name}
+}
+
+type mockStep struct {
+	fs         *mockFS
+	ident      string
+	duration   time.Duration
+	targetDate time.Time
+
+	// filled by method Step
+	globalCtr uint32
+}
+
+func (f *mockStep) String() string {
+	return fmt.Sprintf("%s{%s} targetDate=%s globalCtr=%v", f.fs.name, f.ident, f.targetDate, f.globalCtr)
+}
+
+func (f *mockStep) Step(ctx context.Context) error {
+	f.globalCtr = atomic.AddUint32(f.fs.globalStepCounter, 1)
+	time.Sleep(f.duration)
+	return nil
+}
+
+func (f *mockStep) TargetDate() time.Time {
+	return f.targetDate
+}
+
+func (f *mockStep) ReportInfo() *report.StepInfo {
+	return &report.StepInfo{From: f.ident, To: f.ident, BytesExpected: 100, BytesReplicated: 25}
+}
+
+// TODO: add meaningful validation (i.e. actual checks)
+// Since the stepqueue is not deterministic due to scheduler jitter,
+// we cannot test for any definitive sequence of steps here.
+// Such checks would further only be sensible for a non-concurrent step-queue,
+// but we're going to have concurrent replication in the future.
+//
+// For the time being, let's just exercise the code a bit.
+func TestReplication(t *testing.T) {
+
+	ctx := context.Background()
+
+	mp := &mockPlanner{}
+	getReport, wait := Do(ctx, mp)
+	begin := time.Now()
+	fireAt := []time.Duration{
+		// the following values are relative to the start
+		500 * time.Millisecond,  // planning
+		1500 * time.Millisecond, // nothing is done, a is running
+		2500 * time.Millisecond, // a done,  b running
+		3250 * time.Millisecond, // a,b done, u running
+		3750 * time.Millisecond, // a,b,u done, c running
+		4750 * time.Millisecond, // a,b,u,c done, v running
+		5250 * time.Millisecond, // a,b,u,c,v done
+	}
+	reports := make([]*report.Report, len(fireAt))
+	for i := range fireAt {
+		sleepUntil := begin.Add(fireAt[i])
+		time.Sleep(sleepUntil.Sub(time.Now()))
+		reports[i] = getReport()
+		// uncomment for viewing non-diffed results
+		// t.Logf("report @ %6.4f:\n%s", fireAt[i].Seconds(), pretty.Sprint(reports[i]))
+	}
+	waitBegin := time.Now()
+	wait(true)
+	waitDuration := time.Now().Sub(waitBegin)
+	assert.True(t, waitDuration < 10*time.Millisecond, "%v", waitDuration) // and that's gratious
+
+	prev, err := json.Marshal(reports[0])
+	require.NoError(t, err)
+	for _, r := range reports[1:] {
+		this, err := json.Marshal(r)
+		require.NoError(t, err)
+		differ := jsondiff.New()
+		diff, err := differ.Compare(prev, this)
+		require.NoError(t, err)
+		df := jsondiffformatter.NewDeltaFormatter()
+		res, err := df.Format(diff)
+		res = res
+		require.NoError(t, err)
+		// uncomment the following line to get json diffs between each captured step
+		// t.Logf("%s", res)
+		prev, err = json.Marshal(r)
+		require.NoError(t, err)
+	}
+
+	steps := make([]*mockStep, 0)
+	for _, fs := range mp.fss {
+		for _, step := range fs.(*mockFS).steps {
+			steps = append(steps, step.(*mockStep))
+		}
+	}
+
+	// sort steps in pq order (although, remember, pq is not deterministic)
+	sort.Slice(steps, func(i, j int) bool {
+		return steps[i].targetDate.Before(steps[j].targetDate)
+	})
+
+	// manual inspection of the globalCtr value should show that, despite
+	// scheduler-dependent behavior of pq, steps should generally be taken
+	// from oldest to newest target date (globally, not per FS).
+	t.Logf("steps sorted by target date:")
+	for _, step := range steps {
+		t.Logf("\t%s", step)
+	}
+
+}
--- a/replication/driver/replication_stepqueue.go
+++ b/replication/driver/replication_stepqueue.go
@ -0,0 +1,163 @@
+package driver
+
+import (
+	"container/heap"
+	"time"
+
+	"github.com/zrepl/zrepl/util/chainlock"
+)
+
+type stepQueueRec struct {
+	ident      interface{}
+	targetDate time.Time
+	wakeup     chan StepCompletedFunc
+}
+
+type stepQueue struct {
+	stop chan struct{}
+	reqs chan stepQueueRec
+}
+
+type stepQueueHeapItem struct {
+	idx int
+	req stepQueueRec
+}
+type stepQueueHeap []*stepQueueHeapItem
+
+func (h stepQueueHeap) Less(i, j int) bool {
+	return h[i].req.targetDate.Before(h[j].req.targetDate)
+}
+
+func (h stepQueueHeap) Swap(i, j int) {
+	h[i], h[j] = h[j], h[i]
+	h[i].idx = i
+	h[j].idx = j
+}
+
+func (h stepQueueHeap) Len() int {
+	return len(h)
+}
+
+func (h *stepQueueHeap) Push(elem interface{}) {
+	hitem := elem.(*stepQueueHeapItem)
+	hitem.idx = h.Len()
+	*h = append(*h, hitem)
+}
+
+func (h *stepQueueHeap) Pop() interface{} {
+	elem := (*h)[h.Len()-1]
+	elem.idx = -1
+	*h = (*h)[:h.Len()-1]
+	return elem
+}
+
+// returned stepQueue must be closed with method Close
+func newStepQueue() *stepQueue {
+	q := &stepQueue{
+		stop: make(chan struct{}),
+		reqs: make(chan stepQueueRec),
+	}
+	return q
+}
+
+// the returned done function must be called to free resources
+// allocated by the call to Start
+//
+// No WaitReady calls must be active at the time done is called
+// The behavior of calling WaitReady after done was called is undefined
+func (q *stepQueue) Start(concurrency int) (done func()) {
+	if concurrency < 1 {
+		panic("concurrency must be >= 1")
+	}
+	// l protects pending and queueItems
+	l := chainlock.New()
+	pendingCond := l.NewCond()
+	// priority queue
+	pending := &stepQueueHeap{}
+	// ident => queueItem
+	queueItems := make(map[interface{}]*stepQueueHeapItem)
+	// stopped is used for cancellation of "wake" goroutine
+	stopped := false
+	active := 0
+	go func() { // "stopper" goroutine
+		<-q.stop
+		defer l.Lock().Unlock()
+		stopped = true
+		pendingCond.Broadcast()
+	}()
+	go func() { // "reqs" goroutine
+		for {
+			select {
+			case <-q.stop:
+				select {
+				case <-q.reqs:
+					panic("WaitReady call active while calling Close")
+				default:
+					return
+				}
+			case req := <-q.reqs:
+				func() {
+					defer l.Lock().Unlock()
+					if _, ok := queueItems[req.ident]; ok {
+						panic("WaitReady must not be called twice for the same ident")
+					}
+					qitem := &stepQueueHeapItem{
+						req: req,
+					}
+					queueItems[req.ident] = qitem
+					heap.Push(pending, qitem)
+					pendingCond.Broadcast()
+				}()
+			}
+		}
+	}()
+	go func() { // "wake" goroutine
+		defer l.Lock().Unlock()
+		for {
+
+			for !stopped && (active >= concurrency || pending.Len() == 0) {
+				pendingCond.Wait()
+			}
+			if stopped {
+				return
+			}
+			if pending.Len() <= 0 {
+				return
+			}
+			active++
+			next := heap.Pop(pending).(*stepQueueHeapItem).req
+			delete(queueItems, next.ident)
+
+			next.wakeup <- func() {
+				defer l.Lock().Unlock()
+				active--
+				pendingCond.Broadcast()
+			}
+		}
+	}()
+
+	done = func() {
+		close(q.stop)
+	}
+	return done
+}
+
+type StepCompletedFunc func()
+
+func (q *stepQueue) sendAndWaitForWakeup(ident interface{}, targetDate time.Time) StepCompletedFunc {
+	req := stepQueueRec{
+		ident,
+		targetDate,
+		make(chan StepCompletedFunc),
+	}
+	q.reqs <- req
+	return <-req.wakeup
+}
+
+// Wait for the ident with targetDate to be selected to run.
+func (q *stepQueue) WaitReady(ident interface{}, targetDate time.Time) StepCompletedFunc {
+	if targetDate.IsZero() {
+		panic("targetDate of zero is reserved for marking Done")
+	}
+	return q.sendAndWaitForWakeup(ident, targetDate)
+}
--- a/replication/driver/replication_stepqueue_test.go
+++ b/replication/driver/replication_stepqueue_test.go
@ -0,0 +1,166 @@
+package driver
+
+import (
+	"fmt"
+	"math"
+	"sort"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/montanaflynn/stats"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestPqNotconcurrent(t *testing.T) {
+
+	var ctr uint32
+	q := newStepQueue()
+	var wg sync.WaitGroup
+	wg.Add(3)
+	go func() {
+		defer wg.Done()
+		defer q.WaitReady("1", time.Unix(1, 0))()
+		ret := atomic.AddUint32(&ctr, 1)
+		assert.Equal(t, uint32(1), ret)
+	}()
+	go func() {
+		defer wg.Done()
+		defer q.WaitReady("2", time.Unix(2, 0))()
+		ret := atomic.AddUint32(&ctr, 1)
+		assert.Equal(t, uint32(2), ret)
+	}()
+	go func() {
+		defer wg.Done()
+		defer q.WaitReady("3", time.Unix(3, 0))()
+		ret := atomic.AddUint32(&ctr, 1)
+		assert.Equal(t, uint32(3), ret)
+	}()
+
+	time.Sleep(1 * time.Second)
+	defer q.Start(1)()
+	wg.Wait()
+
+}
+
+type record struct {
+	fs        int
+	step      int
+	globalCtr uint32
+	wakeAt    time.Duration // relative to begin
+}
+
+func (r record) String() string {
+	return fmt.Sprintf("fs %08d step %08d globalCtr %08d wakeAt %2.8f", r.fs, r.step, r.globalCtr, r.wakeAt.Seconds())
+}
+
+// This tests uses stepPq concurrently, simulating the following scenario:
+// Given a number of filesystems F, each filesystem has N steps to take.
+// The number of concurrent steps is limited to C.
+// The target date for each step is the step number N.
+// Hence, there are always F filesystems runnable (calling WaitReady)
+// The priority queue prioritizes steps with lower target data (= lower step number).
+// Hence, all steps with lower numbers should be woken up before steps with higher numbers.
+// However, scheduling is not 100% deterministic (runtime, OS scheduler, etc).
+// Hence, perform some statistics on the wakeup times and assert that the mean wakeup
+// times for each step are close together.
+func TestPqConcurrent(t *testing.T) {
+
+	q := newStepQueue()
+	var wg sync.WaitGroup
+	filesystems := 100
+	stepsPerFS := 20
+	sleepTimePerStep := 50 * time.Millisecond
+	wg.Add(filesystems)
+	var globalCtr uint32
+
+	begin := time.Now()
+	records := make(chan []record, filesystems)
+	for fs := 0; fs < filesystems; fs++ {
+		go func(fs int) {
+			defer wg.Done()
+			recs := make([]record, 0)
+			for step := 0; step < stepsPerFS; step++ {
+				pos := atomic.AddUint32(&globalCtr, 1)
+				t := time.Unix(int64(step), 0)
+				done := q.WaitReady(fs, t)
+				wakeAt := time.Now().Sub(begin)
+				time.Sleep(sleepTimePerStep)
+				done()
+				recs = append(recs, record{fs, step, pos, wakeAt})
+			}
+			records <- recs
+		}(fs)
+	}
+	concurrency := 5
+	defer q.Start(concurrency)()
+	wg.Wait()
+	close(records)
+	t.Logf("loop done")
+
+	flattenedRecs := make([]record, 0)
+	for recs := range records {
+		flattenedRecs = append(flattenedRecs, recs...)
+	}
+
+	sort.Slice(flattenedRecs, func(i, j int) bool {
+		return flattenedRecs[i].globalCtr < flattenedRecs[j].globalCtr
+	})
+
+	wakeTimesByStep := map[int][]float64{}
+	for _, rec := range flattenedRecs {
+		wakeTimes, ok := wakeTimesByStep[rec.step]
+		if !ok {
+			wakeTimes = []float64{}
+		}
+		wakeTimes = append(wakeTimes, rec.wakeAt.Seconds())
+		wakeTimesByStep[rec.step] = wakeTimes
+	}
+
+	meansByStepId := make([]float64, stepsPerFS)
+	interQuartileRangesByStepIdx := make([]float64, stepsPerFS)
+	for step := 0; step < stepsPerFS; step++ {
+		t.Logf("step %d", step)
+		mean, _ := stats.Mean(wakeTimesByStep[step])
+		meansByStepId[step] = mean
+		t.Logf("\tmean: %v", mean)
+		median, _ := stats.Median(wakeTimesByStep[step])
+		t.Logf("\tmedian: %v", median)
+		midhinge, _ := stats.Midhinge(wakeTimesByStep[step])
+		t.Logf("\tmidhinge: %v", midhinge)
+		min, _ := stats.Min(wakeTimesByStep[step])
+		t.Logf("\tmin: %v", min)
+		max, _ := stats.Max(wakeTimesByStep[step])
+		t.Logf("\tmax: %v", max)
+		quartiles, _ := stats.Quartile(wakeTimesByStep[step])
+		t.Logf("\t%#v", quartiles)
+		interQuartileRange, _ := stats.InterQuartileRange(wakeTimesByStep[step])
+		t.Logf("\tinter-quartile range: %v", interQuartileRange)
+		interQuartileRangesByStepIdx[step] = interQuartileRange
+	}
+
+	iqrMean, _ := stats.Mean(interQuartileRangesByStepIdx)
+	t.Logf("inter-quartile-range mean: %v", iqrMean)
+	iqrDev, _ := stats.StandardDeviation(interQuartileRangesByStepIdx)
+	t.Logf("inter-quartile-range deviation: %v", iqrDev)
+
+	// each step should have the same "distribution" (=~ "spread")
+	assert.True(t, iqrDev < 0.01)
+
+	minTimeForAllStepsWithIdxI := sleepTimePerStep.Seconds() * float64(filesystems) / float64(concurrency)
+	t.Logf("minTimeForAllStepsWithIdxI = %11.8f", minTimeForAllStepsWithIdxI)
+	for i, mean := range meansByStepId {
+		// we can't just do (i + 0.5) * minTimeforAllStepsWithIdxI
+		// because this doesn't account for drift
+		idealMean := 0.5 * minTimeForAllStepsWithIdxI
+		if i > 0 {
+			previousMean := meansByStepId[i-1]
+			idealMean = previousMean + minTimeForAllStepsWithIdxI
+		}
+		deltaFromIdeal := idealMean - mean
+		t.Logf("step %02d delta from ideal mean wake time: %11.8f - %11.8f = %11.8f", i, idealMean, mean, deltaFromIdeal)
+		assert.True(t, math.Abs(deltaFromIdeal) < 0.05)
+	}
+
+}
--- a/replication/fsrep/fsfsm.go
+++ b/replication/fsrep/fsfsm.go
@ -1,557 +0,0 @@
-// Package fsrep implements replication of a single file system with existing versions
-// from a sender to a receiver.
-package fsrep
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"net"
-	"sync"
-	"time"
-
-	"github.com/prometheus/client_golang/prometheus"
-	"github.com/zrepl/zrepl/logger"
-	"github.com/zrepl/zrepl/replication/pdu"
-	"github.com/zrepl/zrepl/util/bytecounter"
-	"github.com/zrepl/zrepl/util/watchdog"
-	"github.com/zrepl/zrepl/zfs"
-)
-
-type contextKey int
-
-const (
-	contextKeyLogger contextKey = iota
-)
-
-type Logger = logger.Logger
-
-func WithLogger(ctx context.Context, log Logger) context.Context {
-	return context.WithValue(ctx, contextKeyLogger, log)
-}
-
-func getLogger(ctx context.Context) Logger {
-	l, ok := ctx.Value(contextKeyLogger).(Logger)
-	if !ok {
-		l = logger.NewNullLogger()
-	}
-	return l
-}
-
-// A Sender is usually part of a github.com/zrepl/zrepl/replication.Endpoint.
-type Sender interface {
-	// If a non-nil io.ReadCloser is returned, it is guaranteed to be closed before
-	// any next call to the parent github.com/zrepl/zrepl/replication.Endpoint.
-	// If the send request is for dry run the io.ReadCloser will be nil
-	Send(ctx context.Context, r *pdu.SendReq) (*pdu.SendRes, zfs.StreamCopier, error)
-	ReplicationCursor(ctx context.Context, req *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error)
-}
-
-// A Sender is usually part of a github.com/zrepl/zrepl/replication.Endpoint.
-type Receiver interface {
-	// Receive sends r and sendStream (the latter containing a ZFS send stream)
-	// to the parent github.com/zrepl/zrepl/replication.Endpoint.
-	Receive(ctx context.Context, req *pdu.ReceiveReq, receive zfs.StreamCopier) (*pdu.ReceiveRes, error)
-}
-
-type StepReport struct {
-	From, To string
-	Status   StepState
-	Problem  string
-	Bytes    int64
-	ExpectedBytes int64 // 0 means no size estimate possible
-}
-
-type Report struct {
-	Filesystem         string
-	Status             string
-	Problem            string
-	Completed, Pending []*StepReport
-}
-
-//go:generate enumer -type=State
-type State uint
-
-const (
-	Ready State = 1 << iota
-	Completed
-)
-
-type Error interface {
-	error
-	Temporary() bool
-	ContextErr() bool
-	LocalToFS() bool
-}
-
-type Replication struct {
-	promBytesReplicated prometheus.Counter
-
-	fs                 string
-
-	// lock protects all fields below it in this struct, but not the data behind pointers
-	lock               sync.Mutex
-	state              State
-	err                Error
-	completed, pending []*ReplicationStep
-}
-
-func (f *Replication) State() State {
-	f.lock.Lock()
-	defer f.lock.Unlock()
-	return f.state
-}
-
-func (f *Replication) FS() string { return f.fs }
-
-// returns zero value time.Time{} if no more pending steps
-func (f *Replication) NextStepDate() time.Time {
-	if len(f.pending) == 0 {
-		return time.Time{}
-	}
-	return f.pending[0].to.SnapshotTime()
-}
-
-func (f *Replication) Err() Error {
-	f.lock.Lock()
-	defer f.lock.Unlock()
-	return f.err
-}
-
-func (f *Replication) CanRetry() bool {
-	f.lock.Lock()
-	defer f.lock.Unlock()
-	if f.state == Completed {
-		return false
-	}
-	if f.state != Ready {
-		panic(fmt.Sprintf("implementation error: %v", f.state))
-	}
-	if f.err == nil {
-		return true
-	}
-	return f.err.Temporary()
-}
-
-func (f *Replication) UpdateSizeEsitmate(ctx context.Context, sender Sender) error {
-	f.lock.Lock()
-	defer f.lock.Unlock()
-	for _, e := range f.pending {
-		if err := e.updateSizeEstimate(ctx, sender); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-type ReplicationBuilder struct {
-	r *Replication
-}
-
-func BuildReplication(fs string, promBytesReplicated prometheus.Counter) *ReplicationBuilder {
-	return &ReplicationBuilder{&Replication{fs: fs, promBytesReplicated: promBytesReplicated}}
-}
-
-func (b *ReplicationBuilder) AddStep(from, to FilesystemVersion) *ReplicationBuilder {
-	step := &ReplicationStep{
-		state:  StepReplicationReady,
-		parent: b.r,
-		from:   from,
-		to:     to,
-	}
-	b.r.pending = append(b.r.pending, step)
-	return b
-}
-
-func (b *ReplicationBuilder) Done() (r *Replication) {
-	if len(b.r.pending) > 0 {
-		b.r.state = Ready
-	} else {
-		b.r.state = Completed
-	}
-	r = b.r
-	b.r = nil
-	return r
-}
-
-type ReplicationConflictError struct {
-	Err error
-}
-
-func (e *ReplicationConflictError) Timeout() bool { return false }
-
-func (e *ReplicationConflictError) Temporary() bool { return false }
-
-func (e *ReplicationConflictError) Error() string { return fmt.Sprintf("permanent error: %s", e.Err.Error()) }
-
-func (e *ReplicationConflictError) LocalToFS() bool { return true }
-
-func (e *ReplicationConflictError) ContextErr() bool { return false }
-
-func NewReplicationConflictError(fs string, err error) *Replication {
-	return &Replication{
-		state: Completed,
-		fs:    fs,
-		err:   &ReplicationConflictError{Err: err},
-	}
-}
-
-//go:generate enumer -type=StepState
-type StepState uint
-
-const (
-	StepReplicationReady StepState = 1 << iota
-	StepMarkReplicatedReady
-	StepCompleted
-)
-
-func (s StepState) IsTerminal() bool { return s == StepCompleted }
-
-type FilesystemVersion interface {
-	SnapshotTime() time.Time
-	GetName() string // name without @ or #
-	RelName() string // name with @ or #
-}
-
-type ReplicationStep struct {
-	// only protects state, err
-	// from, to and parent are assumed to be immutable
-	lock sync.Mutex
-
-	state    StepState
-	from, to FilesystemVersion
-	parent   *Replication
-
-	// both retry and permanent error
-	err error
-
-	byteCounter  bytecounter.StreamCopier
-	expectedSize int64 // 0 means no size estimate present / possible
-}
-
-func (f *Replication) Retry(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver) Error {
-
-	var u updater = func(fu func(*Replication)) State {
-		f.lock.Lock()
-		defer f.lock.Unlock()
-		if fu != nil {
-			fu(f)
-		}
-		return f.state
-	}
-
-
-	var current *ReplicationStep
-	pre := u(nil)
-	getLogger(ctx).WithField("fsrep_state", pre).Debug("begin fsrep.Retry")
-	defer func() {
-		post := u(nil)
-		getLogger(ctx).WithField("fsrep_transition", post).Debug("end fsrep.Retry")
-	}()
-
-	st := u(func(f *Replication) {
-		if len(f.pending) == 0 {
-			f.state = Completed
-			return
-		}
-		current = f.pending[0]
-	})
-	if st == Completed {
-		return nil
-	}
-	if st != Ready {
-		panic(fmt.Sprintf("implementation error: %v", st))
-	}
-
-	stepCtx := WithLogger(ctx, getLogger(ctx).WithField("step", current))
-	getLogger(stepCtx).Debug("take step")
-	err := current.Retry(stepCtx, ka, sender, receiver)
-	if err != nil {
-		getLogger(stepCtx).WithError(err).Error("step could not be completed")
-	}
-
-	u(func(fsr *Replication) {
-		if err != nil {
-			f.err = &StepError{stepStr: current.String(), err: err}
-			return
-		}
-		if err == nil && current.state != StepCompleted {
-			panic(fmt.Sprintf("implementation error: %v", current.state))
-		}
-		f.err = nil
-		f.completed = append(f.completed, current)
-		f.pending = f.pending[1:]
-		if len(f.pending) > 0 {
-			f.state = Ready
-		} else {
-			f.state = Completed
-		}
-	})
-	var retErr Error = nil
-	u(func(fsr *Replication) {
-		retErr = fsr.err
-	})
-	return retErr
-}
-
-type updater func(func(fsr *Replication)) State
-
-type state func(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state
-
-type StepError struct {
-	stepStr string
-	err     error
-}
-
-var _ Error = &StepError{}
-
-func (e StepError) Error() string {
-	if e.LocalToFS() {
-		return fmt.Sprintf("step %s failed: %s", e.stepStr, e.err)
-	}
-	return e.err.Error()
-}
-
-func (e StepError) Timeout() bool {
-	if neterr, ok := e.err.(net.Error); ok {
-		return neterr.Timeout()
-	}
-	return false
-}
-
-func (e StepError) Temporary() bool {
-	if neterr, ok := e.err.(net.Error); ok {
-		return neterr.Temporary()
-	}
-	return false
-}
-
-func (e StepError) LocalToFS() bool {
-	if _, ok := e.err.(net.Error); ok {
-		return false
-	}
-	return true // conservative approximation: we'd like to check for specific errors returned over RPC here...
-}
-
-func (e StepError) ContextErr() bool {
-	switch e.err {
-	case context.Canceled:
-		return true
-	case context.DeadlineExceeded:
-		return true
-	}
-	return false
-}
-
-func (fsr *Replication) Report() *Report {
-	fsr.lock.Lock()
-	defer fsr.lock.Unlock()
-
-	rep := Report{
-		Filesystem: fsr.fs,
-		Status:     fsr.state.String(),
-	}
-
-	if fsr.err != nil && fsr.err.LocalToFS() {
-		rep.Problem = fsr.err.Error()
-	}
-
-	rep.Completed = make([]*StepReport, len(fsr.completed))
-	for i := range fsr.completed {
-		rep.Completed[i] = fsr.completed[i].Report()
-	}
-	rep.Pending = make([]*StepReport, len(fsr.pending))
-	for i := range fsr.pending {
-		rep.Pending[i] = fsr.pending[i].Report()
-	}
-
-	return &rep
-}
-
-func (s *ReplicationStep) Retry(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver) error {
-	switch s.state {
-	case StepReplicationReady:
-		return s.doReplication(ctx, ka, sender, receiver)
-	case StepMarkReplicatedReady:
-		return s.doMarkReplicated(ctx, ka, sender)
-	case StepCompleted:
-		return nil
-	}
-	panic(fmt.Sprintf("implementation error: %v", s.state))
-}
-
-func (s *ReplicationStep) Error() error {
-	if s.state & (StepReplicationReady|StepMarkReplicatedReady) != 0 {
-		return s.err
-	}
-	return nil
-}
-
-func (s *ReplicationStep) doReplication(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver) error {
-
-	if s.state != StepReplicationReady {
-		panic(fmt.Sprintf("implementation error: %v", s.state))
-	}
-
-	fs := s.parent.fs
-
-	log := getLogger(ctx)
-	sr := s.buildSendRequest(false)
-
-	log.Debug("initiate send request")
-	sres, sstreamCopier, err := sender.Send(ctx, sr)
-	if err != nil {
-		log.WithError(err).Error("send request failed")
-		return err
-	}
-	if sstreamCopier == nil {
-		err := errors.New("send request did not return a stream, broken endpoint implementation")
-		return err
-	}
-	defer sstreamCopier.Close()
-
-	// Install a byte counter to track progress + for status report
-	s.byteCounter = bytecounter.NewStreamCopier(sstreamCopier)
-	byteCounterStopProgress := make(chan struct{})
-	defer close(byteCounterStopProgress)
-	go func() {
-		var lastCount int64
-		t := time.NewTicker(1 * time.Second)
-		defer t.Stop()
-		for {
-			select {
-			case <-byteCounterStopProgress:
-				return
-			case <-t.C:
-				newCount := s.byteCounter.Count()
-				if lastCount != newCount {
-					ka.MadeProgress()
-				} else {
-					lastCount = newCount
-				}
-			}
-		}
-	}()
-	defer func() {
-		s.parent.promBytesReplicated.Add(float64(s.byteCounter.Count()))
-	}()
-
-	rr := &pdu.ReceiveReq{
-		Filesystem:       fs,
-		ClearResumeToken: !sres.UsedResumeToken,
-	}
-	log.Debug("initiate receive request")
-	_, err = receiver.Receive(ctx, rr, s.byteCounter)
-	if err != nil {
-		log.
-			WithError(err).
-			WithField("errType", fmt.Sprintf("%T", err)).
-			Error("receive request failed (might also be error on sender)")
-		// This failure could be due to
-		// 	- an unexpected exit of ZFS on the sending side
-		//  - an unexpected exit of ZFS on the receiving side
-		//  - a connectivity issue
-		return err
-	}
-	log.Debug("receive finished")
-	ka.MadeProgress()
-
-	s.state = StepMarkReplicatedReady
-	return s.doMarkReplicated(ctx, ka, sender)
-
-}
-
-func (s *ReplicationStep) doMarkReplicated(ctx context.Context, ka *watchdog.KeepAlive, sender Sender) error {
-
-	if s.state != StepMarkReplicatedReady {
-		panic(fmt.Sprintf("implementation error: %v", s.state))
-	}
-
-	log := getLogger(ctx)
-
-	log.Debug("advance replication cursor")
-	req := &pdu.ReplicationCursorReq{
-		Filesystem: s.parent.fs,
-		Op: &pdu.ReplicationCursorReq_Set{
-			Set: &pdu.ReplicationCursorReq_SetOp{
-				Snapshot:   s.to.GetName(),
-			},
-		},
-	}
-	_, err := sender.ReplicationCursor(ctx, req)
-	if err != nil {
-		log.WithError(err).Error("error advancing replication cursor")
-		return err
-	}
-	ka.MadeProgress()
-
-	s.state = StepCompleted
-	return err
-}
-
-func (s *ReplicationStep) updateSizeEstimate(ctx context.Context, sender Sender) error {
-
-	log := getLogger(ctx)
-
-	sr := s.buildSendRequest(true)
-
-	log.Debug("initiate dry run send request")
-	sres, _, err := sender.Send(ctx, sr)
-	if err != nil {
-		log.WithError(err).Error("dry run send request failed")
-		return err
-	}
-	s.expectedSize = sres.ExpectedSize
-	return nil
-}
-
-func (s *ReplicationStep) buildSendRequest(dryRun bool) (sr *pdu.SendReq) {
-	fs := s.parent.fs
-	if s.from == nil {
-		sr = &pdu.SendReq{
-			Filesystem: fs,
-			To:         s.to.RelName(),
-			DryRun:     dryRun,
-		}
-	} else {
-		sr = &pdu.SendReq{
-			Filesystem: fs,
-			From:       s.from.RelName(),
-			To:         s.to.RelName(),
-			DryRun:     dryRun,
-		}
-	}
-	return sr
-}
-
-func (s *ReplicationStep) String() string {
-	if s.from == nil { // FIXME: ZFS semantics are that to is nil on non-incremental send
-		return fmt.Sprintf("%s%s (full)", s.parent.fs, s.to.RelName())
-	} else {
-		return fmt.Sprintf("%s(%s => %s)", s.parent.fs, s.from.RelName(), s.to.RelName())
-	}
-}
-
-func (s *ReplicationStep) Report() *StepReport {
-	var from string // FIXME follow same convention as ZFS: to should be nil on full send
-	if s.from != nil {
-		from = s.from.RelName()
-	}
-	bytes := int64(0)
-	if s.byteCounter != nil {
-		bytes = s.byteCounter.Count()
-	}
-	problem := ""
-	if s.err != nil {
-		problem = s.err.Error()
-	}
-	rep := StepReport{
-		From:   from,
-		To:     s.to.RelName(),
-		Status: s.state,
-		Problem: problem,
-		Bytes:  bytes,
-		ExpectedBytes: s.expectedSize,
-	}
-	return &rep
-}
--- a/replication/fsrep/state_enumer.go
+++ b/replication/fsrep/state_enumer.go
@ -1,50 +0,0 @@
-// Code generated by "enumer -type=State"; DO NOT EDIT.
-
-package fsrep
-
-import (
-	"fmt"
-)
-
-const _StateName = "ReadyCompleted"
-
-var _StateIndex = [...]uint8{0, 5, 14}
-
-func (i State) String() string {
-	i -= 1
-	if i >= State(len(_StateIndex)-1) {
-		return fmt.Sprintf("State(%d)", i+1)
-	}
-	return _StateName[_StateIndex[i]:_StateIndex[i+1]]
-}
-
-var _StateValues = []State{1, 2}
-
-var _StateNameToValueMap = map[string]State{
-	_StateName[0:5]:  1,
-	_StateName[5:14]: 2,
-}
-
-// StateString retrieves an enum value from the enum constants string name.
-// Throws an error if the param is not part of the enum.
-func StateString(s string) (State, error) {
-	if val, ok := _StateNameToValueMap[s]; ok {
-		return val, nil
-	}
-	return 0, fmt.Errorf("%s does not belong to State values", s)
-}
-
-// StateValues returns all values of the enum
-func StateValues() []State {
-	return _StateValues
-}
-
-// IsAState returns "true" if the value is listed in the enum definition. "false" otherwise
-func (i State) IsAState() bool {
-	for _, v := range _StateValues {
-		if i == v {
-			return true
-		}
-	}
-	return false
-}
--- a/replication/fsrep/stepstate_enumer.go
+++ b/replication/fsrep/stepstate_enumer.go
@ -1,61 +0,0 @@
-// Code generated by "enumer -type=StepState"; DO NOT EDIT.
-
-package fsrep
-
-import (
-	"fmt"
-)
-
-const (
-	_StepStateName_0 = "StepReplicationReadyStepMarkReplicatedReady"
-	_StepStateName_1 = "StepCompleted"
-)
-
-var (
-	_StepStateIndex_0 = [...]uint8{0, 20, 43}
-	_StepStateIndex_1 = [...]uint8{0, 13}
-)
-
-func (i StepState) String() string {
-	switch {
-	case 1 <= i && i <= 2:
-		i -= 1
-		return _StepStateName_0[_StepStateIndex_0[i]:_StepStateIndex_0[i+1]]
-	case i == 4:
-		return _StepStateName_1
-	default:
-		return fmt.Sprintf("StepState(%d)", i)
-	}
-}
-
-var _StepStateValues = []StepState{1, 2, 4}
-
-var _StepStateNameToValueMap = map[string]StepState{
-	_StepStateName_0[0:20]:  1,
-	_StepStateName_0[20:43]: 2,
-	_StepStateName_1[0:13]:  4,
-}
-
-// StepStateString retrieves an enum value from the enum constants string name.
-// Throws an error if the param is not part of the enum.
-func StepStateString(s string) (StepState, error) {
-	if val, ok := _StepStateNameToValueMap[s]; ok {
-		return val, nil
-	}
-	return 0, fmt.Errorf("%s does not belong to StepState values", s)
-}
-
-// StepStateValues returns all values of the enum
-func StepStateValues() []StepState {
-	return _StepStateValues
-}
-
-// IsAStepState returns "true" if the value is listed in the enum definition. "false" otherwise
-func (i StepState) IsAStepState() bool {
-	for _, v := range _StepStateValues {
-		if i == v {
-			return true
-		}
-	}
-	return false
-}
--- a/replication/internal/diff/diff.go
+++ b/replication/internal/diff/diff.go
@ -3,7 +3,7 @@ package mainfsm
 import (
 	"sort"

-	. "github.com/zrepl/zrepl/replication/pdu"
+	. "github.com/zrepl/zrepl/replication/logic/pdu"
 )

 type ConflictNoCommonAncestor struct {
--- a/replication/logic/pdu/pdu.pb.go
+++ b/replication/logic/pdu/pdu.pb.go
--- a/replication/logic/pdu/pdu.proto
+++ b/replication/logic/pdu/pdu.proto
--- a/replication/logic/pdu/pdu_extras.go
+++ b/replication/logic/pdu/pdu_extras.go
--- a/replication/logic/pdu/pdu_test.go
+++ b/replication/logic/pdu/pdu_test.go
--- a/replication/logic/replication_logic.go
+++ b/replication/logic/replication_logic.go
@ -0,0 +1,430 @@
+package logic
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+
+	"github.com/zrepl/zrepl/replication/driver"
+	. "github.com/zrepl/zrepl/replication/logic/diff"
+	"github.com/zrepl/zrepl/replication/logic/pdu"
+	"github.com/zrepl/zrepl/replication/report"
+	"github.com/zrepl/zrepl/util/bytecounter"
+	"github.com/zrepl/zrepl/zfs"
+)
+
+// Endpoint represents one side of the replication.
+//
+// An endpoint is either in Sender or Receiver mode, represented by the correspondingly
+// named interfaces defined in this package.
+type Endpoint interface {
+	// Does not include placeholder filesystems
+	ListFilesystems(ctx context.Context, req *pdu.ListFilesystemReq) (*pdu.ListFilesystemRes, error)
+	ListFilesystemVersions(ctx context.Context, req *pdu.ListFilesystemVersionsReq) (*pdu.ListFilesystemVersionsRes, error)
+	DestroySnapshots(ctx context.Context, req *pdu.DestroySnapshotsReq) (*pdu.DestroySnapshotsRes, error)
+}
+
+type Sender interface {
+	Endpoint
+	// If a non-nil io.ReadCloser is returned, it is guaranteed to be closed before
+	// any next call to the parent github.com/zrepl/zrepl/replication.Endpoint.
+	// If the send request is for dry run the io.ReadCloser will be nil
+	Send(ctx context.Context, r *pdu.SendReq) (*pdu.SendRes, zfs.StreamCopier, error)
+	ReplicationCursor(ctx context.Context, req *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error)
+}
+
+type Receiver interface {
+	Endpoint
+	// Receive sends r and sendStream (the latter containing a ZFS send stream)
+	// to the parent github.com/zrepl/zrepl/replication.Endpoint.
+	Receive(ctx context.Context, req *pdu.ReceiveReq, receive zfs.StreamCopier) (*pdu.ReceiveRes, error)
+}
+
+type Planner struct {
+	sender   Sender
+	receiver Receiver
+
+	promSecsPerState    *prometheus.HistogramVec // labels: state
+	promBytesReplicated *prometheus.CounterVec   // labels: filesystem
+}
+
+func (p *Planner) Plan(ctx context.Context) ([]driver.FS, error) {
+	fss, err := p.doPlanning(ctx)
+	if err != nil {
+		return nil, err
+	}
+	dfss := make([]driver.FS, len(fss))
+	for i := range dfss {
+		dfss[i] = fss[i]
+	}
+	return dfss, nil
+}
+
+type Filesystem struct {
+	sender   Sender
+	receiver Receiver
+
+	Path                string             // compat
+	receiverFSExists    bool               // compat
+	promBytesReplicated prometheus.Counter // compat
+}
+
+func (f *Filesystem) PlanFS(ctx context.Context) ([]driver.Step, error) {
+	steps, err := f.doPlanning(ctx)
+	if err != nil {
+		return nil, err
+	}
+	dsteps := make([]driver.Step, len(steps))
+	for i := range dsteps {
+		dsteps[i] = steps[i]
+	}
+	return dsteps, nil
+}
+func (f *Filesystem) ReportInfo() *report.FilesystemInfo {
+	return &report.FilesystemInfo{Name: f.Path} // FIXME compat name
+}
+
+type Step struct {
+	sender   Sender
+	receiver Receiver
+
+	parent   *Filesystem
+	from, to *pdu.FilesystemVersion // compat
+
+	byteCounter  bytecounter.StreamCopier
+	expectedSize int64 // 0 means no size estimate present / possible
+}
+
+func (s *Step) TargetDate() time.Time {
+	return s.to.SnapshotTime() // FIXME compat name
+}
+
+func (s *Step) Step(ctx context.Context) error {
+	return s.doReplication(ctx)
+}
+
+func (s *Step) ReportInfo() *report.StepInfo {
+	var byteCounter int64
+	if s.byteCounter != nil {
+		byteCounter = s.byteCounter.Count()
+	}
+	return &report.StepInfo{
+		From:            s.from.RelName(),
+		To:              s.to.RelName(),
+		BytesExpected:   s.expectedSize,
+		BytesReplicated: byteCounter,
+	}
+}
+
+func NewPlanner(secsPerState *prometheus.HistogramVec, bytesReplicated *prometheus.CounterVec, sender Sender, receiver Receiver) *Planner {
+	return &Planner{
+		sender:              sender,
+		receiver:            receiver,
+		promSecsPerState:    secsPerState,
+		promBytesReplicated: bytesReplicated,
+	}
+}
+func resolveConflict(conflict error) (path []*pdu.FilesystemVersion, msg string) {
+	if noCommonAncestor, ok := conflict.(*ConflictNoCommonAncestor); ok {
+		if len(noCommonAncestor.SortedReceiverVersions) == 0 {
+			// TODO this is hard-coded replication policy: most recent snapshot as source
+			var mostRecentSnap *pdu.FilesystemVersion
+			for n := len(noCommonAncestor.SortedSenderVersions) - 1; n >= 0; n-- {
+				if noCommonAncestor.SortedSenderVersions[n].Type == pdu.FilesystemVersion_Snapshot {
+					mostRecentSnap = noCommonAncestor.SortedSenderVersions[n]
+					break
+				}
+			}
+			if mostRecentSnap == nil {
+				return nil, "no snapshots available on sender side"
+			}
+			return []*pdu.FilesystemVersion{mostRecentSnap}, fmt.Sprintf("start replication at most recent snapshot %s", mostRecentSnap.RelName())
+		}
+	}
+	return nil, "no automated way to handle conflict type"
+}
+
+func (p *Planner) doPlanning(ctx context.Context) ([]*Filesystem, error) {
+
+	log := getLogger(ctx)
+
+	log.Info("start planning")
+
+	slfssres, err := p.sender.ListFilesystems(ctx, &pdu.ListFilesystemReq{})
+	if err != nil {
+		log.WithError(err).WithField("errType", fmt.Sprintf("%T", err)).Error("error listing sender filesystems")
+		return nil, err
+	}
+	sfss := slfssres.GetFilesystems()
+	// no progress here since we could run in a live-lock on connectivity issues
+
+	rlfssres, err := p.receiver.ListFilesystems(ctx, &pdu.ListFilesystemReq{})
+	if err != nil {
+		log.WithError(err).WithField("errType", fmt.Sprintf("%T", err)).Error("error listing receiver filesystems")
+		return nil, err
+	}
+	rfss := rlfssres.GetFilesystems()
+
+	q := make([]*Filesystem, 0, len(sfss))
+	for _, fs := range sfss {
+
+		receiverFSExists := false
+		for _, rfs := range rfss {
+			if rfs.Path == fs.Path {
+				receiverFSExists = true
+			}
+		}
+
+		ctr := p.promBytesReplicated.WithLabelValues(fs.Path)
+
+		q = append(q, &Filesystem{
+			sender:              p.sender,
+			receiver:            p.receiver,
+			Path:                fs.Path,
+			receiverFSExists:    receiverFSExists,
+			promBytesReplicated: ctr,
+		})
+	}
+
+	return q, nil
+}
+
+func (fs *Filesystem) doPlanning(ctx context.Context) ([]*Step, error) {
+
+	log := getLogger(ctx).WithField("filesystem", fs.Path)
+
+	log.Debug("assessing filesystem")
+
+	sfsvsres, err := fs.sender.ListFilesystemVersions(ctx, &pdu.ListFilesystemVersionsReq{Filesystem: fs.Path})
+	if err != nil {
+		log.WithError(err).Error("cannot get remote filesystem versions")
+		return nil, err
+	}
+	sfsvs := sfsvsres.GetVersions()
+
+	if len(sfsvs) < 1 {
+		err := errors.New("sender does not have any versions")
+		log.Error(err.Error())
+		return nil, err
+	}
+
+	var rfsvs []*pdu.FilesystemVersion
+	if fs.receiverFSExists {
+		rfsvsres, err := fs.receiver.ListFilesystemVersions(ctx, &pdu.ListFilesystemVersionsReq{Filesystem: fs.Path})
+		if err != nil {
+			log.WithError(err).Error("receiver error")
+			return nil, err
+		}
+		rfsvs = rfsvsres.GetVersions()
+	} else {
+		rfsvs = []*pdu.FilesystemVersion{}
+	}
+
+	path, conflict := IncrementalPath(rfsvs, sfsvs)
+	if conflict != nil {
+		var msg string
+		path, msg = resolveConflict(conflict) // no shadowing allowed!
+		if path != nil {
+			log.WithField("conflict", conflict).Info("conflict")
+			log.WithField("resolution", msg).Info("automatically resolved")
+		} else {
+			log.WithField("conflict", conflict).Error("conflict")
+			log.WithField("problem", msg).Error("cannot resolve conflict")
+		}
+	}
+	if path == nil {
+		return nil, conflict
+	}
+
+	steps := make([]*Step, 0, len(path))
+	// FIXME unify struct declarations => initializer?
+	if len(path) == 1 {
+		steps = append(steps, &Step{
+			parent:   fs,
+			sender:   fs.sender,
+			receiver: fs.receiver,
+			from:     nil,
+			to:       path[0],
+		})
+	} else {
+		for i := 0; i < len(path)-1; i++ {
+			steps = append(steps, &Step{
+				parent:   fs,
+				sender:   fs.sender,
+				receiver: fs.receiver,
+				from:     path[i],
+				to:       path[i+1],
+			})
+		}
+	}
+
+	log.Debug("compute send size estimate")
+	errs := make(chan error, len(steps))
+	var wg sync.WaitGroup
+	fanOutCtx, fanOutCancel := context.WithCancel(ctx)
+	defer fanOutCancel()
+	for _, step := range steps {
+		wg.Add(1)
+		go func(step *Step) {
+			defer wg.Done()
+			err := step.updateSizeEstimate(fanOutCtx)
+			if err != nil {
+				log.WithError(err).WithField("step", step).Error("error computing size estimate")
+				fanOutCancel()
+			}
+			errs <- err
+		}(step)
+	}
+	wg.Wait()
+	close(errs)
+	var significantErr error = nil
+	for err := range errs {
+		if err != nil {
+			if significantErr == nil || significantErr == context.Canceled {
+				significantErr = err
+			}
+		}
+	}
+	if significantErr != nil {
+		return nil, significantErr
+	}
+
+	log.Debug("filesystem planning finished")
+	return steps, nil
+}
+
+// type FilesystemsReplicationFailedError struct {
+// 	FilesystemsWithError []*fsrep.Replication
+// }
+
+// func (e FilesystemsReplicationFailedError) Error() string {
+// 	allSame := true
+// 	lastErr := e.FilesystemsWithError[0].Err().Error()
+// 	for _, fs := range e.FilesystemsWithError {
+// 		fsErr := fs.Err().Error()
+// 		allSame = allSame && lastErr == fsErr
+// 	}
+
+// 	fsstr := "multiple filesystems"
+// 	if len(e.FilesystemsWithError) == 1 {
+// 		fsstr = fmt.Sprintf("filesystem %s", e.FilesystemsWithError[0].FS())
+// 	}
+// 	errorStr := lastErr
+// 	if !allSame {
+// 		errorStr = "multiple different errors"
+// 	}
+// 	return fmt.Sprintf("%s could not be replicated: %s", fsstr, errorStr)
+// }
+
+func (s *Step) updateSizeEstimate(ctx context.Context) error {
+
+	log := getLogger(ctx)
+
+	sr := s.buildSendRequest(true)
+
+	log.Debug("initiate dry run send request")
+	sres, _, err := s.sender.Send(ctx, sr)
+	if err != nil {
+		log.WithError(err).Error("dry run send request failed")
+		return err
+	}
+	s.expectedSize = sres.ExpectedSize
+	return nil
+}
+
+func (s *Step) buildSendRequest(dryRun bool) (sr *pdu.SendReq) {
+	fs := s.parent.Path
+	if s.from == nil {
+		sr = &pdu.SendReq{
+			Filesystem: fs,
+			To:         s.to.RelName(),
+			DryRun:     dryRun,
+		}
+	} else {
+		sr = &pdu.SendReq{
+			Filesystem: fs,
+			From:       s.from.RelName(),
+			To:         s.to.RelName(),
+			DryRun:     dryRun,
+		}
+	}
+	return sr
+}
+
+func (s *Step) doReplication(ctx context.Context) error {
+
+	fs := s.parent.Path
+
+	log := getLogger(ctx)
+	sr := s.buildSendRequest(false)
+
+	log.Debug("initiate send request")
+	sres, sstreamCopier, err := s.sender.Send(ctx, sr)
+	if err != nil {
+		log.WithError(err).Error("send request failed")
+		return err
+	}
+	if sstreamCopier == nil {
+		err := errors.New("send request did not return a stream, broken endpoint implementation")
+		return err
+	}
+	defer sstreamCopier.Close()
+
+	// Install a byte counter to track progress + for status report
+	s.byteCounter = bytecounter.NewStreamCopier(sstreamCopier)
+	defer func() {
+		s.parent.promBytesReplicated.Add(float64(s.byteCounter.Count()))
+	}()
+
+	rr := &pdu.ReceiveReq{
+		Filesystem:       fs,
+		ClearResumeToken: !sres.UsedResumeToken,
+	}
+	log.Debug("initiate receive request")
+	_, err = s.receiver.Receive(ctx, rr, s.byteCounter)
+	if err != nil {
+		log.
+			WithError(err).
+			WithField("errType", fmt.Sprintf("%T", err)).
+			Error("receive request failed (might also be error on sender)")
+		// This failure could be due to
+		// 	- an unexpected exit of ZFS on the sending side
+		//  - an unexpected exit of ZFS on the receiving side
+		//  - a connectivity issue
+		return err
+	}
+	log.Debug("receive finished")
+
+	log.Debug("advance replication cursor")
+	req := &pdu.ReplicationCursorReq{
+		Filesystem: fs,
+		Op: &pdu.ReplicationCursorReq_Set{
+			Set: &pdu.ReplicationCursorReq_SetOp{
+				Snapshot: s.to.GetName(),
+			},
+		},
+	}
+	_, err = s.sender.ReplicationCursor(ctx, req)
+	if err != nil {
+		log.WithError(err).Error("error advancing replication cursor")
+		// If this fails and replication planning restarts, the diff algorithm will find
+		// that cursor out of place. This is not a problem because then, it would just use another FS
+		// However, we FIXME have no means to just update the cursor in a
+		// second replication attempt right after this one where we don't have new snaps yet
+		return err
+	}
+
+	return err
+}
+
+func (s *Step) String() string {
+	if s.from == nil { // FIXME: ZFS semantics are that to is nil on non-incremental send
+		return fmt.Sprintf("%s%s (full)", s.parent.Path, s.to.RelName())
+	} else {
+		return fmt.Sprintf("%s(%s => %s)", s.parent.Path, s.from.RelName(), s.to.RelName())
+	}
+}
--- a/replication/logic/replication_logic_context.go
+++ b/replication/logic/replication_logic_context.go
@ -1,9 +1,9 @@
-package replication
+package logic

 import (
 	"context"
+
 	"github.com/zrepl/zrepl/logger"
-	"github.com/zrepl/zrepl/replication/fsrep"
 )

 type contextKey int
@ -16,7 +16,6 @@ type Logger = logger.Logger

 func WithLogger(ctx context.Context, l Logger) context.Context {
 	ctx = context.WithValue(ctx, contextKeyLog, l)
-	ctx = fsrep.WithLogger(ctx, l)
 	return ctx
 }

--- a/replication/mainfsm.go
+++ b/replication/mainfsm.go
@ -1,560 +0,0 @@
-// Package replication implements replication of filesystems with existing
-// versions (snapshots) from a sender to a receiver.
-package replication
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"github.com/prometheus/client_golang/prometheus"
-	"github.com/zrepl/zrepl/daemon/job/wakeup"
-	"github.com/zrepl/zrepl/util/envconst"
-	"github.com/zrepl/zrepl/util/watchdog"
-	"math/bits"
-	"net"
-	"sort"
-	"sync"
-	"time"
-
-	"github.com/zrepl/zrepl/replication/fsrep"
-	. "github.com/zrepl/zrepl/replication/internal/diff"
-	"github.com/zrepl/zrepl/replication/pdu"
-)
-
-//go:generate enumer -type=State
-type State uint
-
-const (
-	Planning State = 1 << iota
-	PlanningError
-	Working
-	WorkingWait
-	Completed
-	PermanentError
-)
-
-func (s State) rsf() state {
-	idx := bits.TrailingZeros(uint(s))
-	if idx == bits.UintSize {
-		panic(s) // invalid value
-	}
-	m := []state{
-		statePlanning,
-		statePlanningError,
-		stateWorking,
-		stateWorkingWait,
-		nil,
-		nil,
-	}
-	return m[idx]
-}
-
-func (s State) IsTerminal() bool {
-	return s.rsf() == nil
-}
-
-// Replication implements the replication of multiple file systems from a Sender to a Receiver.
-//
-// It is a state machine that is driven by the Drive method
-// and provides asynchronous reporting via the Report method (i.e. from another goroutine).
-type Replication struct {
-	// not protected by lock
-	promSecsPerState *prometheus.HistogramVec // labels: state
-	promBytesReplicated *prometheus.CounterVec // labels: filesystem
-
-	Progress watchdog.KeepAlive
-
-	// lock protects all fields of this struct (but not the fields behind pointers!)
-	lock sync.Mutex
-
-	state State
-
-	// Working, WorkingWait, Completed, ContextDone
-	queue     []*fsrep.Replication
-	completed []*fsrep.Replication
-	active    *fsrep.Replication // == queue[0] or nil, unlike in Report
-
-	// for PlanningError, WorkingWait and ContextError and Completed
-	err error
-
-	// PlanningError, WorkingWait
-	sleepUntil time.Time
-}
-
-type Report struct {
-	Status    string
-	Problem   string
-	SleepUntil time.Time
-	Completed []*fsrep.Report
-	Pending   []*fsrep.Report
-	Active    *fsrep.Report // not contained in Pending, unlike in struct Replication
-}
-
-func NewReplication(secsPerState *prometheus.HistogramVec, bytesReplicated *prometheus.CounterVec) *Replication {
-	r := Replication{
-		promSecsPerState: secsPerState,
-		promBytesReplicated: bytesReplicated,
-		state:            Planning,
-	}
-	return &r
-}
-
-// Endpoint represents one side of the replication.
-//
-// An endpoint is either in Sender or Receiver mode, represented by the correspondingly
-// named interfaces defined in this package.
-type Endpoint interface {
-	// Does not include placeholder filesystems
-	ListFilesystems(ctx context.Context, req *pdu.ListFilesystemReq) (*pdu.ListFilesystemRes, error)
-	ListFilesystemVersions(ctx context.Context, req *pdu.ListFilesystemVersionsReq) (*pdu.ListFilesystemVersionsRes, error)
-	DestroySnapshots(ctx context.Context, req *pdu.DestroySnapshotsReq) (*pdu.DestroySnapshotsRes, error)
-}
-
-type Sender interface {
-	Endpoint
-	fsrep.Sender
-}
-
-type Receiver interface {
-	Endpoint
-	fsrep.Receiver
-}
-
-type FilteredError struct{ fs string }
-
-func NewFilteredError(fs string) *FilteredError {
-	return &FilteredError{fs}
-}
-
-func (f FilteredError) Error() string { return "endpoint does not allow access to filesystem " + f.fs }
-
-type updater func(func(*Replication)) (newState State)
-type state func(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state
-
-// Drive starts the state machine and returns only after replication has finished (with or without errors).
-// The Logger in ctx is used for both debug and error logging, but is not guaranteed to be stable
-// or end-user friendly.
-// User-facing replication progress reports and can be obtained using the Report method,
-// whose output will not change after Drive returns.
-//
-// FIXME: Drive may be only called once per instance of Replication
-func (r *Replication) Drive(ctx context.Context, sender Sender, receiver Receiver) {
-
-	var u updater = func(f func(*Replication)) State {
-		r.lock.Lock()
-		defer r.lock.Unlock()
-		if f != nil {
-			f(r)
-		}
-		return r.state
-	}
-
-	var s state = statePlanning
-	var pre, post State
-	for s != nil {
-		preTime := time.Now()
-		pre = u(nil)
-		s = s(ctx, &r.Progress, sender, receiver, u)
-		delta := time.Now().Sub(preTime)
-		r.promSecsPerState.WithLabelValues(pre.String()).Observe(delta.Seconds())
-		post = u(nil)
-		getLogger(ctx).
-			WithField("transition", fmt.Sprintf("%s => %s", pre, post)).
-			WithField("duration", delta).
-			Debug("main state transition")
-		if post == Working && pre != post {
-			getLogger(ctx).Info("start working")
-		}
-	}
-
-	getLogger(ctx).
-		WithField("final_state", post).
-		Debug("main final state")
-}
-
-func resolveConflict(conflict error) (path []*pdu.FilesystemVersion, msg string) {
-	if noCommonAncestor, ok := conflict.(*ConflictNoCommonAncestor); ok {
-		if len(noCommonAncestor.SortedReceiverVersions) == 0 {
-			// TODO this is hard-coded replication policy: most recent snapshot as source
-			var mostRecentSnap *pdu.FilesystemVersion
-			for n := len(noCommonAncestor.SortedSenderVersions) - 1; n >= 0; n-- {
-				if noCommonAncestor.SortedSenderVersions[n].Type == pdu.FilesystemVersion_Snapshot {
-					mostRecentSnap = noCommonAncestor.SortedSenderVersions[n]
-					break
-				}
-			}
-			if mostRecentSnap == nil {
-				return nil, "no snapshots available on sender side"
-			}
-			return []*pdu.FilesystemVersion{mostRecentSnap}, fmt.Sprintf("start replication at most recent snapshot %s", mostRecentSnap.RelName())
-		}
-	}
-	return nil, "no automated way to handle conflict type"
-}
-
-var RetryInterval = envconst.Duration("ZREPL_REPLICATION_RETRY_INTERVAL", 10 * time.Second)
-
-type Error interface {
-	error
-	Temporary() bool
-}
-
-var _ Error = fsrep.Error(nil)
-var _ Error = net.Error(nil)
-
-func isPermanent(err error) bool {
-	if e, ok := err.(Error); ok {
-		return !e.Temporary()
-	}
-	return true
-}
-
-func statePlanning(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {
-
-	log := getLogger(ctx)
-
-	log.Info("start planning")
-
-	handlePlanningError := func(err error) state {
-		return u(func(r *Replication) {
-			ge := GlobalError{Err: err, Temporary: !isPermanent(err)}
-			log.WithError(ge).Error("encountered global error while planning replication")
-			r.err = ge
-			if !ge.Temporary {
-				r.state = PermanentError
-			} else {
-				r.sleepUntil = time.Now().Add(RetryInterval)
-				r.state = PlanningError
-			}
-		}).rsf()
-	}
-
-	slfssres, err := sender.ListFilesystems(ctx, &pdu.ListFilesystemReq{})
-	if err != nil {
-		log.WithError(err).WithField("errType", fmt.Sprintf("%T", err)).Error("error listing sender filesystems")
-		return handlePlanningError(err)
-	}
-	sfss := slfssres.GetFilesystems()
-	// no progress here since we could run in a live-lock on connectivity issues
-
-	rlfssres, err := receiver.ListFilesystems(ctx, &pdu.ListFilesystemReq{})
-	if err != nil {
-		log.WithError(err).WithField("errType", fmt.Sprintf("%T", err)).Error("error listing receiver filesystems")
-		return handlePlanningError(err)
-	}
-	rfss := rlfssres.GetFilesystems()
-	ka.MadeProgress() // for both sender and receiver
-
-	q := make([]*fsrep.Replication, 0, len(sfss))
-	mainlog := log
-	for _, fs := range sfss {
-
-		log := mainlog.WithField("filesystem", fs.Path)
-
-		log.Debug("assessing filesystem")
-
-		sfsvsres, err := sender.ListFilesystemVersions(ctx, &pdu.ListFilesystemVersionsReq{Filesystem: fs.Path})
-		if err != nil {
-			log.WithError(err).Error("cannot get remote filesystem versions")
-			return handlePlanningError(err)
-		}
-		sfsvs := sfsvsres.GetVersions()
-		ka.MadeProgress()
-
-		if len(sfsvs) < 1 {
-			err := errors.New("sender does not have any versions")
-			log.Error(err.Error())
-			q = append(q, fsrep.NewReplicationConflictError(fs.Path, err))
-			continue
-		}
-
-		receiverFSExists := false
-		for _, rfs := range rfss {
-			if rfs.Path == fs.Path {
-				receiverFSExists = true
-			}
-		}
-
-		var rfsvs []*pdu.FilesystemVersion
-		if receiverFSExists {
-			rfsvsres, err := receiver.ListFilesystemVersions(ctx, &pdu.ListFilesystemVersionsReq{Filesystem: fs.Path})
-			if err != nil {
-				if _, ok := err.(*FilteredError); ok {
-					log.Info("receiver ignores filesystem")
-					continue
-				}
-				log.WithError(err).Error("receiver error")
-				return handlePlanningError(err)
-			}
-			rfsvs = rfsvsres.GetVersions()
-		} else {
-			rfsvs = []*pdu.FilesystemVersion{}
-		}
-		ka.MadeProgress()
-
-		path, conflict := IncrementalPath(rfsvs, sfsvs)
-		if conflict != nil {
-			var msg string
-			path, msg = resolveConflict(conflict) // no shadowing allowed!
-			if path != nil {
-				log.WithField("conflict", conflict).Info("conflict")
-				log.WithField("resolution", msg).Info("automatically resolved")
-			} else {
-				log.WithField("conflict", conflict).Error("conflict")
-				log.WithField("problem", msg).Error("cannot resolve conflict")
-			}
-		}
-		ka.MadeProgress()
-		if path == nil {
-			q = append(q, fsrep.NewReplicationConflictError(fs.Path, conflict))
-			continue
-		}
-
-		var promBytesReplicated *prometheus.CounterVec
-		u(func(replication *Replication) { // FIXME args struct like in pruner (also use for sender and receiver)
-			promBytesReplicated = replication.promBytesReplicated
-		})
-		fsrfsm := fsrep.BuildReplication(fs.Path, promBytesReplicated.WithLabelValues(fs.Path))
-		if len(path) == 1 {
-			fsrfsm.AddStep(nil, path[0])
-		} else {
-			for i := 0; i < len(path)-1; i++ {
-				fsrfsm.AddStep(path[i], path[i+1])
-			}
-		}
-		qitem := fsrfsm.Done()
-		ka.MadeProgress()
-
-		log.Debug("compute send size estimate")
-		if err = qitem.UpdateSizeEsitmate(ctx, sender); err != nil {
-			log.WithError(err).Error("error computing size estimate")
-			return handlePlanningError(err)
-		}
-		ka.MadeProgress()
-
-		q = append(q, qitem)
-	}
-
-	ka.MadeProgress()
-
-	return u(func(r *Replication) {
-		r.completed = nil
-		r.queue = q
-		r.err = nil
-		r.state = Working
-	}).rsf()
-}
-
-func statePlanningError(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {
-	var sleepUntil time.Time
-	u(func(r *Replication) {
-		sleepUntil = r.sleepUntil
-	})
-	t := time.NewTimer(sleepUntil.Sub(time.Now()))
-	getLogger(ctx).WithField("until", sleepUntil).Info("retry wait after planning error")
-	defer t.Stop()
-	select {
-	case <-ctx.Done():
-		return u(func(r *Replication) {
-			r.state = PermanentError
-			r.err = ctx.Err()
-		}).rsf()
-	case <-t.C:
-	case <-wakeup.Wait(ctx):
-	}
-	return u(func(r *Replication) {
-		r.state = Planning
-	}).rsf()
-}
-
-type GlobalError struct {
-	Err       error
-	Temporary bool
-}
-
-func (e GlobalError) Error() string {
-	errClass := "temporary"
-	if !e.Temporary {
-		errClass = "permanent"
-	}
-	return fmt.Sprintf("%s global error: %s", errClass, e.Err)
-}
-
-type FilesystemsReplicationFailedError struct {
-	FilesystemsWithError []*fsrep.Replication
-}
-
-func (e FilesystemsReplicationFailedError) Error() string {
-	allSame := true
-	lastErr := e.FilesystemsWithError[0].Err().Error()
-	for _, fs := range e.FilesystemsWithError {
-		fsErr := fs.Err().Error()
-		allSame = allSame && lastErr == fsErr
-	}
-
-	fsstr := "multiple filesystems"
-	if len(e.FilesystemsWithError) == 1 {
-		fsstr = fmt.Sprintf("filesystem %s", e.FilesystemsWithError[0].FS())
-	}
-	errorStr := lastErr
-	if !allSame {
-		errorStr = "multiple different errors"
-	}
-	return fmt.Sprintf("%s could not be replicated: %s", fsstr, errorStr)
-}
-
-func stateWorking(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {
-
-	var active *fsrep.Replication
-	rsfNext := u(func(r *Replication) {
-
-		r.err = nil
-
-		newq := make([]*fsrep.Replication, 0, len(r.queue))
-		for i := range r.queue {
-			if r.queue[i].CanRetry() {
-				newq = append(newq, r.queue[i])
-			} else {
-				r.completed = append(r.completed, r.queue[i])
-			}
-		}
-		sort.SliceStable(newq, func(i, j int) bool {
-			return newq[i].NextStepDate().Before(newq[j].NextStepDate())
-		})
-		r.queue = newq
-
-		if len(r.queue) == 0 {
-			r.state = Completed
-			fsWithErr := FilesystemsReplicationFailedError{ // prepare it
-				FilesystemsWithError: make([]*fsrep.Replication, 0, len(r.completed)),
-			}
-			for _, fs := range r.completed {
-				if fs.CanRetry() {
-					panic(fmt.Sprintf("implementation error: completed contains retryable FS %s %#v",
-						fs.FS(), fs.Err()))
-				}
-				if fs.Err() != nil {
-					fsWithErr.FilesystemsWithError = append(fsWithErr.FilesystemsWithError, fs)
-				}
-			}
-			if len(fsWithErr.FilesystemsWithError) > 0 {
-				r.err = fsWithErr
-				r.state = PermanentError
-			}
-			return
-		}
-
-		active =  r.queue[0] // do not dequeue: if it's done, it will be sorted the next time we check for more work
-		r.active = active
-	}).rsf()
-
-	if active == nil {
-		return rsfNext
-	}
-
-	activeCtx := fsrep.WithLogger(ctx, getLogger(ctx).WithField("fs", active.FS()))
-	err := active.Retry(activeCtx, ka, sender, receiver)
-	u(func(r *Replication) {
-		r.active = nil
-	}).rsf()
-
-	if err != nil {
-		if err.ContextErr() && ctx.Err() != nil {
-			getLogger(ctx).WithError(err).
-				Info("filesystem replication was cancelled")
-			u(func(r*Replication) {
-				r.err = GlobalError{Err: err, Temporary: false}
-				r.state = PermanentError
-			})
-		} else if err.LocalToFS() {
-			getLogger(ctx).WithError(err).
-				Error("filesystem replication encountered a filesystem-specific error")
-			// we stay in this state and let the queuing logic above de-prioritize this failing FS
-		} else if err.Temporary() {
-			getLogger(ctx).WithError(err).
-				Error("filesystem encountered a non-filesystem-specific temporary error, enter retry-wait")
-			u(func(r *Replication) {
-				r.err = GlobalError{Err: err, Temporary: true}
-				r.sleepUntil = time.Now().Add(RetryInterval)
-				r.state = WorkingWait
-			}).rsf()
-		} else {
-			getLogger(ctx).WithError(err).
-				Error("encountered a permanent non-filesystem-specific error")
-			u(func(r *Replication) {
-				r.err = GlobalError{Err: err, Temporary: false}
-				r.state = PermanentError
-			}).rsf()
-		}
-	}
-
-	return u(nil).rsf()
-}
-
-func stateWorkingWait(ctx context.Context, ka *watchdog.KeepAlive, sender Sender, receiver Receiver, u updater) state {
-	var sleepUntil time.Time
-	u(func(r *Replication) {
-		sleepUntil = r.sleepUntil
-	})
-	t := time.NewTimer(RetryInterval)
-	getLogger(ctx).WithField("until", sleepUntil).Info("retry wait after error")
-	defer t.Stop()
-	select {
-	case <-ctx.Done():
-		return u(func(r *Replication) {
-			r.state = PermanentError
-			r.err = ctx.Err()
-		}).rsf()
-
-	case <-t.C:
-	case <-wakeup.Wait(ctx):
-	}
-	return u(func(r *Replication) {
-		r.state = Working
-	}).rsf()
-}
-
-// Report provides a summary of the progress of the Replication,
-// i.e., a condensed dump of the internal state machine.
-// Report is safe to be called asynchronously while Drive is running.
-func (r *Replication) Report() *Report {
-	r.lock.Lock()
-	defer r.lock.Unlock()
-
-	rep := Report{
-		Status: r.state.String(),
-		SleepUntil: r.sleepUntil,
-	}
-
-	if r.err != nil {
-		rep.Problem = r.err.Error()
-	}
-
-	if r.state&(Planning|PlanningError) != 0 {
-		return &rep
-	}
-
-	rep.Pending = make([]*fsrep.Report, 0, len(r.queue))
-	rep.Completed = make([]*fsrep.Report, 0, len(r.completed)) // room for active (potentially)
-
-	// since r.active == r.queue[0], do not contain it in pending output
-	pending := r.queue
-	if r.active != nil {
-		rep.Active = r.active.Report()
-		pending = r.queue[1:]
-	}
-	for _, fsr := range pending {
-		rep.Pending= append(rep.Pending, fsr.Report())
-	}
-	for _, fsr := range r.completed {
-		rep.Completed = append(rep.Completed, fsr.Report())
-	}
-
-	return &rep
-}
-
-func (r *Replication) State() State {
-	r.lock.Lock()
-	defer r.lock.Unlock()
-	return r.state
-}
--- a/replication/replication.go
+++ b/replication/replication.go
@ -0,0 +1,13 @@
+// Package replication implements replication of filesystems with existing
+// versions (snapshots) from a sender to a receiver.
+package replication
+
+import (
+	"context"
+
+	"github.com/zrepl/zrepl/replication/driver"
+)
+
+func Do(ctx context.Context, planner driver.Planner) (driver.ReportFunc, driver.WaitFunc) {
+	return driver.Do(ctx, planner)
+}
--- a/replication/report/replication_report.go
+++ b/replication/report/replication_report.go
@ -0,0 +1,150 @@
+package report
+
+import (
+	"encoding/json"
+	"time"
+)
+
+type Report struct {
+	StartAt, FinishAt time.Time
+	Attempts          []*AttemptReport
+}
+
+var _, _ = json.Marshal(&Report{})
+
+type TimedError struct {
+	Err  string
+	Time time.Time
+}
+
+func NewTimedError(err string, t time.Time) *TimedError {
+	if err == "" {
+		panic("error must be empty")
+	}
+	if t.IsZero() {
+		panic("t must be non-zero")
+	}
+	return &TimedError{err, t}
+}
+
+func (s *TimedError) Error() string {
+	return s.Err
+}
+
+var _, _ = json.Marshal(&TimedError{})
+
+type AttemptReport struct {
+	State             AttemptState
+	StartAt, FinishAt time.Time
+	PlanError         *TimedError
+	Filesystems       []*FilesystemReport
+}
+
+type AttemptState string
+
+const (
+	AttemptPlanning      AttemptState = "planning"
+	AttemptPlanningError AttemptState = "planning-error"
+	AttemptFanOutFSs     AttemptState = "fan-out-filesystems"
+	AttemptFanOutError   AttemptState = "filesystem-error"
+	AttemptDone          AttemptState = "done"
+)
+
+type FilesystemState string
+
+const (
+	FilesystemPlanning        FilesystemState = "planning"
+	FilesystemPlanningErrored FilesystemState = "planning-error"
+	FilesystemStepping        FilesystemState = "stepping"
+	FilesystemSteppingErrored FilesystemState = "step-error"
+	FilesystemDone            FilesystemState = "done"
+)
+
+type FilesystemReport struct {
+	Info *FilesystemInfo
+
+	State FilesystemState
+
+	// Valid in State = FilesystemPlanningErrored
+	PlanError *TimedError
+	// Valid in State = FilesystemSteppingErrored
+	StepError *TimedError
+
+	// Valid in State = FilesystemStepping
+	CurrentStep int
+	Steps       []*StepReport
+}
+
+type FilesystemInfo struct {
+	Name string
+}
+
+type StepReport struct {
+	Info *StepInfo
+}
+
+type StepInfo struct {
+	From, To        string
+	BytesExpected   int64
+	BytesReplicated int64
+}
+
+func (a *AttemptReport) BytesSum() (expected, replicated int64) {
+	for _, fs := range a.Filesystems {
+		e, r := fs.BytesSum()
+		expected += e
+		replicated += r
+	}
+	return expected, replicated
+}
+
+func (f *FilesystemReport) BytesSum() (expected, replicated int64) {
+	for _, step := range f.Steps {
+		expected += step.Info.BytesExpected
+		replicated += step.Info.BytesReplicated
+	}
+	return
+}
+
+func (f *AttemptReport) FilesystemsByState() map[FilesystemState][]*FilesystemReport {
+	r := make(map[FilesystemState][]*FilesystemReport, 4)
+	for _, fs := range f.Filesystems {
+		l := r[fs.State]
+		l = append(l, fs)
+		r[fs.State] = l
+	}
+	return r
+}
+
+func (f *FilesystemReport) Error() *TimedError {
+	switch f.State {
+	case FilesystemPlanningErrored:
+		return f.PlanError
+	case FilesystemSteppingErrored:
+		return f.StepError
+	}
+	return nil
+}
+
+// may return nil
+func (f *FilesystemReport) NextStep() *StepReport {
+	switch f.State {
+	case FilesystemDone:
+		return nil
+	case FilesystemPlanningErrored:
+		return nil
+	case FilesystemSteppingErrored:
+		return nil
+	case FilesystemPlanning:
+		return nil
+	case FilesystemStepping:
+		// invariant is that this is always correct
+		// TODO what about 0-length Steps but short intermediary state?
+		return f.Steps[f.CurrentStep]
+	}
+	panic("unreachable")
+}
+
+func (f *StepReport) IsIncremental() bool {
+	return f.Info.From != "" // FIXME change to ZFS semantics (To != "")
+}
--- a/replication/state_enumer.go
+++ b/replication/state_enumer.go
@ -1,76 +0,0 @@
-// Code generated by "enumer -type=State"; DO NOT EDIT.
-
-package replication
-
-import (
-	"fmt"
-)
-
-const (
-	_StateName_0 = "PlanningPlanningError"
-	_StateName_1 = "Working"
-	_StateName_2 = "WorkingWait"
-	_StateName_3 = "Completed"
-	_StateName_4 = "PermanentError"
-)
-
-var (
-	_StateIndex_0 = [...]uint8{0, 8, 21}
-	_StateIndex_1 = [...]uint8{0, 7}
-	_StateIndex_2 = [...]uint8{0, 11}
-	_StateIndex_3 = [...]uint8{0, 9}
-	_StateIndex_4 = [...]uint8{0, 14}
-)
-
-func (i State) String() string {
-	switch {
-	case 1 <= i && i <= 2:
-		i -= 1
-		return _StateName_0[_StateIndex_0[i]:_StateIndex_0[i+1]]
-	case i == 4:
-		return _StateName_1
-	case i == 8:
-		return _StateName_2
-	case i == 16:
-		return _StateName_3
-	case i == 32:
-		return _StateName_4
-	default:
-		return fmt.Sprintf("State(%d)", i)
-	}
-}
-
-var _StateValues = []State{1, 2, 4, 8, 16, 32}
-
-var _StateNameToValueMap = map[string]State{
-	_StateName_0[0:8]:  1,
-	_StateName_0[8:21]: 2,
-	_StateName_1[0:7]:  4,
-	_StateName_2[0:11]: 8,
-	_StateName_3[0:9]:  16,
-	_StateName_4[0:14]: 32,
-}
-
-// StateString retrieves an enum value from the enum constants string name.
-// Throws an error if the param is not part of the enum.
-func StateString(s string) (State, error) {
-	if val, ok := _StateNameToValueMap[s]; ok {
-		return val, nil
-	}
-	return 0, fmt.Errorf("%s does not belong to State values", s)
-}
-
-// StateValues returns all values of the enum
-func StateValues() []State {
-	return _StateValues
-}
-
-// IsAState returns "true" if the value is listed in the enum definition. "false" otherwise
-func (i State) IsAState() bool {
-	for _, v := range _StateValues {
-		if i == v {
-			return true
-		}
-	}
-	return false
-}
--- a/rpc/dataconn/dataconn_client.go
+++ b/rpc/dataconn/dataconn_client.go
@ -7,7 +7,7 @@ import (
 	"strings"

 	"github.com/golang/protobuf/proto"
-	"github.com/zrepl/zrepl/replication/pdu"
+	"github.com/zrepl/zrepl/replication/logic/pdu"
 	"github.com/zrepl/zrepl/rpc/dataconn/stream"
 	"github.com/zrepl/zrepl/transport"
 	"github.com/zrepl/zrepl/zfs"
--- a/rpc/dataconn/dataconn_server.go
+++ b/rpc/dataconn/dataconn_server.go
@ -7,7 +7,7 @@ import (

 	"github.com/golang/protobuf/proto"
 	"github.com/zrepl/zrepl/logger"
-	"github.com/zrepl/zrepl/replication/pdu"
+	"github.com/zrepl/zrepl/replication/logic/pdu"
 	"github.com/zrepl/zrepl/rpc/dataconn/stream"
 	"github.com/zrepl/zrepl/transport"
 	"github.com/zrepl/zrepl/zfs"
--- a/rpc/dataconn/microbenchmark/microbenchmark.go
+++ b/rpc/dataconn/microbenchmark/microbenchmark.go
@ -24,7 +24,7 @@ import (
 	"github.com/pkg/profile"

 	"github.com/zrepl/zrepl/logger"
-	"github.com/zrepl/zrepl/replication/pdu"
+	"github.com/zrepl/zrepl/replication/logic/pdu"
 	"github.com/zrepl/zrepl/rpc/dataconn"
 	"github.com/zrepl/zrepl/rpc/dataconn/timeoutconn"
 	"github.com/zrepl/zrepl/transport"
--- a/rpc/grpcclientidentity/grpchelper/authlistener_grpc_adaptor_wrapper.go
+++ b/rpc/grpcclientidentity/grpchelper/authlistener_grpc_adaptor_wrapper.go
@ -36,7 +36,7 @@ func ClientConn(cn transport.Connecter, log Logger) *grpc.ClientConn {
 	})
 	dialerOption := grpc.WithDialer(grpcclientidentity.NewDialer(log, cn))
 	cred := grpc.WithTransportCredentials(grpcclientidentity.NewTransportCredentials(log))
-	ctx, cancel := context.WithTimeout(context.TODO(), 5*time.Second)
+	ctx, cancel := context.WithTimeout(context.TODO(), 5*time.Second) // FIXME constant
 	defer cancel()
 	cc, err := grpc.DialContext(ctx, "doesn't matter done by dialer", dialerOption, cred, ka)
 	if err != nil {
--- a/rpc/rpc_client.go
+++ b/rpc/rpc_client.go
@ -7,8 +7,8 @@ import (

 	"google.golang.org/grpc"

-	"github.com/zrepl/zrepl/replication"
-	"github.com/zrepl/zrepl/replication/pdu"
+	"github.com/zrepl/zrepl/replication/logic"
+	"github.com/zrepl/zrepl/replication/logic/pdu"
 	"github.com/zrepl/zrepl/rpc/dataconn"
 	"github.com/zrepl/zrepl/rpc/grpcclientidentity/grpchelper"
 	"github.com/zrepl/zrepl/rpc/versionhandshake"
@ -26,9 +26,9 @@ type Client struct {
 	loggers       Loggers
 }

-var _ replication.Endpoint = &Client{}
-var _ replication.Sender = &Client{}
-var _ replication.Receiver = &Client{}
+var _ logic.Endpoint = &Client{}
+var _ logic.Sender = &Client{}
+var _ logic.Receiver = &Client{}

 type DialContextFunc = func(ctx context.Context, network string, addr string) (net.Conn, error)

--- a/rpc/rpc_server.go
+++ b/rpc/rpc_server.go
@ -7,7 +7,7 @@ import (
 	"google.golang.org/grpc"

 	"github.com/zrepl/zrepl/endpoint"
-	"github.com/zrepl/zrepl/replication/pdu"
+	"github.com/zrepl/zrepl/replication/logic/pdu"
 	"github.com/zrepl/zrepl/rpc/dataconn"
 	"github.com/zrepl/zrepl/rpc/grpcclientidentity"
 	"github.com/zrepl/zrepl/rpc/netadaptor"
--- a/util/chainlock/chainlock.go
+++ b/util/chainlock/chainlock.go
@ -0,0 +1,42 @@
+// package chainlock implements a mutex whose Lock and Unlock
+// methods return the lock itself, to enable chaining.
+//
+// Intended Usage
+//
+//   defer s.lock().unlock()
+//   // drop lock while waiting for wait group
+//   func() {
+//	     defer a.l.Unlock().Lock()
+//	     fssesDone.Wait()
+//	 }()
+//
+package chainlock
+
+import "sync"
+
+type L struct {
+	mtx sync.Mutex
+}
+
+func New() *L {
+	return &L{}
+}
+
+func (l *L) Lock() *L {
+	l.mtx.Lock()
+	return l
+}
+
+func (l *L) Unlock() *L {
+	l.mtx.Unlock()
+	return l
+}
+
+func (l *L) NewCond() *sync.Cond {
+	return sync.NewCond(&l.mtx)
+}
+
+func (l *L) DropWhile(f func()) {
+	defer l.Unlock().Lock()
+	f()
+}