[#288] option to disable step holds for incremental sends

This is a stop-gap solution until we re-write the pruner to support
rules for removing step holds.

Note that disabling step holds for incremental sends does not affect
zrepl's guarantee that incremental replication is always possible:

Suppose you yank the external drive during an incremental @from -> @to step:

* restarting that step or future incrementals @from -> @to_later` will be possible
  because the replication cursor bookmark points to @from until the step is complete
* resuming @from -> @to will work as long as the pruner on your internal pool doesn't come around to destroy @to.
    * in that case, the replication algorithm should determine that the resumable state
      on the receiving side isuseless because @to no longer exists on the sending side,
      and consequently clear it, and restart an incremental step @from -> @to_later

refs #288
This commit is contained in:
Christian Schwarz 2020-06-01 14:39:59 +02:00
parent b6b2d7b615
commit 5b564a3e28
8 changed files with 87 additions and 46 deletions

View File

@ -77,6 +77,11 @@ type SnapJob struct {
type SendOptions struct {
Encrypted bool `yaml:"encrypted"`
StepHolds SendOptionsStepHolds `yaml:"step_holds,optional"`
}
type SendOptionsStepHolds struct {
DisableIncremental bool `yaml:"disable_incremental,optional"`
}
var _ yaml.Defaulter = (*SendOptions)(nil)

View File

@ -154,6 +154,7 @@ func modePushFromConfig(g *config.Global, in *config.PushJob, jobID endpoint.Job
m.senderConfig = &endpoint.SenderConfig{
FSF: fsf,
Encrypt: &zfs.NilBool{B: in.Send.Encrypted},
DisableIncrementalStepHolds: in.Send.StepHolds.DisableIncremental,
JobID: jobID,
}
m.plannerPolicy = &logic.PlannerPolicy{

View File

@ -81,6 +81,7 @@ func modeSourceFromConfig(g *config.Global, in *config.SourceJob, jobID endpoint
m.senderConfig = &endpoint.SenderConfig{
FSF: fsf,
Encrypt: &zfs.NilBool{B: in.Send.Encrypted},
DisableIncrementalStepHolds: in.Send.StepHolds.DisableIncremental,
JobID: jobID,
}

View File

@ -175,6 +175,8 @@ func (j *SnapJob) doPrune(ctx context.Context) {
FSF: j.fsfilter,
// FIXME encryption setting is irrelevant for SnapJob because the endpoint is only used as pruner.Target
Encrypt: &zfs.NilBool{B: true},
// FIXME DisableIncrementalStepHolds setting is irrelevant for SnapJob because the endpoint is only used as pruner.Target
DisableIncrementalStepHolds: false,
})
j.pruner = j.prunerFactory.BuildLocalPruner(ctx, sender, alwaysUpToDateReplicationCursorHistory{sender})
log.Info("start pruning")

View File

@ -16,6 +16,8 @@ Send Options
filesystems: ...
send:
encrypted: true
step_holds:
disable_incremental: false
...
:ref:`Source<job-source>` and :ref:`push<job-push>` jobs have an optional ``send`` configuration section.
@ -34,6 +36,17 @@ Filesystems matched by ``filesystems`` that are not encrypted are not sent and w
If ``encryption=false``, zrepl expects that filesystems matching ``filesystems`` are not encrypted or have loaded encryption keys.
``step_holds.disable_incremental`` option
-----------------------------------------
The ``step_holds.disable_incremental`` variable controls whether the creation of :ref:`step holds <step-holds-and-bookmarks>` should be disabled for incremental replication.
The default value is ``false``.
Disabling step holds has the disadvantage that steps :ref:`might not be resumable <step-holds-and-bookmarks>` if interrupted.
However, the advantage and :issue:`reason for existence <288>` of this flag is that it allows the pruner to delete snapshots of interrupted replication steps
which is useful if replication happens so rarely (or fails so frequently) that the amount of disk space exclusively referenced by the step's snapshots becomes intolerable.
.. _job-recv-options:
Recv Options

View File

@ -23,6 +23,7 @@ import (
type SenderConfig struct {
FSF zfs.DatasetFilter
Encrypt *zfs.NilBool
DisableIncrementalStepHolds bool
JobID JobID
}
@ -41,6 +42,7 @@ func (c *SenderConfig) Validate() error {
type Sender struct {
FSFilter zfs.DatasetFilter
encrypt *zfs.NilBool
disableIncrementalStepHolds bool
jobId JobID
}
@ -51,6 +53,7 @@ func NewSender(conf SenderConfig) *Sender {
return &Sender{
FSFilter: conf.FSF,
encrypt: conf.Encrypt,
disableIncrementalStepHolds: conf.DisableIncrementalStepHolds,
jobId: conf.JobID,
}
}
@ -221,9 +224,11 @@ func (s *Sender) Send(ctx context.Context, r *pdu.SendReq) (*pdu.SendRes, io.Rea
}
}
takeStepHolds := sendArgs.FromVersion == nil || !s.disableIncrementalStepHolds
var fromHold, toHold Abstraction
// make sure `From` doesn't go away in order to make this step resumable
if sendArgs.From != nil {
if sendArgs.From != nil && takeStepHolds {
fromHold, err = HoldStep(ctx, sendArgs.FS, *sendArgs.FromVersion, s.jobId) // no shadow
if err == zfs.ErrBookmarkCloningNotSupported {
getLogger(ctx).Debug("not creating step bookmark because ZFS does not support it")
@ -232,11 +237,13 @@ func (s *Sender) Send(ctx context.Context, r *pdu.SendReq) (*pdu.SendRes, io.Rea
return nil, nil, errors.Wrapf(err, "cannot hold `from` version %q before starting send", *sendArgs.FromVersion)
}
}
if takeStepHolds {
// make sure `To` doesn't go away in order to make this step resumable
toHold, err = HoldStep(ctx, sendArgs.FS, sendArgs.ToVersion, s.jobId)
if err != nil {
return nil, nil, errors.Wrapf(err, "cannot hold `to` version %q before starting send", sendArgs.ToVersion)
}
}
// cleanup the mess that _this function_ might have created in prior failed attempts:
//
@ -254,11 +261,11 @@ func (s *Sender) Send(ctx context.Context, r *pdu.SendReq) (*pdu.SendRes, io.Rea
//
// Note further that a resuming send, due to the idempotent nature of func CreateReplicationCursor and HoldStep,
// will never lose its step holds because we just (idempotently re-)created them above, before attempting the cleanup.
liveAbs := []Abstraction{fromHold, toHold, fromReplicationCursor}
func() {
ctx, endSpan := trace.WithSpan(ctx, "cleanup-stale-abstractions")
defer endSpan()
liveAbs := []Abstraction{fromHold, toHold, fromReplicationCursor}
keep := func(a Abstraction) (keep bool) {
keep = false
for _, k := range liveAbs {
@ -283,13 +290,11 @@ func (s *Sender) Send(ctx context.Context, r *pdu.SendReq) (*pdu.SendRes, io.Rea
}
sendAbstractionsCacheSingleton.TryBatchDestroy(ctx, s.jobId, sendArgs.FS, keep, check)
}()
if fromHold != nil {
sendAbstractionsCacheSingleton.Put(fromHold)
// now add the newly created abstractions to the cleaned-up cache
for _, a := range liveAbs {
if a != nil {
sendAbstractionsCacheSingleton.Put(a)
}
sendAbstractionsCacheSingleton.Put(toHold)
if fromReplicationCursor != nil {
sendAbstractionsCacheSingleton.Put(fromReplicationCursor)
}
sendStream, err := zfs.ZFSSend(ctx, sendArgs)

View File

@ -19,7 +19,8 @@ var Cases = []Case{BatchDestroy,
ReplicationIncrementalCleansUpStaleAbstractionsWithCacheOnSecondReplication,
ReplicationIncrementalCleansUpStaleAbstractionsWithoutCacheOnSecondReplication,
ReplicationIncrementalIsPossibleIfCommonSnapshotIsDestroyed,
ReplicationIsResumableFullSend,
ReplicationIsResumableFullSend__DisableIncrementalStepHolds_False,
ReplicationIsResumableFullSend__DisableIncrementalStepHolds_True,
ResumableRecvAndTokenHandling,
ResumeTokenParsing,
SendArgsValidationEncryptedSendOfUnencryptedDatasetForbidden,

View File

@ -30,6 +30,7 @@ type replicationInvocation struct {
sfs string
rfsRoot string
interceptSender func(e *endpoint.Sender) logic.Sender
disableIncrementalStepHolds bool
}
func (i replicationInvocation) Do(ctx *platformtest.Context) *report.Report {
@ -44,6 +45,7 @@ func (i replicationInvocation) Do(ctx *platformtest.Context) *report.Report {
sender := i.interceptSender(endpoint.NewSender(endpoint.SenderConfig{
FSF: sfilter.AsFilter(),
Encrypt: &zfs.NilBool{B: false},
DisableIncrementalStepHolds: i.disableIncrementalStepHolds,
JobID: i.sjid,
}))
receiver := endpoint.NewReceiver(endpoint.ReceiverConfig{
@ -90,6 +92,7 @@ func ReplicationIncrementalIsPossibleIfCommonSnapshotIsDestroyed(ctx *platformte
rjid: rjid,
sfs: sfs,
rfsRoot: rfsRoot,
disableIncrementalStepHolds: false,
}
rfs := rep.ReceiveSideFilesystem()
@ -153,6 +156,7 @@ func implReplicationIncrementalCleansUpStaleAbstractions(ctx *platformtest.Conte
rjid: rjid,
sfs: sfs,
rfsRoot: rfsRoot,
disableIncrementalStepHolds: false,
}
rfs := rep.ReceiveSideFilesystem()
@ -322,7 +326,15 @@ func (s *PartialSender) Send(ctx context.Context, r *pdu.SendReq) (r1 *pdu.SendR
return r1, r2, r3
}
func ReplicationIsResumableFullSend(ctx *platformtest.Context) {
func ReplicationIsResumableFullSend__DisableIncrementalStepHolds_False(ctx *platformtest.Context) {
implReplicationIsResumableFullSend(ctx, false)
}
func ReplicationIsResumableFullSend__DisableIncrementalStepHolds_True(ctx *platformtest.Context) {
implReplicationIsResumableFullSend(ctx, true)
}
func implReplicationIsResumableFullSend(ctx *platformtest.Context, disableIncrementalStepHolds bool) {
platformtest.Run(ctx, platformtest.PanicErr, ctx.RootDataset, `
CREATEROOT
@ -353,6 +365,7 @@ func ReplicationIsResumableFullSend(ctx *platformtest.Context) {
interceptSender: func(e *endpoint.Sender) logic.Sender {
return &PartialSender{Sender: e, failAfterByteCount: 1 << 20}
},
disableIncrementalStepHolds: disableIncrementalStepHolds,
}
rfs := rep.ReceiveSideFilesystem()