bookmarking: prune policy for bookmarks

refs #34
This commit is contained in:
Christian Schwarz 2018-02-17 20:48:31 +01:00
parent 8e34843eb1
commit aa92261ea7
15 changed files with 149 additions and 48 deletions

View File

@ -50,7 +50,7 @@ func (a *IntervalAutosnap) findSyncPoint(fss []*zfs.DatasetPath) (syncPoint time
l := a.task.Log().WithField(logFSField, d.ToString())
fsvs, err := zfs.ZFSListFilesystemVersions(d, NewTypedPrefixFilter(a.Prefix, zfs.Snapshot))
fsvs, err := zfs.ZFSListFilesystemVersions(d, NewPrefixFilter(a.Prefix))
if err != nil {
l.WithError(err).Error("cannot list filesystem versions")
continue

View File

@ -59,6 +59,8 @@ type SSHStdinServerConnectDescr struct {
}
type PrunePolicy interface {
// Prune filters versions and decide which to keep and which to remove.
// Prune **does not** implement the actual removal of the versions.
Prune(fs *zfs.DatasetPath, versions []zfs.FilesystemVersion) (keep, remove []zfs.FilesystemVersion, err error)
}

View File

@ -63,11 +63,11 @@ func parseLocalJob(c JobParsingContext, name string, i map[string]interface{}) (
return
}
if j.PruneLHS, err = parsePrunePolicy(asMap.PruneLHS); err != nil {
if j.PruneLHS, err = parsePrunePolicy(asMap.PruneLHS, true); err != nil {
err = errors.Wrap(err, "cannot parse 'prune_lhs'")
return
}
if j.PruneRHS, err = parsePrunePolicy(asMap.PruneRHS); err != nil {
if j.PruneRHS, err = parsePrunePolicy(asMap.PruneRHS, false); err != nil {
err = errors.Wrap(err, "cannot parse 'prune_rhs'")
return
}

View File

@ -77,7 +77,7 @@ func parsePullJob(c JobParsingContext, name string, i map[string]interface{}) (j
return
}
if j.Prune, err = parsePrunePolicy(asMap.Prune); err != nil {
if j.Prune, err = parsePrunePolicy(asMap.Prune, false); err != nil {
err = errors.Wrap(err, "cannot parse prune policy")
return
}

View File

@ -59,7 +59,7 @@ func parseSourceJob(c JobParsingContext, name string, i map[string]interface{})
return
}
if j.Prune, err = parsePrunePolicy(asMap.Prune); err != nil {
if j.Prune, err = parsePrunePolicy(asMap.Prune, true); err != nil {
err = errors.Wrap(err, "cannot parse 'prune'")
return
}

View File

@ -220,7 +220,7 @@ err:
return
}
func parsePrunePolicy(v map[string]interface{}) (p PrunePolicy, err error) {
func parsePrunePolicy(v map[string]interface{}, willSeeBookmarks bool) (p PrunePolicy, err error) {
policyName, err := extractStringField(v, "policy", true)
if err != nil {
@ -229,14 +229,13 @@ func parsePrunePolicy(v map[string]interface{}) (p PrunePolicy, err error) {
switch policyName {
case "grid":
return parseGridPrunePolicy(v)
return parseGridPrunePolicy(v, willSeeBookmarks)
case "noprune":
return NoPrunePolicy{}, nil
default:
err = errors.Errorf("unknown policy '%s'", policyName)
return
}
}
func parseAuthenticatedChannelListenerFactory(c JobParsingContext, v map[string]interface{}) (p AuthenticatedChannelListenerFactory, err error) {

View File

@ -6,6 +6,7 @@ import (
"github.com/pkg/errors"
"github.com/zrepl/zrepl/util"
"github.com/zrepl/zrepl/zfs"
"math"
"regexp"
"sort"
"strconv"
@ -15,8 +16,11 @@ import (
type GridPrunePolicy struct {
RetentionGrid *util.RetentionGrid
MaxBookmarks int
}
const GridPrunePolicyMaxBookmarksKeepAll = -1
type retentionGridAdaptor struct {
zfs.FilesystemVersion
}
@ -29,12 +33,27 @@ func (a retentionGridAdaptor) LessThan(b util.RetentionGridEntry) bool {
return a.CreateTXG < b.(retentionGridAdaptor).CreateTXG
}
// Prune filters snapshots with the retention grid.
// Bookmarks are deleted such that KeepBookmarks are kept in the end.
// The oldest bookmarks are removed first.
func (p *GridPrunePolicy) Prune(_ *zfs.DatasetPath, versions []zfs.FilesystemVersion) (keep, remove []zfs.FilesystemVersion, err error) {
skeep, sremove := p.pruneSnapshots(versions)
keep, remove = p.pruneBookmarks(skeep)
remove = append(remove, sremove...)
return keep, remove, nil
}
func (p *GridPrunePolicy) pruneSnapshots(versions []zfs.FilesystemVersion) (keep, remove []zfs.FilesystemVersion) {
// Build adaptors for retention grid
adaptors := make([]util.RetentionGridEntry, len(versions))
keep = []zfs.FilesystemVersion{}
adaptors := make([]util.RetentionGridEntry, 0)
for fsv := range versions {
adaptors[fsv] = retentionGridAdaptor{versions[fsv]}
if versions[fsv].Type != zfs.Snapshot {
keep = append(keep, versions[fsv])
continue
}
adaptors = append(adaptors, retentionGridAdaptor{versions[fsv]})
}
sort.SliceStable(adaptors, func(i, j int) bool {
@ -46,9 +65,8 @@ func (p *GridPrunePolicy) Prune(_ *zfs.DatasetPath, versions []zfs.FilesystemVer
keepa, removea := p.RetentionGrid.FitEntries(now, adaptors)
// Revert adaptors
keep = make([]zfs.FilesystemVersion, len(keepa))
for i := range keepa {
keep[i] = keepa[i].(retentionGridAdaptor).FilesystemVersion
keep = append(keep, keepa[i].(retentionGridAdaptor).FilesystemVersion)
}
remove = make([]zfs.FilesystemVersion, len(removea))
for i := range removea {
@ -58,20 +76,60 @@ func (p *GridPrunePolicy) Prune(_ *zfs.DatasetPath, versions []zfs.FilesystemVer
}
func parseGridPrunePolicy(e map[string]interface{}) (p *GridPrunePolicy, err error) {
func (p *GridPrunePolicy) pruneBookmarks(versions []zfs.FilesystemVersion) (keep, remove []zfs.FilesystemVersion) {
var i struct {
Grid string
if p.MaxBookmarks == GridPrunePolicyMaxBookmarksKeepAll {
return versions, []zfs.FilesystemVersion{}
}
if err = mapstructure.Decode(e, &i); err != nil {
keep = []zfs.FilesystemVersion{}
bookmarks := make([]zfs.FilesystemVersion, 0)
for fsv := range versions {
if versions[fsv].Type != zfs.Bookmark {
keep = append(keep, versions[fsv])
continue
}
bookmarks = append(bookmarks, versions[fsv])
}
if len(bookmarks) == 0 {
return keep, []zfs.FilesystemVersion{}
}
if len(bookmarks) < p.MaxBookmarks {
keep = append(keep, bookmarks...)
return keep, []zfs.FilesystemVersion{}
}
// NOTE: sorting descending by descending by createtxg <=> sorting ascending wrt creation time
sort.SliceStable(bookmarks, func(i, j int) bool {
return (bookmarks[i].CreateTXG > bookmarks[j].CreateTXG)
})
keep = append(keep, bookmarks[:p.MaxBookmarks]...)
remove = bookmarks[p.MaxBookmarks:]
return keep, remove
}
func parseGridPrunePolicy(e map[string]interface{}, willSeeBookmarks bool) (p *GridPrunePolicy, err error) {
const KeepBookmarksAllString = "all"
var i struct {
Grid string
KeepBookmarks string `mapstructure:"keep_bookmarks"`
}
dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{Result: &i, WeaklyTypedInput: true})
if err != nil {
err = errors.Wrap(err, "mapstructure error")
return
}
if err = dec.Decode(e); err != nil {
err = errors.Wrapf(err, "mapstructure error")
return
}
p = &GridPrunePolicy{}
// Parse grid policy
// Parse grid
intervals, err := parseRetentionGridIntervalsString(i.Grid)
if err != nil {
err = fmt.Errorf("cannot parse retention grid: %s", err)
@ -97,9 +155,22 @@ func parseGridPrunePolicy(e map[string]interface{}) (p *GridPrunePolicy, err err
lastDuration = intervals[i].Length
}
p.RetentionGrid = util.NewRetentionGrid(intervals)
return
// Parse KeepBookmarks
keepBookmarks := 0
if i.KeepBookmarks == KeepBookmarksAllString || (i.KeepBookmarks == "" && !willSeeBookmarks) {
keepBookmarks = GridPrunePolicyMaxBookmarksKeepAll
} else {
i, err := strconv.ParseInt(i.KeepBookmarks, 10, 32)
if err != nil || i <= 0 || i > math.MaxInt32 {
return nil, errors.Errorf("keep_bookmarks must be positive integer or 'all'")
}
keepBookmarks = int(i)
}
return &GridPrunePolicy{
util.NewRetentionGrid(intervals),
keepBookmarks,
}, nil
}
var retentionStringIntervalRegex *regexp.Regexp = regexp.MustCompile(`^\s*(\d+)\s*x\s*([^\(]+)\s*(\((.*)\))?\s*$`)

View File

@ -43,9 +43,8 @@ func (p *Pruner) filterVersions(fs *zfs.DatasetPath) (fsversions []zfs.Filesyste
defer p.task.Finish()
log := p.task.Log().WithField(logFSField, fs.ToString())
// only prune snapshots, bookmarks are kept forever
snapshotFilter := NewTypedPrefixFilter(p.SnapshotPrefix, zfs.Snapshot)
fsversions, err := zfs.ZFSListFilesystemVersions(fs, snapshotFilter)
filter := NewPrefixFilter(p.SnapshotPrefix)
fsversions, err := zfs.ZFSListFilesystemVersions(fs, filter)
if err != nil {
log.WithError(err).Error("error listing filesytem versions")
return nil, true

View File

@ -19,6 +19,7 @@ jobs:
prune_lhs:
policy: grid
grid: 1x1h(keep=all)
keep_bookmarks: all
# follow a grandfathering scheme for filesystems on the right-hand-side of the mapping
prune_rhs:

View File

@ -37,9 +37,11 @@ jobs:
interval: 10m
# keep a one day window 10m interval snapshots in case pull doesn't work (link down, etc)
# (we cannot keep more than one day because this host will run out of disk space)
# keep 1 hour of snapshots (6 at 10m interval)
# and one day of bookmarks in case pull doesn't work (link down, etc)
# => keep_bookmarks = 24h / interval = 24h / 10m = 144
prune:
policy: grid
grid: 1x1d(keep=all)
grid: 1x1h(keep=all)
keep_bookmarks: 144

View File

@ -30,4 +30,4 @@ jobs:
prune:
policy: grid
grid: 1x10s(keep=all)
keep_bookmarks: all

View File

@ -1,5 +1,7 @@
.. |break_config| replace:: **[BREAK]**
.. |break| replace:: **[BREAK]**
.. |bugfix| replace:: [BUG]
.. |feature| replace:: [FEATURE]
Changelog
=========
@ -7,6 +9,16 @@ Changelog
The changelog summarized bugfixes that are deemed relevant for users.
Developers should consult the git commit log or GitHub issue tracker.
0.0.3
-----
* |break_config| |feature| :issue:`34`: automatic bookmarking of snapshots
* Snapshots are automatically bookmarked and pruning of bookmarks **must** be configured.
* This breaks existing configuration: ``grid`` :ref:`prune policy <prune-retention-grid>` specifications require the new ``keep_bookmarks`` parameter.
* Make sure to understand the meaning bookmarks have for :ref:`maximum replication downtime <replication-downtime>`.
* Example: :sampleconf:`pullbackup/productionhost.yml`
0.0.2
-----

View File

@ -48,14 +48,14 @@ Example: :sampleconf:`pullbackup/productionhost.yml`.
* - ``interval``
- snapshotting interval
* - ``prune``
- |prune| policy for filesytems in ``filesystems`` with prefix ``snapshot_prefix``
- |prune| for versions of filesytems in ``filesystems``, versions prefixed with ``snapshot_prefix``
- Snapshotting Task (every ``interval``, |patient|)
- A snapshot of filesystems matched by ``filesystems`` is taken every ``interval`` with prefix ``snapshot_prefix``.
- A bookmark of that snapshot is created with the same name.
- The ``prune`` policy is triggered on filesystems matched by ``filesystems`` with snapshots matched by ``snapshot_prefix``.
- The ``prune`` policy is evaluated for versions of filesystems matched by ``filesystems``, versions prefixed with ``snapshot_prefix``.
- Serve Task
@ -65,12 +65,6 @@ A source job is the counterpart to a :ref:`job-pull`.
Make sure you read the |prune| policy documentation.
Note that zrepl does not prune bookmarks due to the following reason:
a pull job may stop replication due to link failure, misconfiguration or administrative action.
The source prune policy will eventually destroy the last common snapshot between source and pull job.
Without bookmarks, the prune policy would need to perform full replication again.
With bookmarks, we can resume incremental replication, only losing the snapshots pruned since the outage.
.. _job-pull:
Pull Job
@ -99,7 +93,7 @@ Example: :sampleconf:`pullbackup/backuphost.yml`
* - ``snapshot_prefix``
- prefix snapshots must match to be considered for replication & pruning
* - ``prune``
- |prune| policy for local filesystems reachable by ``mapping``
- |prune| policy for versions of filesystems of local filesystems reachable by ``mapping``, versions prefixed with ``snapshot_prefix``
* Main Task (every ``interval``, |patient|)
@ -112,10 +106,11 @@ Example: :sampleconf:`pullbackup/backuphost.yml`
#. If the local target filesystem does not exist, ``initial_repl_policy`` is used.
#. On conflicts, an error is logged but replication of other filesystems with mapping continues.
#. The ``prune`` policy is triggered for all *target filesystems*
#. The ``prune`` policy is evaluated for all *target filesystems*
A pull job is the counterpart to a :ref:`job-source`.
Make sure you read the |prune| policy documentation.
.. _job-local:
@ -163,8 +158,6 @@ Example: :sampleconf:`localbackup/host1.yml`
#. The ``prune_rhs`` policy is triggered for all *target filesystems*
A local job is combination of source & pull job executed on the same machine.
Note that while snapshots are pruned, bookmarks are not pruned and kept around forever.
Refer to the comments on :ref:`source job <job-source>` for the reasoning behind this.
Terminology
-----------
@ -188,3 +181,7 @@ patient task
* waits for the last invocation to finish
* logs a warning with the effective task duration
* immediately starts a new invocation of the task
filesystem version
A snapshot or a bookmark.

View File

@ -3,9 +3,9 @@
Pruning Policies
================
In zrepl, *pruning* means *destroying snapshots by some policy*.
In zrepl, *pruning* means *destroying filesystem versions by some policy* where filesystem versions are bookmarks and snapshots.
A *pruning policy* takes a list of snapshots and -- for each snapshot -- decides whether it should be kept or destroyed.
A *pruning policy* takes a list of filesystem versions and decides for each whether it should be kept or destroyed.
The job context defines which snapshots are even considered for pruning, for example through the ``snapshot_prefix`` variable.
Check the respective :ref:`job definition <job>` for details.
@ -25,6 +25,7 @@ Retention Grid
jobs:
- name: pull_app-srv
type: pull
...
prune:
policy: grid
@ -34,6 +35,15 @@ Retention Grid
└─ 24 adjacent one-hour intervals
- name: pull_backup
type: source
interval: 10m
prune:
policy: grid
grid: 1x1d(keep=all)
keep_bookmarks: 144
The retention grid can be thought of as a time-based sieve:
The ``grid`` field specifies a list of adjacent time intervals:
the left edge of the leftmost (first) interval is the ``creation`` date of the youngest snapshot.
@ -43,6 +53,11 @@ Each interval carries a maximum number of snapshots to keep.
It is secified via ``(keep=N)``, where ``N`` is either ``all`` (all snapshots are kept) or a positive integer.
The default value is **1**.
Bookmarks are not affected by the above.
Instead, the ``keep_bookmarks`` field specifies the number of bookmarks to be kept per filesystem.
You only need to specify ``keep_bookmarks`` at the source-side of a replication setup since the destination side does not receive bookmarks.
You can specify ``all`` as a value to keep all bookmarks, but be warned that you should install some other way to prune unneeded ones then (see below).
The following procedure happens during pruning:
#. The list of snapshots eligible for pruning is sorted by ``creation``
@ -54,14 +69,16 @@ The following procedure happens during pruning:
#. the contained snapshot list is sorted by creation.
#. snapshots from the list, oldest first, are destroyed until the specified ``keep`` count is reached.
#. all remaining snapshots on the list are kept.
#. The list of bookmarks eligible for pruning is sorted by ``createtxg`` and the most recent ``keep_bookmarks`` bookmarks are kept.
.. _replication-downtime:
.. ATTENTION::
The configuration of the first interval (``1x1h(keep=all)`` in the example) determines the **maximum allowable replication lag** because the source and destination pruning policies do not coordinate:
if replication does not work for whatever reason, source will continue to execute the prune policy.
Eventually, source destroys a snapshot that has never been replicated to destination, degrading the temporal resolution of your backup.
Be aware that ``keep_bookmarks x interval`` (interval of the job level) controls the **maximum allowable replication downtime** between source and destination.
If replication does not work for whatever reason, source and destination will eventually run out of sync because the source will continue pruning snapshots.
The only recovery in that case is full replication, which may not always be viable due to disk space or traffic constraints.
Thus, **always** configure the first interval to ``1x?(keep=all)``, substituting ``?`` with the maximum time replication may fail due to downtimes, maintenance, connectivity issues, etc.
.. We intentionally do not mention that bookmarks are used to bridge the gap between source and dest that are out of sync snapshot-wise. This is an implementation detail.
Further note that while bookmarks consume a constant amount of disk space, listing them requires temporary dynamic **kernel memory** proportional to the number of bookmarks.
Thus, do not use ``all`` or an inappropriately high value without good reason.

View File

@ -119,6 +119,7 @@ We define a corresponding **source job** named ``pull_backup`` in the |mainconfi
prune:
policy: grid
grid: 1x1d(keep=all)
keep_bookmarks: 144
The ``serve`` section corresponds to the ``connect`` section in the configuration of ``backup-srv``.