mirror of
https://github.com/zrepl/zrepl.git
synced 2024-12-22 23:20:51 +01:00
parent
8e34843eb1
commit
aa92261ea7
@ -50,7 +50,7 @@ func (a *IntervalAutosnap) findSyncPoint(fss []*zfs.DatasetPath) (syncPoint time
|
||||
|
||||
l := a.task.Log().WithField(logFSField, d.ToString())
|
||||
|
||||
fsvs, err := zfs.ZFSListFilesystemVersions(d, NewTypedPrefixFilter(a.Prefix, zfs.Snapshot))
|
||||
fsvs, err := zfs.ZFSListFilesystemVersions(d, NewPrefixFilter(a.Prefix))
|
||||
if err != nil {
|
||||
l.WithError(err).Error("cannot list filesystem versions")
|
||||
continue
|
||||
|
@ -59,6 +59,8 @@ type SSHStdinServerConnectDescr struct {
|
||||
}
|
||||
|
||||
type PrunePolicy interface {
|
||||
// Prune filters versions and decide which to keep and which to remove.
|
||||
// Prune **does not** implement the actual removal of the versions.
|
||||
Prune(fs *zfs.DatasetPath, versions []zfs.FilesystemVersion) (keep, remove []zfs.FilesystemVersion, err error)
|
||||
}
|
||||
|
||||
|
@ -63,11 +63,11 @@ func parseLocalJob(c JobParsingContext, name string, i map[string]interface{}) (
|
||||
return
|
||||
}
|
||||
|
||||
if j.PruneLHS, err = parsePrunePolicy(asMap.PruneLHS); err != nil {
|
||||
if j.PruneLHS, err = parsePrunePolicy(asMap.PruneLHS, true); err != nil {
|
||||
err = errors.Wrap(err, "cannot parse 'prune_lhs'")
|
||||
return
|
||||
}
|
||||
if j.PruneRHS, err = parsePrunePolicy(asMap.PruneRHS); err != nil {
|
||||
if j.PruneRHS, err = parsePrunePolicy(asMap.PruneRHS, false); err != nil {
|
||||
err = errors.Wrap(err, "cannot parse 'prune_rhs'")
|
||||
return
|
||||
}
|
||||
|
@ -77,7 +77,7 @@ func parsePullJob(c JobParsingContext, name string, i map[string]interface{}) (j
|
||||
return
|
||||
}
|
||||
|
||||
if j.Prune, err = parsePrunePolicy(asMap.Prune); err != nil {
|
||||
if j.Prune, err = parsePrunePolicy(asMap.Prune, false); err != nil {
|
||||
err = errors.Wrap(err, "cannot parse prune policy")
|
||||
return
|
||||
}
|
||||
|
@ -59,7 +59,7 @@ func parseSourceJob(c JobParsingContext, name string, i map[string]interface{})
|
||||
return
|
||||
}
|
||||
|
||||
if j.Prune, err = parsePrunePolicy(asMap.Prune); err != nil {
|
||||
if j.Prune, err = parsePrunePolicy(asMap.Prune, true); err != nil {
|
||||
err = errors.Wrap(err, "cannot parse 'prune'")
|
||||
return
|
||||
}
|
||||
|
@ -220,7 +220,7 @@ err:
|
||||
return
|
||||
}
|
||||
|
||||
func parsePrunePolicy(v map[string]interface{}) (p PrunePolicy, err error) {
|
||||
func parsePrunePolicy(v map[string]interface{}, willSeeBookmarks bool) (p PrunePolicy, err error) {
|
||||
|
||||
policyName, err := extractStringField(v, "policy", true)
|
||||
if err != nil {
|
||||
@ -229,14 +229,13 @@ func parsePrunePolicy(v map[string]interface{}) (p PrunePolicy, err error) {
|
||||
|
||||
switch policyName {
|
||||
case "grid":
|
||||
return parseGridPrunePolicy(v)
|
||||
return parseGridPrunePolicy(v, willSeeBookmarks)
|
||||
case "noprune":
|
||||
return NoPrunePolicy{}, nil
|
||||
default:
|
||||
err = errors.Errorf("unknown policy '%s'", policyName)
|
||||
return
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func parseAuthenticatedChannelListenerFactory(c JobParsingContext, v map[string]interface{}) (p AuthenticatedChannelListenerFactory, err error) {
|
||||
|
@ -6,6 +6,7 @@ import (
|
||||
"github.com/pkg/errors"
|
||||
"github.com/zrepl/zrepl/util"
|
||||
"github.com/zrepl/zrepl/zfs"
|
||||
"math"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strconv"
|
||||
@ -15,8 +16,11 @@ import (
|
||||
|
||||
type GridPrunePolicy struct {
|
||||
RetentionGrid *util.RetentionGrid
|
||||
MaxBookmarks int
|
||||
}
|
||||
|
||||
const GridPrunePolicyMaxBookmarksKeepAll = -1
|
||||
|
||||
type retentionGridAdaptor struct {
|
||||
zfs.FilesystemVersion
|
||||
}
|
||||
@ -29,12 +33,27 @@ func (a retentionGridAdaptor) LessThan(b util.RetentionGridEntry) bool {
|
||||
return a.CreateTXG < b.(retentionGridAdaptor).CreateTXG
|
||||
}
|
||||
|
||||
// Prune filters snapshots with the retention grid.
|
||||
// Bookmarks are deleted such that KeepBookmarks are kept in the end.
|
||||
// The oldest bookmarks are removed first.
|
||||
func (p *GridPrunePolicy) Prune(_ *zfs.DatasetPath, versions []zfs.FilesystemVersion) (keep, remove []zfs.FilesystemVersion, err error) {
|
||||
skeep, sremove := p.pruneSnapshots(versions)
|
||||
keep, remove = p.pruneBookmarks(skeep)
|
||||
remove = append(remove, sremove...)
|
||||
return keep, remove, nil
|
||||
}
|
||||
|
||||
func (p *GridPrunePolicy) pruneSnapshots(versions []zfs.FilesystemVersion) (keep, remove []zfs.FilesystemVersion) {
|
||||
|
||||
// Build adaptors for retention grid
|
||||
adaptors := make([]util.RetentionGridEntry, len(versions))
|
||||
keep = []zfs.FilesystemVersion{}
|
||||
adaptors := make([]util.RetentionGridEntry, 0)
|
||||
for fsv := range versions {
|
||||
adaptors[fsv] = retentionGridAdaptor{versions[fsv]}
|
||||
if versions[fsv].Type != zfs.Snapshot {
|
||||
keep = append(keep, versions[fsv])
|
||||
continue
|
||||
}
|
||||
adaptors = append(adaptors, retentionGridAdaptor{versions[fsv]})
|
||||
}
|
||||
|
||||
sort.SliceStable(adaptors, func(i, j int) bool {
|
||||
@ -46,9 +65,8 @@ func (p *GridPrunePolicy) Prune(_ *zfs.DatasetPath, versions []zfs.FilesystemVer
|
||||
keepa, removea := p.RetentionGrid.FitEntries(now, adaptors)
|
||||
|
||||
// Revert adaptors
|
||||
keep = make([]zfs.FilesystemVersion, len(keepa))
|
||||
for i := range keepa {
|
||||
keep[i] = keepa[i].(retentionGridAdaptor).FilesystemVersion
|
||||
keep = append(keep, keepa[i].(retentionGridAdaptor).FilesystemVersion)
|
||||
}
|
||||
remove = make([]zfs.FilesystemVersion, len(removea))
|
||||
for i := range removea {
|
||||
@ -58,20 +76,60 @@ func (p *GridPrunePolicy) Prune(_ *zfs.DatasetPath, versions []zfs.FilesystemVer
|
||||
|
||||
}
|
||||
|
||||
func parseGridPrunePolicy(e map[string]interface{}) (p *GridPrunePolicy, err error) {
|
||||
func (p *GridPrunePolicy) pruneBookmarks(versions []zfs.FilesystemVersion) (keep, remove []zfs.FilesystemVersion) {
|
||||
|
||||
var i struct {
|
||||
Grid string
|
||||
if p.MaxBookmarks == GridPrunePolicyMaxBookmarksKeepAll {
|
||||
return versions, []zfs.FilesystemVersion{}
|
||||
}
|
||||
|
||||
if err = mapstructure.Decode(e, &i); err != nil {
|
||||
keep = []zfs.FilesystemVersion{}
|
||||
bookmarks := make([]zfs.FilesystemVersion, 0)
|
||||
for fsv := range versions {
|
||||
if versions[fsv].Type != zfs.Bookmark {
|
||||
keep = append(keep, versions[fsv])
|
||||
continue
|
||||
}
|
||||
bookmarks = append(bookmarks, versions[fsv])
|
||||
}
|
||||
|
||||
if len(bookmarks) == 0 {
|
||||
return keep, []zfs.FilesystemVersion{}
|
||||
}
|
||||
if len(bookmarks) < p.MaxBookmarks {
|
||||
keep = append(keep, bookmarks...)
|
||||
return keep, []zfs.FilesystemVersion{}
|
||||
}
|
||||
|
||||
// NOTE: sorting descending by descending by createtxg <=> sorting ascending wrt creation time
|
||||
sort.SliceStable(bookmarks, func(i, j int) bool {
|
||||
return (bookmarks[i].CreateTXG > bookmarks[j].CreateTXG)
|
||||
})
|
||||
|
||||
keep = append(keep, bookmarks[:p.MaxBookmarks]...)
|
||||
remove = bookmarks[p.MaxBookmarks:]
|
||||
|
||||
return keep, remove
|
||||
}
|
||||
|
||||
func parseGridPrunePolicy(e map[string]interface{}, willSeeBookmarks bool) (p *GridPrunePolicy, err error) {
|
||||
|
||||
const KeepBookmarksAllString = "all"
|
||||
var i struct {
|
||||
Grid string
|
||||
KeepBookmarks string `mapstructure:"keep_bookmarks"`
|
||||
}
|
||||
|
||||
dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{Result: &i, WeaklyTypedInput: true})
|
||||
if err != nil {
|
||||
err = errors.Wrap(err, "mapstructure error")
|
||||
return
|
||||
}
|
||||
if err = dec.Decode(e); err != nil {
|
||||
err = errors.Wrapf(err, "mapstructure error")
|
||||
return
|
||||
}
|
||||
|
||||
p = &GridPrunePolicy{}
|
||||
|
||||
// Parse grid policy
|
||||
// Parse grid
|
||||
intervals, err := parseRetentionGridIntervalsString(i.Grid)
|
||||
if err != nil {
|
||||
err = fmt.Errorf("cannot parse retention grid: %s", err)
|
||||
@ -97,9 +155,22 @@ func parseGridPrunePolicy(e map[string]interface{}) (p *GridPrunePolicy, err err
|
||||
lastDuration = intervals[i].Length
|
||||
|
||||
}
|
||||
p.RetentionGrid = util.NewRetentionGrid(intervals)
|
||||
|
||||
return
|
||||
// Parse KeepBookmarks
|
||||
keepBookmarks := 0
|
||||
if i.KeepBookmarks == KeepBookmarksAllString || (i.KeepBookmarks == "" && !willSeeBookmarks) {
|
||||
keepBookmarks = GridPrunePolicyMaxBookmarksKeepAll
|
||||
} else {
|
||||
i, err := strconv.ParseInt(i.KeepBookmarks, 10, 32)
|
||||
if err != nil || i <= 0 || i > math.MaxInt32 {
|
||||
return nil, errors.Errorf("keep_bookmarks must be positive integer or 'all'")
|
||||
}
|
||||
keepBookmarks = int(i)
|
||||
}
|
||||
return &GridPrunePolicy{
|
||||
util.NewRetentionGrid(intervals),
|
||||
keepBookmarks,
|
||||
}, nil
|
||||
}
|
||||
|
||||
var retentionStringIntervalRegex *regexp.Regexp = regexp.MustCompile(`^\s*(\d+)\s*x\s*([^\(]+)\s*(\((.*)\))?\s*$`)
|
||||
|
@ -43,9 +43,8 @@ func (p *Pruner) filterVersions(fs *zfs.DatasetPath) (fsversions []zfs.Filesyste
|
||||
defer p.task.Finish()
|
||||
log := p.task.Log().WithField(logFSField, fs.ToString())
|
||||
|
||||
// only prune snapshots, bookmarks are kept forever
|
||||
snapshotFilter := NewTypedPrefixFilter(p.SnapshotPrefix, zfs.Snapshot)
|
||||
fsversions, err := zfs.ZFSListFilesystemVersions(fs, snapshotFilter)
|
||||
filter := NewPrefixFilter(p.SnapshotPrefix)
|
||||
fsversions, err := zfs.ZFSListFilesystemVersions(fs, filter)
|
||||
if err != nil {
|
||||
log.WithError(err).Error("error listing filesytem versions")
|
||||
return nil, true
|
||||
|
@ -19,6 +19,7 @@ jobs:
|
||||
prune_lhs:
|
||||
policy: grid
|
||||
grid: 1x1h(keep=all)
|
||||
keep_bookmarks: all
|
||||
|
||||
# follow a grandfathering scheme for filesystems on the right-hand-side of the mapping
|
||||
prune_rhs:
|
||||
|
@ -37,9 +37,11 @@ jobs:
|
||||
interval: 10m
|
||||
|
||||
|
||||
# keep a one day window 10m interval snapshots in case pull doesn't work (link down, etc)
|
||||
# (we cannot keep more than one day because this host will run out of disk space)
|
||||
# keep 1 hour of snapshots (6 at 10m interval)
|
||||
# and one day of bookmarks in case pull doesn't work (link down, etc)
|
||||
# => keep_bookmarks = 24h / interval = 24h / 10m = 144
|
||||
prune:
|
||||
policy: grid
|
||||
grid: 1x1d(keep=all)
|
||||
grid: 1x1h(keep=all)
|
||||
keep_bookmarks: 144
|
||||
|
||||
|
@ -30,4 +30,4 @@ jobs:
|
||||
prune:
|
||||
policy: grid
|
||||
grid: 1x10s(keep=all)
|
||||
|
||||
keep_bookmarks: all
|
||||
|
@ -1,5 +1,7 @@
|
||||
.. |break_config| replace:: **[BREAK]**
|
||||
.. |break| replace:: **[BREAK]**
|
||||
.. |bugfix| replace:: [BUG]
|
||||
.. |feature| replace:: [FEATURE]
|
||||
|
||||
Changelog
|
||||
=========
|
||||
@ -7,6 +9,16 @@ Changelog
|
||||
The changelog summarized bugfixes that are deemed relevant for users.
|
||||
Developers should consult the git commit log or GitHub issue tracker.
|
||||
|
||||
0.0.3
|
||||
-----
|
||||
|
||||
* |break_config| |feature| :issue:`34`: automatic bookmarking of snapshots
|
||||
|
||||
* Snapshots are automatically bookmarked and pruning of bookmarks **must** be configured.
|
||||
* This breaks existing configuration: ``grid`` :ref:`prune policy <prune-retention-grid>` specifications require the new ``keep_bookmarks`` parameter.
|
||||
* Make sure to understand the meaning bookmarks have for :ref:`maximum replication downtime <replication-downtime>`.
|
||||
* Example: :sampleconf:`pullbackup/productionhost.yml`
|
||||
|
||||
0.0.2
|
||||
-----
|
||||
|
||||
|
@ -48,14 +48,14 @@ Example: :sampleconf:`pullbackup/productionhost.yml`.
|
||||
* - ``interval``
|
||||
- snapshotting interval
|
||||
* - ``prune``
|
||||
- |prune| policy for filesytems in ``filesystems`` with prefix ``snapshot_prefix``
|
||||
- |prune| for versions of filesytems in ``filesystems``, versions prefixed with ``snapshot_prefix``
|
||||
|
||||
|
||||
- Snapshotting Task (every ``interval``, |patient|)
|
||||
|
||||
- A snapshot of filesystems matched by ``filesystems`` is taken every ``interval`` with prefix ``snapshot_prefix``.
|
||||
- A bookmark of that snapshot is created with the same name.
|
||||
- The ``prune`` policy is triggered on filesystems matched by ``filesystems`` with snapshots matched by ``snapshot_prefix``.
|
||||
- The ``prune`` policy is evaluated for versions of filesystems matched by ``filesystems``, versions prefixed with ``snapshot_prefix``.
|
||||
|
||||
- Serve Task
|
||||
|
||||
@ -65,12 +65,6 @@ A source job is the counterpart to a :ref:`job-pull`.
|
||||
|
||||
Make sure you read the |prune| policy documentation.
|
||||
|
||||
Note that zrepl does not prune bookmarks due to the following reason:
|
||||
a pull job may stop replication due to link failure, misconfiguration or administrative action.
|
||||
The source prune policy will eventually destroy the last common snapshot between source and pull job.
|
||||
Without bookmarks, the prune policy would need to perform full replication again.
|
||||
With bookmarks, we can resume incremental replication, only losing the snapshots pruned since the outage.
|
||||
|
||||
.. _job-pull:
|
||||
|
||||
Pull Job
|
||||
@ -99,7 +93,7 @@ Example: :sampleconf:`pullbackup/backuphost.yml`
|
||||
* - ``snapshot_prefix``
|
||||
- prefix snapshots must match to be considered for replication & pruning
|
||||
* - ``prune``
|
||||
- |prune| policy for local filesystems reachable by ``mapping``
|
||||
- |prune| policy for versions of filesystems of local filesystems reachable by ``mapping``, versions prefixed with ``snapshot_prefix``
|
||||
|
||||
* Main Task (every ``interval``, |patient|)
|
||||
|
||||
@ -112,10 +106,11 @@ Example: :sampleconf:`pullbackup/backuphost.yml`
|
||||
#. If the local target filesystem does not exist, ``initial_repl_policy`` is used.
|
||||
#. On conflicts, an error is logged but replication of other filesystems with mapping continues.
|
||||
|
||||
#. The ``prune`` policy is triggered for all *target filesystems*
|
||||
#. The ``prune`` policy is evaluated for all *target filesystems*
|
||||
|
||||
A pull job is the counterpart to a :ref:`job-source`.
|
||||
|
||||
Make sure you read the |prune| policy documentation.
|
||||
|
||||
.. _job-local:
|
||||
|
||||
@ -163,8 +158,6 @@ Example: :sampleconf:`localbackup/host1.yml`
|
||||
#. The ``prune_rhs`` policy is triggered for all *target filesystems*
|
||||
|
||||
A local job is combination of source & pull job executed on the same machine.
|
||||
Note that while snapshots are pruned, bookmarks are not pruned and kept around forever.
|
||||
Refer to the comments on :ref:`source job <job-source>` for the reasoning behind this.
|
||||
|
||||
Terminology
|
||||
-----------
|
||||
@ -188,3 +181,7 @@ patient task
|
||||
* waits for the last invocation to finish
|
||||
* logs a warning with the effective task duration
|
||||
* immediately starts a new invocation of the task
|
||||
|
||||
filesystem version
|
||||
|
||||
A snapshot or a bookmark.
|
||||
|
@ -3,9 +3,9 @@
|
||||
Pruning Policies
|
||||
================
|
||||
|
||||
In zrepl, *pruning* means *destroying snapshots by some policy*.
|
||||
In zrepl, *pruning* means *destroying filesystem versions by some policy* where filesystem versions are bookmarks and snapshots.
|
||||
|
||||
A *pruning policy* takes a list of snapshots and -- for each snapshot -- decides whether it should be kept or destroyed.
|
||||
A *pruning policy* takes a list of filesystem versions and decides for each whether it should be kept or destroyed.
|
||||
|
||||
The job context defines which snapshots are even considered for pruning, for example through the ``snapshot_prefix`` variable.
|
||||
Check the respective :ref:`job definition <job>` for details.
|
||||
@ -25,6 +25,7 @@ Retention Grid
|
||||
|
||||
jobs:
|
||||
- name: pull_app-srv
|
||||
type: pull
|
||||
...
|
||||
prune:
|
||||
policy: grid
|
||||
@ -34,6 +35,15 @@ Retention Grid
|
||||
│
|
||||
└─ 24 adjacent one-hour intervals
|
||||
|
||||
- name: pull_backup
|
||||
type: source
|
||||
interval: 10m
|
||||
prune:
|
||||
policy: grid
|
||||
grid: 1x1d(keep=all)
|
||||
keep_bookmarks: 144
|
||||
|
||||
|
||||
The retention grid can be thought of as a time-based sieve:
|
||||
The ``grid`` field specifies a list of adjacent time intervals:
|
||||
the left edge of the leftmost (first) interval is the ``creation`` date of the youngest snapshot.
|
||||
@ -43,6 +53,11 @@ Each interval carries a maximum number of snapshots to keep.
|
||||
It is secified via ``(keep=N)``, where ``N`` is either ``all`` (all snapshots are kept) or a positive integer.
|
||||
The default value is **1**.
|
||||
|
||||
Bookmarks are not affected by the above.
|
||||
Instead, the ``keep_bookmarks`` field specifies the number of bookmarks to be kept per filesystem.
|
||||
You only need to specify ``keep_bookmarks`` at the source-side of a replication setup since the destination side does not receive bookmarks.
|
||||
You can specify ``all`` as a value to keep all bookmarks, but be warned that you should install some other way to prune unneeded ones then (see below).
|
||||
|
||||
The following procedure happens during pruning:
|
||||
|
||||
#. The list of snapshots eligible for pruning is sorted by ``creation``
|
||||
@ -54,14 +69,16 @@ The following procedure happens during pruning:
|
||||
#. the contained snapshot list is sorted by creation.
|
||||
#. snapshots from the list, oldest first, are destroyed until the specified ``keep`` count is reached.
|
||||
#. all remaining snapshots on the list are kept.
|
||||
#. The list of bookmarks eligible for pruning is sorted by ``createtxg`` and the most recent ``keep_bookmarks`` bookmarks are kept.
|
||||
|
||||
.. _replication-downtime:
|
||||
|
||||
.. ATTENTION::
|
||||
|
||||
The configuration of the first interval (``1x1h(keep=all)`` in the example) determines the **maximum allowable replication lag** because the source and destination pruning policies do not coordinate:
|
||||
if replication does not work for whatever reason, source will continue to execute the prune policy.
|
||||
Eventually, source destroys a snapshot that has never been replicated to destination, degrading the temporal resolution of your backup.
|
||||
Be aware that ``keep_bookmarks x interval`` (interval of the job level) controls the **maximum allowable replication downtime** between source and destination.
|
||||
If replication does not work for whatever reason, source and destination will eventually run out of sync because the source will continue pruning snapshots.
|
||||
The only recovery in that case is full replication, which may not always be viable due to disk space or traffic constraints.
|
||||
|
||||
Thus, **always** configure the first interval to ``1x?(keep=all)``, substituting ``?`` with the maximum time replication may fail due to downtimes, maintenance, connectivity issues, etc.
|
||||
|
||||
.. We intentionally do not mention that bookmarks are used to bridge the gap between source and dest that are out of sync snapshot-wise. This is an implementation detail.
|
||||
Further note that while bookmarks consume a constant amount of disk space, listing them requires temporary dynamic **kernel memory** proportional to the number of bookmarks.
|
||||
Thus, do not use ``all`` or an inappropriately high value without good reason.
|
||||
|
||||
|
@ -119,6 +119,7 @@ We define a corresponding **source job** named ``pull_backup`` in the |mainconfi
|
||||
prune:
|
||||
policy: grid
|
||||
grid: 1x1d(keep=all)
|
||||
keep_bookmarks: 144
|
||||
|
||||
|
||||
The ``serve`` section corresponds to the ``connect`` section in the configuration of ``backup-srv``.
|
||||
|
Loading…
Reference in New Issue
Block a user