mirror of
https://github.com/zrepl/zrepl.git
synced 2025-01-18 20:28:22 +01:00
b2c6e51a43
This was merged to master prematurely as the job components are not decoupled well enough
for these signals to be useful yet.
This reverts commit 2c8c2cfa14
.
closes #452
192 lines
5.5 KiB
Go
192 lines
5.5 KiB
Go
package job
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sort"
|
|
"sync"
|
|
|
|
"github.com/pkg/errors"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
|
|
"github.com/zrepl/zrepl/daemon/logging/trace"
|
|
"github.com/zrepl/zrepl/util/nodefault"
|
|
|
|
"github.com/zrepl/zrepl/config"
|
|
"github.com/zrepl/zrepl/daemon/filters"
|
|
"github.com/zrepl/zrepl/daemon/job/wakeup"
|
|
"github.com/zrepl/zrepl/daemon/pruner"
|
|
"github.com/zrepl/zrepl/daemon/snapper"
|
|
"github.com/zrepl/zrepl/endpoint"
|
|
"github.com/zrepl/zrepl/replication/logic/pdu"
|
|
"github.com/zrepl/zrepl/zfs"
|
|
)
|
|
|
|
type SnapJob struct {
|
|
name endpoint.JobID
|
|
fsfilter zfs.DatasetFilter
|
|
snapper *snapper.PeriodicOrManual
|
|
|
|
prunerFactory *pruner.LocalPrunerFactory
|
|
|
|
promPruneSecs *prometheus.HistogramVec // labels: prune_side
|
|
|
|
prunerMtx sync.Mutex
|
|
pruner *pruner.Pruner
|
|
}
|
|
|
|
func (j *SnapJob) Name() string { return j.name.String() }
|
|
|
|
func (j *SnapJob) Type() Type { return TypeSnap }
|
|
|
|
func snapJobFromConfig(g *config.Global, in *config.SnapJob) (j *SnapJob, err error) {
|
|
j = &SnapJob{}
|
|
fsf, err := filters.DatasetMapFilterFromConfig(in.Filesystems)
|
|
if err != nil {
|
|
return nil, errors.Wrap(err, "cannot build filesystem filter")
|
|
}
|
|
j.fsfilter = fsf
|
|
|
|
if j.snapper, err = snapper.FromConfig(g, fsf, in.Snapshotting); err != nil {
|
|
return nil, errors.Wrap(err, "cannot build snapper")
|
|
}
|
|
j.name, err = endpoint.MakeJobID(in.Name)
|
|
if err != nil {
|
|
return nil, errors.Wrap(err, "invalid job name")
|
|
}
|
|
j.promPruneSecs = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: "zrepl",
|
|
Subsystem: "pruning",
|
|
Name: "time",
|
|
Help: "seconds spent in pruner",
|
|
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
|
|
}, []string{"prune_side"})
|
|
j.prunerFactory, err = pruner.NewLocalPrunerFactory(in.Pruning, j.promPruneSecs)
|
|
if err != nil {
|
|
return nil, errors.Wrap(err, "cannot build snapjob pruning rules")
|
|
}
|
|
return j, nil
|
|
}
|
|
|
|
func (j *SnapJob) RegisterMetrics(registerer prometheus.Registerer) {
|
|
registerer.MustRegister(j.promPruneSecs)
|
|
}
|
|
|
|
type SnapJobStatus struct {
|
|
Pruning *pruner.Report
|
|
Snapshotting *snapper.Report // may be nil
|
|
}
|
|
|
|
func (j *SnapJob) Status() *Status {
|
|
s := &SnapJobStatus{}
|
|
t := j.Type()
|
|
j.prunerMtx.Lock()
|
|
if j.pruner != nil {
|
|
s.Pruning = j.pruner.Report()
|
|
}
|
|
j.prunerMtx.Unlock()
|
|
s.Snapshotting = j.snapper.Report()
|
|
return &Status{Type: t, JobSpecific: s}
|
|
}
|
|
|
|
func (j *SnapJob) OwnedDatasetSubtreeRoot() (rfs *zfs.DatasetPath, ok bool) {
|
|
return nil, false
|
|
}
|
|
|
|
func (j *SnapJob) SenderConfig() *endpoint.SenderConfig { return nil }
|
|
|
|
func (j *SnapJob) Run(ctx context.Context) {
|
|
ctx, endTask := trace.WithTaskAndSpan(ctx, "snap-job", j.Name())
|
|
defer endTask()
|
|
log := GetLogger(ctx)
|
|
|
|
defer log.Info("job exiting")
|
|
|
|
periodicDone := make(chan struct{})
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
defer cancel()
|
|
periodicCtx, endTask := trace.WithTask(ctx, "snapshotting")
|
|
defer endTask()
|
|
go j.snapper.Run(periodicCtx, periodicDone)
|
|
|
|
invocationCount := 0
|
|
outer:
|
|
for {
|
|
log.Info("wait for wakeups")
|
|
select {
|
|
case <-ctx.Done():
|
|
log.WithError(ctx.Err()).Info("context")
|
|
break outer
|
|
|
|
case <-wakeup.Wait(ctx):
|
|
case <-periodicDone:
|
|
}
|
|
invocationCount++
|
|
|
|
invocationCtx, endSpan := trace.WithSpan(ctx, fmt.Sprintf("invocation-%d", invocationCount))
|
|
j.doPrune(invocationCtx)
|
|
endSpan()
|
|
}
|
|
}
|
|
|
|
// Adaptor that implements pruner.History around a pruner.Target.
|
|
// The ReplicationCursor method is Get-op only and always returns
|
|
// the filesystem's most recent version's GUID.
|
|
//
|
|
// TODO:
|
|
// This is a work-around for the current package daemon/pruner
|
|
// and package pruning.Snapshot limitation: they require the
|
|
// `Replicated` getter method be present, but obviously,
|
|
// a local job like SnapJob can't deliver on that.
|
|
// But the pruner.Pruner gives up on an FS if no replication
|
|
// cursor is present, which is why this pruner returns the
|
|
// most recent filesystem version.
|
|
type alwaysUpToDateReplicationCursorHistory struct {
|
|
// the Target passed as Target to BuildLocalPruner
|
|
target pruner.Target
|
|
}
|
|
|
|
var _ pruner.History = (*alwaysUpToDateReplicationCursorHistory)(nil)
|
|
|
|
func (h alwaysUpToDateReplicationCursorHistory) ReplicationCursor(ctx context.Context, req *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error) {
|
|
fsvReq := &pdu.ListFilesystemVersionsReq{
|
|
Filesystem: req.GetFilesystem(),
|
|
}
|
|
res, err := h.target.ListFilesystemVersions(ctx, fsvReq)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
fsvs := res.GetVersions()
|
|
if len(fsvs) <= 0 {
|
|
return &pdu.ReplicationCursorRes{Result: &pdu.ReplicationCursorRes_Notexist{Notexist: true}}, nil
|
|
}
|
|
// always return must recent version
|
|
sort.Slice(fsvs, func(i, j int) bool {
|
|
return fsvs[i].CreateTXG < fsvs[j].CreateTXG
|
|
})
|
|
mostRecent := fsvs[len(fsvs)-1]
|
|
return &pdu.ReplicationCursorRes{Result: &pdu.ReplicationCursorRes_Guid{Guid: mostRecent.GetGuid()}}, nil
|
|
}
|
|
|
|
func (h alwaysUpToDateReplicationCursorHistory) ListFilesystems(ctx context.Context, req *pdu.ListFilesystemReq) (*pdu.ListFilesystemRes, error) {
|
|
return h.target.ListFilesystems(ctx, req)
|
|
}
|
|
|
|
func (j *SnapJob) doPrune(ctx context.Context) {
|
|
ctx, endSpan := trace.WithSpan(ctx, "snap-job-do-prune")
|
|
defer endSpan()
|
|
log := GetLogger(ctx)
|
|
sender := endpoint.NewSender(endpoint.SenderConfig{
|
|
JobID: j.name,
|
|
FSF: j.fsfilter,
|
|
// FIXME encryption setting is irrelevant for SnapJob because the endpoint is only used as pruner.Target
|
|
Encrypt: &nodefault.Bool{B: true},
|
|
})
|
|
j.prunerMtx.Lock()
|
|
j.pruner = j.prunerFactory.BuildLocalPruner(ctx, sender, alwaysUpToDateReplicationCursorHistory{sender})
|
|
j.prunerMtx.Unlock()
|
|
log.Info("start pruning")
|
|
j.pruner.Prune()
|
|
log.Info("finished pruning")
|
|
}
|