zrepl/daemon/job/snapjob.go
Christian Schwarz 845195b7ed bandwidth limiting: fix crash with SnapJob
zrepl daemon panics when the snap job triggers

fixup for f5f269bfd5 (bandwidth limiting)
fixes #521

Oct 01 16:14:56 cstp zrepl[56563]: panic: invalid config`BandwidthLimit` field invalid: BucketCapacity must not be zero
Oct 01 16:14:56 cstp zrepl[56563]:         panic: end span: span still has active child spans
Oct 01 16:14:56 cstp zrepl[56563]: goroutine 38 [running]:
Oct 01 16:14:56 cstp zrepl[56563]: github.com/zrepl/zrepl/daemon/logging/trace.WithSpan.func2()
Oct 01 16:14:56 cstp zrepl[56563]:         /home/cs/zrepl/zrepl/daemon/logging/trace/trace.go:341 +0x2ea
Oct 01 16:14:56 cstp zrepl[56563]: github.com/zrepl/zrepl/daemon/logging/trace.WithTaskAndSpan.func1()
Oct 01 16:14:56 cstp zrepl[56563]:         /home/cs/zrepl/zrepl/daemon/logging/trace/trace_convenience.go:40 +0x2e
Oct 01 16:14:56 cstp zrepl[56563]: panic(0xcee9c0, 0xc000676730)
Oct 01 16:14:56 cstp zrepl[56563]:         /home/cs/go1.16.6/src/runtime/panic.go:965 +0x1b9
Oct 01 16:14:56 cstp zrepl[56563]: github.com/zrepl/zrepl/endpoint.NewSender(0xf5bbc0, 0xc0003840c0, 0xc0000b2c90, 0x4, 0xc0002c5958, 0x0, 0x0, 0x0, 0xc000068cf8)
Oct 01 16:14:56 cstp zrepl[56563]:         /home/cs/zrepl/zrepl/endpoint/endpoint.go:68 +0x1ec
Oct 01 16:14:56 cstp zrepl[56563]: github.com/zrepl/zrepl/daemon/job.(*SnapJob).doPrune(0xc00039e000, 0xf6e3b8, 0xc0006541b0)
Oct 01 16:14:56 cstp zrepl[56563]:         /home/cs/zrepl/zrepl/daemon/job/snapjob.go:179 +0x198
Oct 01 16:14:56 cstp zrepl[56563]: github.com/zrepl/zrepl/daemon/job.(*SnapJob).Run(0xc00039e000, 0xf6e3b8, 0xc0001d83c0)
Oct 01 16:14:56 cstp zrepl[56563]:         /home/cs/zrepl/zrepl/daemon/job/snapjob.go:127 +0x329
Oct 01 16:14:56 cstp zrepl[56563]: github.com/zrepl/zrepl/daemon.(*jobs).start.func1(0xc0006a4100, 0xf6e3b8, 0xc00022a0f0, 0xf72d18, 0xc00039e000)
Oct 01 16:14:56 cstp zrepl[56563]:         /home/cs/zrepl/zrepl/daemon/daemon.go:255 +0x15b
Oct 01 16:14:56 cstp zrepl[56563]: created by github.com/zrepl/zrepl/daemon.(*jobs).start
Oct 01 16:14:56 cstp zrepl[56563]:         /home/cs/zrepl/zrepl/daemon/daemon.go:251 +0x425
Oct 01 16:14:56 cstp systemd[1]: zrepl.service: Main process exited, code=exited, status=2/INVALIDARGUMENT
Oct 01 16:14:56 cstp systemd[1]: zrepl.service: Failed with result 'exit-code'.
2021-10-09 15:52:38 +02:00

196 lines
5.7 KiB
Go

package job
import (
"context"
"fmt"
"sort"
"sync"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/zrepl/zrepl/daemon/logging/trace"
"github.com/zrepl/zrepl/util/bandwidthlimit"
"github.com/zrepl/zrepl/util/nodefault"
"github.com/zrepl/zrepl/config"
"github.com/zrepl/zrepl/daemon/filters"
"github.com/zrepl/zrepl/daemon/job/wakeup"
"github.com/zrepl/zrepl/daemon/pruner"
"github.com/zrepl/zrepl/daemon/snapper"
"github.com/zrepl/zrepl/endpoint"
"github.com/zrepl/zrepl/replication/logic/pdu"
"github.com/zrepl/zrepl/zfs"
)
type SnapJob struct {
name endpoint.JobID
fsfilter zfs.DatasetFilter
snapper *snapper.PeriodicOrManual
prunerFactory *pruner.LocalPrunerFactory
promPruneSecs *prometheus.HistogramVec // labels: prune_side
prunerMtx sync.Mutex
pruner *pruner.Pruner
}
func (j *SnapJob) Name() string { return j.name.String() }
func (j *SnapJob) Type() Type { return TypeSnap }
func snapJobFromConfig(g *config.Global, in *config.SnapJob) (j *SnapJob, err error) {
j = &SnapJob{}
fsf, err := filters.DatasetMapFilterFromConfig(in.Filesystems)
if err != nil {
return nil, errors.Wrap(err, "cannot build filesystem filter")
}
j.fsfilter = fsf
if j.snapper, err = snapper.FromConfig(g, fsf, in.Snapshotting); err != nil {
return nil, errors.Wrap(err, "cannot build snapper")
}
j.name, err = endpoint.MakeJobID(in.Name)
if err != nil {
return nil, errors.Wrap(err, "invalid job name")
}
j.promPruneSecs = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "zrepl",
Subsystem: "pruning",
Name: "time",
Help: "seconds spent in pruner",
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
}, []string{"prune_side"})
j.prunerFactory, err = pruner.NewLocalPrunerFactory(in.Pruning, j.promPruneSecs)
if err != nil {
return nil, errors.Wrap(err, "cannot build snapjob pruning rules")
}
return j, nil
}
func (j *SnapJob) RegisterMetrics(registerer prometheus.Registerer) {
registerer.MustRegister(j.promPruneSecs)
}
type SnapJobStatus struct {
Pruning *pruner.Report
Snapshotting *snapper.Report // may be nil
}
func (j *SnapJob) Status() *Status {
s := &SnapJobStatus{}
t := j.Type()
j.prunerMtx.Lock()
if j.pruner != nil {
s.Pruning = j.pruner.Report()
}
j.prunerMtx.Unlock()
s.Snapshotting = j.snapper.Report()
return &Status{Type: t, JobSpecific: s}
}
func (j *SnapJob) OwnedDatasetSubtreeRoot() (rfs *zfs.DatasetPath, ok bool) {
return nil, false
}
func (j *SnapJob) SenderConfig() *endpoint.SenderConfig { return nil }
func (j *SnapJob) Run(ctx context.Context) {
ctx, endTask := trace.WithTaskAndSpan(ctx, "snap-job", j.Name())
defer endTask()
log := GetLogger(ctx)
defer log.Info("job exiting")
periodicDone := make(chan struct{})
ctx, cancel := context.WithCancel(ctx)
defer cancel()
periodicCtx, endTask := trace.WithTask(ctx, "snapshotting")
defer endTask()
go j.snapper.Run(periodicCtx, periodicDone)
invocationCount := 0
outer:
for {
log.Info("wait for wakeups")
select {
case <-ctx.Done():
log.WithError(ctx.Err()).Info("context")
break outer
case <-wakeup.Wait(ctx):
case <-periodicDone:
}
invocationCount++
invocationCtx, endSpan := trace.WithSpan(ctx, fmt.Sprintf("invocation-%d", invocationCount))
j.doPrune(invocationCtx)
endSpan()
}
}
// Adaptor that implements pruner.History around a pruner.Target.
// The ReplicationCursor method is Get-op only and always returns
// the filesystem's most recent version's GUID.
//
// TODO:
// This is a work-around for the current package daemon/pruner
// and package pruning.Snapshot limitation: they require the
// `Replicated` getter method be present, but obviously,
// a local job like SnapJob can't deliver on that.
// But the pruner.Pruner gives up on an FS if no replication
// cursor is present, which is why this pruner returns the
// most recent filesystem version.
type alwaysUpToDateReplicationCursorHistory struct {
// the Target passed as Target to BuildLocalPruner
target pruner.Target
}
var _ pruner.History = (*alwaysUpToDateReplicationCursorHistory)(nil)
func (h alwaysUpToDateReplicationCursorHistory) ReplicationCursor(ctx context.Context, req *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error) {
fsvReq := &pdu.ListFilesystemVersionsReq{
Filesystem: req.GetFilesystem(),
}
res, err := h.target.ListFilesystemVersions(ctx, fsvReq)
if err != nil {
return nil, err
}
fsvs := res.GetVersions()
if len(fsvs) <= 0 {
return &pdu.ReplicationCursorRes{Result: &pdu.ReplicationCursorRes_Notexist{Notexist: true}}, nil
}
// always return must recent version
sort.Slice(fsvs, func(i, j int) bool {
return fsvs[i].CreateTXG < fsvs[j].CreateTXG
})
mostRecent := fsvs[len(fsvs)-1]
return &pdu.ReplicationCursorRes{Result: &pdu.ReplicationCursorRes_Guid{Guid: mostRecent.GetGuid()}}, nil
}
func (h alwaysUpToDateReplicationCursorHistory) ListFilesystems(ctx context.Context, req *pdu.ListFilesystemReq) (*pdu.ListFilesystemRes, error) {
return h.target.ListFilesystems(ctx, req)
}
func (j *SnapJob) doPrune(ctx context.Context) {
ctx, endSpan := trace.WithSpan(ctx, "snap-job-do-prune")
defer endSpan()
log := GetLogger(ctx)
sender := endpoint.NewSender(endpoint.SenderConfig{
JobID: j.name,
FSF: j.fsfilter,
// FIXME the following config fields are irrelevant for SnapJob
// because the endpoint is only used as pruner.Target.
// However, the implementation requires them to be set.
Encrypt: &nodefault.Bool{B: true},
BandwidthLimit: bandwidthlimit.NoLimitConfig(),
})
j.prunerMtx.Lock()
j.pruner = j.prunerFactory.BuildLocalPruner(ctx, sender, alwaysUpToDateReplicationCursorHistory{sender})
j.prunerMtx.Unlock()
log.Info("start pruning")
j.pruner.Prune()
log.Info("finished pruning")
}