package job import ( "context" "fmt" "sort" "sync" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/zrepl/zrepl/daemon/logging/trace" "github.com/zrepl/zrepl/util/bandwidthlimit" "github.com/zrepl/zrepl/util/nodefault" "github.com/zrepl/zrepl/config" "github.com/zrepl/zrepl/daemon/filters" "github.com/zrepl/zrepl/daemon/job/wakeup" "github.com/zrepl/zrepl/daemon/pruner" "github.com/zrepl/zrepl/daemon/snapper" "github.com/zrepl/zrepl/endpoint" "github.com/zrepl/zrepl/replication/logic/pdu" "github.com/zrepl/zrepl/zfs" ) type SnapJob struct { name endpoint.JobID fsfilter zfs.DatasetFilter snapper snapper.Snapper prunerFactory *pruner.LocalPrunerFactory promPruneSecs *prometheus.HistogramVec // labels: prune_side prunerMtx sync.Mutex pruner *pruner.Pruner } func (j *SnapJob) Name() string { return j.name.String() } func (j *SnapJob) Type() Type { return TypeSnap } func snapJobFromConfig(g *config.Global, in *config.SnapJob) (j *SnapJob, err error) { j = &SnapJob{} fsf, err := filters.DatasetMapFilterFromConfig(in.Filesystems) if err != nil { return nil, errors.Wrap(err, "cannot build filesystem filter") } j.fsfilter = fsf if j.snapper, err = snapper.FromConfig(g, fsf, in.Snapshotting); err != nil { return nil, errors.Wrap(err, "cannot build snapper") } j.name, err = endpoint.MakeJobID(in.Name) if err != nil { return nil, errors.Wrap(err, "invalid job name") } j.promPruneSecs = prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "zrepl", Subsystem: "pruning", Name: "time", Help: "seconds spent in pruner", ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()}, }, []string{"prune_side"}) j.prunerFactory, err = pruner.NewLocalPrunerFactory(in.Pruning, j.promPruneSecs) if err != nil { return nil, errors.Wrap(err, "cannot build snapjob pruning rules") } return j, nil } func (j *SnapJob) RegisterMetrics(registerer prometheus.Registerer) { registerer.MustRegister(j.promPruneSecs) } type SnapJobStatus struct { Pruning *pruner.Report Snapshotting *snapper.Report // may be nil } func (j *SnapJob) Status() *Status { s := &SnapJobStatus{} t := j.Type() j.prunerMtx.Lock() if j.pruner != nil { s.Pruning = j.pruner.Report() } j.prunerMtx.Unlock() r := j.snapper.Report() s.Snapshotting = &r return &Status{Type: t, JobSpecific: s} } func (j *SnapJob) OwnedDatasetSubtreeRoot() (rfs *zfs.DatasetPath, ok bool) { return nil, false } func (j *SnapJob) SenderConfig() *endpoint.SenderConfig { return nil } func (j *SnapJob) Run(ctx context.Context) { ctx, endTask := trace.WithTaskAndSpan(ctx, "snap-job", j.Name()) defer endTask() log := GetLogger(ctx) defer log.Info("job exiting") periodicDone := make(chan struct{}) ctx, cancel := context.WithCancel(ctx) defer cancel() periodicCtx, endTask := trace.WithTask(ctx, "snapshotting") defer endTask() go j.snapper.Run(periodicCtx, periodicDone) invocationCount := 0 outer: for { log.Info("wait for wakeups") select { case <-ctx.Done(): log.WithError(ctx.Err()).Info("context") break outer case <-wakeup.Wait(ctx): case <-periodicDone: } invocationCount++ invocationCtx, endSpan := trace.WithSpan(ctx, fmt.Sprintf("invocation-%d", invocationCount)) j.doPrune(invocationCtx) endSpan() } } // Adaptor that implements pruner.History around a pruner.Target. // The ReplicationCursor method is Get-op only and always returns // the filesystem's most recent version's GUID. // // TODO: // This is a work-around for the current package daemon/pruner // and package pruning.Snapshot limitation: they require the // `Replicated` getter method be present, but obviously, // a local job like SnapJob can't deliver on that. // But the pruner.Pruner gives up on an FS if no replication // cursor is present, which is why this pruner returns the // most recent filesystem version. type alwaysUpToDateReplicationCursorHistory struct { // the Target passed as Target to BuildLocalPruner target pruner.Target } var _ pruner.Sender = (*alwaysUpToDateReplicationCursorHistory)(nil) func (h alwaysUpToDateReplicationCursorHistory) ReplicationCursor(ctx context.Context, req *pdu.ReplicationCursorReq) (*pdu.ReplicationCursorRes, error) { fsvReq := &pdu.ListFilesystemVersionsReq{ Filesystem: req.GetFilesystem(), } res, err := h.target.ListFilesystemVersions(ctx, fsvReq) if err != nil { return nil, err } fsvs := res.GetVersions() if len(fsvs) <= 0 { return &pdu.ReplicationCursorRes{Result: &pdu.ReplicationCursorRes_Notexist{Notexist: true}}, nil } // always return must recent version sort.Slice(fsvs, func(i, j int) bool { return fsvs[i].CreateTXG < fsvs[j].CreateTXG }) mostRecent := fsvs[len(fsvs)-1] return &pdu.ReplicationCursorRes{Result: &pdu.ReplicationCursorRes_Guid{Guid: mostRecent.GetGuid()}}, nil } func (h alwaysUpToDateReplicationCursorHistory) ListFilesystems(ctx context.Context, req *pdu.ListFilesystemReq) (*pdu.ListFilesystemRes, error) { return h.target.ListFilesystems(ctx, req) } func (j *SnapJob) doPrune(ctx context.Context) { ctx, endSpan := trace.WithSpan(ctx, "snap-job-do-prune") defer endSpan() log := GetLogger(ctx) sender := endpoint.NewSender(endpoint.SenderConfig{ JobID: j.name, FSF: j.fsfilter, // FIXME the following config fields are irrelevant for SnapJob // because the endpoint is only used as pruner.Target. // However, the implementation requires them to be set. Encrypt: &nodefault.Bool{B: true}, BandwidthLimit: bandwidthlimit.NoLimitConfig(), }) j.prunerMtx.Lock() j.pruner = j.prunerFactory.BuildLocalPruner(ctx, sender, alwaysUpToDateReplicationCursorHistory{sender}) j.prunerMtx.Unlock() log.Info("start pruning") j.pruner.Prune() log.Info("finished pruning") }