Ugly but working inital snapjob implementation

2025-06-19 17:27:46 +02:00 · 2018-11-20 19:30:15 +01:00 · 2018-11-20 19:30:15 +01:00 · 3d2688e959
commit 3d2688e959
parent 7ab51fad0d
6 changed files with 455 additions and 0 deletions
--- a/config/config.go
+++ b/config/config.go
@ -33,6 +33,7 @@ type JobEnum struct {
 func (j JobEnum) Name() string {
 	var name string
 	switch v := j.Ret.(type) {
 	case *SnapJob: name = v.Name
 	case *PushJob: name = v.Name
 	case *SinkJob: name = v.Name
 	case *PullJob: name = v.Name
@ -58,6 +59,15 @@ type PassiveJob struct {
 	Debug       JobDebugSettings `yaml:"debug,optional"`
 }
 type SnapJob struct {
 	Type         string                `yaml:"type"`
 	Name         string                `yaml:"name"`
 	Pruning      PruningLocal	   `yaml:"pruning"`
 	Debug        JobDebugSettings      `yaml:"debug,optional"`
 	Snapshotting SnapshottingEnum      `yaml:"snapshotting"`
 	Filesystems FilesystemsFilter `yaml:"filesystems"`
 }
 type PushJob struct {
 	ActiveJob `yaml:",inline"`
 	Snapshotting SnapshottingEnum          `yaml:"snapshotting"`
@ -340,6 +350,7 @@ func enumUnmarshal(u func(interface{}, bool) error, types map[string]interface{}
 func (t *JobEnum) UnmarshalYAML(u func(interface{}, bool) error) (err error) {
 	t.Ret, err = enumUnmarshal(u, map[string]interface{}{
 		"snap":   &SnapJob{},
 		"push":   &PushJob{},
 		"sink":   &SinkJob{},
 		"pull":   &PullJob{},
--- a/config/samples/snap.yml
+++ b/config/samples/snap.yml
@ -0,0 +1,14 @@
 jobs:
 - name: snapjob
  type: snap
  filesystems: {
    "tank/frequently_changed<": true,
  }
  snapshotting:
    type: periodic
    interval: 2m
    prefix: zrepl_snapjob_
  pruning:
    keep:
      - type: last_n
        count: 60
--- a/daemon/job/build_jobs.go
+++ b/daemon/job/build_jobs.go
@ -45,6 +45,15 @@ func buildJob(c *config.Global, in config.JobEnum) (j Job, err error) {
 		if err != nil {
 			return cannotBuildJob(err, v.Name)
 		}
 	case *config.SnapJob:
 		m, err := modeSnapFromConfig(c, v)
 		if err != nil {
 			return cannotBuildJob(err, v.Name)
 		}
 		j, err = snap_activeSide(c, v, m)
 		if err != nil {
 			return cannotBuildJob(err, v.Name)
 		}
 	case *config.PushJob:
 		m, err := modePushFromConfig(c, v)
 		if err != nil {
--- a/daemon/job/job.go
+++ b/daemon/job/job.go
@ -39,6 +39,7 @@ type Type string
 const (
 	TypeInternal Type = "internal"
    TypeSnap = "snap"
 	TypePush Type = "push"
 	TypeSink Type = "sink"
 	TypePull Type  = "pull"
@ -84,6 +85,8 @@ func (s *Status) UnmarshalJSON(in []byte) (err error) {
 		return fmt.Errorf("field '%s', not found", key)
 	}
 	switch s.Type {
 	case TypeSnap:
 		fallthrough
 	case TypePull: fallthrough
 	case TypePush:
 		var st ActiveSideStatus
--- a/daemon/job/snapjob.go
+++ b/daemon/job/snapjob.go
@ -0,0 +1,379 @@
 package job
 import (
 	"context"
 	"github.com/pkg/errors"
 //	"github.com/problame/go-streamrpc"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/zrepl/zrepl/config"
 	"github.com/zrepl/zrepl/daemon/filters"
 	"github.com/zrepl/zrepl/daemon/job/reset"
 	"github.com/zrepl/zrepl/daemon/job/wakeup"
 	"github.com/zrepl/zrepl/daemon/logging"
 	"github.com/zrepl/zrepl/daemon/pruner"
 	//"github.com/zrepl/zrepl/pruning"
 	"github.com/zrepl/zrepl/daemon/snapper"
 	"github.com/zrepl/zrepl/daemon/transport/connecter"
 	"github.com/zrepl/zrepl/endpoint"
 //	"github.com/zrepl/zrepl/replication"
 	"github.com/zrepl/zrepl/util/envconst"
 	"github.com/zrepl/zrepl/zfs"
 	"sync"
 	"time"
 )
 type snap_ActiveSide struct {
 	mode          snap_activeMode
 	name          string
 	clientFactory *connecter.ClientFactory
 	prunerFactory *pruner.SinglePrunerFactory
 	//promRepStateSecs *prometheus.HistogramVec // labels: state
 	promPruneSecs *prometheus.HistogramVec // labels: prune_side
 	//promBytesReplicated *prometheus.CounterVec // labels: filesystem
 	tasksMtx sync.Mutex
 	tasks    snap_activeSideTasks
 }
 type snap_activeSideTasks struct {
 	state ActiveSideState
 	// valid for state ActiveSideReplicating, ActiveSidePruneSender, ActiveSidePruneReceiver, ActiveSideDone
 	//replication *replication.Replication
 	//replicationCancel context.CancelFunc
 	// valid for state ActiveSidePruneSender, ActiveSidePruneReceiver, ActiveSideDone
 	//prunerSender, prunerReceiver *pruner.Pruner
 	pruner *pruner.Pruner
 	// valid for state ActiveSidePruneReceiver, ActiveSideDone
 	prunerCancel context.CancelFunc
 }
 func (a *snap_ActiveSide) Name() string { return a.name }
 func (a *snap_ActiveSide) GetPruner(ctx context.Context, sender *endpoint.Sender) (*pruner.Pruner) {
    //p := &pruner.Pruner{ args: pruner.args{ctx, WithLogger(ctx, GetLogger(ctx).WithField("prune_side", "sender")), sender, sender, a.rules, envconst.Duration("ZREPL_PRUNER_RETRY_INTERVAL", 10 * time.Second), true, a.promPruneSecs.WithLabelValues("sender")}, state: pruner.State.Plan,}
    p := a.prunerFactory.BuildSinglePruner(ctx,sender,sender)
    /*if err != nil {
 	return nil, errors.Wrap(err, "cannot build receiver pruning rules")
    }*/
    return p
 }
 func (a *snap_ActiveSide) updateTasks(u func(*snap_activeSideTasks)) snap_activeSideTasks {
 	a.tasksMtx.Lock()
 	defer a.tasksMtx.Unlock()
 	var copy snap_activeSideTasks
 	copy = a.tasks
 	if u == nil {
 		return copy
 	}
 	u(&copy)
 	a.tasks = copy
 	return copy
 }
 type snap_activeMode interface {
 	Type() Type
 	RunPeriodic(ctx context.Context, wakeUpCommon chan<- struct{})
 	//FSFilter() endpoint.FSFilter
 	FSFilter() zfs.DatasetFilter
 }
 type modeSnap struct {
 //    fsfilter         endpoint.FSFilter
    fsfilter         zfs.DatasetFilter
    snapper *snapper.PeriodicOrManual
 }
 func (m *modeSnap) Type() Type { return TypeSnap }
 func (m *modeSnap) RunPeriodic(ctx context.Context, wakeUpCommon chan <- struct{}) {
    m.snapper.Run(ctx, wakeUpCommon)
 }
 func (m *modeSnap) FSFilter() zfs.DatasetFilter {
 	return m.fsfilter
 }
 func modeSnapFromConfig(g *config.Global, in *config.SnapJob) (*modeSnap, error) {
    m := &modeSnap{}
    fsf, err := filters.DatasetMapFilterFromConfig(in.Filesystems)
    if err != nil {
        return nil, errors.Wrap(err, "cannnot build filesystem filter")
    }
    m.fsfilter = fsf
    if m.snapper, err = snapper.FromConfig(g, fsf, in.Snapshotting); err != nil {
        return nil, errors.Wrap(err, "cannot build snapper")
    }
    return m, nil
 }
 func snap_activeSide(g *config.Global, in *config.SnapJob, mode *modeSnap) (j *snap_ActiveSide, err error) {
 	j = &snap_ActiveSide{mode: mode}
 	j.name = in.Name
 /***	j.promRepStateSecs = prometheus.NewHistogramVec(prometheus.HistogramOpts{
 		Namespace:   "zrepl",
 		Subsystem:   "replication",
 		Name:        "state_time",
 		Help:        "seconds spent during replication",
 		ConstLabels: prometheus.Labels{"zrepl_job":j.name},
 	}, []string{"state"})
 	j.promBytesReplicated = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace:   "zrepl",
 		Subsystem:   "replication",
 		Name:        "bytes_replicated",
 		Help:        "number of bytes replicated from sender to receiver per filesystem",
 		ConstLabels: prometheus.Labels{"zrepl_job":j.name},
 	}, []string{"filesystem"})
 	j.clientFactory, err = connecter.FromConfig(g, in.Connect)
 	if err != nil {
 		return nil, errors.Wrap(err, "cannot build client")
 	}
 ***/
 	j.promPruneSecs = prometheus.NewHistogramVec(prometheus.HistogramOpts{
 		Namespace:   "zrepl",
 		Subsystem:   "pruning",
 		Name:        "time",
 		Help:        "seconds spent in pruner",
 		ConstLabels: prometheus.Labels{"zrepl_job":j.name},
 	}, []string{"prune_side"})
 	j.prunerFactory, err = pruner.NewSinglePrunerFactory(in.Pruning, j.promPruneSecs)
 	//j.rules = pruning.RulesFromConfig(in.Keep)
 	if err != nil {
 		return nil, errors.Wrap(err, "cannot build snapjob pruning rules")
 	}
 	return j, nil
 }
 func (j *snap_ActiveSide) RegisterMetrics(registerer prometheus.Registerer) {
 	registerer.MustRegister(j.promPruneSecs)
 }
 /*func (j *ActiveSide) Name() string { return j.name }
 type ActiveSideStatus struct {
 	Replication *replication.Report
 	PruningSender, PruningReceiver *pruner.Report
 }
 */
 func (j *snap_ActiveSide) Status() *Status {
 	tasks := j.updateTasks(nil)
 	s := &ActiveSideStatus{}
 	t := j.mode.Type()
 	/*if tasks.replication != nil {
 		s.Replication = tasks.replication.Report()
 	}
 	if tasks.prunerSender != nil {*/
 	if tasks.pruner != nil {
 		s.PruningSender = tasks.pruner.Report()
 	}
 	/*if tasks.prunerReceiver != nil {
 		s.PruningReceiver = tasks.prunerReceiver.Report()
 	}*/
 	return &Status{Type: t, JobSpecific: s}
 }
 func (j *snap_ActiveSide) Run(ctx context.Context) {
 	log := GetLogger(ctx)
 	ctx = logging.WithSubsystemLoggers(ctx, log)
 	defer log.Info("job exiting")
 	periodicDone := make(chan struct{})
 	ctx, cancel := context.WithCancel(ctx)
 	defer cancel()
 	go j.mode.RunPeriodic(ctx, periodicDone)
 	invocationCount := 0
 outer:
 	for {
 		log.Info("wait for wakeups")
 		select {
 		case <-ctx.Done():
 			log.WithError(ctx.Err()).Info("context")
 			break outer
 		case <-wakeup.Wait(ctx):
 		case <-periodicDone:
 		}
 		invocationCount++
 		invLog := log.WithField("invocation", invocationCount)
 		j.do(WithLogger(ctx, invLog))
 	}
 }
 func (j *snap_ActiveSide) do(ctx context.Context) {
 	log := GetLogger(ctx)
 	ctx = logging.WithSubsystemLoggers(ctx, log)
 	// allow cancellation of an invocation (this function)
 	ctx, cancelThisRun := context.WithCancel(ctx)
 	defer cancelThisRun()
 	go func() {
 		select {
 		case <-reset.Wait(ctx):
 			log.Info("reset received, cancelling current invocation")
 			cancelThisRun()
 		case <-ctx.Done():
 		}
 	}()
 	// The code after this watchdog goroutine is sequential and transitions the state from
 	//   ActiveSideReplicating -> ActiveSidePruneSender -> ActiveSidePruneReceiver -> ActiveSideDone
 	// If any of those sequential tasks 'gets stuck' (livelock, no progress), the watchdog will eventually
 	// cancel its context.
 	// If the task is written to support context cancellation, it will return immediately (in permanent error state),
 	// and the sequential code above transitions to the next state.
 	go func() {
 		wdto := envconst.Duration("ZREPL_JOB_WATCHDOG_TIMEOUT", 10*time.Minute)
 		jitter := envconst.Duration("ZREPL_JOB_WATCHDOG_JITTER", 1*time.Second)
 		// shadowing!
 		log := log.WithField("watchdog_timeout", wdto.String())
 		log.Debug("starting watchdog")
 		defer log.Debug("watchdog stopped")
 		t := time.NewTicker(wdto)
 		defer t.Stop()
 		for {
 			select {
 			case <-ctx.Done():
 				return
 			case <-t.C: // fall
 			}
 			j.updateTasks(func(tasks *snap_activeSideTasks) {
 				// Since cancelling a task will cause the sequential code to transition to the next state immediately,
 				// we cannot check for its progress right then (no fallthrough).
 				// Instead, we return (not continue because we are in a closure) and give the new state another
 				// ZREPL_JOB_WATCHDOG_TIMEOUT interval to try make some progress.
 				log.WithField("state", tasks.state).Debug("watchdog firing")
 				const WATCHDOG_ENVCONST_NOTICE = " (adjust ZREPL_JOB_WATCHDOG_TIMEOUT env variable if inappropriate)"
 				switch tasks.state {
 				/*case ActiveSideReplicating:
 					log.WithField("replication_progress", tasks.replication.Progress.String()).
 						Debug("check replication progress")
 					if tasks.replication.Progress.CheckTimeout(wdto, jitter) {
 						log.Error("replication did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
 						tasks.replicationCancel()
 						return
 					}*/
 				case ActiveSidePruneSender:
 					log.WithField("prune_sender_progress", "TEST DEBUG 123").
 						Debug("check pruner_sender progress")
 					if tasks.pruner.Progress.CheckTimeout(wdto, jitter) {
 						log.Error("pruner_sender did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
 						tasks.prunerCancel()
 						return
 					}
 				/*case ActiveSidePruneReceiver:
 					log.WithField("prune_receiver_progress", tasks.replication.Progress.String()).
 						Debug("check pruner_receiver progress")
 					if tasks.prunerReceiver.Progress.CheckTimeout(wdto, jitter) {
 						log.Error("pruner_receiver did not make progress, cancelling" + WATCHDOG_ENVCONST_NOTICE)
 						tasks.prunerReceiverCancel()
 						return
 					}*/
 				case ActiveSideDone:
 					// ignore, ctx will be Done() in a few milliseconds and the watchdog will exit
 				default:
 					log.WithField("state", tasks.state).
 						Error("watchdog implementation error: unknown active side state")
 				}
 			})
 		}
 	}()
 /*	client, err := j.clientFactory.NewClient()
 	if err != nil {
 		log.WithError(err).Error("factory cannot instantiate streamrpc client")
 	}
 	defer client.Close(ctx)
 	sender, receiver, err := j.mode.SenderReceiver(client)
 	{
 		select {
 		case <-ctx.Done():
 			return
 		default:
 		}
 		ctx, repCancel := context.WithCancel(ctx)
 		tasks := j.updateTasks(func(tasks *activeSideTasks) {
 			// reset it
 			*tasks = activeSideTasks{}
 			tasks.replicationCancel = repCancel
 			tasks.replication = replication.NewReplication(j.promRepStateSecs, j.promBytesReplicated)
 			tasks.state = ActiveSideReplicating
 		})
 		log.Info("start replication")
 		tasks.replication.Drive(ctx, sender, receiver)
 		repCancel() // always cancel to free up context resources
 	}
 	{
 		select {
 		case <-ctx.Done():
 			return
 		default:
 		}
 */
 		ctx, localCancel := context.WithCancel(ctx)
 		sender := endpoint.NewSender(j.mode.FSFilter())
 		tasks := j.updateTasks(func(tasks *snap_activeSideTasks) {
 			tasks.pruner = j.GetPruner(ctx, sender)
 			tasks.prunerCancel = localCancel
 			tasks.state = ActiveSidePruneSender
 		})
 		log.Info("start pruning sender")
 		tasks.pruner.Prune()
 		log.Info("finished pruning sender")
 		localCancel()
 //	}
 /*	{
 		select {
 		case <-ctx.Done():
 			return
 		default:
 		}
 		ctx, receiverCancel := context.WithCancel(ctx)
 		tasks := j.updateTasks(func(tasks *activeSideTasks) {
 			tasks.prunerReceiver = j.prunerFactory.BuildReceiverPruner(ctx, receiver, sender)
 			tasks.prunerReceiverCancel = receiverCancel
 			tasks.state = ActiveSidePruneReceiver
 		})
 		log.Info("start pruning receiver")
 		tasks.prunerReceiver.Prune()
 		log.Info("finished pruning receiver")
 		receiverCancel()
 	}
 */
 	j.updateTasks(func(tasks *snap_activeSideTasks) {
 		tasks.state = ActiveSideDone
 	})
 }
--- a/daemon/pruner/pruner.go
+++ b/daemon/pruner/pruner.go
@ -17,6 +17,7 @@ import (
 	"strings"
 	"sync"
 	"time"
 //	"github.com/zrepl/zrepl/replication"
 )
 // Try to keep it compatible with gitub.com/zrepl/zrepl/replication.Endpoint
@ -82,6 +83,13 @@ type PrunerFactory struct {
 	promPruneSecs *prometheus.HistogramVec
 }
 type SinglePrunerFactory struct {
 	keepRules                         []pruning.KeepRule
 	retryWait                      time.Duration
 	considerSnapAtCursorReplicated bool
 	promPruneSecs *prometheus.HistogramVec
 }
 func checkContainsKeep1(rules []pruning.KeepRule) error {
 	if len(rules) == 0 {
 		return nil //No keep rules means keep all - ok
@ -95,6 +103,21 @@ func checkContainsKeep1(rules []pruning.KeepRule) error {
 	return errors.New("sender keep rules must contain last_n or be empty so that the last snapshot is definitely kept")
 }
 func NewSinglePrunerFactory(in config.PruningLocal, promPruneSecs *prometheus.HistogramVec) (*SinglePrunerFactory, error) {
 	rules, err := pruning.RulesFromConfig(in.Keep)
 	if err != nil {
 		return nil, errors.Wrap(err, "cannot build pruning rules")
 	}
 	considerSnapAtCursorReplicated := false
 	f := &SinglePrunerFactory{
 		keepRules: rules,
 		retryWait: envconst.Duration("ZREPL_PRUNER_RETRY_INTERVAL", 10 * time.Second),
 		considerSnapAtCursorReplicated: considerSnapAtCursorReplicated,
 		promPruneSecs: promPruneSecs,
 	}
 	return f, nil
 }
 func NewPrunerFactory(in config.PruningSenderReceiver, promPruneSecs *prometheus.HistogramVec) (*PrunerFactory, error) {
 	keepRulesReceiver, err := pruning.RulesFromConfig(in.KeepReceiver)
 	if err != nil {
@ -156,6 +179,22 @@ func (f *PrunerFactory) BuildReceiverPruner(ctx context.Context, target Target,
 	return p
 }
 func (f *SinglePrunerFactory) BuildSinglePruner(ctx context.Context, target Target, receiver History) *Pruner {
 	p := &Pruner{
 		args: args{
 			WithLogger(ctx, GetLogger(ctx).WithField("prune_side", "sender")),
 			target,
 			receiver,
 			f.keepRules,
 			f.retryWait,
 			f.considerSnapAtCursorReplicated,
 			f.promPruneSecs.WithLabelValues("sender"),
 		},
 		state: Plan,
 	}
 	return p
 }
 //go:generate enumer -type=State
 type State int