2018-08-27 22:21:45 +02:00
|
|
|
package job
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"github.com/pkg/errors"
|
2018-09-23 23:04:31 +02:00
|
|
|
"github.com/problame/go-streamrpc"
|
2018-09-08 07:03:41 +02:00
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
2018-08-27 22:21:45 +02:00
|
|
|
"github.com/zrepl/zrepl/config"
|
2018-10-12 20:50:30 +02:00
|
|
|
"github.com/zrepl/zrepl/daemon/job/reset"
|
2018-10-12 12:44:40 +02:00
|
|
|
"github.com/zrepl/zrepl/daemon/job/wakeup"
|
2018-10-11 21:20:55 +02:00
|
|
|
"github.com/zrepl/zrepl/daemon/transport/connecter"
|
2018-08-27 22:21:45 +02:00
|
|
|
"github.com/zrepl/zrepl/daemon/filters"
|
2018-08-30 11:52:05 +02:00
|
|
|
"github.com/zrepl/zrepl/daemon/pruner"
|
2018-08-27 22:21:45 +02:00
|
|
|
"github.com/zrepl/zrepl/endpoint"
|
|
|
|
"github.com/zrepl/zrepl/replication"
|
2018-10-19 16:27:05 +02:00
|
|
|
"github.com/zrepl/zrepl/util/envconst"
|
|
|
|
"github.com/zrepl/zrepl/util/watchdog"
|
2018-09-24 12:31:29 +02:00
|
|
|
"github.com/zrepl/zrepl/zfs"
|
2018-08-27 22:21:45 +02:00
|
|
|
"sync"
|
2018-08-31 16:26:11 +02:00
|
|
|
"github.com/zrepl/zrepl/daemon/logging"
|
2018-09-04 23:46:02 +02:00
|
|
|
"github.com/zrepl/zrepl/daemon/snapper"
|
2018-09-24 12:31:29 +02:00
|
|
|
"time"
|
2018-08-27 22:21:45 +02:00
|
|
|
)
|
|
|
|
|
2018-09-23 23:04:31 +02:00
|
|
|
type ActiveSide struct {
|
|
|
|
mode activeMode
|
|
|
|
name string
|
|
|
|
clientFactory *connecter.ClientFactory
|
2018-08-27 22:21:45 +02:00
|
|
|
|
2018-08-30 17:40:45 +02:00
|
|
|
prunerFactory *pruner.PrunerFactory
|
2018-08-29 19:00:45 +02:00
|
|
|
|
2018-09-04 23:46:02 +02:00
|
|
|
|
2018-09-08 07:03:41 +02:00
|
|
|
promRepStateSecs *prometheus.HistogramVec // labels: state
|
|
|
|
promPruneSecs *prometheus.HistogramVec // labels: prune_side
|
|
|
|
promBytesReplicated *prometheus.CounterVec // labels: filesystem
|
|
|
|
|
2018-09-24 17:48:45 +02:00
|
|
|
tasksMtx sync.Mutex
|
|
|
|
tasks activeSideTasks
|
|
|
|
}
|
|
|
|
|
|
|
|
type activeSideTasks struct {
|
2018-08-27 22:21:45 +02:00
|
|
|
replication *replication.Replication
|
2018-10-19 16:27:05 +02:00
|
|
|
replicationCancel context.CancelFunc
|
2018-09-24 17:48:45 +02:00
|
|
|
prunerSender, prunerReceiver *pruner.Pruner
|
2018-10-19 16:27:05 +02:00
|
|
|
prunerSenderCancel, prunerReceiverCancel context.CancelFunc
|
2018-09-24 17:48:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (a *ActiveSide) updateTasks(u func(*activeSideTasks)) activeSideTasks {
|
|
|
|
a.tasksMtx.Lock()
|
2018-10-12 22:15:07 +02:00
|
|
|
defer a.tasksMtx.Unlock()
|
2018-09-24 17:48:45 +02:00
|
|
|
var copy activeSideTasks
|
|
|
|
copy = a.tasks
|
|
|
|
if u == nil {
|
|
|
|
return copy
|
|
|
|
}
|
|
|
|
u(©)
|
|
|
|
a.tasks = copy
|
|
|
|
return copy
|
2018-08-27 22:21:45 +02:00
|
|
|
}
|
|
|
|
|
2018-09-23 23:04:31 +02:00
|
|
|
type activeMode interface {
|
|
|
|
SenderReceiver(client *streamrpc.Client) (replication.Sender, replication.Receiver, error)
|
|
|
|
Type() Type
|
|
|
|
RunPeriodic(ctx context.Context, wakeUpCommon chan<- struct{})
|
|
|
|
}
|
|
|
|
|
|
|
|
type modePush struct {
|
|
|
|
fsfilter endpoint.FSFilter
|
2018-10-11 15:22:52 +02:00
|
|
|
snapper *snapper.PeriodicOrManual
|
2018-09-23 23:04:31 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (m *modePush) SenderReceiver(client *streamrpc.Client) (replication.Sender, replication.Receiver, error) {
|
|
|
|
sender := endpoint.NewSender(m.fsfilter)
|
|
|
|
receiver := endpoint.NewRemote(client)
|
|
|
|
return sender, receiver, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *modePush) Type() Type { return TypePush }
|
|
|
|
|
|
|
|
func (m *modePush) RunPeriodic(ctx context.Context, wakeUpCommon chan <- struct{}) {
|
|
|
|
m.snapper.Run(ctx, wakeUpCommon)
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
func modePushFromConfig(g *config.Global, in *config.PushJob) (*modePush, error) {
|
|
|
|
m := &modePush{}
|
|
|
|
fsf, err := filters.DatasetMapFilterFromConfig(in.Filesystems)
|
|
|
|
if err != nil {
|
|
|
|
return nil, errors.Wrap(err, "cannnot build filesystem filter")
|
|
|
|
}
|
|
|
|
m.fsfilter = fsf
|
|
|
|
|
2018-10-11 15:22:52 +02:00
|
|
|
if m.snapper, err = snapper.FromConfig(g, fsf, in.Snapshotting); err != nil {
|
2018-09-23 23:04:31 +02:00
|
|
|
return nil, errors.Wrap(err, "cannot build snapper")
|
|
|
|
}
|
|
|
|
|
|
|
|
return m, nil
|
|
|
|
}
|
|
|
|
|
2018-09-24 12:31:29 +02:00
|
|
|
type modePull struct {
|
2018-10-11 18:00:23 +02:00
|
|
|
rootFS *zfs.DatasetPath
|
2018-09-24 12:31:29 +02:00
|
|
|
interval time.Duration
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *modePull) SenderReceiver(client *streamrpc.Client) (replication.Sender, replication.Receiver, error) {
|
|
|
|
sender := endpoint.NewRemote(client)
|
2018-10-11 18:00:23 +02:00
|
|
|
receiver, err := endpoint.NewReceiver(m.rootFS)
|
2018-09-24 12:31:29 +02:00
|
|
|
return sender, receiver, err
|
|
|
|
}
|
|
|
|
|
|
|
|
func (*modePull) Type() Type { return TypePull }
|
|
|
|
|
|
|
|
func (m *modePull) RunPeriodic(ctx context.Context, wakeUpCommon chan<- struct{}) {
|
|
|
|
t := time.NewTicker(m.interval)
|
|
|
|
defer t.Stop()
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-t.C:
|
|
|
|
select {
|
|
|
|
case wakeUpCommon <- struct{}{}:
|
|
|
|
default:
|
|
|
|
GetLogger(ctx).
|
|
|
|
WithField("pull_interval", m.interval).
|
|
|
|
Warn("pull job took longer than pull interval")
|
|
|
|
wakeUpCommon <- struct{}{} // block anyways, to queue up the wakeup
|
|
|
|
}
|
|
|
|
case <-ctx.Done():
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func modePullFromConfig(g *config.Global, in *config.PullJob) (m *modePull, err error) {
|
|
|
|
m = &modePull{}
|
|
|
|
if in.Interval <= 0 {
|
|
|
|
return nil, errors.New("interval must be positive")
|
|
|
|
}
|
|
|
|
m.interval = in.Interval
|
|
|
|
|
2018-10-11 18:00:23 +02:00
|
|
|
m.rootFS, err = zfs.NewDatasetPath(in.RootFS)
|
2018-09-24 12:31:29 +02:00
|
|
|
if err != nil {
|
2018-10-11 18:00:23 +02:00
|
|
|
return nil, errors.New("RootFS is not a valid zfs filesystem path")
|
2018-09-24 12:31:29 +02:00
|
|
|
}
|
2018-10-11 18:00:23 +02:00
|
|
|
if m.rootFS.Length() <= 0 {
|
|
|
|
return nil, errors.New("RootFS must not be empty") // duplicates error check of receiver
|
2018-09-24 12:31:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return m, nil
|
|
|
|
}
|
|
|
|
|
2018-09-23 23:04:31 +02:00
|
|
|
func activeSide(g *config.Global, in *config.ActiveJob, mode activeMode) (j *ActiveSide, err error) {
|
2018-08-27 22:21:45 +02:00
|
|
|
|
2018-09-23 23:04:31 +02:00
|
|
|
j = &ActiveSide{mode: mode}
|
2018-08-27 22:21:45 +02:00
|
|
|
j.name = in.Name
|
2018-09-08 07:03:41 +02:00
|
|
|
j.promRepStateSecs = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
|
|
|
Namespace: "zrepl",
|
|
|
|
Subsystem: "replication",
|
|
|
|
Name: "state_time",
|
|
|
|
Help: "seconds spent during replication",
|
|
|
|
ConstLabels: prometheus.Labels{"zrepl_job":j.name},
|
|
|
|
}, []string{"state"})
|
|
|
|
j.promBytesReplicated = prometheus.NewCounterVec(prometheus.CounterOpts{
|
|
|
|
Namespace: "zrepl",
|
|
|
|
Subsystem: "replication",
|
|
|
|
Name: "bytes_replicated",
|
|
|
|
Help: "number of bytes replicated from sender to receiver per filesystem",
|
|
|
|
ConstLabels: prometheus.Labels{"zrepl_job":j.name},
|
|
|
|
}, []string{"filesystem"})
|
2018-08-27 22:21:45 +02:00
|
|
|
|
2018-09-04 23:44:45 +02:00
|
|
|
j.clientFactory, err = connecter.FromConfig(g, in.Connect)
|
2018-08-31 21:51:44 +02:00
|
|
|
if err != nil {
|
|
|
|
return nil, errors.Wrap(err, "cannot build client")
|
|
|
|
}
|
2018-08-27 22:21:45 +02:00
|
|
|
|
2018-09-08 07:03:41 +02:00
|
|
|
j.promPruneSecs = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
|
|
|
Namespace: "zrepl",
|
|
|
|
Subsystem: "pruning",
|
|
|
|
Name: "time",
|
|
|
|
Help: "seconds spent in pruner",
|
|
|
|
ConstLabels: prometheus.Labels{"zrepl_job":j.name},
|
|
|
|
}, []string{"prune_side"})
|
|
|
|
j.prunerFactory, err = pruner.NewPrunerFactory(in.Pruning, j.promPruneSecs)
|
2018-08-30 11:52:05 +02:00
|
|
|
if err != nil {
|
2018-08-30 17:40:45 +02:00
|
|
|
return nil, err
|
2018-08-30 11:52:05 +02:00
|
|
|
}
|
|
|
|
|
2018-08-27 22:21:45 +02:00
|
|
|
return j, nil
|
|
|
|
}
|
|
|
|
|
2018-09-23 23:04:31 +02:00
|
|
|
func (j *ActiveSide) RegisterMetrics(registerer prometheus.Registerer) {
|
2018-09-08 07:03:41 +02:00
|
|
|
registerer.MustRegister(j.promRepStateSecs)
|
|
|
|
registerer.MustRegister(j.promPruneSecs)
|
|
|
|
registerer.MustRegister(j.promBytesReplicated)
|
|
|
|
}
|
|
|
|
|
2018-09-23 23:04:31 +02:00
|
|
|
func (j *ActiveSide) Name() string { return j.name }
|
2018-08-27 22:21:45 +02:00
|
|
|
|
2018-09-23 23:04:31 +02:00
|
|
|
type ActiveSideStatus struct {
|
2018-09-23 21:08:03 +02:00
|
|
|
Replication *replication.Report
|
2018-09-24 19:22:44 +02:00
|
|
|
PruningSender, PruningReceiver *pruner.Report
|
2018-09-23 21:08:03 +02:00
|
|
|
}
|
|
|
|
|
2018-09-23 23:04:31 +02:00
|
|
|
func (j *ActiveSide) Status() *Status {
|
2018-09-24 17:48:45 +02:00
|
|
|
tasks := j.updateTasks(nil)
|
|
|
|
|
2018-09-23 23:04:31 +02:00
|
|
|
s := &ActiveSideStatus{}
|
|
|
|
t := j.mode.Type()
|
2018-09-24 19:22:44 +02:00
|
|
|
if tasks.replication != nil {
|
|
|
|
s.Replication = tasks.replication.Report()
|
|
|
|
}
|
|
|
|
if tasks.prunerSender != nil {
|
|
|
|
s.PruningSender = tasks.prunerSender.Report()
|
|
|
|
}
|
|
|
|
if tasks.prunerReceiver != nil {
|
|
|
|
s.PruningReceiver = tasks.prunerReceiver.Report()
|
2018-08-29 19:18:54 +02:00
|
|
|
}
|
2018-09-23 23:04:31 +02:00
|
|
|
return &Status{Type: t, JobSpecific: s}
|
2018-08-27 22:21:45 +02:00
|
|
|
}
|
|
|
|
|
2018-09-23 23:04:31 +02:00
|
|
|
func (j *ActiveSide) Run(ctx context.Context) {
|
2018-08-27 22:21:45 +02:00
|
|
|
log := GetLogger(ctx)
|
2018-09-23 23:04:31 +02:00
|
|
|
ctx = logging.WithSubsystemLoggers(ctx, log)
|
2018-08-27 22:21:45 +02:00
|
|
|
|
|
|
|
defer log.Info("job exiting")
|
|
|
|
|
2018-09-23 23:04:31 +02:00
|
|
|
periodicDone := make(chan struct{})
|
|
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
|
|
defer cancel()
|
|
|
|
go j.mode.RunPeriodic(ctx, periodicDone)
|
2018-09-04 23:46:02 +02:00
|
|
|
|
2018-08-27 22:21:45 +02:00
|
|
|
invocationCount := 0
|
|
|
|
outer:
|
|
|
|
for {
|
2018-08-31 16:26:11 +02:00
|
|
|
log.Info("wait for wakeups")
|
2018-08-27 22:21:45 +02:00
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
log.WithError(ctx.Err()).Info("context")
|
|
|
|
break outer
|
2018-09-04 23:46:02 +02:00
|
|
|
|
2018-10-12 12:44:40 +02:00
|
|
|
case <-wakeup.Wait(ctx):
|
2018-09-23 23:04:31 +02:00
|
|
|
case <-periodicDone:
|
2018-08-27 22:21:45 +02:00
|
|
|
}
|
2018-09-04 23:46:02 +02:00
|
|
|
invocationCount++
|
|
|
|
invLog := log.WithField("invocation", invocationCount)
|
2018-10-19 09:35:30 +02:00
|
|
|
j.do(WithLogger(ctx, invLog))
|
2018-08-27 22:21:45 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-19 09:35:30 +02:00
|
|
|
func (j *ActiveSide) do(ctx context.Context) {
|
2018-08-27 22:21:45 +02:00
|
|
|
|
|
|
|
log := GetLogger(ctx)
|
2018-08-31 16:26:11 +02:00
|
|
|
ctx = logging.WithSubsystemLoggers(ctx, log)
|
2018-08-27 22:21:45 +02:00
|
|
|
|
2018-10-12 20:50:30 +02:00
|
|
|
// allow cancellation of an invocation (this function)
|
|
|
|
ctx, cancelThisRun := context.WithCancel(ctx)
|
|
|
|
defer cancelThisRun()
|
|
|
|
runDone := make(chan struct{})
|
|
|
|
defer close(runDone)
|
|
|
|
go func() {
|
|
|
|
select {
|
|
|
|
case <-runDone:
|
|
|
|
case <-reset.Wait(ctx):
|
|
|
|
log.Info("reset received, cancelling current invocation")
|
|
|
|
cancelThisRun()
|
|
|
|
case <-ctx.Done():
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
2018-10-19 16:27:05 +02:00
|
|
|
// watchdog
|
|
|
|
go func() {
|
|
|
|
// if no progress after 1 minute, kill the task
|
|
|
|
wdto := envconst.Duration("ZREPL_JOB_WATCHDOG_TIMEOUT", 1*time.Minute)
|
|
|
|
log.WithField("watchdog_timeout", wdto.String()).Debug("starting watchdog")
|
|
|
|
|
|
|
|
t := time.NewTicker(wdto)
|
|
|
|
defer t.Stop()
|
|
|
|
|
|
|
|
var (
|
|
|
|
rep, prunerSender, prunerReceiver watchdog.Progress
|
|
|
|
)
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-runDone:
|
|
|
|
return
|
|
|
|
case <-ctx.Done():
|
|
|
|
return
|
|
|
|
case <-t.C: // fall
|
|
|
|
}
|
|
|
|
|
|
|
|
log := log.WithField("watchdog_timeout", wdto.String()) // shadowing!
|
|
|
|
|
|
|
|
j.updateTasks(func(tasks *activeSideTasks) {
|
|
|
|
if tasks.replication != nil &&
|
|
|
|
!tasks.replication.Progress.ExpectProgress(&rep) &&
|
|
|
|
!tasks.replication.State().IsTerminal() {
|
|
|
|
log.Error("replication did not make progress, cancelling")
|
|
|
|
tasks.replicationCancel()
|
|
|
|
}
|
|
|
|
if tasks.prunerSender != nil &&
|
|
|
|
!tasks.prunerSender.Progress.ExpectProgress(&prunerSender) &&
|
|
|
|
!tasks.prunerSender.State().IsTerminal() {
|
|
|
|
log.Error("pruner:sender did not make progress, cancelling")
|
|
|
|
tasks.prunerSenderCancel()
|
|
|
|
}
|
|
|
|
if tasks.prunerReceiver != nil &&
|
|
|
|
!tasks.prunerReceiver.Progress.ExpectProgress(&prunerReceiver) &&
|
|
|
|
!tasks.prunerReceiver.State().IsTerminal() {
|
|
|
|
log.Error("pruner:receiver did not make progress, cancelling")
|
|
|
|
tasks.prunerReceiverCancel()
|
|
|
|
}
|
|
|
|
})
|
|
|
|
log.WithField("replication_progress", rep.String()).
|
|
|
|
WithField("pruner_sender_progress", prunerSender.String()).
|
|
|
|
WithField("pruner_receiver_progress", prunerReceiver.String()).
|
|
|
|
Debug("watchdog did run")
|
|
|
|
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
2018-08-31 21:51:44 +02:00
|
|
|
client, err := j.clientFactory.NewClient()
|
2018-08-27 22:21:45 +02:00
|
|
|
if err != nil {
|
2018-08-31 21:51:44 +02:00
|
|
|
log.WithError(err).Error("factory cannot instantiate streamrpc client")
|
2018-08-27 22:21:45 +02:00
|
|
|
}
|
2018-08-31 16:26:11 +02:00
|
|
|
defer client.Close(ctx)
|
2018-08-27 22:21:45 +02:00
|
|
|
|
2018-09-23 23:04:31 +02:00
|
|
|
sender, receiver, err := j.mode.SenderReceiver(client)
|
2018-08-27 22:21:45 +02:00
|
|
|
|
2018-10-19 16:27:05 +02:00
|
|
|
{
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
return
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
ctx, repCancel := context.WithCancel(ctx)
|
|
|
|
tasks := j.updateTasks(func(tasks *activeSideTasks) {
|
|
|
|
// reset it
|
|
|
|
*tasks = activeSideTasks{}
|
|
|
|
tasks.replicationCancel = repCancel
|
|
|
|
tasks.replication = replication.NewReplication(j.promRepStateSecs, j.promBytesReplicated)
|
|
|
|
})
|
|
|
|
log.Info("start replication")
|
|
|
|
tasks.replication.Drive(ctx, sender, receiver)
|
|
|
|
repCancel() // always cancel to free up context resources
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
return
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
ctx, senderCancel := context.WithCancel(ctx)
|
|
|
|
tasks := j.updateTasks(func(tasks *activeSideTasks) {
|
|
|
|
tasks.prunerSender = j.prunerFactory.BuildSenderPruner(ctx, sender, sender)
|
|
|
|
tasks.prunerSenderCancel = senderCancel
|
|
|
|
})
|
|
|
|
log.Info("start pruning sender")
|
|
|
|
tasks.prunerSender.Prune()
|
2018-10-19 15:58:04 +02:00
|
|
|
log.Info("finished pruning sender")
|
2018-10-19 16:27:05 +02:00
|
|
|
senderCancel()
|
|
|
|
}
|
|
|
|
{
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
return
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
ctx, receiverCancel := context.WithCancel(ctx)
|
|
|
|
tasks := j.updateTasks(func(tasks *activeSideTasks) {
|
|
|
|
tasks.prunerReceiver = j.prunerFactory.BuildReceiverPruner(ctx, receiver, sender)
|
|
|
|
tasks.prunerReceiverCancel = receiverCancel
|
|
|
|
})
|
|
|
|
log.Info("start pruning receiver")
|
|
|
|
tasks.prunerReceiver.Prune()
|
2018-10-19 15:58:04 +02:00
|
|
|
log.Info("finished pruning receiver")
|
2018-10-19 16:27:05 +02:00
|
|
|
receiverCancel()
|
|
|
|
}
|
2018-08-27 22:21:45 +02:00
|
|
|
}
|