2018-08-27 19:10:55 +02:00
|
|
|
package daemon
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
2021-01-24 12:41:45 +01:00
|
|
|
"math/rand"
|
2019-03-22 19:41:12 +01:00
|
|
|
"os"
|
|
|
|
"os/signal"
|
|
|
|
"strings"
|
|
|
|
"sync"
|
|
|
|
"syscall"
|
|
|
|
"time"
|
|
|
|
|
2018-08-27 22:21:45 +02:00
|
|
|
"github.com/pkg/errors"
|
2018-09-08 07:03:41 +02:00
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
2020-08-31 16:04:00 +02:00
|
|
|
|
2020-04-11 15:49:41 +02:00
|
|
|
"github.com/zrepl/zrepl/daemon/logging/trace"
|
2020-05-10 15:06:44 +02:00
|
|
|
"github.com/zrepl/zrepl/endpoint"
|
2020-06-07 12:24:16 +02:00
|
|
|
"github.com/zrepl/zrepl/util/envconst"
|
2019-03-22 19:41:12 +01:00
|
|
|
|
2018-08-27 22:21:45 +02:00
|
|
|
"github.com/zrepl/zrepl/config"
|
2018-08-27 19:10:55 +02:00
|
|
|
"github.com/zrepl/zrepl/daemon/job"
|
2018-10-12 20:50:30 +02:00
|
|
|
"github.com/zrepl/zrepl/daemon/job/reset"
|
2021-03-23 18:01:12 +01:00
|
|
|
"github.com/zrepl/zrepl/daemon/job/wakeup"
|
2018-08-27 22:21:45 +02:00
|
|
|
"github.com/zrepl/zrepl/daemon/logging"
|
2018-08-27 19:10:55 +02:00
|
|
|
"github.com/zrepl/zrepl/logger"
|
|
|
|
"github.com/zrepl/zrepl/version"
|
2020-03-27 12:35:57 +01:00
|
|
|
"github.com/zrepl/zrepl/zfs/zfscmd"
|
2018-08-27 19:10:55 +02:00
|
|
|
)
|
|
|
|
|
2020-04-11 15:49:41 +02:00
|
|
|
func Run(ctx context.Context, conf *config.Config) error {
|
|
|
|
ctx, cancel := context.WithCancel(ctx)
|
2018-08-27 19:10:55 +02:00
|
|
|
|
|
|
|
defer cancel()
|
|
|
|
sigChan := make(chan os.Signal, 1)
|
|
|
|
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
|
|
|
go func() {
|
|
|
|
<-sigChan
|
|
|
|
cancel()
|
|
|
|
}()
|
|
|
|
|
2021-01-24 12:41:45 +01:00
|
|
|
// The math/rand package is used presently for generating trace IDs, we
|
|
|
|
// seed it with the current time and pid so that the IDs are mostly
|
|
|
|
// unique.
|
|
|
|
rand.Seed(time.Now().UnixNano())
|
|
|
|
rand.Seed(int64(os.Getpid()))
|
|
|
|
|
2018-08-31 21:50:59 +02:00
|
|
|
outlets, err := logging.OutletsFromConfig(*conf.Global.Logging)
|
2018-08-27 19:10:55 +02:00
|
|
|
if err != nil {
|
|
|
|
return errors.Wrap(err, "cannot build logging from config")
|
|
|
|
}
|
2020-04-11 15:49:41 +02:00
|
|
|
outlets.Add(newPrometheusLogOutlet(), logger.Debug)
|
2018-08-27 19:10:55 +02:00
|
|
|
|
|
|
|
confJobs, err := job.JobsFromConfig(conf)
|
|
|
|
if err != nil {
|
|
|
|
return errors.Wrap(err, "cannot build jobs from config")
|
|
|
|
}
|
|
|
|
|
|
|
|
log := logger.NewLogger(outlets, 1*time.Second)
|
|
|
|
log.Info(version.NewZreplVersionInformation().String())
|
|
|
|
|
2020-04-11 15:49:41 +02:00
|
|
|
ctx = logging.WithLoggers(ctx, logging.SubsystemLoggersWithUniversalLogger(log))
|
|
|
|
trace.RegisterCallback(trace.Callback{
|
|
|
|
OnBegin: func(ctx context.Context) { logging.GetLogger(ctx, logging.SubsysTraceData).Debug("begin span") },
|
|
|
|
OnEnd: func(ctx context.Context, spanInfo trace.SpanInfo) {
|
|
|
|
logging.
|
|
|
|
GetLogger(ctx, logging.SubsysTraceData).
|
|
|
|
WithField("duration_s", spanInfo.EndedAt().Sub(spanInfo.StartedAt()).Seconds()).
|
|
|
|
Debug("finished span " + spanInfo.TaskAndSpanStack(trace.SpanStackKindAnnotation))
|
|
|
|
},
|
|
|
|
})
|
|
|
|
|
2018-08-27 19:10:55 +02:00
|
|
|
for _, job := range confJobs {
|
|
|
|
if IsInternalJobName(job.Name()) {
|
|
|
|
panic(fmt.Sprintf("internal job name used for config job '%s'", job.Name())) //FIXME
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
jobs := newJobs()
|
|
|
|
|
|
|
|
// start control socket
|
|
|
|
controlJob, err := newControlJob(conf.Global.Control.SockPath, jobs)
|
|
|
|
if err != nil {
|
|
|
|
panic(err) // FIXME
|
|
|
|
}
|
|
|
|
jobs.start(ctx, controlJob, true)
|
|
|
|
|
2018-09-08 07:03:41 +02:00
|
|
|
for i, jc := range conf.Global.Monitoring {
|
|
|
|
var (
|
|
|
|
job job.Job
|
|
|
|
err error
|
|
|
|
)
|
|
|
|
switch v := jc.Ret.(type) {
|
|
|
|
case *config.PrometheusMonitoring:
|
|
|
|
job, err = newPrometheusJobFromConfig(v)
|
|
|
|
default:
|
|
|
|
return errors.Errorf("unknown monitoring job #%d (type %T)", i, v)
|
|
|
|
}
|
|
|
|
if err != nil {
|
2019-12-11 18:00:00 +01:00
|
|
|
return errors.Wrapf(err, "cannot build monitoring job #%d", i)
|
2018-09-08 07:03:41 +02:00
|
|
|
}
|
|
|
|
jobs.start(ctx, job, true)
|
|
|
|
}
|
|
|
|
|
2020-03-27 12:35:57 +01:00
|
|
|
// register global (=non job-local) metrics
|
2020-06-11 16:32:54 +02:00
|
|
|
version.PrometheusRegister(prometheus.DefaultRegisterer)
|
2020-03-27 12:35:57 +01:00
|
|
|
zfscmd.RegisterMetrics(prometheus.DefaultRegisterer)
|
2020-04-11 15:49:41 +02:00
|
|
|
trace.RegisterMetrics(prometheus.DefaultRegisterer)
|
2020-05-10 15:06:44 +02:00
|
|
|
endpoint.RegisterMetrics(prometheus.DefaultRegisterer)
|
2020-03-27 12:35:57 +01:00
|
|
|
|
2018-08-27 19:10:55 +02:00
|
|
|
log.Info("starting daemon")
|
|
|
|
|
|
|
|
// start regular jobs
|
|
|
|
for _, j := range confJobs {
|
|
|
|
jobs.start(ctx, j, false)
|
|
|
|
}
|
|
|
|
|
|
|
|
select {
|
|
|
|
case <-jobs.wait():
|
|
|
|
log.Info("all jobs finished")
|
|
|
|
case <-ctx.Done():
|
|
|
|
log.WithError(ctx.Err()).Info("context finished")
|
|
|
|
}
|
2020-04-11 15:49:41 +02:00
|
|
|
log.Info("waiting for jobs to finish")
|
|
|
|
<-jobs.wait()
|
2018-08-27 19:10:55 +02:00
|
|
|
log.Info("daemon exiting")
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
type jobs struct {
|
|
|
|
wg sync.WaitGroup
|
|
|
|
|
|
|
|
// m protects all fields below it
|
2021-03-23 18:01:12 +01:00
|
|
|
m sync.RWMutex
|
|
|
|
wakeups map[string]wakeup.Func // by Job.Name
|
|
|
|
resets map[string]reset.Func // by Job.Name
|
|
|
|
jobs map[string]job.Job
|
2018-08-27 19:10:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func newJobs() *jobs {
|
|
|
|
return &jobs{
|
2021-03-23 18:01:12 +01:00
|
|
|
wakeups: make(map[string]wakeup.Func),
|
|
|
|
resets: make(map[string]reset.Func),
|
|
|
|
jobs: make(map[string]job.Job),
|
2018-08-27 19:10:55 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *jobs) wait() <-chan struct{} {
|
|
|
|
ch := make(chan struct{})
|
|
|
|
go func() {
|
|
|
|
s.wg.Wait()
|
2020-04-11 15:49:41 +02:00
|
|
|
close(ch)
|
2018-08-27 19:10:55 +02:00
|
|
|
}()
|
|
|
|
return ch
|
|
|
|
}
|
|
|
|
|
2020-03-27 12:35:57 +01:00
|
|
|
type Status struct {
|
|
|
|
Jobs map[string]*job.Status
|
|
|
|
Global GlobalStatus
|
|
|
|
}
|
|
|
|
|
|
|
|
type GlobalStatus struct {
|
2021-11-10 18:43:08 +01:00
|
|
|
ZFSCmds *zfscmd.Report
|
|
|
|
Envconst *envconst.Report
|
|
|
|
OsEnviron []string
|
2020-03-27 12:35:57 +01:00
|
|
|
}
|
|
|
|
|
2018-09-23 21:08:03 +02:00
|
|
|
func (s *jobs) status() map[string]*job.Status {
|
2018-08-27 19:10:55 +02:00
|
|
|
s.m.RLock()
|
|
|
|
defer s.m.RUnlock()
|
|
|
|
|
|
|
|
type res struct {
|
|
|
|
name string
|
2018-09-23 21:08:03 +02:00
|
|
|
status *job.Status
|
2018-08-27 19:10:55 +02:00
|
|
|
}
|
|
|
|
var wg sync.WaitGroup
|
|
|
|
c := make(chan res, len(s.jobs))
|
|
|
|
for name, j := range s.jobs {
|
|
|
|
wg.Add(1)
|
|
|
|
go func(name string, j job.Job) {
|
|
|
|
defer wg.Done()
|
|
|
|
c <- res{name: name, status: j.Status()}
|
|
|
|
}(name, j)
|
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
close(c)
|
2018-09-23 21:08:03 +02:00
|
|
|
ret := make(map[string]*job.Status, len(s.jobs))
|
2018-08-27 19:10:55 +02:00
|
|
|
for res := range c {
|
|
|
|
ret[res.name] = res.status
|
|
|
|
}
|
|
|
|
return ret
|
|
|
|
}
|
|
|
|
|
2021-03-23 18:01:12 +01:00
|
|
|
func (s *jobs) wakeup(job string) error {
|
2018-08-27 22:21:45 +02:00
|
|
|
s.m.RLock()
|
|
|
|
defer s.m.RUnlock()
|
|
|
|
|
2021-03-23 18:01:12 +01:00
|
|
|
wu, ok := s.wakeups[job]
|
2018-08-27 22:21:45 +02:00
|
|
|
if !ok {
|
|
|
|
return errors.Errorf("Job %s does not exist", job)
|
|
|
|
}
|
|
|
|
return wu()
|
|
|
|
}
|
|
|
|
|
2018-10-12 20:50:30 +02:00
|
|
|
func (s *jobs) reset(job string) error {
|
|
|
|
s.m.RLock()
|
|
|
|
defer s.m.RUnlock()
|
|
|
|
|
|
|
|
wu, ok := s.resets[job]
|
|
|
|
if !ok {
|
|
|
|
return errors.Errorf("Job %s does not exist", job)
|
|
|
|
}
|
|
|
|
return wu()
|
|
|
|
}
|
|
|
|
|
2018-08-27 19:10:55 +02:00
|
|
|
const (
|
|
|
|
jobNamePrometheus = "_prometheus"
|
|
|
|
jobNameControl = "_control"
|
|
|
|
)
|
|
|
|
|
|
|
|
func IsInternalJobName(s string) bool {
|
|
|
|
return strings.HasPrefix(s, "_")
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *jobs) start(ctx context.Context, j job.Job, internal bool) {
|
|
|
|
s.m.Lock()
|
|
|
|
defer s.m.Unlock()
|
|
|
|
|
2020-04-11 15:49:41 +02:00
|
|
|
ctx = logging.WithInjectedField(ctx, logging.JobField, j.Name())
|
|
|
|
|
2018-08-27 19:10:55 +02:00
|
|
|
jobName := j.Name()
|
|
|
|
if !internal && IsInternalJobName(jobName) {
|
|
|
|
panic(fmt.Sprintf("internal job name used for non-internal job %s", jobName))
|
|
|
|
}
|
|
|
|
if internal && !IsInternalJobName(jobName) {
|
|
|
|
panic(fmt.Sprintf("internal job does not use internal job name %s", jobName))
|
|
|
|
}
|
|
|
|
if _, ok := s.jobs[jobName]; ok {
|
|
|
|
panic(fmt.Sprintf("duplicate job name %s", jobName))
|
|
|
|
}
|
2018-09-08 07:03:41 +02:00
|
|
|
|
|
|
|
j.RegisterMetrics(prometheus.DefaultRegisterer)
|
|
|
|
|
2018-08-27 19:10:55 +02:00
|
|
|
s.jobs[jobName] = j
|
2020-03-27 12:35:57 +01:00
|
|
|
ctx = zfscmd.WithJobID(ctx, j.Name())
|
2021-03-23 18:01:12 +01:00
|
|
|
ctx, wakeup := wakeup.Context(ctx)
|
2018-10-12 20:50:30 +02:00
|
|
|
ctx, resetFunc := reset.Context(ctx)
|
2021-03-23 18:01:12 +01:00
|
|
|
s.wakeups[jobName] = wakeup
|
2018-10-12 20:50:30 +02:00
|
|
|
s.resets[jobName] = resetFunc
|
2018-08-27 19:10:55 +02:00
|
|
|
|
|
|
|
s.wg.Add(1)
|
|
|
|
go func() {
|
|
|
|
defer s.wg.Done()
|
2020-04-11 15:49:41 +02:00
|
|
|
job.GetLogger(ctx).Info("starting job")
|
|
|
|
defer job.GetLogger(ctx).Info("job exited")
|
2018-08-27 19:10:55 +02:00
|
|
|
j.Run(ctx)
|
|
|
|
}()
|
|
|
|
}
|