Files
zrepl/daemon/prometheus.go
Christian Schwarz 17add553d3 WIP runtime-controllable concurrency for replication
Changes done so far:
- signal to route concurrency request up to the stepQueue
    - pretty hacky, no status reporting yet
- stepQueue upsizing (confirmed that it works, no intermediary commit)

Stuck at: stepQueue downsizing
- idea was to have the stepQueue signal to the activated step that it
  should suspend
- ideally, we'd just kill everything associated with the step and track
  it as restartable
    - the driver model doesn't allow for that though within an attempt
    - would need to start a separate run for the step
- less perfect: tell the downsized steps to stop copying, but leave
  all zfs sends + rpc conns open
    - - doesn't give back the resoures that a step aquired before
      being selected as downsize victim (open zfs processe + TCP conn,
      some memory in the pipe)
    - - looks weird to user if the ps aux
    - + re-waking a step is easy: just tell it to proceed with copying
    - (the impl would likely pass a check function down into Step.do
      and have it check that functino periodically. the suspend should
      be acknowledged, and stepQueue should only remove the step from
      the active queue _after_ that step has acknowledged that is
      suspended)
2020-02-17 17:22:52 +01:00

105 lines
2.6 KiB
Go

package daemon
import (
"context"
"net"
"net/http"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/zrepl/zrepl/config"
"github.com/zrepl/zrepl/daemon/job"
"github.com/zrepl/zrepl/endpoint"
"github.com/zrepl/zrepl/logger"
"github.com/zrepl/zrepl/rpc/dataconn/frameconn"
"github.com/zrepl/zrepl/util/tcpsock"
"github.com/zrepl/zrepl/zfs"
)
type prometheusJob struct {
listen string
freeBind bool
}
func newPrometheusJobFromConfig(in *config.PrometheusMonitoring) (*prometheusJob, error) {
if _, _, err := net.SplitHostPort(in.Listen); err != nil {
return nil, err
}
return &prometheusJob{in.Listen, in.ListenFreeBind}, nil
}
var prom struct {
taskLogEntries *prometheus.CounterVec
}
func init() {
prom.taskLogEntries = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "zrepl",
Subsystem: "daemon",
Name: "log_entries",
Help: "number of log entries per job task and level",
}, []string{"zrepl_job", "level"})
prometheus.MustRegister(prom.taskLogEntries)
}
func (j *prometheusJob) Name() string { return jobNamePrometheus }
func (j *prometheusJob) Status() *job.Status { return &job.Status{Type: job.TypeInternal} }
func (j *prometheusJob) OwnedDatasetSubtreeRoot() (p *zfs.DatasetPath, ok bool) { return nil, false }
func (j *prometheusJob) SenderConfig() *endpoint.SenderConfig { return nil }
func (j *prometheusJob) SetConcurrency(concurrency int) error { return errors.Errorf("not supported") }
func (j *prometheusJob) RegisterMetrics(registerer prometheus.Registerer) {}
func (j *prometheusJob) Run(ctx context.Context) {
if err := zfs.PrometheusRegister(prometheus.DefaultRegisterer); err != nil {
panic(err)
}
if err := frameconn.PrometheusRegister(prometheus.DefaultRegisterer); err != nil {
panic(err)
}
log := job.GetLogger(ctx)
l, err := tcpsock.Listen(j.listen, j.freeBind)
if err != nil {
log.WithError(err).Error("cannot listen")
return
}
go func() {
<-ctx.Done()
l.Close()
}()
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.Handler())
err = http.Serve(l, mux)
if err != nil && ctx.Err() == nil {
log.WithError(err).Error("error while serving")
}
}
type prometheusJobOutlet struct {
jobName string
}
var _ logger.Outlet = prometheusJobOutlet{}
func newPrometheusLogOutlet(jobName string) prometheusJobOutlet {
return prometheusJobOutlet{jobName}
}
func (o prometheusJobOutlet) WriteEntry(entry logger.Entry) error {
prom.taskLogEntries.WithLabelValues(o.jobName, entry.Level.String()).Inc()
return nil
}