job/active: add "last completed" metric for error reporting

use case:

    So that I can use a more resilient alerting such as "last complete was sent more than 24h ago".

fixes https://github.com/zrepl/zrepl/issues/516
closes https://github.com/zrepl/zrepl/pull/530
This commit is contained in:
Lapo Luchini 2021-10-20 11:01:17 +02:00 committed by Christian Schwarz
parent 1f0f2f8569
commit c6a9ebc71c

View File

@ -42,6 +42,7 @@ type ActiveSide struct {
promPruneSecs *prometheus.HistogramVec // labels: prune_side
promBytesReplicated *prometheus.CounterVec // labels: filesystem
promReplicationErrors prometheus.Gauge
promLastSuccessful prometheus.Gauge
tasksMtx sync.Mutex
tasks activeSideTasks
@ -321,7 +322,6 @@ func activeSide(g *config.Global, in *config.ActiveJob, configJob interface{}) (
Help: "number of bytes replicated from sender to receiver per filesystem",
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
}, []string{"filesystem"})
j.promReplicationErrors = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "zrepl",
Subsystem: "replication",
@ -329,6 +329,13 @@ func activeSide(g *config.Global, in *config.ActiveJob, configJob interface{}) (
Help: "number of filesystems that failed replication in the latest replication attempt, or -1 if the job failed before enumerating the filesystems",
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
})
j.promLastSuccessful = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "zrepl",
Subsystem: "replication",
Name: "last_successful",
Help: "timestamp of last successful replication",
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
})
j.connecter, err = fromconfig.ConnecterFromConfig(g, in.Connect)
if err != nil {
@ -360,6 +367,7 @@ func (j *ActiveSide) RegisterMetrics(registerer prometheus.Registerer) {
registerer.MustRegister(j.promPruneSecs)
registerer.MustRegister(j.promBytesReplicated)
registerer.MustRegister(j.promReplicationErrors)
registerer.MustRegister(j.promLastSuccessful)
}
func (j *ActiveSide) Name() string { return j.name.String() }
@ -494,7 +502,11 @@ func (j *ActiveSide) do(ctx context.Context) {
repCancel() // always cancel to free up context resources
replicationReport := j.tasks.replicationReport()
j.promReplicationErrors.Set(float64(replicationReport.GetFailedFilesystemsCountInLatestAttempt()))
var numErrors = replicationReport.GetFailedFilesystemsCountInLatestAttempt()
j.promReplicationErrors.Set(float64(numErrors))
if numErrors == 0 {
j.promLastSuccessful.SetToCurrentTime()
}
endSpan()
}