From c6a9ebc71c5147acc8be46ea9d482bab99a958d6 Mon Sep 17 00:00:00 2001 From: Lapo Luchini Date: Wed, 20 Oct 2021 11:01:17 +0200 Subject: [PATCH] job/active: add "last completed" metric for error reporting use case: So that I can use a more resilient alerting such as "last complete was sent more than 24h ago". fixes https://github.com/zrepl/zrepl/issues/516 closes https://github.com/zrepl/zrepl/pull/530 --- daemon/job/active.go | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/daemon/job/active.go b/daemon/job/active.go index cc6cbc1..ebececf 100644 --- a/daemon/job/active.go +++ b/daemon/job/active.go @@ -42,6 +42,7 @@ type ActiveSide struct { promPruneSecs *prometheus.HistogramVec // labels: prune_side promBytesReplicated *prometheus.CounterVec // labels: filesystem promReplicationErrors prometheus.Gauge + promLastSuccessful prometheus.Gauge tasksMtx sync.Mutex tasks activeSideTasks @@ -321,7 +322,6 @@ func activeSide(g *config.Global, in *config.ActiveJob, configJob interface{}) ( Help: "number of bytes replicated from sender to receiver per filesystem", ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()}, }, []string{"filesystem"}) - j.promReplicationErrors = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: "zrepl", Subsystem: "replication", @@ -329,6 +329,13 @@ func activeSide(g *config.Global, in *config.ActiveJob, configJob interface{}) ( Help: "number of filesystems that failed replication in the latest replication attempt, or -1 if the job failed before enumerating the filesystems", ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()}, }) + j.promLastSuccessful = prometheus.NewGauge(prometheus.GaugeOpts{ + Namespace: "zrepl", + Subsystem: "replication", + Name: "last_successful", + Help: "timestamp of last successful replication", + ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()}, + }) j.connecter, err = fromconfig.ConnecterFromConfig(g, in.Connect) if err != nil { @@ -360,6 +367,7 @@ func (j *ActiveSide) RegisterMetrics(registerer prometheus.Registerer) { registerer.MustRegister(j.promPruneSecs) registerer.MustRegister(j.promBytesReplicated) registerer.MustRegister(j.promReplicationErrors) + registerer.MustRegister(j.promLastSuccessful) } func (j *ActiveSide) Name() string { return j.name.String() } @@ -494,7 +502,11 @@ func (j *ActiveSide) do(ctx context.Context) { repCancel() // always cancel to free up context resources replicationReport := j.tasks.replicationReport() - j.promReplicationErrors.Set(float64(replicationReport.GetFailedFilesystemsCountInLatestAttempt())) + var numErrors = replicationReport.GetFailedFilesystemsCountInLatestAttempt() + j.promReplicationErrors.Set(float64(numErrors)) + if numErrors == 0 { + j.promLastSuccessful.SetToCurrentTime() + } endSpan() }