mirror of
https://github.com/zrepl/zrepl.git
synced 2025-01-03 04:48:55 +01:00
job/active: add "last completed" metric for error reporting
use case: So that I can use a more resilient alerting such as "last complete was sent more than 24h ago". fixes https://github.com/zrepl/zrepl/issues/516 closes https://github.com/zrepl/zrepl/pull/530
This commit is contained in:
parent
1f0f2f8569
commit
c6a9ebc71c
@ -42,6 +42,7 @@ type ActiveSide struct {
|
||||
promPruneSecs *prometheus.HistogramVec // labels: prune_side
|
||||
promBytesReplicated *prometheus.CounterVec // labels: filesystem
|
||||
promReplicationErrors prometheus.Gauge
|
||||
promLastSuccessful prometheus.Gauge
|
||||
|
||||
tasksMtx sync.Mutex
|
||||
tasks activeSideTasks
|
||||
@ -321,7 +322,6 @@ func activeSide(g *config.Global, in *config.ActiveJob, configJob interface{}) (
|
||||
Help: "number of bytes replicated from sender to receiver per filesystem",
|
||||
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
|
||||
}, []string{"filesystem"})
|
||||
|
||||
j.promReplicationErrors = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: "zrepl",
|
||||
Subsystem: "replication",
|
||||
@ -329,6 +329,13 @@ func activeSide(g *config.Global, in *config.ActiveJob, configJob interface{}) (
|
||||
Help: "number of filesystems that failed replication in the latest replication attempt, or -1 if the job failed before enumerating the filesystems",
|
||||
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
|
||||
})
|
||||
j.promLastSuccessful = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: "zrepl",
|
||||
Subsystem: "replication",
|
||||
Name: "last_successful",
|
||||
Help: "timestamp of last successful replication",
|
||||
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
|
||||
})
|
||||
|
||||
j.connecter, err = fromconfig.ConnecterFromConfig(g, in.Connect)
|
||||
if err != nil {
|
||||
@ -360,6 +367,7 @@ func (j *ActiveSide) RegisterMetrics(registerer prometheus.Registerer) {
|
||||
registerer.MustRegister(j.promPruneSecs)
|
||||
registerer.MustRegister(j.promBytesReplicated)
|
||||
registerer.MustRegister(j.promReplicationErrors)
|
||||
registerer.MustRegister(j.promLastSuccessful)
|
||||
}
|
||||
|
||||
func (j *ActiveSide) Name() string { return j.name.String() }
|
||||
@ -494,7 +502,11 @@ func (j *ActiveSide) do(ctx context.Context) {
|
||||
repCancel() // always cancel to free up context resources
|
||||
|
||||
replicationReport := j.tasks.replicationReport()
|
||||
j.promReplicationErrors.Set(float64(replicationReport.GetFailedFilesystemsCountInLatestAttempt()))
|
||||
var numErrors = replicationReport.GetFailedFilesystemsCountInLatestAttempt()
|
||||
j.promReplicationErrors.Set(float64(numErrors))
|
||||
if numErrors == 0 {
|
||||
j.promLastSuccessful.SetToCurrentTime()
|
||||
}
|
||||
|
||||
endSpan()
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user