mirror of
https://github.com/zrepl/zrepl.git
synced 2024-11-24 17:35:01 +01:00
job/active: add "last completed" metric for error reporting
use case: So that I can use a more resilient alerting such as "last complete was sent more than 24h ago". fixes https://github.com/zrepl/zrepl/issues/516 closes https://github.com/zrepl/zrepl/pull/530
This commit is contained in:
parent
1f0f2f8569
commit
c6a9ebc71c
@ -42,6 +42,7 @@ type ActiveSide struct {
|
|||||||
promPruneSecs *prometheus.HistogramVec // labels: prune_side
|
promPruneSecs *prometheus.HistogramVec // labels: prune_side
|
||||||
promBytesReplicated *prometheus.CounterVec // labels: filesystem
|
promBytesReplicated *prometheus.CounterVec // labels: filesystem
|
||||||
promReplicationErrors prometheus.Gauge
|
promReplicationErrors prometheus.Gauge
|
||||||
|
promLastSuccessful prometheus.Gauge
|
||||||
|
|
||||||
tasksMtx sync.Mutex
|
tasksMtx sync.Mutex
|
||||||
tasks activeSideTasks
|
tasks activeSideTasks
|
||||||
@ -321,7 +322,6 @@ func activeSide(g *config.Global, in *config.ActiveJob, configJob interface{}) (
|
|||||||
Help: "number of bytes replicated from sender to receiver per filesystem",
|
Help: "number of bytes replicated from sender to receiver per filesystem",
|
||||||
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
|
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
|
||||||
}, []string{"filesystem"})
|
}, []string{"filesystem"})
|
||||||
|
|
||||||
j.promReplicationErrors = prometheus.NewGauge(prometheus.GaugeOpts{
|
j.promReplicationErrors = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
Namespace: "zrepl",
|
Namespace: "zrepl",
|
||||||
Subsystem: "replication",
|
Subsystem: "replication",
|
||||||
@ -329,6 +329,13 @@ func activeSide(g *config.Global, in *config.ActiveJob, configJob interface{}) (
|
|||||||
Help: "number of filesystems that failed replication in the latest replication attempt, or -1 if the job failed before enumerating the filesystems",
|
Help: "number of filesystems that failed replication in the latest replication attempt, or -1 if the job failed before enumerating the filesystems",
|
||||||
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
|
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
|
||||||
})
|
})
|
||||||
|
j.promLastSuccessful = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
|
Namespace: "zrepl",
|
||||||
|
Subsystem: "replication",
|
||||||
|
Name: "last_successful",
|
||||||
|
Help: "timestamp of last successful replication",
|
||||||
|
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
|
||||||
|
})
|
||||||
|
|
||||||
j.connecter, err = fromconfig.ConnecterFromConfig(g, in.Connect)
|
j.connecter, err = fromconfig.ConnecterFromConfig(g, in.Connect)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -360,6 +367,7 @@ func (j *ActiveSide) RegisterMetrics(registerer prometheus.Registerer) {
|
|||||||
registerer.MustRegister(j.promPruneSecs)
|
registerer.MustRegister(j.promPruneSecs)
|
||||||
registerer.MustRegister(j.promBytesReplicated)
|
registerer.MustRegister(j.promBytesReplicated)
|
||||||
registerer.MustRegister(j.promReplicationErrors)
|
registerer.MustRegister(j.promReplicationErrors)
|
||||||
|
registerer.MustRegister(j.promLastSuccessful)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (j *ActiveSide) Name() string { return j.name.String() }
|
func (j *ActiveSide) Name() string { return j.name.String() }
|
||||||
@ -494,7 +502,11 @@ func (j *ActiveSide) do(ctx context.Context) {
|
|||||||
repCancel() // always cancel to free up context resources
|
repCancel() // always cancel to free up context resources
|
||||||
|
|
||||||
replicationReport := j.tasks.replicationReport()
|
replicationReport := j.tasks.replicationReport()
|
||||||
j.promReplicationErrors.Set(float64(replicationReport.GetFailedFilesystemsCountInLatestAttempt()))
|
var numErrors = replicationReport.GetFailedFilesystemsCountInLatestAttempt()
|
||||||
|
j.promReplicationErrors.Set(float64(numErrors))
|
||||||
|
if numErrors == 0 {
|
||||||
|
j.promLastSuccessful.SetToCurrentTime()
|
||||||
|
}
|
||||||
|
|
||||||
endSpan()
|
endSpan()
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user