mirror of
https://github.com/zrepl/zrepl.git
synced 2024-12-22 15:11:16 +01:00
replication: prometheus metric for number of failed replications in last attempt
- package replication: metric - Grafana panel - wiring - changelog Signed-off-by: Christian Schwarz <me@cschwarz.com> closes #341
This commit is contained in:
parent
0ee7a49d31
commit
83fdffbcef
@ -34,9 +34,10 @@ type ActiveSide struct {
|
||||
|
||||
prunerFactory *pruner.PrunerFactory
|
||||
|
||||
promRepStateSecs *prometheus.HistogramVec // labels: state
|
||||
promPruneSecs *prometheus.HistogramVec // labels: prune_side
|
||||
promBytesReplicated *prometheus.CounterVec // labels: filesystem
|
||||
promRepStateSecs *prometheus.HistogramVec // labels: state
|
||||
promPruneSecs *prometheus.HistogramVec // labels: prune_side
|
||||
promBytesReplicated *prometheus.CounterVec // labels: filesystem
|
||||
promReplicationErrors prometheus.Gauge
|
||||
|
||||
tasksMtx sync.Mutex
|
||||
tasks activeSideTasks
|
||||
@ -299,6 +300,14 @@ func activeSide(g *config.Global, in *config.ActiveJob, configJob interface{}) (
|
||||
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
|
||||
}, []string{"filesystem"})
|
||||
|
||||
j.promReplicationErrors = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Namespace: "zrepl",
|
||||
Subsystem: "replication",
|
||||
Name: "filesystem_errors",
|
||||
Help: "number of filesystems that failed replication in the latest replication attempt, or -1 if the job failed before enumerating the filesystems",
|
||||
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
|
||||
})
|
||||
|
||||
j.connecter, err = fromconfig.ConnecterFromConfig(g, in.Connect)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "cannot build client")
|
||||
@ -323,6 +332,7 @@ func (j *ActiveSide) RegisterMetrics(registerer prometheus.Registerer) {
|
||||
registerer.MustRegister(j.promRepStateSecs)
|
||||
registerer.MustRegister(j.promPruneSecs)
|
||||
registerer.MustRegister(j.promBytesReplicated)
|
||||
registerer.MustRegister(j.promReplicationErrors)
|
||||
}
|
||||
|
||||
func (j *ActiveSide) Name() string { return j.name.String() }
|
||||
@ -455,6 +465,10 @@ func (j *ActiveSide) do(ctx context.Context) {
|
||||
GetLogger(ctx).Info("start replication")
|
||||
repWait(true) // wait blocking
|
||||
repCancel() // always cancel to free up context resources
|
||||
|
||||
replicationReport := j.tasks.replicationReport()
|
||||
j.promReplicationErrors.Set(float64(replicationReport.GetFailedFilesystemsCountInLatestAttempt()))
|
||||
|
||||
endSpan()
|
||||
}
|
||||
|
||||
|
180
dist/grafana/grafana-prometheus-zrepl.json
vendored
180
dist/grafana/grafana-prometheus-zrepl.json
vendored
@ -69,6 +69,139 @@
|
||||
"title": "Panel Title",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"cacheTimeout": null,
|
||||
"colorBackground": true,
|
||||
"colorPostfix": false,
|
||||
"colorPrefix": false,
|
||||
"colorValue": false,
|
||||
"colors": [
|
||||
"#bf1b00",
|
||||
"#508642",
|
||||
"#bf1b00"
|
||||
],
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"description": "Number of filesystems that failed replications",
|
||||
"format": "none",
|
||||
"gauge": {
|
||||
"maxValue": 100,
|
||||
"minValue": 0,
|
||||
"show": false,
|
||||
"thresholdLabels": false,
|
||||
"thresholdMarkers": true
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
},
|
||||
"id": 50,
|
||||
"interval": null,
|
||||
"links": [],
|
||||
"mappingType": 1,
|
||||
"mappingTypes": [
|
||||
{
|
||||
"name": "value to text",
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"name": "range to text",
|
||||
"value": 2
|
||||
}
|
||||
],
|
||||
"maxDataPoints": 100,
|
||||
"nullPointMode": "connected",
|
||||
"nullText": null,
|
||||
"postfix": "",
|
||||
"postfixFontSize": "50%",
|
||||
"prefix": "",
|
||||
"prefixFontSize": "50%",
|
||||
"rangeMaps": [
|
||||
{
|
||||
"from": "",
|
||||
"text": "",
|
||||
"to": ""
|
||||
}
|
||||
],
|
||||
"repeat": "zrepl_job_name",
|
||||
"repeatDirection": "h",
|
||||
"scopedVars": {
|
||||
"zrepl_job_name": {
|
||||
"selected": false,
|
||||
"text": "desktop_to_homesrv",
|
||||
"value": "desktop_to_homesrv"
|
||||
}
|
||||
},
|
||||
"sparkline": {
|
||||
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||
"full": true,
|
||||
"lineColor": "rgb(31, 120, 193)",
|
||||
"show": true
|
||||
},
|
||||
"tableColumn": "__name__",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "zrepl_replication_filesystem_errors{job=\"$prom_job_name\",zrepl_job=\"$zrepl_job_name\"}",
|
||||
"format": "time_series",
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"$__interval"
|
||||
],
|
||||
"type": "time"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"null"
|
||||
],
|
||||
"type": "fill"
|
||||
}
|
||||
],
|
||||
"instant": true,
|
||||
"interval": "",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"value"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "mean"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": []
|
||||
}
|
||||
],
|
||||
"thresholds": "0,1",
|
||||
"title": "Failed replications $zrepl_job_name",
|
||||
"transparent": false,
|
||||
"type": "singlestat",
|
||||
"valueFontSize": "80%",
|
||||
"valueMaps": [
|
||||
{
|
||||
"op": "=",
|
||||
"text": "All failed",
|
||||
"value": "-1"
|
||||
},
|
||||
{
|
||||
"op": "=",
|
||||
"text": "All OK",
|
||||
"value": "0"
|
||||
}
|
||||
],
|
||||
"valueName": "avg"
|
||||
},
|
||||
{
|
||||
"aliasColors": {},
|
||||
"bars": false,
|
||||
@ -87,7 +220,7 @@
|
||||
"h": 4,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 10
|
||||
"y": 13
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 48,
|
||||
@ -181,7 +314,7 @@
|
||||
"h": 5,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 10
|
||||
"y": 13
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 44,
|
||||
@ -273,7 +406,7 @@
|
||||
"h": 5,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 14
|
||||
"y": 17
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 42,
|
||||
@ -373,7 +506,7 @@
|
||||
"h": 4,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 15
|
||||
"y": 18
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 22,
|
||||
@ -465,7 +598,7 @@
|
||||
"h": 5,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 19
|
||||
"y": 22
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 33,
|
||||
@ -573,7 +706,7 @@
|
||||
"h": 5,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 19
|
||||
"y": 22
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 23,
|
||||
@ -665,7 +798,7 @@
|
||||
"h": 5,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 24
|
||||
"y": 27
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 41,
|
||||
@ -758,7 +891,7 @@
|
||||
"h": 5,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 24
|
||||
"y": 27
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 47,
|
||||
@ -850,7 +983,7 @@
|
||||
"h": 5,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 29
|
||||
"y": 32
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 17,
|
||||
@ -943,7 +1076,7 @@
|
||||
"h": 5,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 34
|
||||
"y": 37
|
||||
},
|
||||
"hiddenSeries": false,
|
||||
"id": 19,
|
||||
@ -1044,6 +1177,33 @@
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {
|
||||
"text": "All",
|
||||
"value": [
|
||||
"$__all"
|
||||
]
|
||||
},
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"definition": "label_values(zrepl_replication_filesystem_errors{job=\"$prom_job_name\"}, zrepl_job)",
|
||||
"hide": 2,
|
||||
"includeAll": true,
|
||||
"label": "Zrepl Job Name",
|
||||
"multi": true,
|
||||
"name": "zrepl_job_name",
|
||||
"options": [],
|
||||
"query": "label_values(zrepl_replication_filesystem_errors{job=\"$prom_job_name\"}, zrepl_job)",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}]
|
||||
},
|
||||
"time": {
|
||||
|
@ -65,6 +65,7 @@ Additional changelog:
|
||||
The migration will ensure that only those old-format cursors are destroyed that have been superseeded by new-format cursors.
|
||||
|
||||
* |feature| New option ``listen_freebind`` (tcp, tls, prometheus listener)
|
||||
* |feature| :issue:`341` Prometheus metric for failing replications + corresponding Grafana panel
|
||||
* |feature| :issue:`265` transport/tcp: support for CIDR masks in client IP whitelist
|
||||
* |feature| documented subcommand to generate ``bash`` and ``zsh`` completions
|
||||
* |feature| :issue:`307` ``chrome://trace`` -compatible activity tracing of zrepl daemon activity
|
||||
|
@ -162,3 +162,30 @@ func (f *FilesystemReport) NextStep() *StepReport {
|
||||
func (f *StepReport) IsIncremental() bool {
|
||||
return f.Info.From != ""
|
||||
}
|
||||
|
||||
// Returns, for the latest replication attempt,
|
||||
// 0 if there have not been any replication attempts,
|
||||
// -1 if the replication failed while enumerating file systems
|
||||
// N if N filesystems could not not be replicated successfully
|
||||
func (r *Report) GetFailedFilesystemsCountInLatestAttempt() int {
|
||||
|
||||
if len(r.Attempts) == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
a := r.Attempts[len(r.Attempts)-1]
|
||||
switch a.State {
|
||||
case AttemptPlanningError:
|
||||
return -1
|
||||
case AttemptFanOutError:
|
||||
var count int
|
||||
for _, f := range a.Filesystems {
|
||||
if f.Error() != nil {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user