replication: prometheus metric for number of failed replications in last attempt

- package replication: metric
- Grafana panel
- wiring
- changelog

Signed-off-by: Christian Schwarz <me@cschwarz.com>

closes #341
This commit is contained in:
Hans Schulz 2020-08-04 01:17:38 +02:00 committed by Christian Schwarz
parent 0ee7a49d31
commit 83fdffbcef
4 changed files with 215 additions and 13 deletions

View File

@ -37,6 +37,7 @@ type ActiveSide struct {
promRepStateSecs *prometheus.HistogramVec // labels: state
promPruneSecs *prometheus.HistogramVec // labels: prune_side
promBytesReplicated *prometheus.CounterVec // labels: filesystem
promReplicationErrors prometheus.Gauge
tasksMtx sync.Mutex
tasks activeSideTasks
@ -299,6 +300,14 @@ func activeSide(g *config.Global, in *config.ActiveJob, configJob interface{}) (
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
}, []string{"filesystem"})
j.promReplicationErrors = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: "zrepl",
Subsystem: "replication",
Name: "filesystem_errors",
Help: "number of filesystems that failed replication in the latest replication attempt, or -1 if the job failed before enumerating the filesystems",
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
})
j.connecter, err = fromconfig.ConnecterFromConfig(g, in.Connect)
if err != nil {
return nil, errors.Wrap(err, "cannot build client")
@ -323,6 +332,7 @@ func (j *ActiveSide) RegisterMetrics(registerer prometheus.Registerer) {
registerer.MustRegister(j.promRepStateSecs)
registerer.MustRegister(j.promPruneSecs)
registerer.MustRegister(j.promBytesReplicated)
registerer.MustRegister(j.promReplicationErrors)
}
func (j *ActiveSide) Name() string { return j.name.String() }
@ -455,6 +465,10 @@ func (j *ActiveSide) do(ctx context.Context) {
GetLogger(ctx).Info("start replication")
repWait(true) // wait blocking
repCancel() // always cancel to free up context resources
replicationReport := j.tasks.replicationReport()
j.promReplicationErrors.Set(float64(replicationReport.GetFailedFilesystemsCountInLatestAttempt()))
endSpan()
}

View File

@ -69,6 +69,139 @@
"title": "Panel Title",
"type": "text"
},
{
"cacheTimeout": null,
"colorBackground": true,
"colorPostfix": false,
"colorPrefix": false,
"colorValue": false,
"colors": [
"#bf1b00",
"#508642",
"#bf1b00"
],
"datasource": "${DS_PROMETHEUS}",
"description": "Number of filesystems that failed replications",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 3,
"w": 24,
"x": 0,
"y": 10
},
"id": 50,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "",
"text": "",
"to": ""
}
],
"repeat": "zrepl_job_name",
"repeatDirection": "h",
"scopedVars": {
"zrepl_job_name": {
"selected": false,
"text": "desktop_to_homesrv",
"value": "desktop_to_homesrv"
}
},
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": true,
"lineColor": "rgb(31, 120, 193)",
"show": true
},
"tableColumn": "__name__",
"targets": [
{
"expr": "zrepl_replication_filesystem_errors{job=\"$prom_job_name\",zrepl_job=\"$zrepl_job_name\"}",
"format": "time_series",
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"null"
],
"type": "fill"
}
],
"instant": true,
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"orderByTime": "ASC",
"policy": "default",
"refId": "A",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"value"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
}
],
"thresholds": "0,1",
"title": "Failed replications $zrepl_job_name",
"transparent": false,
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "All failed",
"value": "-1"
},
{
"op": "=",
"text": "All OK",
"value": "0"
}
],
"valueName": "avg"
},
{
"aliasColors": {},
"bars": false,
@ -87,7 +220,7 @@
"h": 4,
"w": 12,
"x": 0,
"y": 10
"y": 13
},
"hiddenSeries": false,
"id": 48,
@ -181,7 +314,7 @@
"h": 5,
"w": 12,
"x": 12,
"y": 10
"y": 13
},
"hiddenSeries": false,
"id": 44,
@ -273,7 +406,7 @@
"h": 5,
"w": 12,
"x": 0,
"y": 14
"y": 17
},
"hiddenSeries": false,
"id": 42,
@ -373,7 +506,7 @@
"h": 4,
"w": 12,
"x": 12,
"y": 15
"y": 18
},
"hiddenSeries": false,
"id": 22,
@ -465,7 +598,7 @@
"h": 5,
"w": 12,
"x": 0,
"y": 19
"y": 22
},
"hiddenSeries": false,
"id": 33,
@ -573,7 +706,7 @@
"h": 5,
"w": 12,
"x": 12,
"y": 19
"y": 22
},
"hiddenSeries": false,
"id": 23,
@ -665,7 +798,7 @@
"h": 5,
"w": 12,
"x": 0,
"y": 24
"y": 27
},
"hiddenSeries": false,
"id": 41,
@ -758,7 +891,7 @@
"h": 5,
"w": 12,
"x": 12,
"y": 24
"y": 27
},
"hiddenSeries": false,
"id": 47,
@ -850,7 +983,7 @@
"h": 5,
"w": 12,
"x": 0,
"y": 29
"y": 32
},
"hiddenSeries": false,
"id": 17,
@ -943,7 +1076,7 @@
"h": 5,
"w": 12,
"x": 0,
"y": 34
"y": 37
},
"hiddenSeries": false,
"id": 19,
@ -1044,6 +1177,33 @@
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {
"text": "All",
"value": [
"$__all"
]
},
"datasource": "${DS_PROMETHEUS}",
"definition": "label_values(zrepl_replication_filesystem_errors{job=\"$prom_job_name\"}, zrepl_job)",
"hide": 2,
"includeAll": true,
"label": "Zrepl Job Name",
"multi": true,
"name": "zrepl_job_name",
"options": [],
"query": "label_values(zrepl_replication_filesystem_errors{job=\"$prom_job_name\"}, zrepl_job)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}]
},
"time": {

View File

@ -65,6 +65,7 @@ Additional changelog:
The migration will ensure that only those old-format cursors are destroyed that have been superseeded by new-format cursors.
* |feature| New option ``listen_freebind`` (tcp, tls, prometheus listener)
* |feature| :issue:`341` Prometheus metric for failing replications + corresponding Grafana panel
* |feature| :issue:`265` transport/tcp: support for CIDR masks in client IP whitelist
* |feature| documented subcommand to generate ``bash`` and ``zsh`` completions
* |feature| :issue:`307` ``chrome://trace`` -compatible activity tracing of zrepl daemon activity

View File

@ -162,3 +162,30 @@ func (f *FilesystemReport) NextStep() *StepReport {
func (f *StepReport) IsIncremental() bool {
return f.Info.From != ""
}
// Returns, for the latest replication attempt,
// 0 if there have not been any replication attempts,
// -1 if the replication failed while enumerating file systems
// N if N filesystems could not not be replicated successfully
func (r *Report) GetFailedFilesystemsCountInLatestAttempt() int {
if len(r.Attempts) == 0 {
return 0
}
a := r.Attempts[len(r.Attempts)-1]
switch a.State {
case AttemptPlanningError:
return -1
case AttemptFanOutError:
var count int
for _, f := range a.Filesystems {
if f.Error() != nil {
count++
}
}
return count
default:
return 0
}
}