mirror of
https://github.com/zrepl/zrepl.git
synced 2024-11-22 08:23:50 +01:00
replication: prometheus metric for number of failed replications in last attempt
- package replication: metric - Grafana panel - wiring - changelog Signed-off-by: Christian Schwarz <me@cschwarz.com> closes #341
This commit is contained in:
parent
0ee7a49d31
commit
83fdffbcef
@ -37,6 +37,7 @@ type ActiveSide struct {
|
|||||||
promRepStateSecs *prometheus.HistogramVec // labels: state
|
promRepStateSecs *prometheus.HistogramVec // labels: state
|
||||||
promPruneSecs *prometheus.HistogramVec // labels: prune_side
|
promPruneSecs *prometheus.HistogramVec // labels: prune_side
|
||||||
promBytesReplicated *prometheus.CounterVec // labels: filesystem
|
promBytesReplicated *prometheus.CounterVec // labels: filesystem
|
||||||
|
promReplicationErrors prometheus.Gauge
|
||||||
|
|
||||||
tasksMtx sync.Mutex
|
tasksMtx sync.Mutex
|
||||||
tasks activeSideTasks
|
tasks activeSideTasks
|
||||||
@ -299,6 +300,14 @@ func activeSide(g *config.Global, in *config.ActiveJob, configJob interface{}) (
|
|||||||
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
|
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
|
||||||
}, []string{"filesystem"})
|
}, []string{"filesystem"})
|
||||||
|
|
||||||
|
j.promReplicationErrors = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||||
|
Namespace: "zrepl",
|
||||||
|
Subsystem: "replication",
|
||||||
|
Name: "filesystem_errors",
|
||||||
|
Help: "number of filesystems that failed replication in the latest replication attempt, or -1 if the job failed before enumerating the filesystems",
|
||||||
|
ConstLabels: prometheus.Labels{"zrepl_job": j.name.String()},
|
||||||
|
})
|
||||||
|
|
||||||
j.connecter, err = fromconfig.ConnecterFromConfig(g, in.Connect)
|
j.connecter, err = fromconfig.ConnecterFromConfig(g, in.Connect)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, errors.Wrap(err, "cannot build client")
|
return nil, errors.Wrap(err, "cannot build client")
|
||||||
@ -323,6 +332,7 @@ func (j *ActiveSide) RegisterMetrics(registerer prometheus.Registerer) {
|
|||||||
registerer.MustRegister(j.promRepStateSecs)
|
registerer.MustRegister(j.promRepStateSecs)
|
||||||
registerer.MustRegister(j.promPruneSecs)
|
registerer.MustRegister(j.promPruneSecs)
|
||||||
registerer.MustRegister(j.promBytesReplicated)
|
registerer.MustRegister(j.promBytesReplicated)
|
||||||
|
registerer.MustRegister(j.promReplicationErrors)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (j *ActiveSide) Name() string { return j.name.String() }
|
func (j *ActiveSide) Name() string { return j.name.String() }
|
||||||
@ -455,6 +465,10 @@ func (j *ActiveSide) do(ctx context.Context) {
|
|||||||
GetLogger(ctx).Info("start replication")
|
GetLogger(ctx).Info("start replication")
|
||||||
repWait(true) // wait blocking
|
repWait(true) // wait blocking
|
||||||
repCancel() // always cancel to free up context resources
|
repCancel() // always cancel to free up context resources
|
||||||
|
|
||||||
|
replicationReport := j.tasks.replicationReport()
|
||||||
|
j.promReplicationErrors.Set(float64(replicationReport.GetFailedFilesystemsCountInLatestAttempt()))
|
||||||
|
|
||||||
endSpan()
|
endSpan()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
180
dist/grafana/grafana-prometheus-zrepl.json
vendored
180
dist/grafana/grafana-prometheus-zrepl.json
vendored
@ -69,6 +69,139 @@
|
|||||||
"title": "Panel Title",
|
"title": "Panel Title",
|
||||||
"type": "text"
|
"type": "text"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cacheTimeout": null,
|
||||||
|
"colorBackground": true,
|
||||||
|
"colorPostfix": false,
|
||||||
|
"colorPrefix": false,
|
||||||
|
"colorValue": false,
|
||||||
|
"colors": [
|
||||||
|
"#bf1b00",
|
||||||
|
"#508642",
|
||||||
|
"#bf1b00"
|
||||||
|
],
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"description": "Number of filesystems that failed replications",
|
||||||
|
"format": "none",
|
||||||
|
"gauge": {
|
||||||
|
"maxValue": 100,
|
||||||
|
"minValue": 0,
|
||||||
|
"show": false,
|
||||||
|
"thresholdLabels": false,
|
||||||
|
"thresholdMarkers": true
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 3,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 10
|
||||||
|
},
|
||||||
|
"id": 50,
|
||||||
|
"interval": null,
|
||||||
|
"links": [],
|
||||||
|
"mappingType": 1,
|
||||||
|
"mappingTypes": [
|
||||||
|
{
|
||||||
|
"name": "value to text",
|
||||||
|
"value": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "range to text",
|
||||||
|
"value": 2
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"maxDataPoints": 100,
|
||||||
|
"nullPointMode": "connected",
|
||||||
|
"nullText": null,
|
||||||
|
"postfix": "",
|
||||||
|
"postfixFontSize": "50%",
|
||||||
|
"prefix": "",
|
||||||
|
"prefixFontSize": "50%",
|
||||||
|
"rangeMaps": [
|
||||||
|
{
|
||||||
|
"from": "",
|
||||||
|
"text": "",
|
||||||
|
"to": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"repeat": "zrepl_job_name",
|
||||||
|
"repeatDirection": "h",
|
||||||
|
"scopedVars": {
|
||||||
|
"zrepl_job_name": {
|
||||||
|
"selected": false,
|
||||||
|
"text": "desktop_to_homesrv",
|
||||||
|
"value": "desktop_to_homesrv"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"sparkline": {
|
||||||
|
"fillColor": "rgba(31, 118, 189, 0.18)",
|
||||||
|
"full": true,
|
||||||
|
"lineColor": "rgb(31, 120, 193)",
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
"tableColumn": "__name__",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "zrepl_replication_filesystem_errors{job=\"$prom_job_name\",zrepl_job=\"$zrepl_job_name\"}",
|
||||||
|
"format": "time_series",
|
||||||
|
"groupBy": [
|
||||||
|
{
|
||||||
|
"params": [
|
||||||
|
"$__interval"
|
||||||
|
],
|
||||||
|
"type": "time"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"params": [
|
||||||
|
"null"
|
||||||
|
],
|
||||||
|
"type": "fill"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"instant": true,
|
||||||
|
"interval": "",
|
||||||
|
"intervalFactor": 1,
|
||||||
|
"legendFormat": "",
|
||||||
|
"orderByTime": "ASC",
|
||||||
|
"policy": "default",
|
||||||
|
"refId": "A",
|
||||||
|
"resultFormat": "time_series",
|
||||||
|
"select": [
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"params": [
|
||||||
|
"value"
|
||||||
|
],
|
||||||
|
"type": "field"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"params": [],
|
||||||
|
"type": "mean"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"tags": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"thresholds": "0,1",
|
||||||
|
"title": "Failed replications $zrepl_job_name",
|
||||||
|
"transparent": false,
|
||||||
|
"type": "singlestat",
|
||||||
|
"valueFontSize": "80%",
|
||||||
|
"valueMaps": [
|
||||||
|
{
|
||||||
|
"op": "=",
|
||||||
|
"text": "All failed",
|
||||||
|
"value": "-1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"op": "=",
|
||||||
|
"text": "All OK",
|
||||||
|
"value": "0"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"valueName": "avg"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"aliasColors": {},
|
"aliasColors": {},
|
||||||
"bars": false,
|
"bars": false,
|
||||||
@ -87,7 +220,7 @@
|
|||||||
"h": 4,
|
"h": 4,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 10
|
"y": 13
|
||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 48,
|
"id": 48,
|
||||||
@ -181,7 +314,7 @@
|
|||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 12,
|
"x": 12,
|
||||||
"y": 10
|
"y": 13
|
||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 44,
|
"id": 44,
|
||||||
@ -273,7 +406,7 @@
|
|||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 14
|
"y": 17
|
||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 42,
|
"id": 42,
|
||||||
@ -373,7 +506,7 @@
|
|||||||
"h": 4,
|
"h": 4,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 12,
|
"x": 12,
|
||||||
"y": 15
|
"y": 18
|
||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 22,
|
"id": 22,
|
||||||
@ -465,7 +598,7 @@
|
|||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 19
|
"y": 22
|
||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 33,
|
"id": 33,
|
||||||
@ -573,7 +706,7 @@
|
|||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 12,
|
"x": 12,
|
||||||
"y": 19
|
"y": 22
|
||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 23,
|
"id": 23,
|
||||||
@ -665,7 +798,7 @@
|
|||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 24
|
"y": 27
|
||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 41,
|
"id": 41,
|
||||||
@ -758,7 +891,7 @@
|
|||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 12,
|
"x": 12,
|
||||||
"y": 24
|
"y": 27
|
||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 47,
|
"id": 47,
|
||||||
@ -850,7 +983,7 @@
|
|||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 29
|
"y": 32
|
||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 17,
|
"id": 17,
|
||||||
@ -943,7 +1076,7 @@
|
|||||||
"h": 5,
|
"h": 5,
|
||||||
"w": 12,
|
"w": 12,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 34
|
"y": 37
|
||||||
},
|
},
|
||||||
"hiddenSeries": false,
|
"hiddenSeries": false,
|
||||||
"id": 19,
|
"id": 19,
|
||||||
@ -1044,6 +1177,33 @@
|
|||||||
"tagsQuery": "",
|
"tagsQuery": "",
|
||||||
"type": "query",
|
"type": "query",
|
||||||
"useTags": false
|
"useTags": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"allValue": null,
|
||||||
|
"current": {
|
||||||
|
"text": "All",
|
||||||
|
"value": [
|
||||||
|
"$__all"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"definition": "label_values(zrepl_replication_filesystem_errors{job=\"$prom_job_name\"}, zrepl_job)",
|
||||||
|
"hide": 2,
|
||||||
|
"includeAll": true,
|
||||||
|
"label": "Zrepl Job Name",
|
||||||
|
"multi": true,
|
||||||
|
"name": "zrepl_job_name",
|
||||||
|
"options": [],
|
||||||
|
"query": "label_values(zrepl_replication_filesystem_errors{job=\"$prom_job_name\"}, zrepl_job)",
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"skipUrlSync": false,
|
||||||
|
"sort": 1,
|
||||||
|
"tagValuesQuery": "",
|
||||||
|
"tags": [],
|
||||||
|
"tagsQuery": "",
|
||||||
|
"type": "query",
|
||||||
|
"useTags": false
|
||||||
}]
|
}]
|
||||||
},
|
},
|
||||||
"time": {
|
"time": {
|
||||||
|
@ -65,6 +65,7 @@ Additional changelog:
|
|||||||
The migration will ensure that only those old-format cursors are destroyed that have been superseeded by new-format cursors.
|
The migration will ensure that only those old-format cursors are destroyed that have been superseeded by new-format cursors.
|
||||||
|
|
||||||
* |feature| New option ``listen_freebind`` (tcp, tls, prometheus listener)
|
* |feature| New option ``listen_freebind`` (tcp, tls, prometheus listener)
|
||||||
|
* |feature| :issue:`341` Prometheus metric for failing replications + corresponding Grafana panel
|
||||||
* |feature| :issue:`265` transport/tcp: support for CIDR masks in client IP whitelist
|
* |feature| :issue:`265` transport/tcp: support for CIDR masks in client IP whitelist
|
||||||
* |feature| documented subcommand to generate ``bash`` and ``zsh`` completions
|
* |feature| documented subcommand to generate ``bash`` and ``zsh`` completions
|
||||||
* |feature| :issue:`307` ``chrome://trace`` -compatible activity tracing of zrepl daemon activity
|
* |feature| :issue:`307` ``chrome://trace`` -compatible activity tracing of zrepl daemon activity
|
||||||
|
@ -162,3 +162,30 @@ func (f *FilesystemReport) NextStep() *StepReport {
|
|||||||
func (f *StepReport) IsIncremental() bool {
|
func (f *StepReport) IsIncremental() bool {
|
||||||
return f.Info.From != ""
|
return f.Info.From != ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns, for the latest replication attempt,
|
||||||
|
// 0 if there have not been any replication attempts,
|
||||||
|
// -1 if the replication failed while enumerating file systems
|
||||||
|
// N if N filesystems could not not be replicated successfully
|
||||||
|
func (r *Report) GetFailedFilesystemsCountInLatestAttempt() int {
|
||||||
|
|
||||||
|
if len(r.Attempts) == 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
a := r.Attempts[len(r.Attempts)-1]
|
||||||
|
switch a.State {
|
||||||
|
case AttemptPlanningError:
|
||||||
|
return -1
|
||||||
|
case AttemptFanOutError:
|
||||||
|
var count int
|
||||||
|
for _, f := range a.Filesystems {
|
||||||
|
if f.Error() != nil {
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count
|
||||||
|
default:
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user