From bc5e1ede04a02b89f81d12f1cfaf01f04f4497e0 Mon Sep 17 00:00:00 2001 From: Goran Mekic <107943856+gmekicaxcient@users.noreply.github.com> Date: Tue, 2 May 2023 22:13:52 +0200 Subject: [PATCH] metric to detect filesystems rules that don't match any local dataset (#653) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds a Prometheus counter called `zrepl_zfs_list_unmatched_user_specified_dataset_count`. Monitor for increases of the counter to detect filesystem filter rules that have no effect because they don't match any local filesystem. An example use case for this is the following story: 1. Someone sets up zrepl with `filesystems` filter for `zroot/pg14<`. 2. During the upgrade to Postgres 15, they rename the dataset to `zroot/pg15`, but forget to update the zrepl `filesystems` filter. 3. zrepl will not snapshot / replicate the `zroot/pg15<` datasets. Since `filesystems` rules are always evaluated on the side that has the datasets, we can smuggle this functionality into the `zfs` module's `ZFSList` function that is used by all jobs with a `filesystems` filter. Dashboard changes: - histogram with increase in $__interval, one row per job - table with increase in $__range - explainer text box, so, people know what the previous two are about We had to re-arrange some panels, hence the Git diff isn't great. closes https://github.com/zrepl/zrepl/pull/653 Co-authored-by: Christian Schwarz Co-authored-by: Goran Mekić --- daemon/filters/fsmapfilter.go | 8 + dist/grafana/grafana-prometheus-zrepl.json | 2735 +++++++++++--------- endpoint/endpoint.go | 7 + zfs/mapping.go | 18 +- zfs/prometheus.go | 21 +- zfs/zfscmd/zfscmd_context.go | 4 + 6 files changed, 1536 insertions(+), 1257 deletions(-) diff --git a/daemon/filters/fsmapfilter.go b/daemon/filters/fsmapfilter.go index 9def7b3..b28da20 100644 --- a/daemon/filters/fsmapfilter.go +++ b/daemon/filters/fsmapfilter.go @@ -160,6 +160,14 @@ func (m DatasetMapFilter) Filter(p *zfs.DatasetPath) (pass bool, err error) { return } +func (m DatasetMapFilter) UserSpecifiedDatasets() (datasets zfs.UserSpecifiedDatasetsSet) { + datasets = make(zfs.UserSpecifiedDatasetsSet) + for i := range m.entries { + datasets[m.entries[i].path.ToString()] = true + } + return +} + // Construct a new filter-only DatasetMapFilter from a mapping // The new filter allows exactly those paths that were not forbidden by the mapping. func (m DatasetMapFilter) InvertedFilter() (inv *DatasetMapFilter, err error) { diff --git a/dist/grafana/grafana-prometheus-zrepl.json b/dist/grafana/grafana-prometheus-zrepl.json index ba5ed47..e1564af 100644 --- a/dist/grafana/grafana-prometheus-zrepl.json +++ b/dist/grafana/grafana-prometheus-zrepl.json @@ -1,1282 +1,1513 @@ { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__elements": {}, - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "9.3.6" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph (old)", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "stat", - "name": "Stat", - "version": "" - }, - { - "type": "panel", - "id": "text", - "name": "Text", - "version": "" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "datasource", - "uid": "grafana" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.3.6" }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 1, - "id": null, - "links": [], - "liveNow": false, - "panels": [ + { + "type": "panel", + "id": "graph", + "name": "Graph (old)", + "version": "" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "text", + "name": "Text", + "version": "" + } + ], + "annotations": { + "list": [ { + "builtIn": 1, "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "type": "datasource", + "uid": "grafana" }, - "gridPos": { - "h": 10, - "w": 24, - "x": 0, - "y": 0 + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" }, - "id": 35, - "options": { - "code": { - "language": "plaintext", - "showLineNumbers": false, - "showMiniMap": false - }, - "content": "# zrepl Prometheus Metrics\n\nzrepl exposes Prometheus metrics and ships with this Grafana dashboard.\nThe exported metrics are suitable for health checks:\n\n* The log should generally be warning & error-free\n * The `Log Messages that require attention` graph visualizes log message at levels that generally indicate problems.\n* The number of goroutines should not grow unboundedly over time.\n * During replication, the number of goroutines can be way higher than during idle time.\n * If the goroutine count grows with each replication, there is clearly a goroutine leak. Please open a bug report.\n* Memory consumption should not grow unboundedly over time.\n * Note that the Go runtime pre-allocates some of its heap from the OS.\n * zrepl actually uses much less memory than allocated from the OS.\n * Since Go 1.11, Go pre-allocates more aggressively.\n* Monitor that some data is replicated, although that metric does not guarantee that replication was successful.\n\n**In general, note that the exported metrics are not stable unless declared otherwise.**", - "mode": "markdown" - }, - "pluginVersion": "9.3.6", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "refId": "A" - } - ], - "title": "Panel Title", - "type": "text" + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 35, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false }, - "description": "Number of filesystems that failed replications", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "0": { - "text": "All OK" - }, - "-1": { - "text": "All failed" - } + "content": "# zrepl Prometheus Metrics\n\nzrepl exposes Prometheus metrics and ships with this Grafana dashboard.\nThe exported metrics are suitable for health checks:\n\n* The log should generally be warning & error-free\n * The `Log Messages that require attention` graph visualizes log message at levels that generally indicate problems.\n* In most setups, there shouldn't be any unmatched filesystem filter rules.\n* The number of goroutines should not grow unboundedly over time.\n * During replication, the number of goroutines can be way higher than during idle time.\n * If the goroutine count grows with each replication, there is clearly a goroutine leak. Please open a bug report.\n* Memory consumption should not grow unboundedly over time.\n * Note that the Go runtime pre-allocates some of its heap from the OS.\n * zrepl actually uses much less memory than allocated from the OS.\n * Since Go 1.11, Go pre-allocates more aggressively.\n* Monitor that some data is replicated, although that metric does not guarantee that replication was successful.\n\n**In general, note that the exported metrics are not stable unless declared otherwise.**", + "mode": "markdown" + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "refId": "A" + } + ], + "title": "Panel Title", + "type": "text" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Number of filesystems that failed replications", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "text": "All OK" }, - "type": "value" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#bf1b00", - "value": null - }, - { - "color": "#508642", - "value": 0 - }, - { - "color": "#bf1b00", - "value": 1 + "-1": { + "text": "All failed" } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 3, - "w": 24, - "x": 0, - "y": 10 - }, - "id": 50, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "background", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "mean" - ], - "fields": "/^__name__$/", - "values": false - }, - "textMode": "auto" - }, - "pluginVersion": "9.3.6", - "repeat": "zrepl_job_name", - "repeatDirection": "h", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "zrepl_replication_filesystem_errors{job=\"$prom_job_name\",zrepl_job=\"$zrepl_job_name\"}", - "format": "time_series", - "groupBy": [ + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ { - "params": [ - "$__interval" - ], - "type": "time" + "color": "#bf1b00", + "value": null }, { - "params": [ - "null" - ], - "type": "fill" + "color": "#508642", + "value": 0 + }, + { + "color": "#bf1b00", + "value": 1 } - ], - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "orderByTime": "ASC", - "policy": "default", - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - } - ], - "title": "Failed replications $zrepl_job_name", - "type": "stat" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "links": [] + ] }, - "overrides": [] + "unit": "none" }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 4, - "w": 12, - "x": 0, - "y": 13 - }, - "hiddenSeries": false, - "id": 48, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 50, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "/^__name__$/", "values": false }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "9.3.6", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "sgn(zrepl_start_time{job='$prom_job_name'})", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{instance}}@version={{raw}}", - "refId": "A" - } - ], - "thresholds": [], - "timeRegions": [], - "title": "zrepl Instances Up", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "max": "5", - "min": "0", - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ], - "yaxis": { - "align": false - } + "textMode": "auto" }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 5, - "w": 12, - "x": 12, - "y": 13 - }, - "hiddenSeries": false, - "id": 44, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "9.3.6", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "zrepl_trace_active_tasks", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": [], - "timeRegions": [], - "title": "active tasks tracked by the zrepl trace module (should not grow unboundedly)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ], - "yaxis": { - "align": false - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 5, - "w": 12, - "x": 0, - "y": 17 - }, - "hiddenSeries": false, - "id": 42, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "9.3.6", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/replicated bytes in last.*/", - "yaxis": 2 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "sum(increase(zrepl_replication_bytes_replicated{job='$prom_job_name'}[1d])) by (zrepl_job)", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "1d", - "intervalFactor": 1, - "legendFormat": "zrepl_job={{zrepl_job}}", - "refId": "B" - } - ], - "thresholds": [], - "timeRegions": [], - "title": "Daily Replication Data Volume", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "min": "0", - "show": true - }, - { - "format": "bytes", - "logBase": 1, - "show": false - } - ], - "yaxis": { - "align": false - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 4, - "w": 12, - "x": 12, - "y": 18 - }, - "hiddenSeries": false, - "id": 22, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "9.3.6", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "increase(zrepl_daemon_log_entries{job='$prom_job_name',level=~'warn|error'}[1m])", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": [], - "timeRegions": [], - "title": "Log Messages that require attention", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "min": "0", - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ], - "yaxis": { - "align": false - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 5, - "w": 12, - "x": 0, - "y": 22 - }, - "hiddenSeries": false, - "id": 33, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "9.3.6", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "/replicated bytes in last.*/", - "yaxis": 2 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "sum(rate(zrepl_replication_bytes_replicated{job='$prom_job_name'}[10m])) by (zrepl_job)", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "replication data ratein last 10min zrepl_job={{zrepl_job}}te zrepl_job={{zrepl_job}}", - "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "sum(increase(zrepl_replication_bytes_replicated{job='$prom_job_name'}[10m])) by (zrepl_job)", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "replicated bytes in last 10min zrepl_job={{zrepl_job}}", - "refId": "B" - } - ], - "thresholds": [], - "timeRegions": [], - "title": "Replication Data Rate and Volume(over last 10min)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "logBase": 1, - "show": true - }, - { - "format": "bytes", - "logBase": 1, - "show": true - } - ], - "yaxis": { - "align": false - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 5, - "w": 12, - "x": 12, - "y": 22 - }, - "hiddenSeries": false, - "id": 23, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "9.3.6", - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "sum(increase(zrepl_daemon_log_entries{job='$prom_job_name',zrepl_job=~\"^[^_].*\"}[1m])) by (instance,zrepl_job)", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": [], - "timeRegions": [], - "title": "Log Activity (without internal jobs)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "min": "0", - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ], - "yaxis": { - "align": false - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 5, - "w": 12, - "x": 0, - "y": 27 - }, - "hiddenSeries": false, - "id": 41, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "go_memstats_alloc_bytes{job='$prom_job_name'}", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": [], - "timeRegions": [], - "title": "Go Memory Allocations (should not grow unboundedly)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "show": true - }, - { - "format": "short", - "label": "", - "logBase": 1, - "show": true - } - ], - "yaxis": { - "align": false - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 5, - "w": 12, - "x": 12, - "y": 27 - }, - "hiddenSeries": false, - "id": 47, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "zrepl_endpoint_abstractions_cache_entry_count", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": [], - "timeRegions": [], - "title": "zfs abstractions cache entry count (should not be zero and not grow unboundedly)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ], - "yaxis": { - "align": false - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 5, - "w": 12, - "x": 0, - "y": 32 - }, - "hiddenSeries": false, - "id": 17, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "go_memstats_sys_bytes{job='$prom_job_name'}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": [], - "timeRegions": [], - "title": "Memory Allocated by the Go runtime from the OS (should not grow unboundedly)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "min": "0", - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ], - "yaxis": { - "align": false - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 5, - "w": 12, - "x": 0, - "y": 37 - }, - "hiddenSeries": false, - "id": 19, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "expr": "go_goroutines{job='$prom_job_name'}", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": [], - "timeRegions": [], - "title": "number of goroutines (should not grow unboundedly)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "mode": "time", - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "min": "0", - "show": true - }, - { - "format": "short", - "logBase": 1, - "show": true - } - ], - "yaxis": { - "align": false - } - } - ], - "refresh": "5s", - "schemaVersion": 37, - "style": "dark", - "tags": [], - "templating": { - "list": [ + "pluginVersion": "9.3.6", + "repeat": "zrepl_job_name", + "repeatDirection": "h", + "targets": [ { - "current": {}, "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "definition": "", - "hide": 0, - "includeAll": false, - "label": "Prometheus Job Name", - "multi": false, - "name": "prom_job_name", - "options": [], - "query": { - "query": "label_values(up, job)", - "refId": "Prometheus-prom_job_name-Variable-Query" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "current": {}, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, - "definition": "label_values(zrepl_replication_filesystem_errors{job=\"$prom_job_name\"}, zrepl_job)", - "hide": 2, - "includeAll": true, - "label": "Zrepl Job Name", - "multi": true, - "name": "zrepl_job_name", - "options": [], - "query": { - "query": "label_values(zrepl_replication_filesystem_errors{job=\"$prom_job_name\"}, zrepl_job)", - "refId": "Prometheus-zrepl_job_name-Variable-Query" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false + "expr": "zrepl_replication_filesystem_errors{job=\"$prom_job_name\",zrepl_job=\"$zrepl_job_name\"}", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "orderByTime": "ASC", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] } - ] - }, - "time": { - "from": "now-2d", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] + "title": "Failed replications $zrepl_job_name", + "type": "stat" }, - "timezone": "", + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 13 + }, + "hiddenSeries": false, + "id": 48, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sgn(zrepl_start_time{job='$prom_job_name'})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}@version={{raw}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "zrepl Instances Up", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": "5", + "min": "0", + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 13 + }, + "hiddenSeries": false, + "id": 44, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "zrepl_trace_active_tasks", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "active tasks tracked by the zrepl trace module (should not grow unboundedly)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "gridPos": { + "h": 3, + "w": 5, + "x": 0, + "y": 17 + }, + "id": 56, + "options": { + "code": { + "language": "plaintext", + "showLineNumbers": false, + "showMiniMap": false + }, + "content": "### Unmatched Filesystems Rules\n\nFilesystem filter rules which mention datasets that didn't exist in the `zfs list` output.", + "mode": "markdown" + }, + "pluginVersion": "9.3.6", + "type": "text" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 5, + "y": 17 + }, + "id": 52, + "maxDataPoints": 10, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-red", + "min": 0, + "mode": "opacity", + "reverse": false, + "scale": "linear", + "scheme": "Oranges", + "steps": 2 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": false + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "increase(zrepl_zfs_list_unmatched_user_specified_dataset_count{job=\"$prom_job_name\"}[$__interval])", + "format": "time_series", + "legendFormat": "{{jobid}}", + "range": true, + "refId": "A" + } + ], + "title": "Occurences increase[$__interval]", + "transformations": [], + "type": "heatmap" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 18 + }, + "hiddenSeries": false, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "increase(zrepl_daemon_log_entries{job='$prom_job_name',level=~'warn|error'}[1m])", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Log Messages that require attention", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "min": "0", + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "color-background-solid", + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "dark-red", + "value": 0.01 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "jobid" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "transparent", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 5, + "x": 0, + "y": 20 + }, + "id": 54, + "maxDataPoints": 20, + "options": { + "footer": { + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "Value" + } + ] + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "increase(zrepl_zfs_list_unmatched_user_specified_dataset_count{job=\"$prom_job_name\"}[$__range])", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "{{jobid}}", + "range": false, + "refId": "A" + } + ], + "title": "Occurences [Dashboard Range]", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": false, + "instance": true, + "job": true + }, + "indexByName": {}, + "renameByName": {} + } + } + ], + "type": "table" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 22 + }, + "hiddenSeries": false, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(increase(zrepl_daemon_log_entries{job='$prom_job_name',zrepl_job=~\"^[^_].*\"}[1m])) by (instance,zrepl_job)", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Log Activity (without internal jobs)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "min": "0", + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 25 + }, + "hiddenSeries": false, + "id": 42, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/replicated bytes in last.*/", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(increase(zrepl_replication_bytes_replicated{job='$prom_job_name'}[1d])) by (zrepl_job)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "1d", + "intervalFactor": 1, + "legendFormat": "zrepl_job={{zrepl_job}}", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Daily Replication Data Volume", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "min": "0", + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "show": false + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 27 + }, + "hiddenSeries": false, + "id": 47, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "zrepl_endpoint_abstractions_cache_entry_count", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "zfs abstractions cache entry count (should not be zero and not grow unboundedly)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 30 + }, + "hiddenSeries": false, + "id": 33, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/replicated bytes in last.*/", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(rate(zrepl_replication_bytes_replicated{job='$prom_job_name'}[10m])) by (zrepl_job)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "replication data ratein last 10min zrepl_job={{zrepl_job}}te zrepl_job={{zrepl_job}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "sum(increase(zrepl_replication_bytes_replicated{job='$prom_job_name'}[10m])) by (zrepl_job)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "replicated bytes in last 10min zrepl_job={{zrepl_job}}", + "refId": "B" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Replication Data Rate and Volume(over last 10min)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 35 + }, + "hiddenSeries": false, + "id": 41, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "go_memstats_alloc_bytes{job='$prom_job_name'}", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Go Memory Allocations (should not grow unboundedly)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 40 + }, + "hiddenSeries": false, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "go_memstats_sys_bytes{job='$prom_job_name'}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Memory Allocated by the Go runtime from the OS (should not grow unboundedly)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "min": "0", + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 45 + }, + "hiddenSeries": false, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "9.3.6", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "expr": "go_goroutines{job='$prom_job_name'}", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "number of goroutines (should not grow unboundedly)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "min": "0", + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "refresh": "5s", + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Prometheus Job Name", + "multi": false, + "name": "prom_job_name", + "options": [], + "query": { + "query": "label_values(up, job)", + "refId": "Prometheus-prom_job_name-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(zrepl_replication_filesystem_errors{job=\"$prom_job_name\"}, zrepl_job)", + "hide": 2, + "includeAll": true, + "label": "Zrepl Job Name", + "multi": true, + "name": "zrepl_job_name", + "options": [], + "query": { + "query": "label_values(zrepl_replication_filesystem_errors{job=\"$prom_job_name\"}, zrepl_job)", + "refId": "Prometheus-zrepl_job_name-Variable-Query" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-2d", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", "title": "zrepl", "uid": "etQuvBnGz", "version": 1, - "weekStart": "" - } + "weekStart": "" +} diff --git a/endpoint/endpoint.go b/endpoint/endpoint.go index b9c1dbc..dcc9d9f 100644 --- a/endpoint/endpoint.go +++ b/endpoint/endpoint.go @@ -428,6 +428,7 @@ func (p *Sender) Receive(ctx context.Context, r *pdu.ReceiveReq, _ io.ReadCloser type FSFilter interface { // FIXME unused Filter(path *zfs.DatasetPath) (pass bool, err error) + UserSpecifiedDatasets() zfs.UserSpecifiedDatasetsSet } // FIXME: can we get away without error types here? @@ -587,6 +588,12 @@ func (f subroot) Filter(p *zfs.DatasetPath) (pass bool, err error) { return p.HasPrefix(f.localRoot) && !p.Equal(f.localRoot), nil } +func (f subroot) UserSpecifiedDatasets() zfs.UserSpecifiedDatasetsSet { + return zfs.UserSpecifiedDatasetsSet{ + f.localRoot.ToString(): true, + } +} + func (f subroot) MapToLocal(fs string) (*zfs.DatasetPath, error) { p, err := zfs.NewDatasetPath(fs) if err != nil { diff --git a/zfs/mapping.go b/zfs/mapping.go index f414c9d..6be5632 100644 --- a/zfs/mapping.go +++ b/zfs/mapping.go @@ -3,12 +3,20 @@ package zfs import ( "context" "fmt" + + "github.com/zrepl/zrepl/zfs/zfscmd" ) type DatasetFilter interface { Filter(p *DatasetPath) (pass bool, err error) + // The caller owns the returned set. + // Implementations should return a copy. + UserSpecifiedDatasets() UserSpecifiedDatasetsSet } +// A set of dataset names that the user specified in the configuration file. +type UserSpecifiedDatasetsSet map[string]bool + // Returns a DatasetFilter that does not filter (passes all paths) func NoFilter() DatasetFilter { return noFilter{} @@ -18,7 +26,8 @@ type noFilter struct{} var _ DatasetFilter = noFilter{} -func (noFilter) Filter(p *DatasetPath) (pass bool, err error) { return true, nil } +func (noFilter) Filter(p *DatasetPath) (pass bool, err error) { return true, nil } +func (noFilter) UserSpecifiedDatasets() UserSpecifiedDatasetsSet { return nil } func ZFSListMapping(ctx context.Context, filter DatasetFilter) (datasets []*DatasetPath, err error) { res, err := ZFSListMappingProperties(ctx, filter, nil) @@ -61,6 +70,7 @@ func ZFSListMappingProperties(ctx context.Context, filter DatasetFilter, propert go ZFSListChan(ctx, rchan, properties, nil, "-r", "-t", "filesystem,volume") + unmatchedUserSpecifiedDatasets := filter.UserSpecifiedDatasets() datasets = make([]ZFSListMappingPropertiesResult, 0) for r := range rchan { @@ -74,6 +84,8 @@ func ZFSListMappingProperties(ctx context.Context, filter DatasetFilter, propert return } + delete(unmatchedUserSpecifiedDatasets, path.ToString()) + pass, filterErr := filter.Filter(path) if filterErr != nil { return nil, fmt.Errorf("error calling filter: %s", filterErr) @@ -87,5 +99,9 @@ func ZFSListMappingProperties(ctx context.Context, filter DatasetFilter, propert } + jobid := zfscmd.GetJobIDOrDefault(ctx, "__nojobid") + metric := prom.ZFSListUnmatchedUserSpecifiedDatasetCount.WithLabelValues(jobid) + metric.Add(float64(len(unmatchedUserSpecifiedDatasets))) + return } diff --git a/zfs/prometheus.go b/zfs/prometheus.go index 3934418..a1dbf15 100644 --- a/zfs/prometheus.go +++ b/zfs/prometheus.go @@ -3,10 +3,11 @@ package zfs import "github.com/prometheus/client_golang/prometheus" var prom struct { - ZFSListFilesystemVersionDuration *prometheus.HistogramVec - ZFSSnapshotDuration *prometheus.HistogramVec - ZFSBookmarkDuration *prometheus.HistogramVec - ZFSDestroyDuration *prometheus.HistogramVec + ZFSListFilesystemVersionDuration *prometheus.HistogramVec + ZFSSnapshotDuration *prometheus.HistogramVec + ZFSBookmarkDuration *prometheus.HistogramVec + ZFSDestroyDuration *prometheus.HistogramVec + ZFSListUnmatchedUserSpecifiedDatasetCount *prometheus.GaugeVec } func init() { @@ -34,6 +35,15 @@ func init() { Name: "destroy_duration", Help: "Duration it took to destroy a dataset", }, []string{"dataset_type", "filesystem"}) + prom.ZFSListUnmatchedUserSpecifiedDatasetCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "zrepl", + Subsystem: "zfs", + Name: "list_unmatched_user_specified_dataset_count", + Help: "When evaluating a DatsetFilter against zfs list output, this counter " + + "is incremented for every DatasetFilter rule that did not match any " + + "filesystem name in the zfs list output. Monitor for increases to detect filesystem " + + "filter rules that have no effect because they don't match any local filesystem.", + }, []string{"jobid"}) } func PrometheusRegister(registry prometheus.Registerer) error { @@ -49,5 +59,8 @@ func PrometheusRegister(registry prometheus.Registerer) error { if err := registry.Register(prom.ZFSDestroyDuration); err != nil { return err } + if err := registry.Register(prom.ZFSListUnmatchedUserSpecifiedDatasetCount); err != nil { + return err + } return nil } diff --git a/zfs/zfscmd/zfscmd_context.go b/zfs/zfscmd/zfscmd_context.go index 2497c87..9238d5d 100644 --- a/zfs/zfscmd/zfscmd_context.go +++ b/zfs/zfscmd/zfscmd_context.go @@ -19,6 +19,10 @@ func WithJobID(ctx context.Context, jobID string) context.Context { return context.WithValue(ctx, contextKeyJobID, jobID) } +func GetJobIDOrDefault(ctx context.Context, def string) string { + return getJobIDOrDefault(ctx, def) +} + func getJobIDOrDefault(ctx context.Context, def string) string { ret, ok := ctx.Value(contextKeyJobID).(string) if !ok {