From 056be1185d17a2b3e27eea23a4dee71386ab1c87 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Sat, 16 Mar 2019 15:48:37 +0100 Subject: [PATCH] dist: add grafana dashboard fixes #116 --- .../grafana/grafana-prometheus-zrepl-0.1.json | 595 ++++++++++++++++++ docs/configuration/monitoring.rst | 14 +- docs/sphinxconf/conf.py | 1 + 3 files changed, 606 insertions(+), 4 deletions(-) create mode 100644 dist/grafana/grafana-prometheus-zrepl-0.1.json diff --git a/dist/grafana/grafana-prometheus-zrepl-0.1.json b/dist/grafana/grafana-prometheus-zrepl-0.1.json new file mode 100644 index 0000000..d0ce1a6 --- /dev/null +++ b/dist/grafana/grafana-prometheus-zrepl-0.1.json @@ -0,0 +1,595 @@ +{ + "annotations": { + "list": [ + { + "$$hashKey": "object:3351", + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 1, + "id": 7, + "iteration": 1552746418826, + "links": [], + "panels": [ + { + "content": "# zrepl Prometheus Metrics\n\nzrepl exposes Prometheus metrics and ships with this Grafana dashboard.\nThe exported metrics are suitable for health checks:\n\n* The log should generally be warning & error-free\n * The `Log Messages that require attention` graph visualizes log message counts indicating problems.\n* The number of goroutines should not grow unboundedly over time.\n * During replication, the number of goroutines can be way higher than during idle time.\n * If the goroutine count grows with each replication, there is clearly a goroutine leak. Please open a bug report.\n* The sys memory consumption should not grow unboundedly over time.\n * Note that the Go runtime pre-allocates some of its heap from the OS.\n * zrepl actually uses much less memory than allocated from the OS.\n * Since Go 1.11, Go pre-allocates more aggressively.\n* Monitor that some data is replicated, although that metric does not guarantee that replication was successful.\n\n**In general, note that the exported metrics are not stable unless declared otherwise.**", + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 35, + "mode": "markdown", + "title": "Panel Title", + "type": "text" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:3436", + "expr": "up{job='$prom_job_name'}", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "zrepl Instances Up", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": "5", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(zrepl_daemon_log_entries{job='$prom_job_name',level=~'warn|error'}[$__interval])", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Log Messages that require attention", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 33, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/replicated bytes in last.*/", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(zrepl_replication_bytes_replicated{job='$prom_job_name'}[$__interval])) by (zrepl_job)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "replication data rate zrepl_job={{zrepl_job}}", + "refId": "A" + }, + { + "expr": "sum(increase(zrepl_replication_bytes_replicated{job='$prom_job_name'}[10m])) by (zrepl_job)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "replicated bytes in last 10min zrepl_job={{zrepl_job}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Replication Data Rate and Volume(integrates last 10min)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(zrepl_daemon_log_entries{job='$prom_job_name',zrepl_job=~\"^[^_].*\"}[$__interval])) by (instance,zrepl_job)", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Log Activity (without internal jobs)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "$$hashKey": "object:3535", + "expr": "go_memstats_sys_bytes{job='$prom_job_name'}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory Allocated by the Go runtime from the OS (should not grow unboundedly)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 27 + }, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{job='$prom_job_name'}", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "number of goroutines (should not grow unboundedly)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "refresh": "1m", + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "text": "zrepl", + "value": "zrepl" + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Prometheus Job Name", + "multi": false, + "name": "prom_job_name", + "options": [], + "query": "label_values(up, job)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-2d", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "zrepl 0.1", + "uid": "xTljn4qmk", + "version": 6 +} \ No newline at end of file diff --git a/docs/configuration/monitoring.rst b/docs/configuration/monitoring.rst index 8974ee3..f0e97dc 100644 --- a/docs/configuration/monitoring.rst +++ b/docs/configuration/monitoring.rst @@ -9,14 +9,20 @@ Monitoring endpoints are configured in the ``global.monitoring`` section of the .. _monitoring-prometheus: -Prometheus ----------- +Prometheus & Grafana +-------------------- zrepl can expose `Prometheus metrics `_ via HTTP. The ``listen`` attribute is a `net.Listen `_ string for tcp, e.g. ``:9091`` or ``127.0.0.1:9091``. - The Prometheues monitoring job appears in the ``zrepl control`` job list and may be specified **at most once**. -At the time of writing, there is no stability guarantee on the exported metrics. + +zrepl also ships with an importable `Grafana `_ dashboard that consumes the Prometheus metrics: +see :repomasterlink:`dist/grafana`. +The dashboard also contains some advice on which metrics are important to monitor. + +.. NOTE:: + + At the time of writing, there is no stability guarantee on the exported metrics. :: diff --git a/docs/sphinxconf/conf.py b/docs/sphinxconf/conf.py index 97f234d..83b4123 100644 --- a/docs/sphinxconf/conf.py +++ b/docs/sphinxconf/conf.py @@ -173,6 +173,7 @@ texinfo_documents = [ # http://www.sphinx-doc.org/en/stable/ext/extlinks.html extlinks = { 'issue':('https://github.com/zrepl/zrepl/issues/%s', 'issue #'), + 'repomasterlink':('https://github.com/zrepl/zrepl/blob/master/%s', ''), 'sampleconf':('https://github.com/zrepl/zrepl/blob/master/config/samples%s', 'config/samples'), 'commit':('https://github.com/zrepl/zrepl/commit/%s', 'commit '), }