diff --git a/dist/grafana/grafana-prometheus-zrepl-0.1.json b/dist/grafana/grafana-prometheus-zrepl-0.1.json
new file mode 100644
index 0000000..d0ce1a6
--- /dev/null
+++ b/dist/grafana/grafana-prometheus-zrepl-0.1.json
@@ -0,0 +1,595 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "$$hashKey": "object:3351",
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 1,
+ "id": 7,
+ "iteration": 1552746418826,
+ "links": [],
+ "panels": [
+ {
+ "content": "# zrepl Prometheus Metrics\n\nzrepl exposes Prometheus metrics and ships with this Grafana dashboard.\nThe exported metrics are suitable for health checks:\n\n* The log should generally be warning & error-free\n * The `Log Messages that require attention` graph visualizes log message counts indicating problems.\n* The number of goroutines should not grow unboundedly over time.\n * During replication, the number of goroutines can be way higher than during idle time.\n * If the goroutine count grows with each replication, there is clearly a goroutine leak. Please open a bug report.\n* The sys memory consumption should not grow unboundedly over time.\n * Note that the Go runtime pre-allocates some of its heap from the OS.\n * zrepl actually uses much less memory than allocated from the OS.\n * Since Go 1.11, Go pre-allocates more aggressively.\n* Monitor that some data is replicated, although that metric does not guarantee that replication was successful.\n\n**In general, note that the exported metrics are not stable unless declared otherwise.**",
+ "gridPos": {
+ "h": 9,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 35,
+ "mode": "markdown",
+ "title": "Panel Title",
+ "type": "text"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 9
+ },
+ "id": 15,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "$$hashKey": "object:3436",
+ "expr": "up{job='$prom_job_name'}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "zrepl Instances Up",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": "5",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 9
+ },
+ "id": 22,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "increase(zrepl_daemon_log_entries{job='$prom_job_name',level=~'warn|error'}[$__interval])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Log Messages that require attention",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 18
+ },
+ "id": 33,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/replicated bytes in last.*/",
+ "yaxis": 2
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(zrepl_replication_bytes_replicated{job='$prom_job_name'}[$__interval])) by (zrepl_job)",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "replication data rate zrepl_job={{zrepl_job}}",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(increase(zrepl_replication_bytes_replicated{job='$prom_job_name'}[10m])) by (zrepl_job)",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "replicated bytes in last 10min zrepl_job={{zrepl_job}}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Replication Data Rate and Volume(integrates last 10min)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 18
+ },
+ "id": 23,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(increase(zrepl_daemon_log_entries{job='$prom_job_name',zrepl_job=~\"^[^_].*\"}[$__interval])) by (instance,zrepl_job)",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Log Activity (without internal jobs)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 27
+ },
+ "id": 17,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "$$hashKey": "object:3535",
+ "expr": "go_memstats_sys_bytes{job='$prom_job_name'}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Memory Allocated by the Go runtime from the OS (should not grow unboundedly)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fill": 1,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 27
+ },
+ "id": 19,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "go_goroutines{job='$prom_job_name'}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "number of goroutines (should not grow unboundedly)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ]
+ }
+ ],
+ "refresh": "1m",
+ "schemaVersion": 16,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "allValue": null,
+ "current": {
+ "text": "zrepl",
+ "value": "zrepl"
+ },
+ "datasource": "prometheus",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Prometheus Job Name",
+ "multi": false,
+ "name": "prom_job_name",
+ "options": [],
+ "query": "label_values(up, job)",
+ "refresh": 1,
+ "regex": "",
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-2d",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "",
+ "title": "zrepl 0.1",
+ "uid": "xTljn4qmk",
+ "version": 6
+}
\ No newline at end of file
diff --git a/docs/configuration/monitoring.rst b/docs/configuration/monitoring.rst
index 8974ee3..f0e97dc 100644
--- a/docs/configuration/monitoring.rst
+++ b/docs/configuration/monitoring.rst
@@ -9,14 +9,20 @@ Monitoring endpoints are configured in the ``global.monitoring`` section of the
.. _monitoring-prometheus:
-Prometheus
-----------
+Prometheus & Grafana
+--------------------
zrepl can expose `Prometheus metrics `_ via HTTP.
The ``listen`` attribute is a `net.Listen `_ string for tcp, e.g. ``:9091`` or ``127.0.0.1:9091``.
-
The Prometheues monitoring job appears in the ``zrepl control`` job list and may be specified **at most once**.
-At the time of writing, there is no stability guarantee on the exported metrics.
+
+zrepl also ships with an importable `Grafana `_ dashboard that consumes the Prometheus metrics:
+see :repomasterlink:`dist/grafana`.
+The dashboard also contains some advice on which metrics are important to monitor.
+
+.. NOTE::
+
+ At the time of writing, there is no stability guarantee on the exported metrics.
::
diff --git a/docs/sphinxconf/conf.py b/docs/sphinxconf/conf.py
index 97f234d..83b4123 100644
--- a/docs/sphinxconf/conf.py
+++ b/docs/sphinxconf/conf.py
@@ -173,6 +173,7 @@ texinfo_documents = [
# http://www.sphinx-doc.org/en/stable/ext/extlinks.html
extlinks = {
'issue':('https://github.com/zrepl/zrepl/issues/%s', 'issue #'),
+ 'repomasterlink':('https://github.com/zrepl/zrepl/blob/master/%s', ''),
'sampleconf':('https://github.com/zrepl/zrepl/blob/master/config/samples%s', 'config/samples'),
'commit':('https://github.com/zrepl/zrepl/commit/%s', 'commit '),
}