mirror of
https://github.com/zrepl/zrepl.git
synced 2024-11-22 08:23:50 +01:00
bc5e1ede04
This PR adds a Prometheus counter called `zrepl_zfs_list_unmatched_user_specified_dataset_count`. Monitor for increases of the counter to detect filesystem filter rules that have no effect because they don't match any local filesystem. An example use case for this is the following story: 1. Someone sets up zrepl with `filesystems` filter for `zroot/pg14<`. 2. During the upgrade to Postgres 15, they rename the dataset to `zroot/pg15`, but forget to update the zrepl `filesystems` filter. 3. zrepl will not snapshot / replicate the `zroot/pg15<` datasets. Since `filesystems` rules are always evaluated on the side that has the datasets, we can smuggle this functionality into the `zfs` module's `ZFSList` function that is used by all jobs with a `filesystems` filter. Dashboard changes: - histogram with increase in $__interval, one row per job - table with increase in $__range - explainer text box, so, people know what the previous two are about We had to re-arrange some panels, hence the Git diff isn't great. closes https://github.com/zrepl/zrepl/pull/653 Co-authored-by: Christian Schwarz <me@cschwarz.com> Co-authored-by: Goran Mekić <meka@tilda.center>
67 lines
2.5 KiB
Go
67 lines
2.5 KiB
Go
package zfs
|
|
|
|
import "github.com/prometheus/client_golang/prometheus"
|
|
|
|
var prom struct {
|
|
ZFSListFilesystemVersionDuration *prometheus.HistogramVec
|
|
ZFSSnapshotDuration *prometheus.HistogramVec
|
|
ZFSBookmarkDuration *prometheus.HistogramVec
|
|
ZFSDestroyDuration *prometheus.HistogramVec
|
|
ZFSListUnmatchedUserSpecifiedDatasetCount *prometheus.GaugeVec
|
|
}
|
|
|
|
func init() {
|
|
prom.ZFSListFilesystemVersionDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: "zrepl",
|
|
Subsystem: "zfs",
|
|
Name: "list_filesystem_versions_duration",
|
|
Help: "Seconds it took for listing the versions of a given filesystem",
|
|
}, []string{"filesystem"})
|
|
prom.ZFSSnapshotDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: "zrepl",
|
|
Subsystem: "zfs",
|
|
Name: "snapshot_duration",
|
|
Help: "Seconds it took to create a snapshot a given filesystem",
|
|
}, []string{"filesystem"})
|
|
prom.ZFSBookmarkDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: "zrepl",
|
|
Subsystem: "zfs",
|
|
Name: "bookmark_duration",
|
|
Help: "Duration it took to bookmark a given snapshot",
|
|
}, []string{"filesystem"})
|
|
prom.ZFSDestroyDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
|
Namespace: "zrepl",
|
|
Subsystem: "zfs",
|
|
Name: "destroy_duration",
|
|
Help: "Duration it took to destroy a dataset",
|
|
}, []string{"dataset_type", "filesystem"})
|
|
prom.ZFSListUnmatchedUserSpecifiedDatasetCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
|
Namespace: "zrepl",
|
|
Subsystem: "zfs",
|
|
Name: "list_unmatched_user_specified_dataset_count",
|
|
Help: "When evaluating a DatsetFilter against zfs list output, this counter " +
|
|
"is incremented for every DatasetFilter rule that did not match any " +
|
|
"filesystem name in the zfs list output. Monitor for increases to detect filesystem " +
|
|
"filter rules that have no effect because they don't match any local filesystem.",
|
|
}, []string{"jobid"})
|
|
}
|
|
|
|
func PrometheusRegister(registry prometheus.Registerer) error {
|
|
if err := registry.Register(prom.ZFSListFilesystemVersionDuration); err != nil {
|
|
return err
|
|
}
|
|
if err := registry.Register(prom.ZFSBookmarkDuration); err != nil {
|
|
return err
|
|
}
|
|
if err := registry.Register(prom.ZFSSnapshotDuration); err != nil {
|
|
return err
|
|
}
|
|
if err := registry.Register(prom.ZFSDestroyDuration); err != nil {
|
|
return err
|
|
}
|
|
if err := registry.Register(prom.ZFSListUnmatchedUserSpecifiedDatasetCount); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|