mirror of
https://github.com/zrepl/zrepl.git
synced 2024-11-21 16:03:32 +01:00
metric to detect filesystems rules that don't match any local dataset (#653)
This PR adds a Prometheus counter called `zrepl_zfs_list_unmatched_user_specified_dataset_count`. Monitor for increases of the counter to detect filesystem filter rules that have no effect because they don't match any local filesystem. An example use case for this is the following story: 1. Someone sets up zrepl with `filesystems` filter for `zroot/pg14<`. 2. During the upgrade to Postgres 15, they rename the dataset to `zroot/pg15`, but forget to update the zrepl `filesystems` filter. 3. zrepl will not snapshot / replicate the `zroot/pg15<` datasets. Since `filesystems` rules are always evaluated on the side that has the datasets, we can smuggle this functionality into the `zfs` module's `ZFSList` function that is used by all jobs with a `filesystems` filter. Dashboard changes: - histogram with increase in $__interval, one row per job - table with increase in $__range - explainer text box, so, people know what the previous two are about We had to re-arrange some panels, hence the Git diff isn't great. closes https://github.com/zrepl/zrepl/pull/653 Co-authored-by: Christian Schwarz <me@cschwarz.com> Co-authored-by: Goran Mekić <meka@tilda.center>
This commit is contained in:
parent
2b3daaf9f1
commit
bc5e1ede04
@ -160,6 +160,14 @@ func (m DatasetMapFilter) Filter(p *zfs.DatasetPath) (pass bool, err error) {
|
||||
return
|
||||
}
|
||||
|
||||
func (m DatasetMapFilter) UserSpecifiedDatasets() (datasets zfs.UserSpecifiedDatasetsSet) {
|
||||
datasets = make(zfs.UserSpecifiedDatasetsSet)
|
||||
for i := range m.entries {
|
||||
datasets[m.entries[i].path.ToString()] = true
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Construct a new filter-only DatasetMapFilter from a mapping
|
||||
// The new filter allows exactly those paths that were not forbidden by the mapping.
|
||||
func (m DatasetMapFilter) InvertedFilter() (inv *DatasetMapFilter, err error) {
|
||||
|
2735
dist/grafana/grafana-prometheus-zrepl.json
vendored
2735
dist/grafana/grafana-prometheus-zrepl.json
vendored
File diff suppressed because it is too large
Load Diff
@ -428,6 +428,7 @@ func (p *Sender) Receive(ctx context.Context, r *pdu.ReceiveReq, _ io.ReadCloser
|
||||
|
||||
type FSFilter interface { // FIXME unused
|
||||
Filter(path *zfs.DatasetPath) (pass bool, err error)
|
||||
UserSpecifiedDatasets() zfs.UserSpecifiedDatasetsSet
|
||||
}
|
||||
|
||||
// FIXME: can we get away without error types here?
|
||||
@ -587,6 +588,12 @@ func (f subroot) Filter(p *zfs.DatasetPath) (pass bool, err error) {
|
||||
return p.HasPrefix(f.localRoot) && !p.Equal(f.localRoot), nil
|
||||
}
|
||||
|
||||
func (f subroot) UserSpecifiedDatasets() zfs.UserSpecifiedDatasetsSet {
|
||||
return zfs.UserSpecifiedDatasetsSet{
|
||||
f.localRoot.ToString(): true,
|
||||
}
|
||||
}
|
||||
|
||||
func (f subroot) MapToLocal(fs string) (*zfs.DatasetPath, error) {
|
||||
p, err := zfs.NewDatasetPath(fs)
|
||||
if err != nil {
|
||||
|
@ -3,12 +3,20 @@ package zfs
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/zrepl/zrepl/zfs/zfscmd"
|
||||
)
|
||||
|
||||
type DatasetFilter interface {
|
||||
Filter(p *DatasetPath) (pass bool, err error)
|
||||
// The caller owns the returned set.
|
||||
// Implementations should return a copy.
|
||||
UserSpecifiedDatasets() UserSpecifiedDatasetsSet
|
||||
}
|
||||
|
||||
// A set of dataset names that the user specified in the configuration file.
|
||||
type UserSpecifiedDatasetsSet map[string]bool
|
||||
|
||||
// Returns a DatasetFilter that does not filter (passes all paths)
|
||||
func NoFilter() DatasetFilter {
|
||||
return noFilter{}
|
||||
@ -18,7 +26,8 @@ type noFilter struct{}
|
||||
|
||||
var _ DatasetFilter = noFilter{}
|
||||
|
||||
func (noFilter) Filter(p *DatasetPath) (pass bool, err error) { return true, nil }
|
||||
func (noFilter) Filter(p *DatasetPath) (pass bool, err error) { return true, nil }
|
||||
func (noFilter) UserSpecifiedDatasets() UserSpecifiedDatasetsSet { return nil }
|
||||
|
||||
func ZFSListMapping(ctx context.Context, filter DatasetFilter) (datasets []*DatasetPath, err error) {
|
||||
res, err := ZFSListMappingProperties(ctx, filter, nil)
|
||||
@ -61,6 +70,7 @@ func ZFSListMappingProperties(ctx context.Context, filter DatasetFilter, propert
|
||||
|
||||
go ZFSListChan(ctx, rchan, properties, nil, "-r", "-t", "filesystem,volume")
|
||||
|
||||
unmatchedUserSpecifiedDatasets := filter.UserSpecifiedDatasets()
|
||||
datasets = make([]ZFSListMappingPropertiesResult, 0)
|
||||
for r := range rchan {
|
||||
|
||||
@ -74,6 +84,8 @@ func ZFSListMappingProperties(ctx context.Context, filter DatasetFilter, propert
|
||||
return
|
||||
}
|
||||
|
||||
delete(unmatchedUserSpecifiedDatasets, path.ToString())
|
||||
|
||||
pass, filterErr := filter.Filter(path)
|
||||
if filterErr != nil {
|
||||
return nil, fmt.Errorf("error calling filter: %s", filterErr)
|
||||
@ -87,5 +99,9 @@ func ZFSListMappingProperties(ctx context.Context, filter DatasetFilter, propert
|
||||
|
||||
}
|
||||
|
||||
jobid := zfscmd.GetJobIDOrDefault(ctx, "__nojobid")
|
||||
metric := prom.ZFSListUnmatchedUserSpecifiedDatasetCount.WithLabelValues(jobid)
|
||||
metric.Add(float64(len(unmatchedUserSpecifiedDatasets)))
|
||||
|
||||
return
|
||||
}
|
||||
|
@ -3,10 +3,11 @@ package zfs
|
||||
import "github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
var prom struct {
|
||||
ZFSListFilesystemVersionDuration *prometheus.HistogramVec
|
||||
ZFSSnapshotDuration *prometheus.HistogramVec
|
||||
ZFSBookmarkDuration *prometheus.HistogramVec
|
||||
ZFSDestroyDuration *prometheus.HistogramVec
|
||||
ZFSListFilesystemVersionDuration *prometheus.HistogramVec
|
||||
ZFSSnapshotDuration *prometheus.HistogramVec
|
||||
ZFSBookmarkDuration *prometheus.HistogramVec
|
||||
ZFSDestroyDuration *prometheus.HistogramVec
|
||||
ZFSListUnmatchedUserSpecifiedDatasetCount *prometheus.GaugeVec
|
||||
}
|
||||
|
||||
func init() {
|
||||
@ -34,6 +35,15 @@ func init() {
|
||||
Name: "destroy_duration",
|
||||
Help: "Duration it took to destroy a dataset",
|
||||
}, []string{"dataset_type", "filesystem"})
|
||||
prom.ZFSListUnmatchedUserSpecifiedDatasetCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: "zrepl",
|
||||
Subsystem: "zfs",
|
||||
Name: "list_unmatched_user_specified_dataset_count",
|
||||
Help: "When evaluating a DatsetFilter against zfs list output, this counter " +
|
||||
"is incremented for every DatasetFilter rule that did not match any " +
|
||||
"filesystem name in the zfs list output. Monitor for increases to detect filesystem " +
|
||||
"filter rules that have no effect because they don't match any local filesystem.",
|
||||
}, []string{"jobid"})
|
||||
}
|
||||
|
||||
func PrometheusRegister(registry prometheus.Registerer) error {
|
||||
@ -49,5 +59,8 @@ func PrometheusRegister(registry prometheus.Registerer) error {
|
||||
if err := registry.Register(prom.ZFSDestroyDuration); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := registry.Register(prom.ZFSListUnmatchedUserSpecifiedDatasetCount); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
@ -19,6 +19,10 @@ func WithJobID(ctx context.Context, jobID string) context.Context {
|
||||
return context.WithValue(ctx, contextKeyJobID, jobID)
|
||||
}
|
||||
|
||||
func GetJobIDOrDefault(ctx context.Context, def string) string {
|
||||
return getJobIDOrDefault(ctx, def)
|
||||
}
|
||||
|
||||
func getJobIDOrDefault(ctx context.Context, def string) string {
|
||||
ret, ok := ctx.Value(contextKeyJobID).(string)
|
||||
if !ok {
|
||||
|
Loading…
Reference in New Issue
Block a user