From 447e140479753bf69e6cb843b4e95553ef08e6ff Mon Sep 17 00:00:00 2001 From: TwiN Date: Tue, 2 May 2023 22:41:22 -0400 Subject: [PATCH] feat(connectivity): Allow internet connection validation prior to endpoint execution (#461) --- README.md | 23 +++++++++ client/client_test.go | 3 ++ config/config.go | 14 ++++++ config/connectivity/connectivity.go | 53 ++++++++++++++++++++ config/connectivity/connectivity_test.go | 62 ++++++++++++++++++++++++ watchdog/watchdog.go | 20 +++++--- 6 files changed, 167 insertions(+), 8 deletions(-) create mode 100644 config/connectivity/connectivity.go create mode 100644 config/connectivity/connectivity_test.go diff --git a/README.md b/README.md index ae0a4e3b..b8da4cd9 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ Have any feedback or questions? [Create a discussion](https://github.com/TwiN/ga - [OIDC](#oidc) - [TLS Encryption](#tls-encryption) - [Metrics](#metrics) + - [Connectivity](#connectivity) - [Remote instances (EXPERIMENTAL)](#remote-instances-experimental) - [Deployment](#deployment) - [Docker](#docker) @@ -1256,6 +1257,28 @@ endpoint on the same port your application is configured to run on (`web.port`). See [examples/docker-compose-grafana-prometheus](.examples/docker-compose-grafana-prometheus) for further documentation as well as an example. +### Connectivity +| Parameter | Description | Default | +|:--------------------------------|:-------------------------------------------|:--------------| +| `connectivity` | Connectivity configuration | `{}` | +| `connectivity.checker` | Connectivity checker configuration | Required `{}` | +| `connectivity.checker.target` | Host to use for validating connectivity | Required `""` | +| `connectivity.checker.interval` | Interval at which to validate connectivity | `1m` | + +While Gatus is used to monitor other services, it is possible for Gatus itself to lose connectivity to the internet. +In order to prevent Gatus from reporting endpoints as unhealthy when Gatus itself is unhealthy, you may configure +Gatus to periodically check for internet connectivity. + +All endpoint executions are skipped while the connectivity checker deems connectivity to be down. + +```yaml +connectivity: + checker: + target: 1.1.1.1:53 + interval: 60s +``` + + ### Remote instances (EXPERIMENTAL) This feature allows you to retrieve endpoint statuses from a remote Gatus instance. diff --git a/client/client_test.go b/client/client_test.go index f6f3a9e4..e11381f1 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -191,6 +191,9 @@ func TestCanCreateTCPConnection(t *testing.T) { if CanCreateTCPConnection("127.0.0.1", &Config{Timeout: 5 * time.Second}) { t.Error("should've failed, because there's no port in the address") } + if !CanCreateTCPConnection("1.1.1.1:53", &Config{Timeout: 5 * time.Second}) { + t.Error("should've succeeded, because that IP should always™ be up") + } } // This test checks if a HTTP client configured with `configureOAuth2()` automatically diff --git a/config/config.go b/config/config.go index 1f2d2051..cd8d2f2f 100644 --- a/config/config.go +++ b/config/config.go @@ -14,6 +14,7 @@ import ( "github.com/TwiN/gatus/v5/alerting" "github.com/TwiN/gatus/v5/alerting/alert" "github.com/TwiN/gatus/v5/alerting/provider" + "github.com/TwiN/gatus/v5/config/connectivity" "github.com/TwiN/gatus/v5/config/maintenance" "github.com/TwiN/gatus/v5/config/remote" "github.com/TwiN/gatus/v5/config/ui" @@ -91,6 +92,9 @@ type Config struct { // WARNING: This is in ALPHA and may change or be completely removed in the future Remote *remote.Config `yaml:"remote,omitempty"` + // Connectivity is the configuration for connectivity + Connectivity *connectivity.Config `yaml:"connectivity,omitempty"` + configPath string // path to the file or directory from which config was loaded lastFileModTime time.Time // last modification time } @@ -252,10 +256,20 @@ func parseAndValidateConfigBytes(yamlBytes []byte) (config *Config, err error) { if err := validateRemoteConfig(config); err != nil { return nil, err } + if err := validateConnectivityConfig(config); err != nil { + return nil, err + } } return } +func validateConnectivityConfig(config *Config) error { + if config.Connectivity != nil { + return config.Connectivity.ValidateAndSetDefaults() + } + return nil +} + func validateRemoteConfig(config *Config) error { if config.Remote != nil { if err := config.Remote.ValidateAndSetDefaults(); err != nil { diff --git a/config/connectivity/connectivity.go b/config/connectivity/connectivity.go new file mode 100644 index 00000000..fbabec58 --- /dev/null +++ b/config/connectivity/connectivity.go @@ -0,0 +1,53 @@ +package connectivity + +import ( + "errors" + "strings" + "time" + + "github.com/TwiN/gatus/v5/client" +) + +var ( + ErrInvalidInterval = errors.New("connectivity.checker.interval must be 5s or higher") + ErrInvalidDNSTarget = errors.New("connectivity.checker.target must be suffixed with :53") +) + +// Config is the configuration for the connectivity checker. +type Config struct { + Checker *Checker `yaml:"checker,omitempty"` +} + +func (c *Config) ValidateAndSetDefaults() error { + if c.Checker != nil { + if c.Checker.Interval == 0 { + c.Checker.Interval = 60 * time.Second + } else if c.Checker.Interval < 5*time.Second { + return ErrInvalidInterval + } + if !strings.HasSuffix(c.Checker.Target, ":53") { + return ErrInvalidDNSTarget + } + } + return nil +} + +// Checker is the configuration for making sure Gatus has access to the internet. +type Checker struct { + Target string `yaml:"target"` // e.g. 1.1.1.1:53 + Interval time.Duration `yaml:"interval,omitempty"` + + isConnected bool + lastCheck time.Time +} + +func (c Checker) Check() bool { + return client.CanCreateTCPConnection(c.Target, &client.Config{Timeout: 5 * time.Second}) +} + +func (c *Checker) IsConnected() bool { + if now := time.Now(); now.After(c.lastCheck.Add(c.Interval)) { + c.lastCheck, c.isConnected = now, c.Check() + } + return c.isConnected +} diff --git a/config/connectivity/connectivity_test.go b/config/connectivity/connectivity_test.go new file mode 100644 index 00000000..d16520cb --- /dev/null +++ b/config/connectivity/connectivity_test.go @@ -0,0 +1,62 @@ +package connectivity + +import ( + "fmt" + "testing" + "time" +) + +func TestConfig(t *testing.T) { + scenarios := []struct { + name string + cfg *Config + expectedErr error + expectedInterval time.Duration + }{ + { + name: "good-config", + cfg: &Config{Checker: &Checker{Target: "1.1.1.1:53", Interval: 10 * time.Second}}, + expectedInterval: 10 * time.Second, + }, + { + name: "good-config-with-default-interval", + cfg: &Config{Checker: &Checker{Target: "8.8.8.8:53", Interval: 0}}, + expectedInterval: 60 * time.Second, + }, + { + name: "config-with-interval-too-low", + cfg: &Config{Checker: &Checker{Target: "1.1.1.1:53", Interval: 4 * time.Second}}, + expectedErr: ErrInvalidInterval, + }, + { + name: "config-with-invalid-target-due-to-missing-port", + cfg: &Config{Checker: &Checker{Target: "1.1.1.1", Interval: 15 * time.Second}}, + expectedErr: ErrInvalidDNSTarget, + }, + { + name: "config-with-invalid-target-due-to-invalid-dns-port", + cfg: &Config{Checker: &Checker{Target: "1.1.1.1:52", Interval: 15 * time.Second}}, + expectedErr: ErrInvalidDNSTarget, + }, + } + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + err := scenario.cfg.ValidateAndSetDefaults() + if fmt.Sprintf("%s", err) != fmt.Sprintf("%s", scenario.expectedErr) { + t.Errorf("expected error %v, got %v", scenario.expectedErr, err) + } + if err == nil && scenario.expectedErr == nil { + if scenario.cfg.Checker.Interval != scenario.expectedInterval { + t.Errorf("expected interval %v, got %v", scenario.expectedInterval, scenario.cfg.Checker.Interval) + } + } + }) + } +} + +func TestChecker_IsConnected(t *testing.T) { + checker := &Checker{Target: "1.1.1.1:53", Interval: 10 * time.Second} + if !checker.IsConnected() { + t.Error("expected checker.IsConnected() to be true") + } +} diff --git a/watchdog/watchdog.go b/watchdog/watchdog.go index 8b987247..a3257246 100644 --- a/watchdog/watchdog.go +++ b/watchdog/watchdog.go @@ -8,6 +8,7 @@ import ( "github.com/TwiN/gatus/v5/alerting" "github.com/TwiN/gatus/v5/config" + "github.com/TwiN/gatus/v5/config/connectivity" "github.com/TwiN/gatus/v5/config/maintenance" "github.com/TwiN/gatus/v5/core" "github.com/TwiN/gatus/v5/metrics" @@ -30,15 +31,15 @@ func Monitor(cfg *config.Config) { if endpoint.IsEnabled() { // To prevent multiple requests from running at the same time, we'll wait for a little before each iteration time.Sleep(777 * time.Millisecond) - go monitor(endpoint, cfg.Alerting, cfg.Maintenance, cfg.DisableMonitoringLock, cfg.Metrics, cfg.Debug, ctx) + go monitor(endpoint, cfg.Alerting, cfg.Maintenance, cfg.Connectivity, cfg.DisableMonitoringLock, cfg.Metrics, cfg.Debug, ctx) } } } // monitor a single endpoint in a loop -func monitor(endpoint *core.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, disableMonitoringLock, enabledMetrics, debug bool, ctx context.Context) { +func monitor(endpoint *core.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock, enabledMetrics, debug bool, ctx context.Context) { // Run it immediately on start - execute(endpoint, alertingConfig, maintenanceConfig, disableMonitoringLock, enabledMetrics, debug) + execute(endpoint, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics, debug) // Loop for the next executions for { select { @@ -46,16 +47,22 @@ func monitor(endpoint *core.Endpoint, alertingConfig *alerting.Config, maintenan log.Printf("[watchdog][monitor] Canceling current execution of group=%s; endpoint=%s", endpoint.Group, endpoint.Name) return case <-time.After(endpoint.Interval): - execute(endpoint, alertingConfig, maintenanceConfig, disableMonitoringLock, enabledMetrics, debug) + execute(endpoint, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics, debug) } } } -func execute(endpoint *core.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, disableMonitoringLock, enabledMetrics, debug bool) { +func execute(endpoint *core.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock, enabledMetrics, debug bool) { if !disableMonitoringLock { // By placing the lock here, we prevent multiple endpoints from being monitored at the exact same time, which // could cause performance issues and return inaccurate results monitoringMutex.Lock() + defer monitoringMutex.Unlock() + } + // If there's a connectivity checker configured, check if Gatus has internet connectivity + if connectivityConfig != nil && connectivityConfig.Checker != nil && !connectivityConfig.Checker.IsConnected() { + log.Println("[watchdog][execute] No connectivity; skipping execution") + return } if debug { log.Printf("[watchdog][execute] Monitoring group=%s; endpoint=%s", endpoint.Group, endpoint.Name) @@ -79,9 +86,6 @@ func execute(endpoint *core.Endpoint, alertingConfig *alerting.Config, maintenan if debug { log.Printf("[watchdog][execute] Waiting for interval=%s before monitoring group=%s endpoint=%s again", endpoint.Interval, endpoint.Group, endpoint.Name) } - if !disableMonitoringLock { - monitoringMutex.Unlock() - } } // UpdateEndpointStatuses updates the slice of endpoint statuses