feat(connectivity): Allow internet connection validation prior to endpoint execution (#461)

This commit is contained in:
TwiN 2023-05-02 22:41:22 -04:00 committed by GitHub
parent 6908199716
commit 447e140479
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 167 additions and 8 deletions

View File

@ -71,6 +71,7 @@ Have any feedback or questions? [Create a discussion](https://github.com/TwiN/ga
- [OIDC](#oidc) - [OIDC](#oidc)
- [TLS Encryption](#tls-encryption) - [TLS Encryption](#tls-encryption)
- [Metrics](#metrics) - [Metrics](#metrics)
- [Connectivity](#connectivity)
- [Remote instances (EXPERIMENTAL)](#remote-instances-experimental) - [Remote instances (EXPERIMENTAL)](#remote-instances-experimental)
- [Deployment](#deployment) - [Deployment](#deployment)
- [Docker](#docker) - [Docker](#docker)
@ -1256,6 +1257,28 @@ endpoint on the same port your application is configured to run on (`web.port`).
See [examples/docker-compose-grafana-prometheus](.examples/docker-compose-grafana-prometheus) for further documentation as well as an example. See [examples/docker-compose-grafana-prometheus](.examples/docker-compose-grafana-prometheus) for further documentation as well as an example.
### Connectivity
| Parameter | Description | Default |
|:--------------------------------|:-------------------------------------------|:--------------|
| `connectivity` | Connectivity configuration | `{}` |
| `connectivity.checker` | Connectivity checker configuration | Required `{}` |
| `connectivity.checker.target` | Host to use for validating connectivity | Required `""` |
| `connectivity.checker.interval` | Interval at which to validate connectivity | `1m` |
While Gatus is used to monitor other services, it is possible for Gatus itself to lose connectivity to the internet.
In order to prevent Gatus from reporting endpoints as unhealthy when Gatus itself is unhealthy, you may configure
Gatus to periodically check for internet connectivity.
All endpoint executions are skipped while the connectivity checker deems connectivity to be down.
```yaml
connectivity:
checker:
target: 1.1.1.1:53
interval: 60s
```
### Remote instances (EXPERIMENTAL) ### Remote instances (EXPERIMENTAL)
This feature allows you to retrieve endpoint statuses from a remote Gatus instance. This feature allows you to retrieve endpoint statuses from a remote Gatus instance.

View File

@ -191,6 +191,9 @@ func TestCanCreateTCPConnection(t *testing.T) {
if CanCreateTCPConnection("127.0.0.1", &Config{Timeout: 5 * time.Second}) { if CanCreateTCPConnection("127.0.0.1", &Config{Timeout: 5 * time.Second}) {
t.Error("should've failed, because there's no port in the address") t.Error("should've failed, because there's no port in the address")
} }
if !CanCreateTCPConnection("1.1.1.1:53", &Config{Timeout: 5 * time.Second}) {
t.Error("should've succeeded, because that IP should always™ be up")
}
} }
// This test checks if a HTTP client configured with `configureOAuth2()` automatically // This test checks if a HTTP client configured with `configureOAuth2()` automatically

View File

@ -14,6 +14,7 @@ import (
"github.com/TwiN/gatus/v5/alerting" "github.com/TwiN/gatus/v5/alerting"
"github.com/TwiN/gatus/v5/alerting/alert" "github.com/TwiN/gatus/v5/alerting/alert"
"github.com/TwiN/gatus/v5/alerting/provider" "github.com/TwiN/gatus/v5/alerting/provider"
"github.com/TwiN/gatus/v5/config/connectivity"
"github.com/TwiN/gatus/v5/config/maintenance" "github.com/TwiN/gatus/v5/config/maintenance"
"github.com/TwiN/gatus/v5/config/remote" "github.com/TwiN/gatus/v5/config/remote"
"github.com/TwiN/gatus/v5/config/ui" "github.com/TwiN/gatus/v5/config/ui"
@ -91,6 +92,9 @@ type Config struct {
// WARNING: This is in ALPHA and may change or be completely removed in the future // WARNING: This is in ALPHA and may change or be completely removed in the future
Remote *remote.Config `yaml:"remote,omitempty"` Remote *remote.Config `yaml:"remote,omitempty"`
// Connectivity is the configuration for connectivity
Connectivity *connectivity.Config `yaml:"connectivity,omitempty"`
configPath string // path to the file or directory from which config was loaded configPath string // path to the file or directory from which config was loaded
lastFileModTime time.Time // last modification time lastFileModTime time.Time // last modification time
} }
@ -252,10 +256,20 @@ func parseAndValidateConfigBytes(yamlBytes []byte) (config *Config, err error) {
if err := validateRemoteConfig(config); err != nil { if err := validateRemoteConfig(config); err != nil {
return nil, err return nil, err
} }
if err := validateConnectivityConfig(config); err != nil {
return nil, err
}
} }
return return
} }
func validateConnectivityConfig(config *Config) error {
if config.Connectivity != nil {
return config.Connectivity.ValidateAndSetDefaults()
}
return nil
}
func validateRemoteConfig(config *Config) error { func validateRemoteConfig(config *Config) error {
if config.Remote != nil { if config.Remote != nil {
if err := config.Remote.ValidateAndSetDefaults(); err != nil { if err := config.Remote.ValidateAndSetDefaults(); err != nil {

View File

@ -0,0 +1,53 @@
package connectivity
import (
"errors"
"strings"
"time"
"github.com/TwiN/gatus/v5/client"
)
var (
ErrInvalidInterval = errors.New("connectivity.checker.interval must be 5s or higher")
ErrInvalidDNSTarget = errors.New("connectivity.checker.target must be suffixed with :53")
)
// Config is the configuration for the connectivity checker.
type Config struct {
Checker *Checker `yaml:"checker,omitempty"`
}
func (c *Config) ValidateAndSetDefaults() error {
if c.Checker != nil {
if c.Checker.Interval == 0 {
c.Checker.Interval = 60 * time.Second
} else if c.Checker.Interval < 5*time.Second {
return ErrInvalidInterval
}
if !strings.HasSuffix(c.Checker.Target, ":53") {
return ErrInvalidDNSTarget
}
}
return nil
}
// Checker is the configuration for making sure Gatus has access to the internet.
type Checker struct {
Target string `yaml:"target"` // e.g. 1.1.1.1:53
Interval time.Duration `yaml:"interval,omitempty"`
isConnected bool
lastCheck time.Time
}
func (c Checker) Check() bool {
return client.CanCreateTCPConnection(c.Target, &client.Config{Timeout: 5 * time.Second})
}
func (c *Checker) IsConnected() bool {
if now := time.Now(); now.After(c.lastCheck.Add(c.Interval)) {
c.lastCheck, c.isConnected = now, c.Check()
}
return c.isConnected
}

View File

@ -0,0 +1,62 @@
package connectivity
import (
"fmt"
"testing"
"time"
)
func TestConfig(t *testing.T) {
scenarios := []struct {
name string
cfg *Config
expectedErr error
expectedInterval time.Duration
}{
{
name: "good-config",
cfg: &Config{Checker: &Checker{Target: "1.1.1.1:53", Interval: 10 * time.Second}},
expectedInterval: 10 * time.Second,
},
{
name: "good-config-with-default-interval",
cfg: &Config{Checker: &Checker{Target: "8.8.8.8:53", Interval: 0}},
expectedInterval: 60 * time.Second,
},
{
name: "config-with-interval-too-low",
cfg: &Config{Checker: &Checker{Target: "1.1.1.1:53", Interval: 4 * time.Second}},
expectedErr: ErrInvalidInterval,
},
{
name: "config-with-invalid-target-due-to-missing-port",
cfg: &Config{Checker: &Checker{Target: "1.1.1.1", Interval: 15 * time.Second}},
expectedErr: ErrInvalidDNSTarget,
},
{
name: "config-with-invalid-target-due-to-invalid-dns-port",
cfg: &Config{Checker: &Checker{Target: "1.1.1.1:52", Interval: 15 * time.Second}},
expectedErr: ErrInvalidDNSTarget,
},
}
for _, scenario := range scenarios {
t.Run(scenario.name, func(t *testing.T) {
err := scenario.cfg.ValidateAndSetDefaults()
if fmt.Sprintf("%s", err) != fmt.Sprintf("%s", scenario.expectedErr) {
t.Errorf("expected error %v, got %v", scenario.expectedErr, err)
}
if err == nil && scenario.expectedErr == nil {
if scenario.cfg.Checker.Interval != scenario.expectedInterval {
t.Errorf("expected interval %v, got %v", scenario.expectedInterval, scenario.cfg.Checker.Interval)
}
}
})
}
}
func TestChecker_IsConnected(t *testing.T) {
checker := &Checker{Target: "1.1.1.1:53", Interval: 10 * time.Second}
if !checker.IsConnected() {
t.Error("expected checker.IsConnected() to be true")
}
}

View File

@ -8,6 +8,7 @@ import (
"github.com/TwiN/gatus/v5/alerting" "github.com/TwiN/gatus/v5/alerting"
"github.com/TwiN/gatus/v5/config" "github.com/TwiN/gatus/v5/config"
"github.com/TwiN/gatus/v5/config/connectivity"
"github.com/TwiN/gatus/v5/config/maintenance" "github.com/TwiN/gatus/v5/config/maintenance"
"github.com/TwiN/gatus/v5/core" "github.com/TwiN/gatus/v5/core"
"github.com/TwiN/gatus/v5/metrics" "github.com/TwiN/gatus/v5/metrics"
@ -30,15 +31,15 @@ func Monitor(cfg *config.Config) {
if endpoint.IsEnabled() { if endpoint.IsEnabled() {
// To prevent multiple requests from running at the same time, we'll wait for a little before each iteration // To prevent multiple requests from running at the same time, we'll wait for a little before each iteration
time.Sleep(777 * time.Millisecond) time.Sleep(777 * time.Millisecond)
go monitor(endpoint, cfg.Alerting, cfg.Maintenance, cfg.DisableMonitoringLock, cfg.Metrics, cfg.Debug, ctx) go monitor(endpoint, cfg.Alerting, cfg.Maintenance, cfg.Connectivity, cfg.DisableMonitoringLock, cfg.Metrics, cfg.Debug, ctx)
} }
} }
} }
// monitor a single endpoint in a loop // monitor a single endpoint in a loop
func monitor(endpoint *core.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, disableMonitoringLock, enabledMetrics, debug bool, ctx context.Context) { func monitor(endpoint *core.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock, enabledMetrics, debug bool, ctx context.Context) {
// Run it immediately on start // Run it immediately on start
execute(endpoint, alertingConfig, maintenanceConfig, disableMonitoringLock, enabledMetrics, debug) execute(endpoint, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics, debug)
// Loop for the next executions // Loop for the next executions
for { for {
select { select {
@ -46,16 +47,22 @@ func monitor(endpoint *core.Endpoint, alertingConfig *alerting.Config, maintenan
log.Printf("[watchdog][monitor] Canceling current execution of group=%s; endpoint=%s", endpoint.Group, endpoint.Name) log.Printf("[watchdog][monitor] Canceling current execution of group=%s; endpoint=%s", endpoint.Group, endpoint.Name)
return return
case <-time.After(endpoint.Interval): case <-time.After(endpoint.Interval):
execute(endpoint, alertingConfig, maintenanceConfig, disableMonitoringLock, enabledMetrics, debug) execute(endpoint, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics, debug)
} }
} }
} }
func execute(endpoint *core.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, disableMonitoringLock, enabledMetrics, debug bool) { func execute(endpoint *core.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock, enabledMetrics, debug bool) {
if !disableMonitoringLock { if !disableMonitoringLock {
// By placing the lock here, we prevent multiple endpoints from being monitored at the exact same time, which // By placing the lock here, we prevent multiple endpoints from being monitored at the exact same time, which
// could cause performance issues and return inaccurate results // could cause performance issues and return inaccurate results
monitoringMutex.Lock() monitoringMutex.Lock()
defer monitoringMutex.Unlock()
}
// If there's a connectivity checker configured, check if Gatus has internet connectivity
if connectivityConfig != nil && connectivityConfig.Checker != nil && !connectivityConfig.Checker.IsConnected() {
log.Println("[watchdog][execute] No connectivity; skipping execution")
return
} }
if debug { if debug {
log.Printf("[watchdog][execute] Monitoring group=%s; endpoint=%s", endpoint.Group, endpoint.Name) log.Printf("[watchdog][execute] Monitoring group=%s; endpoint=%s", endpoint.Group, endpoint.Name)
@ -79,9 +86,6 @@ func execute(endpoint *core.Endpoint, alertingConfig *alerting.Config, maintenan
if debug { if debug {
log.Printf("[watchdog][execute] Waiting for interval=%s before monitoring group=%s endpoint=%s again", endpoint.Interval, endpoint.Group, endpoint.Name) log.Printf("[watchdog][execute] Waiting for interval=%s before monitoring group=%s endpoint=%s again", endpoint.Interval, endpoint.Group, endpoint.Name)
} }
if !disableMonitoringLock {
monitoringMutex.Unlock()
}
} }
// UpdateEndpointStatuses updates the slice of endpoint statuses // UpdateEndpointStatuses updates the slice of endpoint statuses