Close #74: Add maintenance window

This commit is contained in:
TwinProduction 2021-09-22 00:04:51 -04:00
parent dc173b29bc
commit fa4736c672
6 changed files with 399 additions and 8 deletions

View File

@ -47,6 +47,7 @@ For more details, see [Usage](#usage)
- [Configuring Twilio alerts](#configuring-twilio-alerts)
- [Configuring custom alerts](#configuring-custom-alerts)
- [Setting a default alert](#setting-a-default-alert)
- [Maintenance](#maintenance)
- [Deployment](#deployment)
- [Docker](#docker)
- [Helm Chart](#helm-chart)
@ -736,6 +737,37 @@ services:
- type: pagerduty
```
### Maintenance
If you have maintenance windows, you may not want to be annoyed by alerts.
To do that, you'll have to use the maintenance configuration:
| Parameter | Description | Default |
|:----------------------- |:----------------------------------------------------------------------------- |:--------------- |
| `maintenance.enabled` | Whether the maintenance period is enabled | `true` |
| `maintenance.start` | Time at which the maintenance window starts in `hh:mm` format (e.g. `23:00`) | Required `""` |
| `maintenance.duration` | Duration of the maintenance window (e.g. `1h`, `30m`) | Required `""` |
| `maintenance.every` | Days on which the maintenance period applies (e.g. `[Monday, Thursday]`).<br />If left empty, the maintenance window applies every day | `[]` |
**Note that the maintenance configuration uses UTC.**
Here's an example:
```yaml
maintenance:
start: 23:00
duration: 1h
every: [Monday, Thursday]
```
Note that you can also specify each day on separate lines:
```yaml
maintenance:
start: 23:00
duration: 1h
every:
- Monday
- Thursday
```
## Deployment
Many examples can be found in the [examples](examples) folder, but this section will focus on the most popular ways of deploying Gatus.

View File

@ -10,6 +10,7 @@ import (
"github.com/TwinProduction/gatus/alerting"
"github.com/TwinProduction/gatus/alerting/alert"
"github.com/TwinProduction/gatus/alerting/provider"
"github.com/TwinProduction/gatus/config/maintenance"
"github.com/TwinProduction/gatus/core"
"github.com/TwinProduction/gatus/security"
"github.com/TwinProduction/gatus/storage"
@ -82,6 +83,9 @@ type Config struct {
// UI is the configuration for the UI
UI *UIConfig `yaml:"ui"`
// Maintenance is the configuration for creating a maintenance window in which no alerts are sent
Maintenance *maintenance.Config `yaml:"maintenance"`
filePath string // path to the file from which config was loaded from
lastFileModTime time.Time // last modification time
}
@ -172,6 +176,9 @@ func parseAndValidateConfigBytes(yamlBytes []byte) (config *Config, err error) {
if err := validateUIConfig(config); err != nil {
return nil, err
}
if err := validateMaintenanceConfig(config); err != nil {
return nil, err
}
if err := validateStorageConfig(config); err != nil {
return nil, err
}
@ -201,6 +208,17 @@ func validateStorageConfig(config *Config) error {
return nil
}
func validateMaintenanceConfig(config *Config) error {
if config.Maintenance == nil {
config.Maintenance = maintenance.GetDefaultConfig()
} else {
if err := config.Maintenance.ValidateAndSetDefaults(); err != nil {
return err
}
}
return nil
}
func validateUIConfig(config *Config) error {
if config.UI == nil {
config.UI = GetDefaultUIConfig()

View File

@ -43,6 +43,11 @@ func TestParseAndValidateConfigBytes(t *testing.T) {
config, err := parseAndValidateConfigBytes([]byte(fmt.Sprintf(`
storage:
file: %s
maintenance:
enabled: true
start: 00:00
duration: 4h
every: [Monday, Thursday]
ui:
title: Test
services:
@ -79,6 +84,9 @@ services:
if config.UI == nil || config.UI.Title != "Test" {
t.Error("Expected Config.UI.Title to be Test")
}
if mc := config.Maintenance; mc == nil || mc.Start != "00:00" || !mc.IsEnabled() || mc.Duration != 4*time.Hour || len(mc.Every) != 2 {
t.Error("Expected Config.Maintenance to be configured properly")
}
if len(config.Services) != 3 {
t.Error("Should have returned two services")
}

View File

@ -0,0 +1,133 @@
package maintenance
import (
"errors"
"fmt"
"sort"
"strconv"
"strings"
"time"
)
var (
errInvalidMaintenanceStartFormat = errors.New("invalid maintenance start format: must be hh:mm, between 00:00 and 23:59 inclusively (e.g. 23:00)")
errInvalidMaintenanceDuration = errors.New("invalid maintenance duration: must be bigger than 0 (e.g. 30m)")
errInvalidDayName = fmt.Errorf("invalid value specified for 'on'. supported values are %s", longDayNames)
longDayNames = []string{
"Sunday",
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
}
)
// Config allows for the configuration of a maintenance period.
// During this maintenance period, no alerts will be sent.
//
// Uses UTC.
type Config struct {
Enabled *bool `yaml:"enabled"` // Whether the maintenance period is enabled. Enabled by default if nil.
Start string `yaml:"start"` // Time at which the maintenance period starts (e.g. 23:00)
Duration time.Duration `yaml:"duration"` // Duration of the maintenance period (e.g. 4h)
// Every is a list of days of the week during which maintenance period applies.
// See longDayNames for list of valid values.
// Every day if empty.
Every []string `yaml:"every"`
durationToStartFromMidnight time.Duration
timeLocation *time.Location
}
func GetDefaultConfig() *Config {
defaultValue := false
return &Config{
Enabled: &defaultValue,
}
}
// IsEnabled returns whether maintenance is enabled or not
func (c Config) IsEnabled() bool {
if c.Enabled == nil {
return true
}
return *c.Enabled
}
// ValidateAndSetDefaults validates the maintenance configuration and sets the default values if necessary.
//
// Must be called once in the application's lifecycle before IsUnderMaintenance is called, since it
// also sets durationToStartFromMidnight.
func (c *Config) ValidateAndSetDefaults() error {
if c == nil || !c.IsEnabled() {
// Don't waste time validating if maintenance is not enabled.
return nil
}
for _, day := range c.Every {
isDayValid := false
for _, longDayName := range longDayNames {
if day == longDayName {
isDayValid = true
break
}
}
if !isDayValid {
return errInvalidDayName
}
}
var err error
c.durationToStartFromMidnight, err = hhmmToDuration(c.Start)
if err != nil {
return err
}
if c.Duration <= 0 || c.Duration >= 24*time.Hour {
return errInvalidMaintenanceDuration
}
return nil
}
// IsUnderMaintenance checks whether the services that Gatus monitors are within the configured maintenance window
func (c Config) IsUnderMaintenance() bool {
if !c.IsEnabled() {
return false
}
now := time.Now().UTC()
dayWhereMaintenancePeriodWouldStart := now.Add(-c.Duration).Truncate(24 * time.Hour)
hasMaintenanceEveryDay := len(c.Every) == 0
hasMaintenancePeriodScheduledForThatWeekday := sort.SearchStrings(c.Every, dayWhereMaintenancePeriodWouldStart.Weekday().String()) != len(c.Every)
if !hasMaintenanceEveryDay && !hasMaintenancePeriodScheduledForThatWeekday {
// The day when the maintenance period would start is not scheduled
// to have any maintenance, so we can just return false.
return false
}
startOfMaintenancePeriod := dayWhereMaintenancePeriodWouldStart.Add(c.durationToStartFromMidnight)
endOfMaintenancePeriod := startOfMaintenancePeriod.Add(c.Duration)
return now.After(startOfMaintenancePeriod) && now.Before(endOfMaintenancePeriod)
}
func hhmmToDuration(s string) (time.Duration, error) {
if len(s) != 5 {
return 0, errInvalidMaintenanceStartFormat
}
var hours, minutes int
var err error
if hours, err = extractNumericalValueFromPotentiallyZeroPaddedString(s[:2]); err != nil {
return 0, err
}
if minutes, err = extractNumericalValueFromPotentiallyZeroPaddedString(s[3:5]); err != nil {
return 0, err
}
duration := (time.Duration(hours) * time.Hour) + (time.Duration(minutes) * time.Minute)
if hours < 0 || hours > 23 || minutes < 0 || minutes > 59 || duration < 0 || duration >= 24*time.Hour {
return 0, errInvalidMaintenanceStartFormat
}
return duration, nil
}
func extractNumericalValueFromPotentiallyZeroPaddedString(s string) (int, error) {
return strconv.Atoi(strings.TrimPrefix(s, "0"))
}

View File

@ -0,0 +1,193 @@
package maintenance
import (
"errors"
"fmt"
"strconv"
"testing"
"time"
)
func TestGetDefaultConfig(t *testing.T) {
if *GetDefaultConfig().Enabled {
t.Fatal("expected default config to be disabled by default")
}
}
func TestConfig_Validate(t *testing.T) {
yes, no := true, false
scenarios := []struct {
name string
cfg *Config
expectedError error
}{
{
name: "nil",
cfg: nil,
expectedError: nil,
},
{
name: "disabled",
cfg: &Config{
Enabled: &no,
},
expectedError: nil,
},
{
name: "invalid-day",
cfg: &Config{
Every: []string{"invalid-day"},
},
expectedError: errInvalidDayName,
},
{
name: "invalid-day",
cfg: &Config{
Every: []string{"invalid-day"},
},
expectedError: errInvalidDayName,
},
{
name: "invalid-start-format",
cfg: &Config{
Start: "0000",
},
expectedError: errInvalidMaintenanceStartFormat,
},
{
name: "invalid-start-hours",
cfg: &Config{
Start: "25:00",
},
expectedError: errInvalidMaintenanceStartFormat,
},
{
name: "invalid-start-minutes",
cfg: &Config{
Start: "0:61",
},
expectedError: errInvalidMaintenanceStartFormat,
},
{
name: "invalid-start-minutes-non-numerical",
cfg: &Config{
Start: "00:zz",
},
expectedError: strconv.ErrSyntax,
},
{
name: "invalid-start-hours-non-numerical",
cfg: &Config{
Start: "zz:00",
},
expectedError: strconv.ErrSyntax,
},
{
name: "invalid-duration",
cfg: &Config{
Start: "23:00",
Duration: 0,
},
expectedError: errInvalidMaintenanceDuration,
},
{
name: "every-day-at-2300",
cfg: &Config{
Start: "23:00",
Duration: time.Hour,
},
expectedError: nil,
},
{
name: "every-monday-at-0000",
cfg: &Config{
Start: "00:00",
Duration: 30 * time.Minute,
Every: []string{"Monday"},
},
expectedError: nil,
},
{
name: "every-friday-and-sunday-at-0000-explicitly-enabled",
cfg: &Config{
Enabled: &yes,
Start: "08:00",
Duration: 8 * time.Hour,
Every: []string{"Friday", "Sunday"},
},
expectedError: nil,
},
}
for _, scenario := range scenarios {
t.Run(scenario.name, func(t *testing.T) {
err := scenario.cfg.ValidateAndSetDefaults()
if !errors.Is(err, scenario.expectedError) {
t.Errorf("expected %v, got %v", scenario.expectedError, err)
}
})
}
}
func TestConfig_IsUnderMaintenance(t *testing.T) {
yes, no := true, false
now := time.Now().UTC()
scenarios := []struct {
name string
cfg *Config
expected bool
}{
{
name: "disabled",
cfg: &Config{
Enabled: &no,
},
expected: false,
},
{
name: "under-maintenance-explicitly-enabled",
cfg: &Config{
Enabled: &yes,
Start: fmt.Sprintf("%02d:00", now.Hour()),
Duration: 2 * time.Hour,
},
expected: true,
},
{
name: "under-maintenance",
cfg: &Config{
Start: fmt.Sprintf("%02d:00", now.Hour()),
Duration: 2 * time.Hour,
},
expected: true,
},
{
name: "not-under-maintenance",
cfg: &Config{
Start: fmt.Sprintf("%02d:00", now.Add(-5*time.Hour).Hour()),
Duration: time.Hour,
},
expected: false,
},
{
name: "not-under-maintenance-today",
cfg: &Config{
Start: fmt.Sprintf("%02d:00", now.Hour()),
Duration: time.Hour,
Every: []string{now.Add(48 * time.Hour).Weekday().String()},
},
expected: false,
},
}
for _, scenario := range scenarios {
t.Run(scenario.name, func(t *testing.T) {
if scenario.cfg.ValidateAndSetDefaults() != nil {
t.Fatal("validation shouldn't have returned an error")
}
isUnderMaintenance := scenario.cfg.IsUnderMaintenance()
if isUnderMaintenance != scenario.expected {
t.Errorf("expected %v, got %v", scenario.expected, isUnderMaintenance)
t.Logf("start=%v; duration=%v; now=%v", scenario.cfg.Start, scenario.cfg.Duration, time.Now().UTC())
}
})
}
}

View File

@ -8,6 +8,7 @@ import (
"github.com/TwinProduction/gatus/alerting"
"github.com/TwinProduction/gatus/config"
"github.com/TwinProduction/gatus/config/maintenance"
"github.com/TwinProduction/gatus/core"
"github.com/TwinProduction/gatus/metric"
"github.com/TwinProduction/gatus/storage"
@ -27,17 +28,17 @@ func Monitor(cfg *config.Config) {
ctx, cancelFunc = context.WithCancel(context.Background())
for _, service := range cfg.Services {
if service.IsEnabled() {
// To prevent multiple requests from running at the same time, we'll wait for a little bit before each iteration
// To prevent multiple requests from running at the same time, we'll wait for a little before each iteration
time.Sleep(1111 * time.Millisecond)
go monitor(service, cfg.Alerting, cfg.DisableMonitoringLock, cfg.Metrics, cfg.Debug, ctx)
go monitor(service, cfg.Alerting, cfg.Maintenance, cfg.DisableMonitoringLock, cfg.Metrics, cfg.Debug, ctx)
}
}
}
// monitor monitors a single service in a loop
func monitor(service *core.Service, alertingConfig *alerting.Config, disableMonitoringLock, enabledMetrics, debug bool, ctx context.Context) {
func monitor(service *core.Service, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, disableMonitoringLock, enabledMetrics, debug bool, ctx context.Context) {
// Run it immediately on start
execute(service, alertingConfig, disableMonitoringLock, enabledMetrics, debug)
execute(service, alertingConfig, maintenanceConfig, disableMonitoringLock, enabledMetrics, debug)
// Loop for the next executions
for {
select {
@ -45,12 +46,12 @@ func monitor(service *core.Service, alertingConfig *alerting.Config, disableMoni
log.Printf("[watchdog][monitor] Canceling current execution of group=%s; service=%s", service.Group, service.Name)
return
case <-time.After(service.Interval):
execute(service, alertingConfig, disableMonitoringLock, enabledMetrics, debug)
execute(service, alertingConfig, maintenanceConfig, disableMonitoringLock, enabledMetrics, debug)
}
}
}
func execute(service *core.Service, alertingConfig *alerting.Config, disableMonitoringLock, enabledMetrics, debug bool) {
func execute(service *core.Service, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, disableMonitoringLock, enabledMetrics, debug bool) {
if !disableMonitoringLock {
// By placing the lock here, we prevent multiple services from being monitored at the exact same time, which
// could cause performance issues and return inaccurate results
@ -72,7 +73,11 @@ func execute(service *core.Service, alertingConfig *alerting.Config, disableMoni
len(result.Errors),
result.Duration.Round(time.Millisecond),
)
if !maintenanceConfig.IsUnderMaintenance() {
HandleAlerting(service, result, alertingConfig, debug)
} else if debug {
log.Println("[watchdog][execute] Not handling alerting because currently in the maintenance window")
}
if debug {
log.Printf("[watchdog][execute] Waiting for interval=%s before monitoring group=%s service=%s again", service.Interval, service.Group, service.Name)
}
@ -83,7 +88,9 @@ func execute(service *core.Service, alertingConfig *alerting.Config, disableMoni
// UpdateServiceStatuses updates the slice of service statuses
func UpdateServiceStatuses(service *core.Service, result *core.Result) {
storage.Get().Insert(service, result)
if err := storage.Get().Insert(service, result); err != nil {
log.Println("[watchdog][UpdateServiceStatuses] Failed to insert data in storage:", err.Error())
}
}
// Shutdown stops monitoring all services