diff --git a/README.md b/README.md index 98577756..d0fcdccc 100644 --- a/README.md +++ b/README.md @@ -74,36 +74,37 @@ Note that you can also add environment variables in the configuration file (i.e. ### Configuration -| Parameter | Description | Default | -| --------------------------------------------- | -------------------------------------------------------------------------- | -------------- | -| `debug` | Whether to enable debug logs | `false` | -| `metrics` | Whether to expose metrics at /metrics | `false` | -| `services` | List of services to monitor | Required `[]` | -| `services[].name` | Name of the service. Can be anything. | Required `""` | -| `services[].url` | URL to send the request to | Required `""` | -| `services[].conditions` | Conditions used to determine the health of the service | `[]` | -| `services[].interval` | Duration to wait between every status check | `60s` | -| `services[].method` | Request method | `GET` | -| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` | -| `services[].body` | Request body | `""` | -| `services[].headers` | Request headers | `{}` | -| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` | -| `services[].alerts[].enabled` | Whether to enable the alert | `false` | -| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` | -| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` | -| `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert subsides | `false` | -| `services[].alerts[].success-before-resolved` | Number of successes in a row needed before sending a resolved notification | `2` | -| `alerting` | Configuration for alerting | `{}` | -| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` | -| `alerting.twilio` | Settings for alerts of type `twilio` | `""` | -| `alerting.twilio.sid` | Twilio account SID | Required `""` | -| `alerting.twilio.token` | Twilio auth token | Required `""` | -| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` | -| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` | -| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` | -| `alerting.custom.url` | Custom alerting request url | `""` | -| `alerting.custom.body` | Custom alerting request body. | `""` | -| `alerting.custom.headers` | Custom alerting request headers | `{}` | +| Parameter | Description | Default | +| ---------------------------------------- | ----------------------------------------------------------------------------- | -------------- | +| `debug` | Whether to enable debug logs | `false` | +| `metrics` | Whether to expose metrics at /metrics | `false` | +| `services` | List of services to monitor | Required `[]` | +| `services[].name` | Name of the service. Can be anything. | Required `""` | +| `services[].url` | URL to send the request to | Required `""` | +| `services[].conditions` | Conditions used to determine the health of the service | `[]` | +| `services[].interval` | Duration to wait between every status check | `60s` | +| `services[].method` | Request method | `GET` | +| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` | +| `services[].body` | Request body | `""` | +| `services[].headers` | Request headers | `{}` | +| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `pagerduty`, `twilio`, `custom` | Required `""` | +| `services[].alerts[].enabled` | Whether to enable the alert | `false` | +| `services[].alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert | `3` | +| `services[].alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved | `2` | +| `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved | `false` | +| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` | +| `alerting` | Configuration for alerting | `{}` | +| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` | +| `alerting.pagerduty` | PagerDuty Events API v2 integration key. Used for alerts of type `pagerduty` | `""` | +| `alerting.twilio` | Settings for alerts of type `twilio` | `""` | +| `alerting.twilio.sid` | Twilio account SID | Required `""` | +| `alerting.twilio.token` | Twilio auth token | Required `""` | +| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` | +| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` | +| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` | +| `alerting.custom.url` | Custom alerting request url | `""` | +| `alerting.custom.body` | Custom alerting request body. | `""` | +| `alerting.custom.headers` | Custom alerting request headers | `{}` | ### Conditions @@ -144,7 +145,7 @@ services: send-on-resolved: true - type: slack enabled: true - threshold: 5 + failure-threshold: 5 description: "healthcheck failed 5 times in a row" send-on-resolved: true conditions: @@ -175,10 +176,10 @@ services: alerts: - type: pagerduty enabled: true - threshold: 3 + failure-threshold: 3 + success-threshold: 5 description: "healthcheck failed 3 times in a row" send-on-resolved: true - success-before-resolved: 5 conditions: - "[STATUS] == 200" - "[BODY].status == UP" @@ -202,8 +203,9 @@ services: alerts: - type: twilio enabled: true - threshold: 5 + failure-threshold: 5 description: "healthcheck failed 5 times in a row" + send-on-resolved: true conditions: - "[STATUS] == 200" - "[BODY].status == UP" @@ -244,7 +246,8 @@ services: alerts: - type: custom enabled: true - threshold: 10 + failure-threshold: 10 + success-threshold: 3 send-on-resolved: true description: "healthcheck failed 10 times in a row" conditions: diff --git a/alerting/alerting.go b/alerting/alerting.go index 24f20f98..59605456 100644 --- a/alerting/alerting.go +++ b/alerting/alerting.go @@ -26,7 +26,7 @@ func handleAlertsToTrigger(service *core.Service, result *core.Result, cfg *conf service.NumberOfFailuresInARow++ for _, alert := range service.Alerts { // If the alert hasn't been triggered, move to the next one - if !alert.Enabled || alert.Threshold != service.NumberOfFailuresInARow { + if !alert.Enabled || alert.FailureThreshold != service.NumberOfFailuresInARow { continue } if alert.Triggered { @@ -100,7 +100,7 @@ func handleAlertsToTrigger(service *core.Service, result *core.Result, cfg *conf func handleAlertsToResolve(service *core.Service, result *core.Result, cfg *config.Config) { service.NumberOfSuccessesInARow++ for _, alert := range service.Alerts { - if !alert.Enabled || !alert.Triggered || alert.SuccessBeforeResolved > service.NumberOfSuccessesInARow { + if !alert.Enabled || !alert.Triggered || alert.SuccessThreshold > service.NumberOfSuccessesInARow { continue } alert.Triggered = false diff --git a/config/config_test.go b/config/config_test.go index 2cb297c3..4a87c176 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -128,7 +128,7 @@ services: alerts: - type: slack enabled: true - threshold: 7 + failure-threshold: 7 description: "Healthcheck failed 7 times in a row" conditions: - "[STATUS] == 200" @@ -166,8 +166,11 @@ services: if !config.Services[0].Alerts[0].Enabled { t.Error("The alert should've been enabled") } - if config.Services[0].Alerts[0].Threshold != 7 { - t.Errorf("The threshold of the alert should've been %d, but it was %d", 7, config.Services[0].Alerts[0].Threshold) + if config.Services[0].Alerts[0].FailureThreshold != 7 { + t.Errorf("The failure threshold of the alert should've been %d, but it was %d", 7, config.Services[0].Alerts[0].FailureThreshold) + } + if config.Services[0].Alerts[0].FailureThreshold != 7 { + t.Errorf("The success threshold of the alert should've been %d, but it was %d", 2, config.Services[0].Alerts[0].SuccessThreshold) } if config.Services[0].Alerts[0].Type != core.SlackAlert { t.Errorf("The type of the alert should've been %s, but it was %s", core.SlackAlert, config.Services[0].Alerts[0].Type) diff --git a/core/alert.go b/core/alert.go index 9a2e585f..705eb515 100644 --- a/core/alert.go +++ b/core/alert.go @@ -8,8 +8,8 @@ type Alert struct { // Enabled defines whether or not the alert is enabled Enabled bool `yaml:"enabled"` - // Threshold is the number of failures in a row needed before triggering the alert - Threshold int `yaml:"threshold"` + // FailureThreshold is the number of failures in a row needed before triggering the alert + FailureThreshold int `yaml:"failure-threshold"` // Description of the alert. Will be included in the alert sent. Description string `yaml:"description"` @@ -17,8 +17,8 @@ type Alert struct { // SendOnResolved defines whether to send a second notification when the issue has been resolved SendOnResolved bool `yaml:"send-on-resolved"` - // SuccessBeforeResolved defines whether to send a second notification when the issue has been resolved - SuccessBeforeResolved int `yaml:"success-before-resolved"` + // SuccessThreshold defines how many successful executions must happen in a row before an ongoing incident is marked as resolved + SuccessThreshold int `yaml:"success-threshold"` // ResolveKey is an optional field that is used by some providers (i.e. PagerDuty's dedup_key) to resolve // ongoing/triggered incidents diff --git a/core/alerting.go b/core/alerting.go index 332f89d2..f2328fc2 100644 --- a/core/alerting.go +++ b/core/alerting.go @@ -99,10 +99,10 @@ func CreateSlackCustomAlertProvider(slackWebHookUrl string, service *Service, al var message string var color string if resolved { - message = fmt.Sprintf("An alert for *%s* has been resolved after passing successfully %d time(s) in a row", service.Name, alert.SuccessBeforeResolved) + message = fmt.Sprintf("An alert for *%s* has been resolved after passing successfully %d time(s) in a row", service.Name, alert.SuccessThreshold) color = "#36A64F" } else { - message = fmt.Sprintf("An alert for *%s* has been triggered due to having failed %d time(s) in a row", service.Name, alert.Threshold) + message = fmt.Sprintf("An alert for *%s* has been triggered due to having failed %d time(s) in a row", service.Name, alert.FailureThreshold) color = "#DD0000" } var results string diff --git a/core/service.go b/core/service.go index 52e7aab6..d2adec1a 100644 --- a/core/service.go +++ b/core/service.go @@ -62,11 +62,11 @@ func (service *Service) Validate() { service.Headers = make(map[string]string) } for _, alert := range service.Alerts { - if alert.Threshold <= 0 { - alert.Threshold = 3 + if alert.FailureThreshold <= 0 { + alert.FailureThreshold = 3 } - if alert.SuccessBeforeResolved <= 0 { - alert.SuccessBeforeResolved = 2 + if alert.SuccessThreshold <= 0 { + alert.SuccessThreshold = 2 } } if len(service.Url) == 0 { @@ -107,7 +107,7 @@ func (service *Service) GetAlertsTriggered() []Alert { return alerts } for _, alert := range service.Alerts { - if alert.Enabled && alert.Threshold == service.NumberOfFailuresInARow { + if alert.Enabled && alert.FailureThreshold == service.NumberOfFailuresInARow { alerts = append(alerts, *alert) continue }