Rename services[].alerts[]'s threshold and success-before-resolved to failure-threshold and success-threshold

This commit is contained in:
TwinProduction 2020-09-16 20:22:33 -04:00
parent fefc728201
commit c6f11e63e4
6 changed files with 57 additions and 51 deletions

View File

@ -75,7 +75,7 @@ Note that you can also add environment variables in the configuration file (i.e.
### Configuration
| Parameter | Description | Default |
| --------------------------------------------- | -------------------------------------------------------------------------- | -------------- |
| ---------------------------------------- | ----------------------------------------------------------------------------- | -------------- |
| `debug` | Whether to enable debug logs | `false` |
| `metrics` | Whether to expose metrics at /metrics | `false` |
| `services` | List of services to monitor | Required `[]` |
@ -87,14 +87,15 @@ Note that you can also add environment variables in the configuration file (i.e.
| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` |
| `services[].body` | Request body | `""` |
| `services[].headers` | Request headers | `{}` |
| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` |
| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `pagerduty`, `twilio`, `custom` | Required `""` |
| `services[].alerts[].enabled` | Whether to enable the alert | `false` |
| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` |
| `services[].alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert | `3` |
| `services[].alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved | `2` |
| `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved | `false` |
| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` |
| `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert subsides | `false` |
| `services[].alerts[].success-before-resolved` | Number of successes in a row needed before sending a resolved notification | `2` |
| `alerting` | Configuration for alerting | `{}` |
| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` |
| `alerting.pagerduty` | PagerDuty Events API v2 integration key. Used for alerts of type `pagerduty` | `""` |
| `alerting.twilio` | Settings for alerts of type `twilio` | `""` |
| `alerting.twilio.sid` | Twilio account SID | Required `""` |
| `alerting.twilio.token` | Twilio auth token | Required `""` |
@ -144,7 +145,7 @@ services:
send-on-resolved: true
- type: slack
enabled: true
threshold: 5
failure-threshold: 5
description: "healthcheck failed 5 times in a row"
send-on-resolved: true
conditions:
@ -175,10 +176,10 @@ services:
alerts:
- type: pagerduty
enabled: true
threshold: 3
failure-threshold: 3
success-threshold: 5
description: "healthcheck failed 3 times in a row"
send-on-resolved: true
success-before-resolved: 5
conditions:
- "[STATUS] == 200"
- "[BODY].status == UP"
@ -202,8 +203,9 @@ services:
alerts:
- type: twilio
enabled: true
threshold: 5
failure-threshold: 5
description: "healthcheck failed 5 times in a row"
send-on-resolved: true
conditions:
- "[STATUS] == 200"
- "[BODY].status == UP"
@ -244,7 +246,8 @@ services:
alerts:
- type: custom
enabled: true
threshold: 10
failure-threshold: 10
success-threshold: 3
send-on-resolved: true
description: "healthcheck failed 10 times in a row"
conditions:

View File

@ -26,7 +26,7 @@ func handleAlertsToTrigger(service *core.Service, result *core.Result, cfg *conf
service.NumberOfFailuresInARow++
for _, alert := range service.Alerts {
// If the alert hasn't been triggered, move to the next one
if !alert.Enabled || alert.Threshold != service.NumberOfFailuresInARow {
if !alert.Enabled || alert.FailureThreshold != service.NumberOfFailuresInARow {
continue
}
if alert.Triggered {
@ -100,7 +100,7 @@ func handleAlertsToTrigger(service *core.Service, result *core.Result, cfg *conf
func handleAlertsToResolve(service *core.Service, result *core.Result, cfg *config.Config) {
service.NumberOfSuccessesInARow++
for _, alert := range service.Alerts {
if !alert.Enabled || !alert.Triggered || alert.SuccessBeforeResolved > service.NumberOfSuccessesInARow {
if !alert.Enabled || !alert.Triggered || alert.SuccessThreshold > service.NumberOfSuccessesInARow {
continue
}
alert.Triggered = false

View File

@ -128,7 +128,7 @@ services:
alerts:
- type: slack
enabled: true
threshold: 7
failure-threshold: 7
description: "Healthcheck failed 7 times in a row"
conditions:
- "[STATUS] == 200"
@ -166,8 +166,11 @@ services:
if !config.Services[0].Alerts[0].Enabled {
t.Error("The alert should've been enabled")
}
if config.Services[0].Alerts[0].Threshold != 7 {
t.Errorf("The threshold of the alert should've been %d, but it was %d", 7, config.Services[0].Alerts[0].Threshold)
if config.Services[0].Alerts[0].FailureThreshold != 7 {
t.Errorf("The failure threshold of the alert should've been %d, but it was %d", 7, config.Services[0].Alerts[0].FailureThreshold)
}
if config.Services[0].Alerts[0].FailureThreshold != 7 {
t.Errorf("The success threshold of the alert should've been %d, but it was %d", 2, config.Services[0].Alerts[0].SuccessThreshold)
}
if config.Services[0].Alerts[0].Type != core.SlackAlert {
t.Errorf("The type of the alert should've been %s, but it was %s", core.SlackAlert, config.Services[0].Alerts[0].Type)

View File

@ -8,8 +8,8 @@ type Alert struct {
// Enabled defines whether or not the alert is enabled
Enabled bool `yaml:"enabled"`
// Threshold is the number of failures in a row needed before triggering the alert
Threshold int `yaml:"threshold"`
// FailureThreshold is the number of failures in a row needed before triggering the alert
FailureThreshold int `yaml:"failure-threshold"`
// Description of the alert. Will be included in the alert sent.
Description string `yaml:"description"`
@ -17,8 +17,8 @@ type Alert struct {
// SendOnResolved defines whether to send a second notification when the issue has been resolved
SendOnResolved bool `yaml:"send-on-resolved"`
// SuccessBeforeResolved defines whether to send a second notification when the issue has been resolved
SuccessBeforeResolved int `yaml:"success-before-resolved"`
// SuccessThreshold defines how many successful executions must happen in a row before an ongoing incident is marked as resolved
SuccessThreshold int `yaml:"success-threshold"`
// ResolveKey is an optional field that is used by some providers (i.e. PagerDuty's dedup_key) to resolve
// ongoing/triggered incidents

View File

@ -99,10 +99,10 @@ func CreateSlackCustomAlertProvider(slackWebHookUrl string, service *Service, al
var message string
var color string
if resolved {
message = fmt.Sprintf("An alert for *%s* has been resolved after passing successfully %d time(s) in a row", service.Name, alert.SuccessBeforeResolved)
message = fmt.Sprintf("An alert for *%s* has been resolved after passing successfully %d time(s) in a row", service.Name, alert.SuccessThreshold)
color = "#36A64F"
} else {
message = fmt.Sprintf("An alert for *%s* has been triggered due to having failed %d time(s) in a row", service.Name, alert.Threshold)
message = fmt.Sprintf("An alert for *%s* has been triggered due to having failed %d time(s) in a row", service.Name, alert.FailureThreshold)
color = "#DD0000"
}
var results string

View File

@ -62,11 +62,11 @@ func (service *Service) Validate() {
service.Headers = make(map[string]string)
}
for _, alert := range service.Alerts {
if alert.Threshold <= 0 {
alert.Threshold = 3
if alert.FailureThreshold <= 0 {
alert.FailureThreshold = 3
}
if alert.SuccessBeforeResolved <= 0 {
alert.SuccessBeforeResolved = 2
if alert.SuccessThreshold <= 0 {
alert.SuccessThreshold = 2
}
}
if len(service.Url) == 0 {
@ -107,7 +107,7 @@ func (service *Service) GetAlertsTriggered() []Alert {
return alerts
}
for _, alert := range service.Alerts {
if alert.Enabled && alert.Threshold == service.NumberOfFailuresInARow {
if alert.Enabled && alert.FailureThreshold == service.NumberOfFailuresInARow {
alerts = append(alerts, *alert)
continue
}