Rename services[].alerts[]'s threshold and success-before-resolved to failure-threshold and success-threshold

This commit is contained in:
TwinProduction 2020-09-16 20:22:33 -04:00
parent fefc728201
commit c6f11e63e4
6 changed files with 57 additions and 51 deletions

View File

@ -74,36 +74,37 @@ Note that you can also add environment variables in the configuration file (i.e.
### Configuration ### Configuration
| Parameter | Description | Default | | Parameter | Description | Default |
| --------------------------------------------- | -------------------------------------------------------------------------- | -------------- | | ---------------------------------------- | ----------------------------------------------------------------------------- | -------------- |
| `debug` | Whether to enable debug logs | `false` | | `debug` | Whether to enable debug logs | `false` |
| `metrics` | Whether to expose metrics at /metrics | `false` | | `metrics` | Whether to expose metrics at /metrics | `false` |
| `services` | List of services to monitor | Required `[]` | | `services` | List of services to monitor | Required `[]` |
| `services[].name` | Name of the service. Can be anything. | Required `""` | | `services[].name` | Name of the service. Can be anything. | Required `""` |
| `services[].url` | URL to send the request to | Required `""` | | `services[].url` | URL to send the request to | Required `""` |
| `services[].conditions` | Conditions used to determine the health of the service | `[]` | | `services[].conditions` | Conditions used to determine the health of the service | `[]` |
| `services[].interval` | Duration to wait between every status check | `60s` | | `services[].interval` | Duration to wait between every status check | `60s` |
| `services[].method` | Request method | `GET` | | `services[].method` | Request method | `GET` |
| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` | | `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` |
| `services[].body` | Request body | `""` | | `services[].body` | Request body | `""` |
| `services[].headers` | Request headers | `{}` | | `services[].headers` | Request headers | `{}` |
| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` | | `services[].alerts[].type` | Type of alert. Valid types: `slack`, `pagerduty`, `twilio`, `custom` | Required `""` |
| `services[].alerts[].enabled` | Whether to enable the alert | `false` | | `services[].alerts[].enabled` | Whether to enable the alert | `false` |
| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` | | `services[].alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert | `3` |
| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` | | `services[].alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved | `2` |
| `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert subsides | `false` | | `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved | `false` |
| `services[].alerts[].success-before-resolved` | Number of successes in a row needed before sending a resolved notification | `2` | | `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` |
| `alerting` | Configuration for alerting | `{}` | | `alerting` | Configuration for alerting | `{}` |
| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` | | `alerting.slack` | Webhook to use for alerts of type `slack` | `""` |
| `alerting.twilio` | Settings for alerts of type `twilio` | `""` | | `alerting.pagerduty` | PagerDuty Events API v2 integration key. Used for alerts of type `pagerduty` | `""` |
| `alerting.twilio.sid` | Twilio account SID | Required `""` | | `alerting.twilio` | Settings for alerts of type `twilio` | `""` |
| `alerting.twilio.token` | Twilio auth token | Required `""` | | `alerting.twilio.sid` | Twilio account SID | Required `""` |
| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` | | `alerting.twilio.token` | Twilio auth token | Required `""` |
| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` | | `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` |
| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` | | `alerting.twilio.to` | Number to send twilio alerts to | Required `""` |
| `alerting.custom.url` | Custom alerting request url | `""` | | `alerting.custom` | Configuration for custom actions on failure or alerts | `""` |
| `alerting.custom.body` | Custom alerting request body. | `""` | | `alerting.custom.url` | Custom alerting request url | `""` |
| `alerting.custom.headers` | Custom alerting request headers | `{}` | | `alerting.custom.body` | Custom alerting request body. | `""` |
| `alerting.custom.headers` | Custom alerting request headers | `{}` |
### Conditions ### Conditions
@ -144,7 +145,7 @@ services:
send-on-resolved: true send-on-resolved: true
- type: slack - type: slack
enabled: true enabled: true
threshold: 5 failure-threshold: 5
description: "healthcheck failed 5 times in a row" description: "healthcheck failed 5 times in a row"
send-on-resolved: true send-on-resolved: true
conditions: conditions:
@ -175,10 +176,10 @@ services:
alerts: alerts:
- type: pagerduty - type: pagerduty
enabled: true enabled: true
threshold: 3 failure-threshold: 3
success-threshold: 5
description: "healthcheck failed 3 times in a row" description: "healthcheck failed 3 times in a row"
send-on-resolved: true send-on-resolved: true
success-before-resolved: 5
conditions: conditions:
- "[STATUS] == 200" - "[STATUS] == 200"
- "[BODY].status == UP" - "[BODY].status == UP"
@ -202,8 +203,9 @@ services:
alerts: alerts:
- type: twilio - type: twilio
enabled: true enabled: true
threshold: 5 failure-threshold: 5
description: "healthcheck failed 5 times in a row" description: "healthcheck failed 5 times in a row"
send-on-resolved: true
conditions: conditions:
- "[STATUS] == 200" - "[STATUS] == 200"
- "[BODY].status == UP" - "[BODY].status == UP"
@ -244,7 +246,8 @@ services:
alerts: alerts:
- type: custom - type: custom
enabled: true enabled: true
threshold: 10 failure-threshold: 10
success-threshold: 3
send-on-resolved: true send-on-resolved: true
description: "healthcheck failed 10 times in a row" description: "healthcheck failed 10 times in a row"
conditions: conditions:

View File

@ -26,7 +26,7 @@ func handleAlertsToTrigger(service *core.Service, result *core.Result, cfg *conf
service.NumberOfFailuresInARow++ service.NumberOfFailuresInARow++
for _, alert := range service.Alerts { for _, alert := range service.Alerts {
// If the alert hasn't been triggered, move to the next one // If the alert hasn't been triggered, move to the next one
if !alert.Enabled || alert.Threshold != service.NumberOfFailuresInARow { if !alert.Enabled || alert.FailureThreshold != service.NumberOfFailuresInARow {
continue continue
} }
if alert.Triggered { if alert.Triggered {
@ -100,7 +100,7 @@ func handleAlertsToTrigger(service *core.Service, result *core.Result, cfg *conf
func handleAlertsToResolve(service *core.Service, result *core.Result, cfg *config.Config) { func handleAlertsToResolve(service *core.Service, result *core.Result, cfg *config.Config) {
service.NumberOfSuccessesInARow++ service.NumberOfSuccessesInARow++
for _, alert := range service.Alerts { for _, alert := range service.Alerts {
if !alert.Enabled || !alert.Triggered || alert.SuccessBeforeResolved > service.NumberOfSuccessesInARow { if !alert.Enabled || !alert.Triggered || alert.SuccessThreshold > service.NumberOfSuccessesInARow {
continue continue
} }
alert.Triggered = false alert.Triggered = false

View File

@ -128,7 +128,7 @@ services:
alerts: alerts:
- type: slack - type: slack
enabled: true enabled: true
threshold: 7 failure-threshold: 7
description: "Healthcheck failed 7 times in a row" description: "Healthcheck failed 7 times in a row"
conditions: conditions:
- "[STATUS] == 200" - "[STATUS] == 200"
@ -166,8 +166,11 @@ services:
if !config.Services[0].Alerts[0].Enabled { if !config.Services[0].Alerts[0].Enabled {
t.Error("The alert should've been enabled") t.Error("The alert should've been enabled")
} }
if config.Services[0].Alerts[0].Threshold != 7 { if config.Services[0].Alerts[0].FailureThreshold != 7 {
t.Errorf("The threshold of the alert should've been %d, but it was %d", 7, config.Services[0].Alerts[0].Threshold) t.Errorf("The failure threshold of the alert should've been %d, but it was %d", 7, config.Services[0].Alerts[0].FailureThreshold)
}
if config.Services[0].Alerts[0].FailureThreshold != 7 {
t.Errorf("The success threshold of the alert should've been %d, but it was %d", 2, config.Services[0].Alerts[0].SuccessThreshold)
} }
if config.Services[0].Alerts[0].Type != core.SlackAlert { if config.Services[0].Alerts[0].Type != core.SlackAlert {
t.Errorf("The type of the alert should've been %s, but it was %s", core.SlackAlert, config.Services[0].Alerts[0].Type) t.Errorf("The type of the alert should've been %s, but it was %s", core.SlackAlert, config.Services[0].Alerts[0].Type)

View File

@ -8,8 +8,8 @@ type Alert struct {
// Enabled defines whether or not the alert is enabled // Enabled defines whether or not the alert is enabled
Enabled bool `yaml:"enabled"` Enabled bool `yaml:"enabled"`
// Threshold is the number of failures in a row needed before triggering the alert // FailureThreshold is the number of failures in a row needed before triggering the alert
Threshold int `yaml:"threshold"` FailureThreshold int `yaml:"failure-threshold"`
// Description of the alert. Will be included in the alert sent. // Description of the alert. Will be included in the alert sent.
Description string `yaml:"description"` Description string `yaml:"description"`
@ -17,8 +17,8 @@ type Alert struct {
// SendOnResolved defines whether to send a second notification when the issue has been resolved // SendOnResolved defines whether to send a second notification when the issue has been resolved
SendOnResolved bool `yaml:"send-on-resolved"` SendOnResolved bool `yaml:"send-on-resolved"`
// SuccessBeforeResolved defines whether to send a second notification when the issue has been resolved // SuccessThreshold defines how many successful executions must happen in a row before an ongoing incident is marked as resolved
SuccessBeforeResolved int `yaml:"success-before-resolved"` SuccessThreshold int `yaml:"success-threshold"`
// ResolveKey is an optional field that is used by some providers (i.e. PagerDuty's dedup_key) to resolve // ResolveKey is an optional field that is used by some providers (i.e. PagerDuty's dedup_key) to resolve
// ongoing/triggered incidents // ongoing/triggered incidents

View File

@ -99,10 +99,10 @@ func CreateSlackCustomAlertProvider(slackWebHookUrl string, service *Service, al
var message string var message string
var color string var color string
if resolved { if resolved {
message = fmt.Sprintf("An alert for *%s* has been resolved after passing successfully %d time(s) in a row", service.Name, alert.SuccessBeforeResolved) message = fmt.Sprintf("An alert for *%s* has been resolved after passing successfully %d time(s) in a row", service.Name, alert.SuccessThreshold)
color = "#36A64F" color = "#36A64F"
} else { } else {
message = fmt.Sprintf("An alert for *%s* has been triggered due to having failed %d time(s) in a row", service.Name, alert.Threshold) message = fmt.Sprintf("An alert for *%s* has been triggered due to having failed %d time(s) in a row", service.Name, alert.FailureThreshold)
color = "#DD0000" color = "#DD0000"
} }
var results string var results string

View File

@ -62,11 +62,11 @@ func (service *Service) Validate() {
service.Headers = make(map[string]string) service.Headers = make(map[string]string)
} }
for _, alert := range service.Alerts { for _, alert := range service.Alerts {
if alert.Threshold <= 0 { if alert.FailureThreshold <= 0 {
alert.Threshold = 3 alert.FailureThreshold = 3
} }
if alert.SuccessBeforeResolved <= 0 { if alert.SuccessThreshold <= 0 {
alert.SuccessBeforeResolved = 2 alert.SuccessThreshold = 2
} }
} }
if len(service.Url) == 0 { if len(service.Url) == 0 {
@ -107,7 +107,7 @@ func (service *Service) GetAlertsTriggered() []Alert {
return alerts return alerts
} }
for _, alert := range service.Alerts { for _, alert := range service.Alerts {
if alert.Enabled && alert.Threshold == service.NumberOfFailuresInARow { if alert.Enabled && alert.FailureThreshold == service.NumberOfFailuresInARow {
alerts = append(alerts, *alert) alerts = append(alerts, *alert)
continue continue
} }