2020-09-19 22:22:12 +02:00
package watchdog
2020-09-17 01:26:19 +02:00
import (
"encoding/json"
2020-11-13 21:01:21 +01:00
"log"
2021-10-04 03:53:59 +02:00
"github.com/TwinProduction/gatus/v3/alerting"
"github.com/TwinProduction/gatus/v3/alerting/alert"
"github.com/TwinProduction/gatus/v3/core"
2020-09-17 01:26:19 +02:00
)
2020-09-19 22:22:12 +02:00
// HandleAlerting takes care of alerts to resolve and alerts to trigger based on result success or failure
2021-05-19 04:29:15 +02:00
func HandleAlerting ( service * core . Service , result * core . Result , alertingConfig * alerting . Config , debug bool ) {
if alertingConfig == nil {
2020-09-17 01:26:19 +02:00
return
}
if result . Success {
2021-05-19 04:29:15 +02:00
handleAlertsToResolve ( service , result , alertingConfig , debug )
2020-09-17 01:26:19 +02:00
} else {
2021-05-19 04:29:15 +02:00
handleAlertsToTrigger ( service , result , alertingConfig , debug )
2020-09-17 01:26:19 +02:00
}
}
2021-05-19 04:29:15 +02:00
func handleAlertsToTrigger ( service * core . Service , result * core . Result , alertingConfig * alerting . Config , debug bool ) {
2020-09-17 01:26:19 +02:00
service . NumberOfSuccessesInARow = 0
service . NumberOfFailuresInARow ++
2021-05-19 04:29:15 +02:00
for _ , serviceAlert := range service . Alerts {
// If the serviceAlert hasn't been triggered, move to the next one
if ! serviceAlert . IsEnabled ( ) || serviceAlert . FailureThreshold > service . NumberOfFailuresInARow {
2020-09-17 01:26:19 +02:00
continue
}
2021-05-19 04:29:15 +02:00
if serviceAlert . Triggered {
if debug {
log . Printf ( "[watchdog][handleAlertsToTrigger] Alert for service=%s with description='%s' has already been TRIGGERED, skipping" , service . Name , serviceAlert . GetDescription ( ) )
2020-09-17 01:26:19 +02:00
}
continue
}
2021-05-19 04:29:15 +02:00
alertProvider := alertingConfig . GetAlertingProviderByAlertType ( serviceAlert . Type )
2020-09-26 20:23:43 +02:00
if alertProvider != nil && alertProvider . IsValid ( ) {
2021-05-19 04:29:15 +02:00
log . Printf ( "[watchdog][handleAlertsToTrigger] Sending %s serviceAlert because serviceAlert for service=%s with description='%s' has been TRIGGERED" , serviceAlert . Type , service . Name , serviceAlert . GetDescription ( ) )
customAlertProvider := alertProvider . ToCustomAlertProvider ( service , serviceAlert , result , false )
2020-09-17 01:26:19 +02:00
// TODO: retry on error
var err error
2020-09-26 20:23:43 +02:00
// We need to extract the DedupKey from PagerDuty's response
2021-05-19 04:29:15 +02:00
if serviceAlert . Type == alert . TypePagerDuty {
2020-09-17 01:26:19 +02:00
var body [ ] byte
2021-05-19 04:29:15 +02:00
if body , err = customAlertProvider . Send ( service . Name , serviceAlert . GetDescription ( ) , false ) ; err == nil {
2020-09-17 01:26:19 +02:00
var response pagerDutyResponse
2021-01-10 05:52:11 +01:00
if err = json . Unmarshal ( body , & response ) ; err != nil {
2021-01-21 22:14:32 +01:00
log . Printf ( "[watchdog][handleAlertsToTrigger] Ran into error unmarshaling pagerduty response: %s" , err . Error ( ) )
2020-09-17 01:26:19 +02:00
} else {
2021-05-19 04:29:15 +02:00
serviceAlert . ResolveKey = response . DedupKey
2020-09-17 01:26:19 +02:00
}
}
} else {
2021-05-19 04:29:15 +02:00
// All other serviceAlert types don't need to extract anything from the body, so we can just send the request right away
_ , err = customAlertProvider . Send ( service . Name , serviceAlert . GetDescription ( ) , false )
2020-09-17 01:26:19 +02:00
}
if err != nil {
2021-05-19 04:29:15 +02:00
log . Printf ( "[watchdog][handleAlertsToTrigger] Failed to send an serviceAlert for service=%s: %s" , service . Name , err . Error ( ) )
2020-09-17 01:26:19 +02:00
} else {
2021-05-19 04:29:15 +02:00
serviceAlert . Triggered = true
2020-09-17 01:26:19 +02:00
}
2020-09-26 20:23:43 +02:00
} else {
2021-05-19 04:29:15 +02:00
log . Printf ( "[watchdog][handleAlertsToResolve] Not sending serviceAlert of type=%s despite being TRIGGERED, because the provider wasn't configured properly" , serviceAlert . Type )
2020-09-17 01:26:19 +02:00
}
}
}
2021-05-19 04:29:15 +02:00
func handleAlertsToResolve ( service * core . Service , result * core . Result , alertingConfig * alerting . Config , debug bool ) {
2020-09-17 01:26:19 +02:00
service . NumberOfSuccessesInARow ++
2021-05-19 04:29:15 +02:00
for _ , serviceAlert := range service . Alerts {
if ! serviceAlert . IsEnabled ( ) || ! serviceAlert . Triggered || serviceAlert . SuccessThreshold > service . NumberOfSuccessesInARow {
2020-09-17 01:26:19 +02:00
continue
}
2021-05-19 04:29:15 +02:00
// Even if the serviceAlert provider returns an error, we still set the serviceAlert's Triggered variable to false.
2021-01-21 22:14:32 +01:00
// Further explanation can be found on Alert's Triggered field.
2021-05-19 04:29:15 +02:00
serviceAlert . Triggered = false
if ! serviceAlert . IsSendingOnResolved ( ) {
2020-09-17 01:26:19 +02:00
continue
}
2021-05-19 04:29:15 +02:00
alertProvider := alertingConfig . GetAlertingProviderByAlertType ( serviceAlert . Type )
2020-09-26 20:23:43 +02:00
if alertProvider != nil && alertProvider . IsValid ( ) {
2021-05-19 04:29:15 +02:00
log . Printf ( "[watchdog][handleAlertsToResolve] Sending %s serviceAlert because serviceAlert for service=%s with description='%s' has been RESOLVED" , serviceAlert . Type , service . Name , serviceAlert . GetDescription ( ) )
customAlertProvider := alertProvider . ToCustomAlertProvider ( service , serviceAlert , result , true )
2020-09-17 01:26:19 +02:00
// TODO: retry on error
2021-05-19 04:29:15 +02:00
_ , err := customAlertProvider . Send ( service . Name , serviceAlert . GetDescription ( ) , true )
2020-09-17 01:26:19 +02:00
if err != nil {
2021-05-19 04:29:15 +02:00
log . Printf ( "[watchdog][handleAlertsToResolve] Failed to send an serviceAlert for service=%s: %s" , service . Name , err . Error ( ) )
2020-09-17 01:26:19 +02:00
} else {
2021-05-19 04:29:15 +02:00
if serviceAlert . Type == alert . TypePagerDuty {
serviceAlert . ResolveKey = ""
2020-09-17 01:26:19 +02:00
}
}
2020-09-26 20:23:43 +02:00
} else {
2021-05-19 04:29:15 +02:00
log . Printf ( "[watchdog][handleAlertsToResolve] Not sending serviceAlert of type=%s despite being RESOLVED, because the provider wasn't configured properly" , serviceAlert . Type )
2020-09-17 01:26:19 +02:00
}
}
service . NumberOfFailuresInARow = 0
}
2020-09-19 22:22:12 +02:00
type pagerDutyResponse struct {
Status string ` json:"status" `
Message string ` json:"message" `
DedupKey string ` json:"dedup_key" `
}