2020-09-19 22:22:12 +02:00
package watchdog
2020-09-17 01:26:19 +02:00
import (
"encoding/json"
2020-11-13 21:01:21 +01:00
"log"
2021-10-08 03:28:04 +02:00
"github.com/TwiN/gatus/v3/alerting"
"github.com/TwiN/gatus/v3/alerting/alert"
"github.com/TwiN/gatus/v3/core"
2020-09-17 01:26:19 +02:00
)
2020-09-19 22:22:12 +02:00
// HandleAlerting takes care of alerts to resolve and alerts to trigger based on result success or failure
2021-10-23 22:47:12 +02:00
func HandleAlerting ( endpoint * core . Endpoint , result * core . Result , alertingConfig * alerting . Config , debug bool ) {
2021-05-19 04:29:15 +02:00
if alertingConfig == nil {
2020-09-17 01:26:19 +02:00
return
}
if result . Success {
2021-10-23 22:47:12 +02:00
handleAlertsToResolve ( endpoint , result , alertingConfig , debug )
2020-09-17 01:26:19 +02:00
} else {
2021-10-23 22:47:12 +02:00
handleAlertsToTrigger ( endpoint , result , alertingConfig , debug )
2020-09-17 01:26:19 +02:00
}
}
2021-10-23 22:47:12 +02:00
func handleAlertsToTrigger ( endpoint * core . Endpoint , result * core . Result , alertingConfig * alerting . Config , debug bool ) {
endpoint . NumberOfSuccessesInARow = 0
endpoint . NumberOfFailuresInARow ++
for _ , endpointAlert := range endpoint . Alerts {
// If the alert hasn't been triggered, move to the next one
if ! endpointAlert . IsEnabled ( ) || endpointAlert . FailureThreshold > endpoint . NumberOfFailuresInARow {
2020-09-17 01:26:19 +02:00
continue
}
2021-10-23 22:47:12 +02:00
if endpointAlert . Triggered {
2021-05-19 04:29:15 +02:00
if debug {
2021-10-23 22:47:12 +02:00
log . Printf ( "[watchdog][handleAlertsToTrigger] Alert for endpoint=%s with description='%s' has already been TRIGGERED, skipping" , endpoint . Name , endpointAlert . GetDescription ( ) )
2020-09-17 01:26:19 +02:00
}
continue
}
2021-10-23 22:47:12 +02:00
alertProvider := alertingConfig . GetAlertingProviderByAlertType ( endpointAlert . Type )
2020-09-26 20:23:43 +02:00
if alertProvider != nil && alertProvider . IsValid ( ) {
2021-10-23 22:47:12 +02:00
log . Printf ( "[watchdog][handleAlertsToTrigger] Sending %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED" , endpointAlert . Type , endpoint . Name , endpointAlert . GetDescription ( ) )
customAlertProvider := alertProvider . ToCustomAlertProvider ( endpoint , endpointAlert , result , false )
2020-09-17 01:26:19 +02:00
// TODO: retry on error
var err error
2020-09-26 20:23:43 +02:00
// We need to extract the DedupKey from PagerDuty's response
2021-10-23 22:47:12 +02:00
if endpointAlert . Type == alert . TypePagerDuty {
2020-09-17 01:26:19 +02:00
var body [ ] byte
2021-10-23 22:47:12 +02:00
if body , err = customAlertProvider . Send ( endpoint . Name , endpointAlert . GetDescription ( ) , false ) ; err == nil {
2020-09-17 01:26:19 +02:00
var response pagerDutyResponse
2021-01-10 05:52:11 +01:00
if err = json . Unmarshal ( body , & response ) ; err != nil {
2021-01-21 22:14:32 +01:00
log . Printf ( "[watchdog][handleAlertsToTrigger] Ran into error unmarshaling pagerduty response: %s" , err . Error ( ) )
2020-09-17 01:26:19 +02:00
} else {
2021-10-23 22:47:12 +02:00
endpointAlert . ResolveKey = response . DedupKey
2020-09-17 01:26:19 +02:00
}
}
} else {
2021-10-23 22:47:12 +02:00
// All other alert types don't need to extract anything from the body, so we can just send the request right away
_ , err = customAlertProvider . Send ( endpoint . Name , endpointAlert . GetDescription ( ) , false )
2020-09-17 01:26:19 +02:00
}
if err != nil {
2021-10-23 22:47:12 +02:00
log . Printf ( "[watchdog][handleAlertsToTrigger] Failed to send an alert for endpoint=%s: %s" , endpoint . Name , err . Error ( ) )
2020-09-17 01:26:19 +02:00
} else {
2021-10-23 22:47:12 +02:00
endpointAlert . Triggered = true
2020-09-17 01:26:19 +02:00
}
2020-09-26 20:23:43 +02:00
} else {
2021-10-23 22:47:12 +02:00
log . Printf ( "[watchdog][handleAlertsToResolve] Not sending alert of type=%s despite being TRIGGERED, because the provider wasn't configured properly" , endpointAlert . Type )
2020-09-17 01:26:19 +02:00
}
}
}
2021-10-23 22:47:12 +02:00
func handleAlertsToResolve ( endpoint * core . Endpoint , result * core . Result , alertingConfig * alerting . Config , debug bool ) {
endpoint . NumberOfSuccessesInARow ++
for _ , endpointAlert := range endpoint . Alerts {
if ! endpointAlert . IsEnabled ( ) || ! endpointAlert . Triggered || endpointAlert . SuccessThreshold > endpoint . NumberOfSuccessesInARow {
2020-09-17 01:26:19 +02:00
continue
}
2021-10-23 22:47:12 +02:00
// Even if the alert provider returns an error, we still set the alert's Triggered variable to false.
2021-01-21 22:14:32 +01:00
// Further explanation can be found on Alert's Triggered field.
2021-10-23 22:47:12 +02:00
endpointAlert . Triggered = false
if ! endpointAlert . IsSendingOnResolved ( ) {
2020-09-17 01:26:19 +02:00
continue
}
2021-10-23 22:47:12 +02:00
alertProvider := alertingConfig . GetAlertingProviderByAlertType ( endpointAlert . Type )
2020-09-26 20:23:43 +02:00
if alertProvider != nil && alertProvider . IsValid ( ) {
2021-10-23 22:47:12 +02:00
log . Printf ( "[watchdog][handleAlertsToResolve] Sending %s alert because alert for endpoint=%s with description='%s' has been RESOLVED" , endpointAlert . Type , endpoint . Name , endpointAlert . GetDescription ( ) )
customAlertProvider := alertProvider . ToCustomAlertProvider ( endpoint , endpointAlert , result , true )
2020-09-17 01:26:19 +02:00
// TODO: retry on error
2021-10-23 22:47:12 +02:00
_ , err := customAlertProvider . Send ( endpoint . Name , endpointAlert . GetDescription ( ) , true )
2020-09-17 01:26:19 +02:00
if err != nil {
2021-10-23 22:47:12 +02:00
log . Printf ( "[watchdog][handleAlertsToResolve] Failed to send an alert for endpoint=%s: %s" , endpoint . Name , err . Error ( ) )
2020-09-17 01:26:19 +02:00
} else {
2021-10-23 22:47:12 +02:00
if endpointAlert . Type == alert . TypePagerDuty {
endpointAlert . ResolveKey = ""
2020-09-17 01:26:19 +02:00
}
}
2020-09-26 20:23:43 +02:00
} else {
2021-10-23 22:47:12 +02:00
log . Printf ( "[watchdog][handleAlertsToResolve] Not sending alert of type=%s despite being RESOLVED, because the provider wasn't configured properly" , endpointAlert . Type )
2020-09-17 01:26:19 +02:00
}
}
2021-10-23 22:47:12 +02:00
endpoint . NumberOfFailuresInARow = 0
2020-09-17 01:26:19 +02:00
}
2020-09-19 22:22:12 +02:00
type pagerDutyResponse struct {
Status string ` json:"status" `
Message string ` json:"message" `
DedupKey string ` json:"dedup_key" `
}