feat(alerting): Persist triggered alerts across application restart (#764)

* feat(alerting): Persist triggered alerts across application restart

Fixes #679

* test(alerting): Add numerous tests related to alerts
This commit is contained in:
TwiN 2024-05-15 21:29:45 -04:00 committed by GitHub
parent 9d151fcdb4
commit f2c5f5911c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 822 additions and 72 deletions

View File

@ -1,7 +1,10 @@
package alert
import (
"crypto/sha256"
"encoding/hex"
"errors"
"strconv"
"strings"
)
@ -26,6 +29,9 @@ type Alert struct {
// FailureThreshold is the number of failures in a row needed before triggering the alert
FailureThreshold int `yaml:"failure-threshold"`
// SuccessThreshold defines how many successful executions must happen in a row before an ongoing incident is marked as resolved
SuccessThreshold int `yaml:"success-threshold"`
// Description of the alert. Will be included in the alert sent.
//
// This is a pointer, because it is populated by YAML and we need to know whether it was explicitly set to a value
@ -38,9 +44,6 @@ type Alert struct {
// or not for provider.ParseWithDefaultAlert to work. Use Alert.IsSendingOnResolved() for a non-pointer
SendOnResolved *bool `yaml:"send-on-resolved"`
// SuccessThreshold defines how many successful executions must happen in a row before an ongoing incident is marked as resolved
SuccessThreshold int `yaml:"success-threshold"`
// ResolveKey is an optional field that is used by some providers (i.e. PagerDuty's dedup_key) to resolve
// ongoing/triggered incidents
ResolveKey string `yaml:"-"`
@ -94,3 +97,17 @@ func (alert *Alert) IsSendingOnResolved() bool {
}
return *alert.SendOnResolved
}
// Checksum returns a checksum of the alert
// Used to determine which persisted triggered alert should be deleted on application start
func (alert *Alert) Checksum() string {
hash := sha256.New()
hash.Write([]byte(string(alert.Type) + "_" +
strconv.FormatBool(alert.IsEnabled()) + "_" +
strconv.FormatBool(alert.IsSendingOnResolved()) + "_" +
strconv.Itoa(alert.SuccessThreshold) + "_" +
strconv.Itoa(alert.FailureThreshold) + "_" +
alert.GetDescription()),
)
return hex.EncodeToString(hash.Sum(nil))
}

View File

@ -84,3 +84,109 @@ func TestAlert_IsSendingOnResolved(t *testing.T) {
t.Error("alert.IsSendingOnResolved() should've returned true, because SendOnResolved was set to true")
}
}
func TestAlert_Checksum(t *testing.T) {
description1, description2 := "a", "b"
yes, no := true, false
scenarios := []struct {
name string
alert Alert
expected string
}{
{
name: "barebone",
alert: Alert{
Type: TypeDiscord,
},
expected: "fed0580e44ed5701dbba73afa1f14b2c53ca5a7b8067a860441c212916057fe3",
},
{
name: "with-description-1",
alert: Alert{
Type: TypeDiscord,
Description: &description1,
},
expected: "005f407ebe506e74a4aeb46f74c28b376debead7011e1b085da3840f72ba9707",
},
{
name: "with-description-2",
alert: Alert{
Type: TypeDiscord,
Description: &description2,
},
expected: "3c2c4a9570cdc614006993c21f79a860a7f5afea10cf70d1a79d3c49342ef2c8",
},
{
name: "with-description-2-and-enabled-false",
alert: Alert{
Type: TypeDiscord,
Enabled: &no,
Description: &description2,
},
expected: "837945c2b4cd5e961db3e63e10c348d4f1c3446ba68cf5a48e35a1ae22cf0c22",
},
{
name: "with-description-2-and-enabled-true",
alert: Alert{
Type: TypeDiscord,
Enabled: &yes, // it defaults to true if not set, but just to make sure
Description: &description2,
},
expected: "3c2c4a9570cdc614006993c21f79a860a7f5afea10cf70d1a79d3c49342ef2c8",
},
{
name: "with-description-2-and-enabled-true-and-send-on-resolved-true",
alert: Alert{
Type: TypeDiscord,
Enabled: &yes,
SendOnResolved: &yes,
Description: &description2,
},
expected: "bf1436995a880eb4a352c74c5dfee1f1b5ff6b9fc55aef9bf411b3631adfd80c",
},
{
name: "with-description-2-and-failure-threshold-7",
alert: Alert{
Type: TypeSlack,
FailureThreshold: 7,
Description: &description2,
},
expected: "8bd479e18bda393d4c924f5a0d962e825002168dedaa88b445e435db7bacffd3",
},
{
name: "with-description-2-and-failure-threshold-9",
alert: Alert{
Type: TypeSlack,
FailureThreshold: 9,
Description: &description2,
},
expected: "5abdfce5236e344996d264d526e769c07cb0d3d329a999769a1ff84b157ca6f1",
},
{
name: "with-description-2-and-success-threshold-5",
alert: Alert{
Type: TypeSlack,
SuccessThreshold: 7,
Description: &description2,
},
expected: "c0000e73626b80e212cfc24830de7094568f648e37f3e16f9e68c7f8ef75c34c",
},
{
name: "with-description-2-and-success-threshold-1",
alert: Alert{
Type: TypeSlack,
SuccessThreshold: 1,
Description: &description2,
},
expected: "5c28963b3a76104cfa4a0d79c89dd29ec596c8cfa4b1af210ec83d6d41587b5f",
},
}
for _, scenario := range scenarios {
t.Run(scenario.name, func(t *testing.T) {
scenario.alert.ValidateAndSetDefaults()
if checksum := scenario.alert.Checksum(); checksum != scenario.expected {
t.Errorf("expected checksum %v, got %v", scenario.expected, checksum)
}
})
}
}

View File

@ -64,6 +64,12 @@ func TestCreateExternalEndpointResult(t *testing.T) {
AuthorizationHeaderBearerToken: "Bearer token",
ExpectedCode: 404,
},
{
Name: "bad-success-value",
Path: "/api/v1/endpoints/g_n/external?success=invalid",
AuthorizationHeaderBearerToken: "Bearer token",
ExpectedCode: 400,
},
{
Name: "good-token-success-true",
Path: "/api/v1/endpoints/g_n/external?success=true",

View File

@ -245,16 +245,13 @@ func parseAndValidateConfigBytes(yamlBytes []byte) (config *Config, err error) {
if config == nil || config.Endpoints == nil || len(config.Endpoints) == 0 {
err = ErrNoEndpointInConfig
} else {
validateAlertingConfig(config.Alerting, config.Endpoints, config.Debug)
validateAlertingConfig(config.Alerting, config.Endpoints, config.ExternalEndpoints, config.Debug)
if err := validateSecurityConfig(config); err != nil {
return nil, err
}
if err := validateEndpointsConfig(config); err != nil {
return nil, err
}
if err := validateExternalEndpointsConfig(config); err != nil {
return nil, err
}
if err := validateWebConfig(config); err != nil {
return nil, err
}
@ -338,28 +335,37 @@ func validateWebConfig(config *Config) error {
}
func validateEndpointsConfig(config *Config) error {
duplicateValidationMap := make(map[string]bool)
// Validate endpoints
for _, ep := range config.Endpoints {
if config.Debug {
log.Printf("[config.validateEndpointsConfig] Validating endpoint '%s'", ep.Name)
}
if endpointKey := ep.Key(); duplicateValidationMap[endpointKey] {
return fmt.Errorf("invalid endpoint %s: name and group combination must be unique", ep.Key())
} else {
duplicateValidationMap[endpointKey] = true
}
if err := ep.ValidateAndSetDefaults(); err != nil {
return fmt.Errorf("invalid endpoint %s: %w", ep.DisplayName(), err)
return fmt.Errorf("invalid endpoint %s: %w", ep.Key(), err)
}
}
log.Printf("[config.validateEndpointsConfig] Validated %d endpoints", len(config.Endpoints))
return nil
}
func validateExternalEndpointsConfig(config *Config) error {
// Validate external endpoints
for _, ee := range config.ExternalEndpoints {
if config.Debug {
log.Printf("[config.validateExternalEndpointsConfig] Validating external endpoint '%s'", ee.Name)
log.Printf("[config.validateEndpointsConfig] Validating external endpoint '%s'", ee.Name)
}
if endpointKey := ee.Key(); duplicateValidationMap[endpointKey] {
return fmt.Errorf("invalid external endpoint %s: name and group combination must be unique", ee.Key())
} else {
duplicateValidationMap[endpointKey] = true
}
if err := ee.ValidateAndSetDefaults(); err != nil {
return fmt.Errorf("invalid external endpoint %s: %w", ee.DisplayName(), err)
return fmt.Errorf("invalid external endpoint %s: %w", ee.Key(), err)
}
}
log.Printf("[config.validateExternalEndpointsConfig] Validated %d external endpoints", len(config.ExternalEndpoints))
log.Printf("[config.validateEndpointsConfig] Validated %d external endpoints", len(config.ExternalEndpoints))
return nil
}
@ -382,7 +388,7 @@ func validateSecurityConfig(config *Config) error {
// Note that the alerting configuration has to be validated before the endpoint configuration, because the default alert
// returned by provider.AlertProvider.GetDefaultAlert() must be parsed before endpoint.Endpoint.ValidateAndSetDefaults()
// sets the default alert values when none are set.
func validateAlertingConfig(alertingConfig *alerting.Config, endpoints []*endpoint.Endpoint, debug bool) {
func validateAlertingConfig(alertingConfig *alerting.Config, endpoints []*endpoint.Endpoint, externalEndpoints []*endpoint.ExternalEndpoint, debug bool) {
if alertingConfig == nil {
log.Printf("[config.validateAlertingConfig] Alerting is not configured")
return
@ -391,12 +397,12 @@ func validateAlertingConfig(alertingConfig *alerting.Config, endpoints []*endpoi
alert.TypeAWSSES,
alert.TypeCustom,
alert.TypeDiscord,
alert.TypeEmail,
alert.TypeGitHub,
alert.TypeGitLab,
alert.TypeGoogleChat,
alert.TypeGotify,
alert.TypeJetBrainsSpace,
alert.TypeEmail,
alert.TypeMatrix,
alert.TypeMattermost,
alert.TypeMessagebird,
@ -420,7 +426,17 @@ func validateAlertingConfig(alertingConfig *alerting.Config, endpoints []*endpoi
for alertIndex, endpointAlert := range ep.Alerts {
if alertType == endpointAlert.Type {
if debug {
log.Printf("[config.validateAlertingConfig] Parsing alert %d with provider's default alert for provider=%s in endpoint=%s", alertIndex, alertType, ep.Name)
log.Printf("[config.validateAlertingConfig] Parsing alert %d with default alert for provider=%s in endpoint with key=%s", alertIndex, alertType, ep.Key())
}
provider.ParseWithDefaultAlert(alertProvider.GetDefaultAlert(), endpointAlert)
}
}
}
for _, ee := range externalEndpoints {
for alertIndex, endpointAlert := range ee.Alerts {
if alertType == endpointAlert.Type {
if debug {
log.Printf("[config.validateAlertingConfig] Parsing alert %d with default alert for provider=%s in endpoint with key=%s", alertIndex, alertType, ee.Key())
}
provider.ParseWithDefaultAlert(alertProvider.GetDefaultAlert(), endpointAlert)
}

View File

@ -16,6 +16,7 @@ import (
"github.com/TwiN/gatus/v5/alerting/provider/email"
"github.com/TwiN/gatus/v5/alerting/provider/github"
"github.com/TwiN/gatus/v5/alerting/provider/googlechat"
"github.com/TwiN/gatus/v5/alerting/provider/gotify"
"github.com/TwiN/gatus/v5/alerting/provider/jetbrainsspace"
"github.com/TwiN/gatus/v5/alerting/provider/matrix"
"github.com/TwiN/gatus/v5/alerting/provider/mattermost"
@ -36,6 +37,7 @@ import (
)
func TestLoadConfiguration(t *testing.T) {
yes := true
dir := t.TempDir()
scenarios := []struct {
name string
@ -165,6 +167,8 @@ metrics: true
alerting:
slack:
webhook-url: https://hooks.slack.com/services/xxx/yyy/zzz
default-alert:
enabled: true
endpoints:
- name: example
@ -179,6 +183,12 @@ alerting:
discord:
webhook-url: https://discord.com/api/webhooks/xxx/yyy
external-endpoints:
- name: ext-ep-test
token: "potato"
alerts:
- type: slack
endpoints:
- name: frontend
url: https://example.com
@ -190,7 +200,20 @@ endpoints:
Metrics: true,
Alerting: &alerting.Config{
Discord: &discord.AlertProvider{WebhookURL: "https://discord.com/api/webhooks/xxx/yyy"},
Slack: &slack.AlertProvider{WebhookURL: "https://hooks.slack.com/services/xxx/yyy/zzz"},
Slack: &slack.AlertProvider{WebhookURL: "https://hooks.slack.com/services/xxx/yyy/zzz", DefaultAlert: &alert.Alert{Enabled: &yes}},
},
ExternalEndpoints: []*endpoint.ExternalEndpoint{
{
Name: "ext-ep-test",
Token: "potato",
Alerts: []*alert.Alert{
{
Type: alert.TypeSlack,
FailureThreshold: 3,
SuccessThreshold: 2,
},
},
},
},
Endpoints: []*endpoint.Endpoint{
{
@ -325,10 +348,6 @@ external-endpoints:
- name: ext-ep-test
group: core
token: "potato"
alerts:
- type: discord
description: "healthcheck failed"
send-on-resolved: true
endpoints:
- name: website
@ -382,18 +401,7 @@ endpoints:
if config.ExternalEndpoints[0].Token != "potato" {
t.Errorf("Token should have been %s", "potato")
}
if len(config.ExternalEndpoints[0].Alerts) != 1 {
t.Error("Should have returned one alert")
}
if config.ExternalEndpoints[0].Alerts[0].Type != alert.TypeDiscord {
t.Errorf("Type should have been %s", alert.TypeDiscord)
}
if config.ExternalEndpoints[0].Alerts[0].FailureThreshold != 3 {
t.Errorf("FailureThreshold should have been %d, got %d", 3, config.ExternalEndpoints[0].Alerts[0].FailureThreshold)
}
if config.ExternalEndpoints[0].Alerts[0].SuccessThreshold != 2 {
t.Errorf("SuccessThreshold should have been %d, got %d", 2, config.ExternalEndpoints[0].Alerts[0].SuccessThreshold)
}
if len(config.Endpoints) != 3 {
t.Error("Should have returned two endpoints")
}
@ -439,7 +447,6 @@ endpoints:
if len(config.Endpoints[1].Conditions) != 2 {
t.Errorf("There should have been %d conditions", 2)
}
if config.Endpoints[2].URL != "https://example.com/" {
t.Errorf("URL should have been %s", "https://example.com/")
}
@ -977,6 +984,22 @@ alerting:
enabled: true
failure-threshold: 5
success-threshold: 3
email:
from: "from@example.com"
username: "from@example.com"
password: "hunter2"
host: "mail.example.com"
port: 587
to: "recipient1@example.com,recipient2@example.com"
client:
insecure: false
default-alert:
enabled: true
gotify:
server-url: "https://gotify.example"
token: "**************"
default-alert:
enabled: true
endpoints:
- name: website
@ -993,6 +1016,8 @@ endpoints:
- type: teams
- type: pushover
- type: jetbrainsspace
- type: email
- type: gotify
conditions:
- "[STATUS] == 200"
`))
@ -1107,6 +1132,7 @@ endpoints:
if config.Alerting.JetBrainsSpace == nil || !config.Alerting.JetBrainsSpace.IsValid() {
t.Fatal("JetBrainsSpace alerting config should've been valid")
}
if config.Alerting.JetBrainsSpace.GetDefaultAlert() == nil {
t.Fatal("JetBrainsSpace.GetDefaultAlert() shouldn't have returned nil")
}
@ -1120,6 +1146,50 @@ endpoints:
t.Errorf("JetBrainsSpace webhook should've been %s, but was %s", "baz", config.Alerting.JetBrainsSpace.Token)
}
if config.Alerting.Email == nil || !config.Alerting.Email.IsValid() {
t.Fatal("Email alerting config should've been valid")
}
if config.Alerting.Email.GetDefaultAlert() == nil {
t.Fatal("Email.GetDefaultAlert() shouldn't have returned nil")
}
if config.Alerting.Email.From != "from@example.com" {
t.Errorf("Email from should've been %s, but was %s", "from@example.com", config.Alerting.Email.From)
}
if config.Alerting.Email.Username != "from@example.com" {
t.Errorf("Email username should've been %s, but was %s", "from@example.com", config.Alerting.Email.Username)
}
if config.Alerting.Email.Password != "hunter2" {
t.Errorf("Email password should've been %s, but was %s", "hunter2", config.Alerting.Email.Password)
}
if config.Alerting.Email.Host != "mail.example.com" {
t.Errorf("Email host should've been %s, but was %s", "mail.example.com", config.Alerting.Email.Host)
}
if config.Alerting.Email.Port != 587 {
t.Errorf("Email port should've been %d, but was %d", 587, config.Alerting.Email.Port)
}
if config.Alerting.Email.To != "recipient1@example.com,recipient2@example.com" {
t.Errorf("Email to should've been %s, but was %s", "recipient1@example.com,recipient2@example.com", config.Alerting.Email.To)
}
if config.Alerting.Email.ClientConfig == nil {
t.Fatal("Email client config should've been set")
}
if config.Alerting.Email.ClientConfig.Insecure {
t.Error("Email client config should've been secure")
}
if config.Alerting.Gotify == nil || !config.Alerting.Gotify.IsValid() {
t.Fatal("Gotify alerting config should've been valid")
}
if config.Alerting.Gotify.GetDefaultAlert() == nil {
t.Fatal("Gotify.GetDefaultAlert() shouldn't have returned nil")
}
if config.Alerting.Gotify.ServerURL != "https://gotify.example" {
t.Errorf("Gotify server URL should've been %s, but was %s", "https://gotify.example", config.Alerting.Gotify.ServerURL)
}
if config.Alerting.Gotify.Token != "**************" {
t.Errorf("Gotify token should've been %s, but was %s", "**************", config.Alerting.Gotify.Token)
}
// Endpoints
if len(config.Endpoints) != 1 {
t.Error("There should've been 1 endpoint")
@ -1130,8 +1200,8 @@ endpoints:
if config.Endpoints[0].Interval != 60*time.Second {
t.Errorf("Interval should have been %s, because it is the default value", 60*time.Second)
}
if len(config.Endpoints[0].Alerts) != 10 {
t.Fatal("There should've been 10 alerts configured")
if len(config.Endpoints[0].Alerts) != 12 {
t.Fatalf("There should've been 12 alerts configured, got %d", len(config.Endpoints[0].Alerts))
}
if config.Endpoints[0].Alerts[0].Type != alert.TypeSlack {
@ -1255,10 +1325,36 @@ endpoints:
t.Error("The alert should've been enabled")
}
if config.Endpoints[0].Alerts[9].FailureThreshold != 5 {
t.Errorf("The default failure threshold of the alert should've been %d, but it was %d", 3, config.Endpoints[0].Alerts[9].FailureThreshold)
t.Errorf("The default failure threshold of the alert should've been %d, but it was %d", 5, config.Endpoints[0].Alerts[9].FailureThreshold)
}
if config.Endpoints[0].Alerts[9].SuccessThreshold != 3 {
t.Errorf("The default success threshold of the alert should've been %d, but it was %d", 2, config.Endpoints[0].Alerts[9].SuccessThreshold)
t.Errorf("The default success threshold of the alert should've been %d, but it was %d", 3, config.Endpoints[0].Alerts[9].SuccessThreshold)
}
if config.Endpoints[0].Alerts[10].Type != alert.TypeEmail {
t.Errorf("The type of the alert should've been %s, but it was %s", alert.TypeEmail, config.Endpoints[0].Alerts[10].Type)
}
if !config.Endpoints[0].Alerts[10].IsEnabled() {
t.Error("The alert should've been enabled")
}
if config.Endpoints[0].Alerts[10].FailureThreshold != 3 {
t.Errorf("The default failure threshold of the alert should've been %d, but it was %d", 3, config.Endpoints[0].Alerts[10].FailureThreshold)
}
if config.Endpoints[0].Alerts[10].SuccessThreshold != 2 {
t.Errorf("The default success threshold of the alert should've been %d, but it was %d", 2, config.Endpoints[0].Alerts[10].SuccessThreshold)
}
if config.Endpoints[0].Alerts[11].Type != alert.TypeGotify {
t.Errorf("The type of the alert should've been %s, but it was %s", alert.TypeGotify, config.Endpoints[0].Alerts[11].Type)
}
if !config.Endpoints[0].Alerts[11].IsEnabled() {
t.Error("The alert should've been enabled")
}
if config.Endpoints[0].Alerts[11].FailureThreshold != 3 {
t.Errorf("The default failure threshold of the alert should've been %d, but it was %d", 3, config.Endpoints[0].Alerts[11].FailureThreshold)
}
if config.Endpoints[0].Alerts[11].SuccessThreshold != 2 {
t.Errorf("The default success threshold of the alert should've been %d, but it was %d", 2, config.Endpoints[0].Alerts[11].SuccessThreshold)
}
}
@ -1532,6 +1628,99 @@ endpoints:
}
}
func TestParseAndValidateConfigBytesWithDuplicateEndpointName(t *testing.T) {
scenarios := []struct {
name string
shouldError bool
config string
}{
{
name: "same-name-no-group",
shouldError: true,
config: `
endpoints:
- name: ep1
url: https://twin.sh/health
conditions:
- "[STATUS] == 200"
- name: ep1
url: https://twin.sh/health
conditions:
- "[STATUS] == 200"`,
},
{
name: "same-name-different-group",
shouldError: false,
config: `
endpoints:
- name: ep1
url: https://twin.sh/health
conditions:
- "[STATUS] == 200"
- name: ep1
group: g1
url: https://twin.sh/health
conditions:
- "[STATUS] == 200"`,
},
{
name: "same-name-same-group",
shouldError: true,
config: `
endpoints:
- name: ep1
group: g1
url: https://twin.sh/health
conditions:
- "[STATUS] == 200"
- name: ep1
group: g1
url: https://twin.sh/health
conditions:
- "[STATUS] == 200"`,
},
{
name: "same-name-different-endpoint-type",
shouldError: true,
config: `
external-endpoints:
- name: ep1
token: "12345678"
endpoints:
- name: ep1
url: https://twin.sh/health
conditions:
- "[STATUS] == 200"`,
},
{
name: "same-name-different-group-different-endpoint-type",
shouldError: false,
config: `
external-endpoints:
- name: ep1
group: gr1
token: "12345678"
endpoints:
- name: ep1
url: https://twin.sh/health
conditions:
- "[STATUS] == 200"`,
},
}
for _, scenario := range scenarios {
t.Run(scenario.name, func(t *testing.T) {
_, err := parseAndValidateConfigBytes([]byte(scenario.config))
if scenario.shouldError && err == nil {
t.Error("should've returned an error")
} else if !scenario.shouldError && err != nil {
t.Error("shouldn't have returned an error")
}
})
}
}
func TestParseAndValidateConfigBytesWithInvalidStorageConfig(t *testing.T) {
_, err := parseAndValidateConfigBytes([]byte(`
storage:
@ -1645,7 +1834,7 @@ endpoints:
func TestParseAndValidateConfigBytesWithNoEndpoints(t *testing.T) {
_, err := parseAndValidateConfigBytes([]byte(``))
if err != ErrNoEndpointInConfig {
if !errors.Is(err, ErrNoEndpointInConfig) {
t.Error("The error returned should have been of type ErrNoEndpointInConfig")
}
}
@ -1657,6 +1846,7 @@ func TestGetAlertingProviderByAlertType(t *testing.T) {
Email: &email.AlertProvider{},
GitHub: &github.AlertProvider{},
GoogleChat: &googlechat.AlertProvider{},
Gotify: &gotify.AlertProvider{},
JetBrainsSpace: &jetbrainsspace.AlertProvider{},
Matrix: &matrix.AlertProvider{},
Mattermost: &mattermost.AlertProvider{},
@ -1679,6 +1869,7 @@ func TestGetAlertingProviderByAlertType(t *testing.T) {
{alertType: alert.TypeEmail, expected: alertingConfig.Email},
{alertType: alert.TypeGitHub, expected: alertingConfig.GitHub},
{alertType: alert.TypeGoogleChat, expected: alertingConfig.GoogleChat},
{alertType: alert.TypeGotify, expected: alertingConfig.Gotify},
{alertType: alert.TypeJetBrainsSpace, expected: alertingConfig.JetBrainsSpace},
{alertType: alert.TypeMatrix, expected: alertingConfig.Matrix},
{alertType: alert.TypeMattermost, expected: alertingConfig.Mattermost},

58
main.go
View File

@ -83,13 +83,67 @@ func initializeStorage(cfg *config.Config) {
for _, ep := range cfg.Endpoints {
keys = append(keys, ep.Key())
}
for _, externalEndpoint := range cfg.ExternalEndpoints {
keys = append(keys, externalEndpoint.Key())
for _, ee := range cfg.ExternalEndpoints {
keys = append(keys, ee.Key())
}
numberOfEndpointStatusesDeleted := store.Get().DeleteAllEndpointStatusesNotInKeys(keys)
if numberOfEndpointStatusesDeleted > 0 {
log.Printf("[main.initializeStorage] Deleted %d endpoint statuses because their matching endpoints no longer existed", numberOfEndpointStatusesDeleted)
}
// Clean up the triggered alerts from the storage provider and load valid triggered endpoint alerts
numberOfPersistedTriggeredAlertsLoaded := 0
for _, ep := range cfg.Endpoints {
var checksums []string
for _, alert := range ep.Alerts {
if alert.IsEnabled() {
checksums = append(checksums, alert.Checksum())
}
}
numberOfTriggeredAlertsDeleted := store.Get().DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(ep, checksums)
if cfg.Debug && numberOfTriggeredAlertsDeleted > 0 {
log.Printf("[main.initializeStorage] Deleted %d triggered alerts for endpoint with key=%s because their configurations have been changed or deleted", numberOfTriggeredAlertsDeleted, ep.Key())
}
for _, alert := range ep.Alerts {
exists, resolveKey, numberOfSuccessesInARow, err := store.Get().GetTriggeredEndpointAlert(ep, alert)
if err != nil {
log.Printf("[main.initializeStorage] Failed to get triggered alert for endpoint with key=%s: %s", ep.Key(), err.Error())
continue
}
if exists {
alert.Triggered, alert.ResolveKey = true, resolveKey
ep.NumberOfSuccessesInARow, ep.NumberOfFailuresInARow = numberOfSuccessesInARow, alert.FailureThreshold
numberOfPersistedTriggeredAlertsLoaded++
}
}
}
for _, ee := range cfg.ExternalEndpoints {
var checksums []string
for _, alert := range ee.Alerts {
if alert.IsEnabled() {
checksums = append(checksums, alert.Checksum())
}
}
convertedEndpoint := ee.ToEndpoint()
numberOfTriggeredAlertsDeleted := store.Get().DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(convertedEndpoint, checksums)
if cfg.Debug && numberOfTriggeredAlertsDeleted > 0 {
log.Printf("[main.initializeStorage] Deleted %d triggered alerts for endpoint with key=%s because their configurations have been changed or deleted", numberOfTriggeredAlertsDeleted, ee.Key())
}
for _, alert := range ee.Alerts {
exists, resolveKey, numberOfSuccessesInARow, err := store.Get().GetTriggeredEndpointAlert(convertedEndpoint, alert)
if err != nil {
log.Printf("[main.initializeStorage] Failed to get triggered alert for endpoint with key=%s: %s", ee.Key(), err.Error())
continue
}
if exists {
alert.Triggered, alert.ResolveKey = true, resolveKey
ee.NumberOfSuccessesInARow, ee.NumberOfFailuresInARow = numberOfSuccessesInARow, alert.FailureThreshold
numberOfPersistedTriggeredAlertsLoaded++
}
}
}
if numberOfPersistedTriggeredAlertsLoaded > 0 {
log.Printf("[main.initializeStorage] Loaded %d persisted triggered alerts", numberOfPersistedTriggeredAlertsLoaded)
}
}
func listenToConfigurationFileChanges(cfg *config.Config) {

View File

@ -5,6 +5,7 @@ import (
"sync"
"time"
"github.com/TwiN/gatus/v5/alerting/alert"
"github.com/TwiN/gatus/v5/config/endpoint"
"github.com/TwiN/gatus/v5/storage/store/common"
"github.com/TwiN/gatus/v5/storage/store/common/paging"
@ -174,6 +175,37 @@ func (s *Store) DeleteAllEndpointStatusesNotInKeys(keys []string) int {
return s.cache.DeleteAll(keysToDelete)
}
// GetTriggeredEndpointAlert returns whether the triggered alert for the specified endpoint as well as the necessary information to resolve it
//
// Always returns that the alert does not exist for the in-memory store since it does not support persistence across restarts
func (s *Store) GetTriggeredEndpointAlert(ep *endpoint.Endpoint, alert *alert.Alert) (exists bool, resolveKey string, numberOfSuccessesInARow int, err error) {
return false, "", 0, nil
}
// UpsertTriggeredEndpointAlert inserts/updates a triggered alert for an endpoint
// Used for persistence of triggered alerts across application restarts
//
// Does nothing for the in-memory store since it does not support persistence across restarts
func (s *Store) UpsertTriggeredEndpointAlert(ep *endpoint.Endpoint, triggeredAlert *alert.Alert) error {
return nil
}
// DeleteTriggeredEndpointAlert deletes a triggered alert for an endpoint
//
// Does nothing for the in-memory store since it does not support persistence across restarts
func (s *Store) DeleteTriggeredEndpointAlert(ep *endpoint.Endpoint, triggeredAlert *alert.Alert) error {
return nil
}
// DeleteAllTriggeredAlertsNotInChecksumsByEndpoint removes all triggered alerts owned by an endpoint whose alert
// configurations are not provided in the checksums list.
// This prevents triggered alerts that have been removed or modified from lingering in the database.
//
// Does nothing for the in-memory store since it does not support persistence across restarts
func (s *Store) DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(ep *endpoint.Endpoint, checksums []string) int {
return 0
}
// Clear deletes everything from the store
func (s *Store) Clear() {
s.cache.Clear()

View File

@ -16,7 +16,7 @@ func (s *Store) createPostgresSchema() error {
_, err = s.db.Exec(`
CREATE TABLE IF NOT EXISTS endpoint_events (
endpoint_event_id BIGSERIAL PRIMARY KEY,
endpoint_id INTEGER NOT NULL REFERENCES endpoints(endpoint_id) ON DELETE CASCADE,
endpoint_id BIGINT NOT NULL REFERENCES endpoints(endpoint_id) ON DELETE CASCADE,
event_type TEXT NOT NULL,
event_timestamp TIMESTAMP NOT NULL
)
@ -66,7 +66,20 @@ func (s *Store) createPostgresSchema() error {
UNIQUE(endpoint_id, hour_unix_timestamp)
)
`)
// Silent table modifications
if err != nil {
return err
}
_, err = s.db.Exec(`
CREATE TABLE IF NOT EXISTS endpoint_alerts_triggered (
endpoint_alert_trigger_id BIGSERIAL PRIMARY KEY,
endpoint_id BIGINT NOT NULL REFERENCES endpoints(endpoint_id) ON DELETE CASCADE,
configuration_checksum TEXT NOT NULL,
resolve_key TEXT NOT NULL,
number_of_successes_in_a_row INTEGER NOT NULL,
UNIQUE(endpoint_id, configuration_checksum)
)
`)
// Silent table modifications TODO: Remove this in v6.0.0
_, _ = s.db.Exec(`ALTER TABLE endpoint_results ADD IF NOT EXISTS domain_expiration BIGINT NOT NULL DEFAULT 0`)
return err
}

View File

@ -66,7 +66,20 @@ func (s *Store) createSQLiteSchema() error {
UNIQUE(endpoint_id, hour_unix_timestamp)
)
`)
// Silent table modifications TODO: Remove this
if err != nil {
return err
}
_, err = s.db.Exec(`
CREATE TABLE IF NOT EXISTS endpoint_alerts_triggered (
endpoint_alert_trigger_id INTEGER PRIMARY KEY,
endpoint_id INTEGER NOT NULL REFERENCES endpoints(endpoint_id) ON DELETE CASCADE,
configuration_checksum TEXT NOT NULL,
resolve_key TEXT NOT NULL,
number_of_successes_in_a_row INTEGER NOT NULL,
UNIQUE(endpoint_id, configuration_checksum)
)
`)
// Silent table modifications TODO: Remove this in v6.0.0
_, _ = s.db.Exec(`ALTER TABLE endpoint_results ADD domain_expiration INTEGER NOT NULL DEFAULT 0`)
return err
}

View File

@ -9,6 +9,7 @@ import (
"strings"
"time"
"github.com/TwiN/gatus/v5/alerting/alert"
"github.com/TwiN/gatus/v5/config/endpoint"
"github.com/TwiN/gatus/v5/storage/store/common"
"github.com/TwiN/gatus/v5/storage/store/common/paging"
@ -27,9 +28,9 @@ const (
// for aesthetic purposes, I deemed it wasn't worth the performance impact of yet another one-to-many table.
arraySeparator = "|~|"
uptimeCleanUpThreshold = 10 * 24 * time.Hour // Maximum uptime age before triggering a clean up
eventsCleanUpThreshold = common.MaximumNumberOfEvents + 10 // Maximum number of events before triggering a clean up
resultsCleanUpThreshold = common.MaximumNumberOfResults + 10 // Maximum number of results before triggering a clean up
uptimeCleanUpThreshold = 10 * 24 * time.Hour // Maximum uptime age before triggering a cleanup
eventsCleanUpThreshold = common.MaximumNumberOfEvents + 10 // Maximum number of events before triggering a cleanup
resultsCleanUpThreshold = common.MaximumNumberOfResults + 10 // Maximum number of results before triggering a cleanup
uptimeRetention = 7 * 24 * time.Hour
@ -234,12 +235,12 @@ func (s *Store) Insert(ep *endpoint.Endpoint, result *endpoint.Result) error {
// Endpoint doesn't exist in the database, insert it
if endpointID, err = s.insertEndpoint(tx, ep); err != nil {
_ = tx.Rollback()
log.Printf("[sql.Insert] Failed to create endpoint with group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error())
log.Printf("[sql.Insert] Failed to create endpoint with key=%s: %s", ep.Key(), err.Error())
return err
}
} else {
_ = tx.Rollback()
log.Printf("[sql.Insert] Failed to retrieve id of endpoint with group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error())
log.Printf("[sql.Insert] Failed to retrieve id of endpoint with key=%s: %s", ep.Key(), err.Error())
return err
}
}
@ -255,7 +256,7 @@ func (s *Store) Insert(ep *endpoint.Endpoint, result *endpoint.Result) error {
numberOfEvents, err := s.getNumberOfEventsByEndpointID(tx, endpointID)
if err != nil {
// Silently fail
log.Printf("[sql.Insert] Failed to retrieve total number of events for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error())
log.Printf("[sql.Insert] Failed to retrieve total number of events for endpoint with key=%s: %s", ep.Key(), err.Error())
}
if numberOfEvents == 0 {
// There's no events yet, which means we need to add the EventStart and the first healthy/unhealthy event
@ -265,18 +266,18 @@ func (s *Store) Insert(ep *endpoint.Endpoint, result *endpoint.Result) error {
})
if err != nil {
// Silently fail
log.Printf("[sql.Insert] Failed to insert event=%s for group=%s; endpoint=%s: %s", endpoint.EventStart, ep.Group, ep.Name, err.Error())
log.Printf("[sql.Insert] Failed to insert event=%s for endpoint with key=%s: %s", endpoint.EventStart, ep.Key(), err.Error())
}
event := endpoint.NewEventFromResult(result)
if err = s.insertEndpointEvent(tx, endpointID, event); err != nil {
// Silently fail
log.Printf("[sql.Insert] Failed to insert event=%s for group=%s; endpoint=%s: %s", event.Type, ep.Group, ep.Name, err.Error())
log.Printf("[sql.Insert] Failed to insert event=%s for endpoint with key=%s: %s", event.Type, ep.Key(), err.Error())
}
} else {
// Get the success value of the previous result
var lastResultSuccess bool
if lastResultSuccess, err = s.getLastEndpointResultSuccessValue(tx, endpointID); err != nil {
log.Printf("[sql.Insert] Failed to retrieve outcome of previous result for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error())
log.Printf("[sql.Insert] Failed to retrieve outcome of previous result for endpoint with key=%s: %s", ep.Key(), err.Error())
} else {
// If we managed to retrieve the outcome of the previous result, we'll compare it with the new result.
// If the final outcome (success or failure) of the previous and the new result aren't the same, it means
@ -286,7 +287,7 @@ func (s *Store) Insert(ep *endpoint.Endpoint, result *endpoint.Result) error {
event := endpoint.NewEventFromResult(result)
if err = s.insertEndpointEvent(tx, endpointID, event); err != nil {
// Silently fail
log.Printf("[sql.Insert] Failed to insert event=%s for group=%s; endpoint=%s: %s", event.Type, ep.Group, ep.Name, err.Error())
log.Printf("[sql.Insert] Failed to insert event=%s for endpoint with key=%s: %s", event.Type, ep.Key(), err.Error())
}
}
}
@ -295,40 +296,40 @@ func (s *Store) Insert(ep *endpoint.Endpoint, result *endpoint.Result) error {
// (since we're only deleting MaximumNumberOfEvents at a time instead of 1)
if numberOfEvents > eventsCleanUpThreshold {
if err = s.deleteOldEndpointEvents(tx, endpointID); err != nil {
log.Printf("[sql.Insert] Failed to delete old events for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error())
log.Printf("[sql.Insert] Failed to delete old events for endpoint with key=%s: %s", ep.Key(), err.Error())
}
}
}
// Second, we need to insert the result.
if err = s.insertEndpointResult(tx, endpointID, result); err != nil {
log.Printf("[sql.Insert] Failed to insert result for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error())
log.Printf("[sql.Insert] Failed to insert result for endpoint with key=%s: %s", ep.Key(), err.Error())
_ = tx.Rollback() // If we can't insert the result, we'll rollback now since there's no point continuing
return err
}
// Clean up old results
numberOfResults, err := s.getNumberOfResultsByEndpointID(tx, endpointID)
if err != nil {
log.Printf("[sql.Insert] Failed to retrieve total number of results for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error())
log.Printf("[sql.Insert] Failed to retrieve total number of results for endpoint with key=%s: %s", ep.Key(), err.Error())
} else {
if numberOfResults > resultsCleanUpThreshold {
if err = s.deleteOldEndpointResults(tx, endpointID); err != nil {
log.Printf("[sql.Insert] Failed to delete old results for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error())
log.Printf("[sql.Insert] Failed to delete old results for endpoint with key=%s: %s", ep.Key(), err.Error())
}
}
}
// Finally, we need to insert the uptime data.
// Because the uptime data significantly outlives the results, we can't rely on the results for determining the uptime
if err = s.updateEndpointUptime(tx, endpointID, result); err != nil {
log.Printf("[sql.Insert] Failed to update uptime for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error())
log.Printf("[sql.Insert] Failed to update uptime for endpoint with key=%s: %s", ep.Key(), err.Error())
}
// Clean up old uptime entries
ageOfOldestUptimeEntry, err := s.getAgeOfOldestEndpointUptimeEntry(tx, endpointID)
if err != nil {
log.Printf("[sql.Insert] Failed to retrieve oldest endpoint uptime entry for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error())
log.Printf("[sql.Insert] Failed to retrieve oldest endpoint uptime entry for endpoint with key=%s: %s", ep.Key(), err.Error())
} else {
if ageOfOldestUptimeEntry > uptimeCleanUpThreshold {
if err = s.deleteOldUptimeEntries(tx, endpointID, time.Now().Add(-(uptimeRetention + time.Hour))); err != nil {
log.Printf("[sql.Insert] Failed to delete old uptime entries for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error())
log.Printf("[sql.Insert] Failed to delete old uptime entries for endpoint with key=%s: %s", ep.Key(), err.Error())
}
}
}
@ -374,6 +375,8 @@ func (s *Store) DeleteAllEndpointStatusesNotInKeys(keys []string) int {
}
if s.writeThroughCache != nil {
// It's easier to just wipe out the entire cache than to try to find all keys that are not in the keys list
// This only happens on start and during tests, so it's fine for us to just clear the cache without worrying
// about performance
_ = s.writeThroughCache.DeleteKeysByPattern("*")
}
// Return number of rows deleted
@ -381,6 +384,111 @@ func (s *Store) DeleteAllEndpointStatusesNotInKeys(keys []string) int {
return int(rowsAffects)
}
// GetTriggeredEndpointAlert returns whether the triggered alert for the specified endpoint as well as the necessary information to resolve it
func (s *Store) GetTriggeredEndpointAlert(ep *endpoint.Endpoint, alert *alert.Alert) (exists bool, resolveKey string, numberOfSuccessesInARow int, err error) {
//log.Printf("[sql.GetTriggeredEndpointAlert] Getting triggered alert with checksum=%s for endpoint with key=%s", alert.Checksum(), ep.Key())
err = s.db.QueryRow(
"SELECT resolve_key, number_of_successes_in_a_row FROM endpoint_alerts_triggered WHERE endpoint_id = (SELECT endpoint_id FROM endpoints WHERE endpoint_key = $1 LIMIT 1) AND configuration_checksum = $2",
ep.Key(),
alert.Checksum(),
).Scan(&resolveKey, &numberOfSuccessesInARow)
if err != nil {
if errors.Is(err, sql.ErrNoRows) {
return false, "", 0, nil
}
return false, "", 0, err
}
return true, resolveKey, numberOfSuccessesInARow, nil
}
// UpsertTriggeredEndpointAlert inserts/updates a triggered alert for an endpoint
// Used for persistence of triggered alerts across application restarts
func (s *Store) UpsertTriggeredEndpointAlert(ep *endpoint.Endpoint, triggeredAlert *alert.Alert) error {
//log.Printf("[sql.UpsertTriggeredEndpointAlert] Upserting triggered alert with checksum=%s for endpoint with key=%s", triggeredAlert.Checksum(), ep.Key())
tx, err := s.db.Begin()
if err != nil {
return err
}
endpointID, err := s.getEndpointID(tx, ep)
if err != nil {
if errors.Is(err, common.ErrEndpointNotFound) {
// Endpoint doesn't exist in the database, insert it
// This shouldn't happen, but we'll handle it anyway
if endpointID, err = s.insertEndpoint(tx, ep); err != nil {
_ = tx.Rollback()
log.Printf("[sql.UpsertTriggeredEndpointAlert] Failed to create endpoint with key=%s: %s", ep.Key(), err.Error())
return err
}
} else {
_ = tx.Rollback()
log.Printf("[sql.UpsertTriggeredEndpointAlert] Failed to retrieve id of endpoint with key=%s: %s", ep.Key(), err.Error())
return err
}
}
_, err = tx.Exec(
`
INSERT INTO endpoint_alerts_triggered (endpoint_id, configuration_checksum, resolve_key, number_of_successes_in_a_row)
VALUES ($1, $2, $3, $4)
ON CONFLICT(endpoint_id, configuration_checksum) DO UPDATE SET
resolve_key = $3,
number_of_successes_in_a_row = $4
`,
endpointID,
triggeredAlert.Checksum(),
triggeredAlert.ResolveKey,
ep.NumberOfSuccessesInARow, // We only persist NumberOfSuccessesInARow, because all alerts in this table are already triggered
)
if err != nil {
_ = tx.Rollback()
log.Printf("[sql.UpsertTriggeredEndpointAlert] Failed to persist triggered alert for endpoint with key=%s: %s", ep.Key(), err.Error())
return err
}
if err = tx.Commit(); err != nil {
_ = tx.Rollback()
}
return nil
}
// DeleteTriggeredEndpointAlert deletes a triggered alert for an endpoint
func (s *Store) DeleteTriggeredEndpointAlert(ep *endpoint.Endpoint, triggeredAlert *alert.Alert) error {
//log.Printf("[sql.DeleteTriggeredEndpointAlert] Deleting triggered alert with checksum=%s for endpoint with key=%s", triggeredAlert.Checksum(), ep.Key())
_, err := s.db.Exec("DELETE FROM endpoint_alerts_triggered WHERE configuration_checksum = $1 AND endpoint_id = (SELECT endpoint_id FROM endpoints WHERE endpoint_key = $2 LIMIT 1)", triggeredAlert.Checksum(), ep.Key())
return err
}
// DeleteAllTriggeredAlertsNotInChecksumsByEndpoint removes all triggered alerts owned by an endpoint whose alert
// configurations are not provided in the checksums list.
// This prevents triggered alerts that have been removed or modified from lingering in the database.
func (s *Store) DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(ep *endpoint.Endpoint, checksums []string) int {
//log.Printf("[sql.DeleteAllTriggeredAlertsNotInChecksumsByEndpoint] Deleting triggered alerts for endpoint with key=%s that do not belong to any of checksums=%v", ep.Key(), checksums)
var err error
var result sql.Result
if len(checksums) == 0 {
// No checksums? Then it means there are no (enabled) alerts configured for that endpoint, so we can get rid of all
// persisted triggered alerts for that endpoint
result, err = s.db.Exec("DELETE FROM endpoint_alerts_triggered WHERE endpoint_id = (SELECT endpoint_id FROM endpoints WHERE endpoint_key = $1 LIMIT 1)", ep.Key())
} else {
args := make([]interface{}, 0, len(checksums)+1)
args = append(args, ep.Key())
query := `DELETE FROM endpoint_alerts_triggered
WHERE endpoint_id = (SELECT endpoint_id FROM endpoints WHERE endpoint_key = $1 LIMIT 1)
AND configuration_checksum NOT IN (`
for i := range checksums {
query += fmt.Sprintf("$%d,", i+2)
args = append(args, checksums[i])
}
query = query[:len(query)-1] + ")" // Remove the last comma and add the closing parenthesis
result, err = s.db.Exec(query, args...)
}
if err != nil {
log.Printf("[sql.DeleteAllTriggeredAlertsNotInChecksumsByEndpoint] Failed to delete rows for endpoint with key=%s that do not belong to any of checksums=%v: %s", ep.Key(), checksums, err.Error())
return 0
}
// Return number of rows deleted
rowsAffects, _ := result.RowsAffected()
return int(rowsAffects)
}
// Clear deletes everything from the store
func (s *Store) Clear() {
_, _ = s.db.Exec("DELETE FROM endpoints")

View File

@ -1,9 +1,11 @@
package sql
import (
"errors"
"testing"
"time"
"github.com/TwiN/gatus/v5/alerting/alert"
"github.com/TwiN/gatus/v5/config/endpoint"
"github.com/TwiN/gatus/v5/storage/store/common"
"github.com/TwiN/gatus/v5/storage/store/common/paging"
@ -81,13 +83,13 @@ var (
)
func TestNewStore(t *testing.T) {
if _, err := NewStore("", "TestNewStore.db", false); err != ErrDatabaseDriverNotSpecified {
if _, err := NewStore("", t.TempDir()+"/TestNewStore.db", false); !errors.Is(err, ErrDatabaseDriverNotSpecified) {
t.Error("expected error due to blank driver parameter")
}
if _, err := NewStore("sqlite", "", false); err != ErrPathNotSpecified {
if _, err := NewStore("sqlite", "", false); !errors.Is(err, ErrPathNotSpecified) {
t.Error("expected error due to blank path parameter")
}
if store, err := NewStore("sqlite", t.TempDir()+"/TestNewStore.db", false); err != nil {
if store, err := NewStore("sqlite", t.TempDir()+"/TestNewStore.db", true); err != nil {
t.Error("shouldn't have returned any error, got", err.Error())
} else {
_ = store.db.Close()
@ -168,6 +170,40 @@ func TestStore_InsertCleansUpEventsAndResultsProperly(t *testing.T) {
store.Clear()
}
func TestStore_InsertWithCaching(t *testing.T) {
store, _ := NewStore("sqlite", t.TempDir()+"/TestStore_InsertWithCaching.db", true)
defer store.Close()
// Add 2 results
store.Insert(&testEndpoint, &testSuccessfulResult)
store.Insert(&testEndpoint, &testSuccessfulResult)
// Verify that they exist
endpointStatuses, _ := store.GetAllEndpointStatuses(paging.NewEndpointStatusParams().WithResults(1, 20))
if numberOfEndpointStatuses := len(endpointStatuses); numberOfEndpointStatuses != 1 {
t.Fatalf("expected 1 EndpointStatus, got %d", numberOfEndpointStatuses)
}
if len(endpointStatuses[0].Results) != 2 {
t.Fatalf("expected 2 results, got %d", len(endpointStatuses[0].Results))
}
// Add 2 more results
store.Insert(&testEndpoint, &testUnsuccessfulResult)
store.Insert(&testEndpoint, &testUnsuccessfulResult)
// Verify that they exist
endpointStatuses, _ = store.GetAllEndpointStatuses(paging.NewEndpointStatusParams().WithResults(1, 20))
if numberOfEndpointStatuses := len(endpointStatuses); numberOfEndpointStatuses != 1 {
t.Fatalf("expected 1 EndpointStatus, got %d", numberOfEndpointStatuses)
}
if len(endpointStatuses[0].Results) != 4 {
t.Fatalf("expected 4 results, got %d", len(endpointStatuses[0].Results))
}
// Clear the store, which should also clear the cache
store.Clear()
// Verify that they no longer exist
endpointStatuses, _ = store.GetAllEndpointStatuses(paging.NewEndpointStatusParams().WithResults(1, 20))
if numberOfEndpointStatuses := len(endpointStatuses); numberOfEndpointStatuses != 0 {
t.Fatalf("expected 0 EndpointStatus, got %d", numberOfEndpointStatuses)
}
}
func TestStore_Persistence(t *testing.T) {
path := t.TempDir() + "/TestStore_Persistence.db"
store, _ := NewStore("sqlite", path, false)
@ -368,10 +404,10 @@ func TestStore_NoRows(t *testing.T) {
defer store.Close()
tx, _ := store.db.Begin()
defer tx.Rollback()
if _, err := store.getLastEndpointResultSuccessValue(tx, 1); err != errNoRowsReturned {
if _, err := store.getLastEndpointResultSuccessValue(tx, 1); !errors.Is(err, errNoRowsReturned) {
t.Errorf("should've %v, got %v", errNoRowsReturned, err)
}
if _, err := store.getAgeOfOldestEndpointUptimeEntry(tx, 1); err != errNoRowsReturned {
if _, err := store.getAgeOfOldestEndpointUptimeEntry(tx, 1); !errors.Is(err, errNoRowsReturned) {
t.Errorf("should've %v, got %v", errNoRowsReturned, err)
}
}
@ -564,3 +600,131 @@ func TestCacheKey(t *testing.T) {
})
}
}
func TestTriggeredEndpointAlertsPersistence(t *testing.T) {
store, _ := NewStore("sqlite", t.TempDir()+"/TestTriggeredEndpointAlertsPersistence.db", false)
defer store.Close()
yes, desc := false, "description"
ep := testEndpoint
ep.NumberOfSuccessesInARow = 0
alrt := &alert.Alert{
Type: alert.TypePagerDuty,
Enabled: &yes,
FailureThreshold: 4,
SuccessThreshold: 2,
Description: &desc,
SendOnResolved: &yes,
Triggered: true,
ResolveKey: "1234567",
}
// Alert just triggered, so NumberOfSuccessesInARow is 0
if err := store.UpsertTriggeredEndpointAlert(&ep, alrt); err != nil {
t.Fatal("expected no error, got", err.Error())
}
exists, resolveKey, numberOfSuccessesInARow, err := store.GetTriggeredEndpointAlert(&ep, alrt)
if err != nil {
t.Fatal("expected no error, got", err.Error())
}
if !exists {
t.Error("expected triggered alert to exist")
}
if resolveKey != alrt.ResolveKey {
t.Errorf("expected resolveKey %s, got %s", alrt.ResolveKey, resolveKey)
}
if numberOfSuccessesInARow != ep.NumberOfSuccessesInARow {
t.Errorf("expected persisted NumberOfSuccessesInARow to be %d, got %d", ep.NumberOfSuccessesInARow, numberOfSuccessesInARow)
}
// Endpoint just had a successful evaluation, so NumberOfSuccessesInARow is now 1
ep.NumberOfSuccessesInARow++
if err := store.UpsertTriggeredEndpointAlert(&ep, alrt); err != nil {
t.Fatal("expected no error, got", err.Error())
}
exists, resolveKey, numberOfSuccessesInARow, err = store.GetTriggeredEndpointAlert(&ep, alrt)
if err != nil {
t.Error("expected no error, got", err.Error())
}
if !exists {
t.Error("expected triggered alert to exist")
}
if resolveKey != alrt.ResolveKey {
t.Errorf("expected resolveKey %s, got %s", alrt.ResolveKey, resolveKey)
}
if numberOfSuccessesInARow != ep.NumberOfSuccessesInARow {
t.Errorf("expected persisted NumberOfSuccessesInARow to be %d, got %d", ep.NumberOfSuccessesInARow, numberOfSuccessesInARow)
}
// Simulate the endpoint having another successful evaluation, which means the alert is now resolved,
// and we should delete the triggered alert from the store
ep.NumberOfSuccessesInARow++
if err := store.DeleteTriggeredEndpointAlert(&ep, alrt); err != nil {
t.Fatal("expected no error, got", err.Error())
}
exists, _, _, err = store.GetTriggeredEndpointAlert(&ep, alrt)
if err != nil {
t.Error("expected no error, got", err.Error())
}
if exists {
t.Error("expected triggered alert to no longer exist as it has been deleted")
}
}
func TestStore_DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(t *testing.T) {
store, _ := NewStore("sqlite", t.TempDir()+"/TestStore_DeleteAllTriggeredAlertsNotInChecksumsByEndpoint.db", false)
defer store.Close()
yes, desc := false, "description"
ep1 := testEndpoint
ep1.Name = "ep1"
ep2 := testEndpoint
ep2.Name = "ep2"
alert1 := alert.Alert{
Type: alert.TypePagerDuty,
Enabled: &yes,
FailureThreshold: 4,
SuccessThreshold: 2,
Description: &desc,
SendOnResolved: &yes,
Triggered: true,
ResolveKey: "1234567",
}
alert2 := alert1
alert2.Type, alert2.ResolveKey = alert.TypeSlack, ""
alert3 := alert2
if err := store.UpsertTriggeredEndpointAlert(&ep1, &alert1); err != nil {
t.Fatal("expected no error, got", err.Error())
}
if err := store.UpsertTriggeredEndpointAlert(&ep1, &alert2); err != nil {
t.Fatal("expected no error, got", err.Error())
}
if err := store.UpsertTriggeredEndpointAlert(&ep2, &alert3); err != nil {
t.Fatal("expected no error, got", err.Error())
}
if exists, _, _, _ := store.GetTriggeredEndpointAlert(&ep1, &alert1); !exists {
t.Error("expected alert1 to have been deleted")
}
if exists, _, _, _ := store.GetTriggeredEndpointAlert(&ep1, &alert2); !exists {
t.Error("expected alert2 to exist for ep1")
}
if exists, _, _, _ := store.GetTriggeredEndpointAlert(&ep2, &alert3); !exists {
t.Error("expected alert3 to exist for ep2")
}
// Now we simulate the alert configuration being updated, and the alert being resolved
if deleted := store.DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(&ep1, []string{alert2.Checksum()}); deleted != 1 {
t.Errorf("expected 1 triggered alert to be deleted, got %d", deleted)
}
if exists, _, _, _ := store.GetTriggeredEndpointAlert(&ep1, &alert1); exists {
t.Error("expected alert1 to have been deleted")
}
if exists, _, _, _ := store.GetTriggeredEndpointAlert(&ep1, &alert2); !exists {
t.Error("expected alert2 to exist for ep1")
}
if exists, _, _, _ := store.GetTriggeredEndpointAlert(&ep2, &alert3); !exists {
t.Error("expected alert3 to exist for ep2")
}
// Now let's just assume all alerts for ep1 were removed
if deleted := store.DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(&ep1, []string{}); deleted != 1 {
t.Errorf("expected 1 triggered alert to be deleted, got %d", deleted)
}
// Make sure the alert for ep2 still exists
if exists, _, _, _ := store.GetTriggeredEndpointAlert(&ep2, &alert3); !exists {
t.Error("expected alert3 to exist for ep2")
}
}

View File

@ -5,6 +5,7 @@ import (
"log"
"time"
"github.com/TwiN/gatus/v5/alerting/alert"
"github.com/TwiN/gatus/v5/config/endpoint"
"github.com/TwiN/gatus/v5/storage"
"github.com/TwiN/gatus/v5/storage/store/common/paging"
@ -41,6 +42,21 @@ type Store interface {
// Used to delete endpoints that have been persisted but are no longer part of the configured endpoints
DeleteAllEndpointStatusesNotInKeys(keys []string) int
// GetTriggeredEndpointAlert returns whether the triggered alert for the specified endpoint as well as the necessary information to resolve it
GetTriggeredEndpointAlert(ep *endpoint.Endpoint, alert *alert.Alert) (exists bool, resolveKey string, numberOfSuccessesInARow int, err error)
// UpsertTriggeredEndpointAlert inserts/updates a triggered alert for an endpoint
// Used for persistence of triggered alerts across application restarts
UpsertTriggeredEndpointAlert(ep *endpoint.Endpoint, triggeredAlert *alert.Alert) error
// DeleteTriggeredEndpointAlert deletes a triggered alert for an endpoint
DeleteTriggeredEndpointAlert(ep *endpoint.Endpoint, triggeredAlert *alert.Alert) error
// DeleteAllTriggeredAlertsNotInChecksumsByEndpoint removes all triggered alerts owned by an endpoint whose alert
// configurations are not provided in the checksums list.
// This prevents triggered alerts that have been removed or modified from lingering in the database.
DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(ep *endpoint.Endpoint, checksums []string) int
// Clear deletes everything from the store
Clear()

View File

@ -7,6 +7,7 @@ import (
"github.com/TwiN/gatus/v5/alerting"
"github.com/TwiN/gatus/v5/config/endpoint"
"github.com/TwiN/gatus/v5/storage/store"
)
// HandleAlerting takes care of alerts to resolve and alerts to trigger based on result success or failure
@ -50,9 +51,12 @@ func handleAlertsToTrigger(ep *endpoint.Endpoint, result *endpoint.Result, alert
log.Printf("[watchdog.handleAlertsToTrigger] Failed to send an alert for endpoint=%s: %s", ep.Name, err.Error())
} else {
endpointAlert.Triggered = true
if err := store.Get().UpsertTriggeredEndpointAlert(ep, endpointAlert); err != nil {
log.Printf("[watchdog.handleAlertsToTrigger] Failed to persist triggered endpoint alert for endpoint with key=%s: %s", ep.Key(), err.Error())
}
}
} else {
log.Printf("[watchdog.handleAlertsToResolve] Not sending alert of type=%s despite being TRIGGERED, because the provider wasn't configured properly", endpointAlert.Type)
log.Printf("[watchdog.handleAlertsToTrigger] Not sending alert of type=%s despite being TRIGGERED, because the provider wasn't configured properly", endpointAlert.Type)
}
}
}
@ -60,21 +64,31 @@ func handleAlertsToTrigger(ep *endpoint.Endpoint, result *endpoint.Result, alert
func handleAlertsToResolve(ep *endpoint.Endpoint, result *endpoint.Result, alertingConfig *alerting.Config, debug bool) {
ep.NumberOfSuccessesInARow++
for _, endpointAlert := range ep.Alerts {
if !endpointAlert.IsEnabled() || !endpointAlert.Triggered || endpointAlert.SuccessThreshold > ep.NumberOfSuccessesInARow {
isStillBelowSuccessThreshold := endpointAlert.SuccessThreshold > ep.NumberOfSuccessesInARow
if isStillBelowSuccessThreshold && endpointAlert.IsEnabled() {
// Persist NumberOfSuccessesInARow
if err := store.Get().UpsertTriggeredEndpointAlert(ep, endpointAlert); err != nil {
log.Printf("[watchdog.handleAlertsToResolve] Failed to update triggered endpoint alert for endpoint with key=%s: %s", ep.Key(), err.Error())
}
}
if !endpointAlert.IsEnabled() || !endpointAlert.Triggered || isStillBelowSuccessThreshold {
continue
}
// Even if the alert provider returns an error, we still set the alert's Triggered variable to false.
// Further explanation can be found on Alert's Triggered field.
endpointAlert.Triggered = false
if err := store.Get().DeleteTriggeredEndpointAlert(ep, endpointAlert); err != nil {
log.Printf("[watchdog.handleAlertsToResolve] Failed to delete persisted triggered endpoint alert for endpoint with key=%s: %s", ep.Key(), err.Error())
}
if !endpointAlert.IsSendingOnResolved() {
continue
}
alertProvider := alertingConfig.GetAlertingProviderByAlertType(endpointAlert.Type)
if alertProvider != nil {
log.Printf("[watchdog.handleAlertsToResolve] Sending %s alert because alert for endpoint=%s with description='%s' has been RESOLVED", endpointAlert.Type, ep.Name, endpointAlert.GetDescription())
log.Printf("[watchdog.handleAlertsToResolve] Sending %s alert because alert for endpoint with key=%s with description='%s' has been RESOLVED", endpointAlert.Type, ep.Key(), endpointAlert.GetDescription())
err := alertProvider.Send(ep, endpointAlert, result, true)
if err != nil {
log.Printf("[watchdog.handleAlertsToResolve] Failed to send an alert for endpoint=%s: %s", ep.Name, err.Error())
log.Printf("[watchdog.handleAlertsToResolve] Failed to send an alert for endpoint with key=%s: %s", ep.Key(), err.Error())
}
} else {
log.Printf("[watchdog.handleAlertsToResolve] Not sending alert of type=%s despite being RESOLVED, because the provider wasn't configured properly", endpointAlert.Type)