From f2c5f5911cd7f6f379de1b478f4bc97dcdbd8f24 Mon Sep 17 00:00:00 2001 From: TwiN Date: Wed, 15 May 2024 21:29:45 -0400 Subject: [PATCH] feat(alerting): Persist triggered alerts across application restart (#764) * feat(alerting): Persist triggered alerts across application restart Fixes #679 * test(alerting): Add numerous tests related to alerts --- alerting/alert/alert.go | 23 ++- alerting/alert/alert_test.go | 106 +++++++++++ api/external_endpoint_test.go | 6 + config/config.go | 46 +++-- config/config_test.go | 237 ++++++++++++++++++++++--- main.go | 58 +++++- storage/store/memory/memory.go | 32 ++++ storage/store/sql/specific_postgres.go | 17 +- storage/store/sql/specific_sqlite.go | 15 +- storage/store/sql/sql.go | 142 +++++++++++++-- storage/store/sql/sql_test.go | 174 +++++++++++++++++- storage/store/store.go | 16 ++ watchdog/alerting.go | 22 ++- 13 files changed, 822 insertions(+), 72 deletions(-) diff --git a/alerting/alert/alert.go b/alerting/alert/alert.go index 4d90ced2..2afcd8b5 100644 --- a/alerting/alert/alert.go +++ b/alerting/alert/alert.go @@ -1,7 +1,10 @@ package alert import ( + "crypto/sha256" + "encoding/hex" "errors" + "strconv" "strings" ) @@ -26,6 +29,9 @@ type Alert struct { // FailureThreshold is the number of failures in a row needed before triggering the alert FailureThreshold int `yaml:"failure-threshold"` + // SuccessThreshold defines how many successful executions must happen in a row before an ongoing incident is marked as resolved + SuccessThreshold int `yaml:"success-threshold"` + // Description of the alert. Will be included in the alert sent. // // This is a pointer, because it is populated by YAML and we need to know whether it was explicitly set to a value @@ -38,9 +44,6 @@ type Alert struct { // or not for provider.ParseWithDefaultAlert to work. Use Alert.IsSendingOnResolved() for a non-pointer SendOnResolved *bool `yaml:"send-on-resolved"` - // SuccessThreshold defines how many successful executions must happen in a row before an ongoing incident is marked as resolved - SuccessThreshold int `yaml:"success-threshold"` - // ResolveKey is an optional field that is used by some providers (i.e. PagerDuty's dedup_key) to resolve // ongoing/triggered incidents ResolveKey string `yaml:"-"` @@ -94,3 +97,17 @@ func (alert *Alert) IsSendingOnResolved() bool { } return *alert.SendOnResolved } + +// Checksum returns a checksum of the alert +// Used to determine which persisted triggered alert should be deleted on application start +func (alert *Alert) Checksum() string { + hash := sha256.New() + hash.Write([]byte(string(alert.Type) + "_" + + strconv.FormatBool(alert.IsEnabled()) + "_" + + strconv.FormatBool(alert.IsSendingOnResolved()) + "_" + + strconv.Itoa(alert.SuccessThreshold) + "_" + + strconv.Itoa(alert.FailureThreshold) + "_" + + alert.GetDescription()), + ) + return hex.EncodeToString(hash.Sum(nil)) +} diff --git a/alerting/alert/alert_test.go b/alerting/alert/alert_test.go index 5f51eab1..ffe03eee 100644 --- a/alerting/alert/alert_test.go +++ b/alerting/alert/alert_test.go @@ -84,3 +84,109 @@ func TestAlert_IsSendingOnResolved(t *testing.T) { t.Error("alert.IsSendingOnResolved() should've returned true, because SendOnResolved was set to true") } } + +func TestAlert_Checksum(t *testing.T) { + description1, description2 := "a", "b" + yes, no := true, false + scenarios := []struct { + name string + alert Alert + expected string + }{ + { + name: "barebone", + alert: Alert{ + Type: TypeDiscord, + }, + expected: "fed0580e44ed5701dbba73afa1f14b2c53ca5a7b8067a860441c212916057fe3", + }, + { + name: "with-description-1", + alert: Alert{ + Type: TypeDiscord, + Description: &description1, + }, + expected: "005f407ebe506e74a4aeb46f74c28b376debead7011e1b085da3840f72ba9707", + }, + { + name: "with-description-2", + alert: Alert{ + Type: TypeDiscord, + Description: &description2, + }, + expected: "3c2c4a9570cdc614006993c21f79a860a7f5afea10cf70d1a79d3c49342ef2c8", + }, + { + name: "with-description-2-and-enabled-false", + alert: Alert{ + Type: TypeDiscord, + Enabled: &no, + Description: &description2, + }, + expected: "837945c2b4cd5e961db3e63e10c348d4f1c3446ba68cf5a48e35a1ae22cf0c22", + }, + { + name: "with-description-2-and-enabled-true", + alert: Alert{ + Type: TypeDiscord, + Enabled: &yes, // it defaults to true if not set, but just to make sure + Description: &description2, + }, + expected: "3c2c4a9570cdc614006993c21f79a860a7f5afea10cf70d1a79d3c49342ef2c8", + }, + { + name: "with-description-2-and-enabled-true-and-send-on-resolved-true", + alert: Alert{ + Type: TypeDiscord, + Enabled: &yes, + SendOnResolved: &yes, + Description: &description2, + }, + expected: "bf1436995a880eb4a352c74c5dfee1f1b5ff6b9fc55aef9bf411b3631adfd80c", + }, + { + name: "with-description-2-and-failure-threshold-7", + alert: Alert{ + Type: TypeSlack, + FailureThreshold: 7, + Description: &description2, + }, + expected: "8bd479e18bda393d4c924f5a0d962e825002168dedaa88b445e435db7bacffd3", + }, + { + name: "with-description-2-and-failure-threshold-9", + alert: Alert{ + Type: TypeSlack, + FailureThreshold: 9, + Description: &description2, + }, + expected: "5abdfce5236e344996d264d526e769c07cb0d3d329a999769a1ff84b157ca6f1", + }, + { + name: "with-description-2-and-success-threshold-5", + alert: Alert{ + Type: TypeSlack, + SuccessThreshold: 7, + Description: &description2, + }, + expected: "c0000e73626b80e212cfc24830de7094568f648e37f3e16f9e68c7f8ef75c34c", + }, + { + name: "with-description-2-and-success-threshold-1", + alert: Alert{ + Type: TypeSlack, + SuccessThreshold: 1, + Description: &description2, + }, + expected: "5c28963b3a76104cfa4a0d79c89dd29ec596c8cfa4b1af210ec83d6d41587b5f", + }, + } + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + scenario.alert.ValidateAndSetDefaults() + if checksum := scenario.alert.Checksum(); checksum != scenario.expected { + t.Errorf("expected checksum %v, got %v", scenario.expected, checksum) + } + }) + } +} diff --git a/api/external_endpoint_test.go b/api/external_endpoint_test.go index 60ebcd7b..95fc432a 100644 --- a/api/external_endpoint_test.go +++ b/api/external_endpoint_test.go @@ -64,6 +64,12 @@ func TestCreateExternalEndpointResult(t *testing.T) { AuthorizationHeaderBearerToken: "Bearer token", ExpectedCode: 404, }, + { + Name: "bad-success-value", + Path: "/api/v1/endpoints/g_n/external?success=invalid", + AuthorizationHeaderBearerToken: "Bearer token", + ExpectedCode: 400, + }, { Name: "good-token-success-true", Path: "/api/v1/endpoints/g_n/external?success=true", diff --git a/config/config.go b/config/config.go index 14fde4b0..4de98bd7 100644 --- a/config/config.go +++ b/config/config.go @@ -245,16 +245,13 @@ func parseAndValidateConfigBytes(yamlBytes []byte) (config *Config, err error) { if config == nil || config.Endpoints == nil || len(config.Endpoints) == 0 { err = ErrNoEndpointInConfig } else { - validateAlertingConfig(config.Alerting, config.Endpoints, config.Debug) + validateAlertingConfig(config.Alerting, config.Endpoints, config.ExternalEndpoints, config.Debug) if err := validateSecurityConfig(config); err != nil { return nil, err } if err := validateEndpointsConfig(config); err != nil { return nil, err } - if err := validateExternalEndpointsConfig(config); err != nil { - return nil, err - } if err := validateWebConfig(config); err != nil { return nil, err } @@ -338,28 +335,37 @@ func validateWebConfig(config *Config) error { } func validateEndpointsConfig(config *Config) error { + duplicateValidationMap := make(map[string]bool) + // Validate endpoints for _, ep := range config.Endpoints { if config.Debug { log.Printf("[config.validateEndpointsConfig] Validating endpoint '%s'", ep.Name) } + if endpointKey := ep.Key(); duplicateValidationMap[endpointKey] { + return fmt.Errorf("invalid endpoint %s: name and group combination must be unique", ep.Key()) + } else { + duplicateValidationMap[endpointKey] = true + } if err := ep.ValidateAndSetDefaults(); err != nil { - return fmt.Errorf("invalid endpoint %s: %w", ep.DisplayName(), err) + return fmt.Errorf("invalid endpoint %s: %w", ep.Key(), err) } } log.Printf("[config.validateEndpointsConfig] Validated %d endpoints", len(config.Endpoints)) - return nil -} - -func validateExternalEndpointsConfig(config *Config) error { + // Validate external endpoints for _, ee := range config.ExternalEndpoints { if config.Debug { - log.Printf("[config.validateExternalEndpointsConfig] Validating external endpoint '%s'", ee.Name) + log.Printf("[config.validateEndpointsConfig] Validating external endpoint '%s'", ee.Name) + } + if endpointKey := ee.Key(); duplicateValidationMap[endpointKey] { + return fmt.Errorf("invalid external endpoint %s: name and group combination must be unique", ee.Key()) + } else { + duplicateValidationMap[endpointKey] = true } if err := ee.ValidateAndSetDefaults(); err != nil { - return fmt.Errorf("invalid external endpoint %s: %w", ee.DisplayName(), err) + return fmt.Errorf("invalid external endpoint %s: %w", ee.Key(), err) } } - log.Printf("[config.validateExternalEndpointsConfig] Validated %d external endpoints", len(config.ExternalEndpoints)) + log.Printf("[config.validateEndpointsConfig] Validated %d external endpoints", len(config.ExternalEndpoints)) return nil } @@ -382,7 +388,7 @@ func validateSecurityConfig(config *Config) error { // Note that the alerting configuration has to be validated before the endpoint configuration, because the default alert // returned by provider.AlertProvider.GetDefaultAlert() must be parsed before endpoint.Endpoint.ValidateAndSetDefaults() // sets the default alert values when none are set. -func validateAlertingConfig(alertingConfig *alerting.Config, endpoints []*endpoint.Endpoint, debug bool) { +func validateAlertingConfig(alertingConfig *alerting.Config, endpoints []*endpoint.Endpoint, externalEndpoints []*endpoint.ExternalEndpoint, debug bool) { if alertingConfig == nil { log.Printf("[config.validateAlertingConfig] Alerting is not configured") return @@ -391,12 +397,12 @@ func validateAlertingConfig(alertingConfig *alerting.Config, endpoints []*endpoi alert.TypeAWSSES, alert.TypeCustom, alert.TypeDiscord, + alert.TypeEmail, alert.TypeGitHub, alert.TypeGitLab, alert.TypeGoogleChat, alert.TypeGotify, alert.TypeJetBrainsSpace, - alert.TypeEmail, alert.TypeMatrix, alert.TypeMattermost, alert.TypeMessagebird, @@ -420,7 +426,17 @@ func validateAlertingConfig(alertingConfig *alerting.Config, endpoints []*endpoi for alertIndex, endpointAlert := range ep.Alerts { if alertType == endpointAlert.Type { if debug { - log.Printf("[config.validateAlertingConfig] Parsing alert %d with provider's default alert for provider=%s in endpoint=%s", alertIndex, alertType, ep.Name) + log.Printf("[config.validateAlertingConfig] Parsing alert %d with default alert for provider=%s in endpoint with key=%s", alertIndex, alertType, ep.Key()) + } + provider.ParseWithDefaultAlert(alertProvider.GetDefaultAlert(), endpointAlert) + } + } + } + for _, ee := range externalEndpoints { + for alertIndex, endpointAlert := range ee.Alerts { + if alertType == endpointAlert.Type { + if debug { + log.Printf("[config.validateAlertingConfig] Parsing alert %d with default alert for provider=%s in endpoint with key=%s", alertIndex, alertType, ee.Key()) } provider.ParseWithDefaultAlert(alertProvider.GetDefaultAlert(), endpointAlert) } diff --git a/config/config_test.go b/config/config_test.go index f94b170e..9f86d45a 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -16,6 +16,7 @@ import ( "github.com/TwiN/gatus/v5/alerting/provider/email" "github.com/TwiN/gatus/v5/alerting/provider/github" "github.com/TwiN/gatus/v5/alerting/provider/googlechat" + "github.com/TwiN/gatus/v5/alerting/provider/gotify" "github.com/TwiN/gatus/v5/alerting/provider/jetbrainsspace" "github.com/TwiN/gatus/v5/alerting/provider/matrix" "github.com/TwiN/gatus/v5/alerting/provider/mattermost" @@ -36,6 +37,7 @@ import ( ) func TestLoadConfiguration(t *testing.T) { + yes := true dir := t.TempDir() scenarios := []struct { name string @@ -165,6 +167,8 @@ metrics: true alerting: slack: webhook-url: https://hooks.slack.com/services/xxx/yyy/zzz + default-alert: + enabled: true endpoints: - name: example @@ -179,6 +183,12 @@ alerting: discord: webhook-url: https://discord.com/api/webhooks/xxx/yyy +external-endpoints: + - name: ext-ep-test + token: "potato" + alerts: + - type: slack + endpoints: - name: frontend url: https://example.com @@ -190,7 +200,20 @@ endpoints: Metrics: true, Alerting: &alerting.Config{ Discord: &discord.AlertProvider{WebhookURL: "https://discord.com/api/webhooks/xxx/yyy"}, - Slack: &slack.AlertProvider{WebhookURL: "https://hooks.slack.com/services/xxx/yyy/zzz"}, + Slack: &slack.AlertProvider{WebhookURL: "https://hooks.slack.com/services/xxx/yyy/zzz", DefaultAlert: &alert.Alert{Enabled: &yes}}, + }, + ExternalEndpoints: []*endpoint.ExternalEndpoint{ + { + Name: "ext-ep-test", + Token: "potato", + Alerts: []*alert.Alert{ + { + Type: alert.TypeSlack, + FailureThreshold: 3, + SuccessThreshold: 2, + }, + }, + }, }, Endpoints: []*endpoint.Endpoint{ { @@ -325,10 +348,6 @@ external-endpoints: - name: ext-ep-test group: core token: "potato" - alerts: - - type: discord - description: "healthcheck failed" - send-on-resolved: true endpoints: - name: website @@ -382,18 +401,7 @@ endpoints: if config.ExternalEndpoints[0].Token != "potato" { t.Errorf("Token should have been %s", "potato") } - if len(config.ExternalEndpoints[0].Alerts) != 1 { - t.Error("Should have returned one alert") - } - if config.ExternalEndpoints[0].Alerts[0].Type != alert.TypeDiscord { - t.Errorf("Type should have been %s", alert.TypeDiscord) - } - if config.ExternalEndpoints[0].Alerts[0].FailureThreshold != 3 { - t.Errorf("FailureThreshold should have been %d, got %d", 3, config.ExternalEndpoints[0].Alerts[0].FailureThreshold) - } - if config.ExternalEndpoints[0].Alerts[0].SuccessThreshold != 2 { - t.Errorf("SuccessThreshold should have been %d, got %d", 2, config.ExternalEndpoints[0].Alerts[0].SuccessThreshold) - } + if len(config.Endpoints) != 3 { t.Error("Should have returned two endpoints") } @@ -439,7 +447,6 @@ endpoints: if len(config.Endpoints[1].Conditions) != 2 { t.Errorf("There should have been %d conditions", 2) } - if config.Endpoints[2].URL != "https://example.com/" { t.Errorf("URL should have been %s", "https://example.com/") } @@ -977,6 +984,22 @@ alerting: enabled: true failure-threshold: 5 success-threshold: 3 + email: + from: "from@example.com" + username: "from@example.com" + password: "hunter2" + host: "mail.example.com" + port: 587 + to: "recipient1@example.com,recipient2@example.com" + client: + insecure: false + default-alert: + enabled: true + gotify: + server-url: "https://gotify.example" + token: "**************" + default-alert: + enabled: true endpoints: - name: website @@ -993,6 +1016,8 @@ endpoints: - type: teams - type: pushover - type: jetbrainsspace + - type: email + - type: gotify conditions: - "[STATUS] == 200" `)) @@ -1107,6 +1132,7 @@ endpoints: if config.Alerting.JetBrainsSpace == nil || !config.Alerting.JetBrainsSpace.IsValid() { t.Fatal("JetBrainsSpace alerting config should've been valid") } + if config.Alerting.JetBrainsSpace.GetDefaultAlert() == nil { t.Fatal("JetBrainsSpace.GetDefaultAlert() shouldn't have returned nil") } @@ -1120,6 +1146,50 @@ endpoints: t.Errorf("JetBrainsSpace webhook should've been %s, but was %s", "baz", config.Alerting.JetBrainsSpace.Token) } + if config.Alerting.Email == nil || !config.Alerting.Email.IsValid() { + t.Fatal("Email alerting config should've been valid") + } + if config.Alerting.Email.GetDefaultAlert() == nil { + t.Fatal("Email.GetDefaultAlert() shouldn't have returned nil") + } + if config.Alerting.Email.From != "from@example.com" { + t.Errorf("Email from should've been %s, but was %s", "from@example.com", config.Alerting.Email.From) + } + if config.Alerting.Email.Username != "from@example.com" { + t.Errorf("Email username should've been %s, but was %s", "from@example.com", config.Alerting.Email.Username) + } + if config.Alerting.Email.Password != "hunter2" { + t.Errorf("Email password should've been %s, but was %s", "hunter2", config.Alerting.Email.Password) + } + if config.Alerting.Email.Host != "mail.example.com" { + t.Errorf("Email host should've been %s, but was %s", "mail.example.com", config.Alerting.Email.Host) + } + if config.Alerting.Email.Port != 587 { + t.Errorf("Email port should've been %d, but was %d", 587, config.Alerting.Email.Port) + } + if config.Alerting.Email.To != "recipient1@example.com,recipient2@example.com" { + t.Errorf("Email to should've been %s, but was %s", "recipient1@example.com,recipient2@example.com", config.Alerting.Email.To) + } + if config.Alerting.Email.ClientConfig == nil { + t.Fatal("Email client config should've been set") + } + if config.Alerting.Email.ClientConfig.Insecure { + t.Error("Email client config should've been secure") + } + + if config.Alerting.Gotify == nil || !config.Alerting.Gotify.IsValid() { + t.Fatal("Gotify alerting config should've been valid") + } + if config.Alerting.Gotify.GetDefaultAlert() == nil { + t.Fatal("Gotify.GetDefaultAlert() shouldn't have returned nil") + } + if config.Alerting.Gotify.ServerURL != "https://gotify.example" { + t.Errorf("Gotify server URL should've been %s, but was %s", "https://gotify.example", config.Alerting.Gotify.ServerURL) + } + if config.Alerting.Gotify.Token != "**************" { + t.Errorf("Gotify token should've been %s, but was %s", "**************", config.Alerting.Gotify.Token) + } + // Endpoints if len(config.Endpoints) != 1 { t.Error("There should've been 1 endpoint") @@ -1130,8 +1200,8 @@ endpoints: if config.Endpoints[0].Interval != 60*time.Second { t.Errorf("Interval should have been %s, because it is the default value", 60*time.Second) } - if len(config.Endpoints[0].Alerts) != 10 { - t.Fatal("There should've been 10 alerts configured") + if len(config.Endpoints[0].Alerts) != 12 { + t.Fatalf("There should've been 12 alerts configured, got %d", len(config.Endpoints[0].Alerts)) } if config.Endpoints[0].Alerts[0].Type != alert.TypeSlack { @@ -1255,10 +1325,36 @@ endpoints: t.Error("The alert should've been enabled") } if config.Endpoints[0].Alerts[9].FailureThreshold != 5 { - t.Errorf("The default failure threshold of the alert should've been %d, but it was %d", 3, config.Endpoints[0].Alerts[9].FailureThreshold) + t.Errorf("The default failure threshold of the alert should've been %d, but it was %d", 5, config.Endpoints[0].Alerts[9].FailureThreshold) } if config.Endpoints[0].Alerts[9].SuccessThreshold != 3 { - t.Errorf("The default success threshold of the alert should've been %d, but it was %d", 2, config.Endpoints[0].Alerts[9].SuccessThreshold) + t.Errorf("The default success threshold of the alert should've been %d, but it was %d", 3, config.Endpoints[0].Alerts[9].SuccessThreshold) + } + + if config.Endpoints[0].Alerts[10].Type != alert.TypeEmail { + t.Errorf("The type of the alert should've been %s, but it was %s", alert.TypeEmail, config.Endpoints[0].Alerts[10].Type) + } + if !config.Endpoints[0].Alerts[10].IsEnabled() { + t.Error("The alert should've been enabled") + } + if config.Endpoints[0].Alerts[10].FailureThreshold != 3 { + t.Errorf("The default failure threshold of the alert should've been %d, but it was %d", 3, config.Endpoints[0].Alerts[10].FailureThreshold) + } + if config.Endpoints[0].Alerts[10].SuccessThreshold != 2 { + t.Errorf("The default success threshold of the alert should've been %d, but it was %d", 2, config.Endpoints[0].Alerts[10].SuccessThreshold) + } + + if config.Endpoints[0].Alerts[11].Type != alert.TypeGotify { + t.Errorf("The type of the alert should've been %s, but it was %s", alert.TypeGotify, config.Endpoints[0].Alerts[11].Type) + } + if !config.Endpoints[0].Alerts[11].IsEnabled() { + t.Error("The alert should've been enabled") + } + if config.Endpoints[0].Alerts[11].FailureThreshold != 3 { + t.Errorf("The default failure threshold of the alert should've been %d, but it was %d", 3, config.Endpoints[0].Alerts[11].FailureThreshold) + } + if config.Endpoints[0].Alerts[11].SuccessThreshold != 2 { + t.Errorf("The default success threshold of the alert should've been %d, but it was %d", 2, config.Endpoints[0].Alerts[11].SuccessThreshold) } } @@ -1532,6 +1628,99 @@ endpoints: } } +func TestParseAndValidateConfigBytesWithDuplicateEndpointName(t *testing.T) { + scenarios := []struct { + name string + shouldError bool + config string + }{ + { + name: "same-name-no-group", + shouldError: true, + config: ` +endpoints: + - name: ep1 + url: https://twin.sh/health + conditions: + - "[STATUS] == 200" + - name: ep1 + url: https://twin.sh/health + conditions: + - "[STATUS] == 200"`, + }, + { + name: "same-name-different-group", + shouldError: false, + config: ` +endpoints: + - name: ep1 + url: https://twin.sh/health + conditions: + - "[STATUS] == 200" + - name: ep1 + group: g1 + url: https://twin.sh/health + conditions: + - "[STATUS] == 200"`, + }, + { + name: "same-name-same-group", + shouldError: true, + config: ` +endpoints: + - name: ep1 + group: g1 + url: https://twin.sh/health + conditions: + - "[STATUS] == 200" + - name: ep1 + group: g1 + url: https://twin.sh/health + conditions: + - "[STATUS] == 200"`, + }, + { + name: "same-name-different-endpoint-type", + shouldError: true, + config: ` +external-endpoints: + - name: ep1 + token: "12345678" + +endpoints: + - name: ep1 + url: https://twin.sh/health + conditions: + - "[STATUS] == 200"`, + }, + { + name: "same-name-different-group-different-endpoint-type", + shouldError: false, + config: ` +external-endpoints: + - name: ep1 + group: gr1 + token: "12345678" + +endpoints: + - name: ep1 + url: https://twin.sh/health + conditions: + - "[STATUS] == 200"`, + }, + } + for _, scenario := range scenarios { + t.Run(scenario.name, func(t *testing.T) { + _, err := parseAndValidateConfigBytes([]byte(scenario.config)) + if scenario.shouldError && err == nil { + t.Error("should've returned an error") + } else if !scenario.shouldError && err != nil { + t.Error("shouldn't have returned an error") + } + }) + } +} + func TestParseAndValidateConfigBytesWithInvalidStorageConfig(t *testing.T) { _, err := parseAndValidateConfigBytes([]byte(` storage: @@ -1645,7 +1834,7 @@ endpoints: func TestParseAndValidateConfigBytesWithNoEndpoints(t *testing.T) { _, err := parseAndValidateConfigBytes([]byte(``)) - if err != ErrNoEndpointInConfig { + if !errors.Is(err, ErrNoEndpointInConfig) { t.Error("The error returned should have been of type ErrNoEndpointInConfig") } } @@ -1657,6 +1846,7 @@ func TestGetAlertingProviderByAlertType(t *testing.T) { Email: &email.AlertProvider{}, GitHub: &github.AlertProvider{}, GoogleChat: &googlechat.AlertProvider{}, + Gotify: &gotify.AlertProvider{}, JetBrainsSpace: &jetbrainsspace.AlertProvider{}, Matrix: &matrix.AlertProvider{}, Mattermost: &mattermost.AlertProvider{}, @@ -1679,6 +1869,7 @@ func TestGetAlertingProviderByAlertType(t *testing.T) { {alertType: alert.TypeEmail, expected: alertingConfig.Email}, {alertType: alert.TypeGitHub, expected: alertingConfig.GitHub}, {alertType: alert.TypeGoogleChat, expected: alertingConfig.GoogleChat}, + {alertType: alert.TypeGotify, expected: alertingConfig.Gotify}, {alertType: alert.TypeJetBrainsSpace, expected: alertingConfig.JetBrainsSpace}, {alertType: alert.TypeMatrix, expected: alertingConfig.Matrix}, {alertType: alert.TypeMattermost, expected: alertingConfig.Mattermost}, diff --git a/main.go b/main.go index c1890451..31de744c 100644 --- a/main.go +++ b/main.go @@ -83,13 +83,67 @@ func initializeStorage(cfg *config.Config) { for _, ep := range cfg.Endpoints { keys = append(keys, ep.Key()) } - for _, externalEndpoint := range cfg.ExternalEndpoints { - keys = append(keys, externalEndpoint.Key()) + for _, ee := range cfg.ExternalEndpoints { + keys = append(keys, ee.Key()) } numberOfEndpointStatusesDeleted := store.Get().DeleteAllEndpointStatusesNotInKeys(keys) if numberOfEndpointStatusesDeleted > 0 { log.Printf("[main.initializeStorage] Deleted %d endpoint statuses because their matching endpoints no longer existed", numberOfEndpointStatusesDeleted) } + // Clean up the triggered alerts from the storage provider and load valid triggered endpoint alerts + numberOfPersistedTriggeredAlertsLoaded := 0 + for _, ep := range cfg.Endpoints { + var checksums []string + for _, alert := range ep.Alerts { + if alert.IsEnabled() { + checksums = append(checksums, alert.Checksum()) + } + } + numberOfTriggeredAlertsDeleted := store.Get().DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(ep, checksums) + if cfg.Debug && numberOfTriggeredAlertsDeleted > 0 { + log.Printf("[main.initializeStorage] Deleted %d triggered alerts for endpoint with key=%s because their configurations have been changed or deleted", numberOfTriggeredAlertsDeleted, ep.Key()) + } + for _, alert := range ep.Alerts { + exists, resolveKey, numberOfSuccessesInARow, err := store.Get().GetTriggeredEndpointAlert(ep, alert) + if err != nil { + log.Printf("[main.initializeStorage] Failed to get triggered alert for endpoint with key=%s: %s", ep.Key(), err.Error()) + continue + } + if exists { + alert.Triggered, alert.ResolveKey = true, resolveKey + ep.NumberOfSuccessesInARow, ep.NumberOfFailuresInARow = numberOfSuccessesInARow, alert.FailureThreshold + numberOfPersistedTriggeredAlertsLoaded++ + } + } + } + for _, ee := range cfg.ExternalEndpoints { + var checksums []string + for _, alert := range ee.Alerts { + if alert.IsEnabled() { + checksums = append(checksums, alert.Checksum()) + } + } + convertedEndpoint := ee.ToEndpoint() + numberOfTriggeredAlertsDeleted := store.Get().DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(convertedEndpoint, checksums) + if cfg.Debug && numberOfTriggeredAlertsDeleted > 0 { + log.Printf("[main.initializeStorage] Deleted %d triggered alerts for endpoint with key=%s because their configurations have been changed or deleted", numberOfTriggeredAlertsDeleted, ee.Key()) + } + for _, alert := range ee.Alerts { + exists, resolveKey, numberOfSuccessesInARow, err := store.Get().GetTriggeredEndpointAlert(convertedEndpoint, alert) + if err != nil { + log.Printf("[main.initializeStorage] Failed to get triggered alert for endpoint with key=%s: %s", ee.Key(), err.Error()) + continue + } + if exists { + alert.Triggered, alert.ResolveKey = true, resolveKey + ee.NumberOfSuccessesInARow, ee.NumberOfFailuresInARow = numberOfSuccessesInARow, alert.FailureThreshold + numberOfPersistedTriggeredAlertsLoaded++ + } + } + } + if numberOfPersistedTriggeredAlertsLoaded > 0 { + log.Printf("[main.initializeStorage] Loaded %d persisted triggered alerts", numberOfPersistedTriggeredAlertsLoaded) + } } func listenToConfigurationFileChanges(cfg *config.Config) { diff --git a/storage/store/memory/memory.go b/storage/store/memory/memory.go index fac3fd3d..6451381d 100644 --- a/storage/store/memory/memory.go +++ b/storage/store/memory/memory.go @@ -5,6 +5,7 @@ import ( "sync" "time" + "github.com/TwiN/gatus/v5/alerting/alert" "github.com/TwiN/gatus/v5/config/endpoint" "github.com/TwiN/gatus/v5/storage/store/common" "github.com/TwiN/gatus/v5/storage/store/common/paging" @@ -174,6 +175,37 @@ func (s *Store) DeleteAllEndpointStatusesNotInKeys(keys []string) int { return s.cache.DeleteAll(keysToDelete) } +// GetTriggeredEndpointAlert returns whether the triggered alert for the specified endpoint as well as the necessary information to resolve it +// +// Always returns that the alert does not exist for the in-memory store since it does not support persistence across restarts +func (s *Store) GetTriggeredEndpointAlert(ep *endpoint.Endpoint, alert *alert.Alert) (exists bool, resolveKey string, numberOfSuccessesInARow int, err error) { + return false, "", 0, nil +} + +// UpsertTriggeredEndpointAlert inserts/updates a triggered alert for an endpoint +// Used for persistence of triggered alerts across application restarts +// +// Does nothing for the in-memory store since it does not support persistence across restarts +func (s *Store) UpsertTriggeredEndpointAlert(ep *endpoint.Endpoint, triggeredAlert *alert.Alert) error { + return nil +} + +// DeleteTriggeredEndpointAlert deletes a triggered alert for an endpoint +// +// Does nothing for the in-memory store since it does not support persistence across restarts +func (s *Store) DeleteTriggeredEndpointAlert(ep *endpoint.Endpoint, triggeredAlert *alert.Alert) error { + return nil +} + +// DeleteAllTriggeredAlertsNotInChecksumsByEndpoint removes all triggered alerts owned by an endpoint whose alert +// configurations are not provided in the checksums list. +// This prevents triggered alerts that have been removed or modified from lingering in the database. +// +// Does nothing for the in-memory store since it does not support persistence across restarts +func (s *Store) DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(ep *endpoint.Endpoint, checksums []string) int { + return 0 +} + // Clear deletes everything from the store func (s *Store) Clear() { s.cache.Clear() diff --git a/storage/store/sql/specific_postgres.go b/storage/store/sql/specific_postgres.go index ea00e839..aa8b5d5f 100644 --- a/storage/store/sql/specific_postgres.go +++ b/storage/store/sql/specific_postgres.go @@ -16,7 +16,7 @@ func (s *Store) createPostgresSchema() error { _, err = s.db.Exec(` CREATE TABLE IF NOT EXISTS endpoint_events ( endpoint_event_id BIGSERIAL PRIMARY KEY, - endpoint_id INTEGER NOT NULL REFERENCES endpoints(endpoint_id) ON DELETE CASCADE, + endpoint_id BIGINT NOT NULL REFERENCES endpoints(endpoint_id) ON DELETE CASCADE, event_type TEXT NOT NULL, event_timestamp TIMESTAMP NOT NULL ) @@ -66,7 +66,20 @@ func (s *Store) createPostgresSchema() error { UNIQUE(endpoint_id, hour_unix_timestamp) ) `) - // Silent table modifications + if err != nil { + return err + } + _, err = s.db.Exec(` + CREATE TABLE IF NOT EXISTS endpoint_alerts_triggered ( + endpoint_alert_trigger_id BIGSERIAL PRIMARY KEY, + endpoint_id BIGINT NOT NULL REFERENCES endpoints(endpoint_id) ON DELETE CASCADE, + configuration_checksum TEXT NOT NULL, + resolve_key TEXT NOT NULL, + number_of_successes_in_a_row INTEGER NOT NULL, + UNIQUE(endpoint_id, configuration_checksum) + ) + `) + // Silent table modifications TODO: Remove this in v6.0.0 _, _ = s.db.Exec(`ALTER TABLE endpoint_results ADD IF NOT EXISTS domain_expiration BIGINT NOT NULL DEFAULT 0`) return err } diff --git a/storage/store/sql/specific_sqlite.go b/storage/store/sql/specific_sqlite.go index 66b6eff2..0fe7fa54 100644 --- a/storage/store/sql/specific_sqlite.go +++ b/storage/store/sql/specific_sqlite.go @@ -66,7 +66,20 @@ func (s *Store) createSQLiteSchema() error { UNIQUE(endpoint_id, hour_unix_timestamp) ) `) - // Silent table modifications TODO: Remove this + if err != nil { + return err + } + _, err = s.db.Exec(` + CREATE TABLE IF NOT EXISTS endpoint_alerts_triggered ( + endpoint_alert_trigger_id INTEGER PRIMARY KEY, + endpoint_id INTEGER NOT NULL REFERENCES endpoints(endpoint_id) ON DELETE CASCADE, + configuration_checksum TEXT NOT NULL, + resolve_key TEXT NOT NULL, + number_of_successes_in_a_row INTEGER NOT NULL, + UNIQUE(endpoint_id, configuration_checksum) + ) + `) + // Silent table modifications TODO: Remove this in v6.0.0 _, _ = s.db.Exec(`ALTER TABLE endpoint_results ADD domain_expiration INTEGER NOT NULL DEFAULT 0`) return err } diff --git a/storage/store/sql/sql.go b/storage/store/sql/sql.go index ec25b564..7cfa9b5f 100644 --- a/storage/store/sql/sql.go +++ b/storage/store/sql/sql.go @@ -9,6 +9,7 @@ import ( "strings" "time" + "github.com/TwiN/gatus/v5/alerting/alert" "github.com/TwiN/gatus/v5/config/endpoint" "github.com/TwiN/gatus/v5/storage/store/common" "github.com/TwiN/gatus/v5/storage/store/common/paging" @@ -27,9 +28,9 @@ const ( // for aesthetic purposes, I deemed it wasn't worth the performance impact of yet another one-to-many table. arraySeparator = "|~|" - uptimeCleanUpThreshold = 10 * 24 * time.Hour // Maximum uptime age before triggering a clean up - eventsCleanUpThreshold = common.MaximumNumberOfEvents + 10 // Maximum number of events before triggering a clean up - resultsCleanUpThreshold = common.MaximumNumberOfResults + 10 // Maximum number of results before triggering a clean up + uptimeCleanUpThreshold = 10 * 24 * time.Hour // Maximum uptime age before triggering a cleanup + eventsCleanUpThreshold = common.MaximumNumberOfEvents + 10 // Maximum number of events before triggering a cleanup + resultsCleanUpThreshold = common.MaximumNumberOfResults + 10 // Maximum number of results before triggering a cleanup uptimeRetention = 7 * 24 * time.Hour @@ -234,12 +235,12 @@ func (s *Store) Insert(ep *endpoint.Endpoint, result *endpoint.Result) error { // Endpoint doesn't exist in the database, insert it if endpointID, err = s.insertEndpoint(tx, ep); err != nil { _ = tx.Rollback() - log.Printf("[sql.Insert] Failed to create endpoint with group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error()) + log.Printf("[sql.Insert] Failed to create endpoint with key=%s: %s", ep.Key(), err.Error()) return err } } else { _ = tx.Rollback() - log.Printf("[sql.Insert] Failed to retrieve id of endpoint with group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error()) + log.Printf("[sql.Insert] Failed to retrieve id of endpoint with key=%s: %s", ep.Key(), err.Error()) return err } } @@ -255,7 +256,7 @@ func (s *Store) Insert(ep *endpoint.Endpoint, result *endpoint.Result) error { numberOfEvents, err := s.getNumberOfEventsByEndpointID(tx, endpointID) if err != nil { // Silently fail - log.Printf("[sql.Insert] Failed to retrieve total number of events for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error()) + log.Printf("[sql.Insert] Failed to retrieve total number of events for endpoint with key=%s: %s", ep.Key(), err.Error()) } if numberOfEvents == 0 { // There's no events yet, which means we need to add the EventStart and the first healthy/unhealthy event @@ -265,18 +266,18 @@ func (s *Store) Insert(ep *endpoint.Endpoint, result *endpoint.Result) error { }) if err != nil { // Silently fail - log.Printf("[sql.Insert] Failed to insert event=%s for group=%s; endpoint=%s: %s", endpoint.EventStart, ep.Group, ep.Name, err.Error()) + log.Printf("[sql.Insert] Failed to insert event=%s for endpoint with key=%s: %s", endpoint.EventStart, ep.Key(), err.Error()) } event := endpoint.NewEventFromResult(result) if err = s.insertEndpointEvent(tx, endpointID, event); err != nil { // Silently fail - log.Printf("[sql.Insert] Failed to insert event=%s for group=%s; endpoint=%s: %s", event.Type, ep.Group, ep.Name, err.Error()) + log.Printf("[sql.Insert] Failed to insert event=%s for endpoint with key=%s: %s", event.Type, ep.Key(), err.Error()) } } else { // Get the success value of the previous result var lastResultSuccess bool if lastResultSuccess, err = s.getLastEndpointResultSuccessValue(tx, endpointID); err != nil { - log.Printf("[sql.Insert] Failed to retrieve outcome of previous result for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error()) + log.Printf("[sql.Insert] Failed to retrieve outcome of previous result for endpoint with key=%s: %s", ep.Key(), err.Error()) } else { // If we managed to retrieve the outcome of the previous result, we'll compare it with the new result. // If the final outcome (success or failure) of the previous and the new result aren't the same, it means @@ -286,7 +287,7 @@ func (s *Store) Insert(ep *endpoint.Endpoint, result *endpoint.Result) error { event := endpoint.NewEventFromResult(result) if err = s.insertEndpointEvent(tx, endpointID, event); err != nil { // Silently fail - log.Printf("[sql.Insert] Failed to insert event=%s for group=%s; endpoint=%s: %s", event.Type, ep.Group, ep.Name, err.Error()) + log.Printf("[sql.Insert] Failed to insert event=%s for endpoint with key=%s: %s", event.Type, ep.Key(), err.Error()) } } } @@ -295,40 +296,40 @@ func (s *Store) Insert(ep *endpoint.Endpoint, result *endpoint.Result) error { // (since we're only deleting MaximumNumberOfEvents at a time instead of 1) if numberOfEvents > eventsCleanUpThreshold { if err = s.deleteOldEndpointEvents(tx, endpointID); err != nil { - log.Printf("[sql.Insert] Failed to delete old events for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error()) + log.Printf("[sql.Insert] Failed to delete old events for endpoint with key=%s: %s", ep.Key(), err.Error()) } } } // Second, we need to insert the result. if err = s.insertEndpointResult(tx, endpointID, result); err != nil { - log.Printf("[sql.Insert] Failed to insert result for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error()) + log.Printf("[sql.Insert] Failed to insert result for endpoint with key=%s: %s", ep.Key(), err.Error()) _ = tx.Rollback() // If we can't insert the result, we'll rollback now since there's no point continuing return err } // Clean up old results numberOfResults, err := s.getNumberOfResultsByEndpointID(tx, endpointID) if err != nil { - log.Printf("[sql.Insert] Failed to retrieve total number of results for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error()) + log.Printf("[sql.Insert] Failed to retrieve total number of results for endpoint with key=%s: %s", ep.Key(), err.Error()) } else { if numberOfResults > resultsCleanUpThreshold { if err = s.deleteOldEndpointResults(tx, endpointID); err != nil { - log.Printf("[sql.Insert] Failed to delete old results for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error()) + log.Printf("[sql.Insert] Failed to delete old results for endpoint with key=%s: %s", ep.Key(), err.Error()) } } } // Finally, we need to insert the uptime data. // Because the uptime data significantly outlives the results, we can't rely on the results for determining the uptime if err = s.updateEndpointUptime(tx, endpointID, result); err != nil { - log.Printf("[sql.Insert] Failed to update uptime for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error()) + log.Printf("[sql.Insert] Failed to update uptime for endpoint with key=%s: %s", ep.Key(), err.Error()) } // Clean up old uptime entries ageOfOldestUptimeEntry, err := s.getAgeOfOldestEndpointUptimeEntry(tx, endpointID) if err != nil { - log.Printf("[sql.Insert] Failed to retrieve oldest endpoint uptime entry for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error()) + log.Printf("[sql.Insert] Failed to retrieve oldest endpoint uptime entry for endpoint with key=%s: %s", ep.Key(), err.Error()) } else { if ageOfOldestUptimeEntry > uptimeCleanUpThreshold { if err = s.deleteOldUptimeEntries(tx, endpointID, time.Now().Add(-(uptimeRetention + time.Hour))); err != nil { - log.Printf("[sql.Insert] Failed to delete old uptime entries for group=%s; endpoint=%s: %s", ep.Group, ep.Name, err.Error()) + log.Printf("[sql.Insert] Failed to delete old uptime entries for endpoint with key=%s: %s", ep.Key(), err.Error()) } } } @@ -374,6 +375,8 @@ func (s *Store) DeleteAllEndpointStatusesNotInKeys(keys []string) int { } if s.writeThroughCache != nil { // It's easier to just wipe out the entire cache than to try to find all keys that are not in the keys list + // This only happens on start and during tests, so it's fine for us to just clear the cache without worrying + // about performance _ = s.writeThroughCache.DeleteKeysByPattern("*") } // Return number of rows deleted @@ -381,6 +384,111 @@ func (s *Store) DeleteAllEndpointStatusesNotInKeys(keys []string) int { return int(rowsAffects) } +// GetTriggeredEndpointAlert returns whether the triggered alert for the specified endpoint as well as the necessary information to resolve it +func (s *Store) GetTriggeredEndpointAlert(ep *endpoint.Endpoint, alert *alert.Alert) (exists bool, resolveKey string, numberOfSuccessesInARow int, err error) { + //log.Printf("[sql.GetTriggeredEndpointAlert] Getting triggered alert with checksum=%s for endpoint with key=%s", alert.Checksum(), ep.Key()) + err = s.db.QueryRow( + "SELECT resolve_key, number_of_successes_in_a_row FROM endpoint_alerts_triggered WHERE endpoint_id = (SELECT endpoint_id FROM endpoints WHERE endpoint_key = $1 LIMIT 1) AND configuration_checksum = $2", + ep.Key(), + alert.Checksum(), + ).Scan(&resolveKey, &numberOfSuccessesInARow) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + return false, "", 0, nil + } + return false, "", 0, err + } + return true, resolveKey, numberOfSuccessesInARow, nil +} + +// UpsertTriggeredEndpointAlert inserts/updates a triggered alert for an endpoint +// Used for persistence of triggered alerts across application restarts +func (s *Store) UpsertTriggeredEndpointAlert(ep *endpoint.Endpoint, triggeredAlert *alert.Alert) error { + //log.Printf("[sql.UpsertTriggeredEndpointAlert] Upserting triggered alert with checksum=%s for endpoint with key=%s", triggeredAlert.Checksum(), ep.Key()) + tx, err := s.db.Begin() + if err != nil { + return err + } + endpointID, err := s.getEndpointID(tx, ep) + if err != nil { + if errors.Is(err, common.ErrEndpointNotFound) { + // Endpoint doesn't exist in the database, insert it + // This shouldn't happen, but we'll handle it anyway + if endpointID, err = s.insertEndpoint(tx, ep); err != nil { + _ = tx.Rollback() + log.Printf("[sql.UpsertTriggeredEndpointAlert] Failed to create endpoint with key=%s: %s", ep.Key(), err.Error()) + return err + } + } else { + _ = tx.Rollback() + log.Printf("[sql.UpsertTriggeredEndpointAlert] Failed to retrieve id of endpoint with key=%s: %s", ep.Key(), err.Error()) + return err + } + } + _, err = tx.Exec( + ` + INSERT INTO endpoint_alerts_triggered (endpoint_id, configuration_checksum, resolve_key, number_of_successes_in_a_row) + VALUES ($1, $2, $3, $4) + ON CONFLICT(endpoint_id, configuration_checksum) DO UPDATE SET + resolve_key = $3, + number_of_successes_in_a_row = $4 + `, + endpointID, + triggeredAlert.Checksum(), + triggeredAlert.ResolveKey, + ep.NumberOfSuccessesInARow, // We only persist NumberOfSuccessesInARow, because all alerts in this table are already triggered + ) + if err != nil { + _ = tx.Rollback() + log.Printf("[sql.UpsertTriggeredEndpointAlert] Failed to persist triggered alert for endpoint with key=%s: %s", ep.Key(), err.Error()) + return err + } + if err = tx.Commit(); err != nil { + _ = tx.Rollback() + } + return nil +} + +// DeleteTriggeredEndpointAlert deletes a triggered alert for an endpoint +func (s *Store) DeleteTriggeredEndpointAlert(ep *endpoint.Endpoint, triggeredAlert *alert.Alert) error { + //log.Printf("[sql.DeleteTriggeredEndpointAlert] Deleting triggered alert with checksum=%s for endpoint with key=%s", triggeredAlert.Checksum(), ep.Key()) + _, err := s.db.Exec("DELETE FROM endpoint_alerts_triggered WHERE configuration_checksum = $1 AND endpoint_id = (SELECT endpoint_id FROM endpoints WHERE endpoint_key = $2 LIMIT 1)", triggeredAlert.Checksum(), ep.Key()) + return err +} + +// DeleteAllTriggeredAlertsNotInChecksumsByEndpoint removes all triggered alerts owned by an endpoint whose alert +// configurations are not provided in the checksums list. +// This prevents triggered alerts that have been removed or modified from lingering in the database. +func (s *Store) DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(ep *endpoint.Endpoint, checksums []string) int { + //log.Printf("[sql.DeleteAllTriggeredAlertsNotInChecksumsByEndpoint] Deleting triggered alerts for endpoint with key=%s that do not belong to any of checksums=%v", ep.Key(), checksums) + var err error + var result sql.Result + if len(checksums) == 0 { + // No checksums? Then it means there are no (enabled) alerts configured for that endpoint, so we can get rid of all + // persisted triggered alerts for that endpoint + result, err = s.db.Exec("DELETE FROM endpoint_alerts_triggered WHERE endpoint_id = (SELECT endpoint_id FROM endpoints WHERE endpoint_key = $1 LIMIT 1)", ep.Key()) + } else { + args := make([]interface{}, 0, len(checksums)+1) + args = append(args, ep.Key()) + query := `DELETE FROM endpoint_alerts_triggered + WHERE endpoint_id = (SELECT endpoint_id FROM endpoints WHERE endpoint_key = $1 LIMIT 1) + AND configuration_checksum NOT IN (` + for i := range checksums { + query += fmt.Sprintf("$%d,", i+2) + args = append(args, checksums[i]) + } + query = query[:len(query)-1] + ")" // Remove the last comma and add the closing parenthesis + result, err = s.db.Exec(query, args...) + } + if err != nil { + log.Printf("[sql.DeleteAllTriggeredAlertsNotInChecksumsByEndpoint] Failed to delete rows for endpoint with key=%s that do not belong to any of checksums=%v: %s", ep.Key(), checksums, err.Error()) + return 0 + } + // Return number of rows deleted + rowsAffects, _ := result.RowsAffected() + return int(rowsAffects) +} + // Clear deletes everything from the store func (s *Store) Clear() { _, _ = s.db.Exec("DELETE FROM endpoints") diff --git a/storage/store/sql/sql_test.go b/storage/store/sql/sql_test.go index da611a4a..b00eee28 100644 --- a/storage/store/sql/sql_test.go +++ b/storage/store/sql/sql_test.go @@ -1,9 +1,11 @@ package sql import ( + "errors" "testing" "time" + "github.com/TwiN/gatus/v5/alerting/alert" "github.com/TwiN/gatus/v5/config/endpoint" "github.com/TwiN/gatus/v5/storage/store/common" "github.com/TwiN/gatus/v5/storage/store/common/paging" @@ -81,13 +83,13 @@ var ( ) func TestNewStore(t *testing.T) { - if _, err := NewStore("", "TestNewStore.db", false); err != ErrDatabaseDriverNotSpecified { + if _, err := NewStore("", t.TempDir()+"/TestNewStore.db", false); !errors.Is(err, ErrDatabaseDriverNotSpecified) { t.Error("expected error due to blank driver parameter") } - if _, err := NewStore("sqlite", "", false); err != ErrPathNotSpecified { + if _, err := NewStore("sqlite", "", false); !errors.Is(err, ErrPathNotSpecified) { t.Error("expected error due to blank path parameter") } - if store, err := NewStore("sqlite", t.TempDir()+"/TestNewStore.db", false); err != nil { + if store, err := NewStore("sqlite", t.TempDir()+"/TestNewStore.db", true); err != nil { t.Error("shouldn't have returned any error, got", err.Error()) } else { _ = store.db.Close() @@ -168,6 +170,40 @@ func TestStore_InsertCleansUpEventsAndResultsProperly(t *testing.T) { store.Clear() } +func TestStore_InsertWithCaching(t *testing.T) { + store, _ := NewStore("sqlite", t.TempDir()+"/TestStore_InsertWithCaching.db", true) + defer store.Close() + // Add 2 results + store.Insert(&testEndpoint, &testSuccessfulResult) + store.Insert(&testEndpoint, &testSuccessfulResult) + // Verify that they exist + endpointStatuses, _ := store.GetAllEndpointStatuses(paging.NewEndpointStatusParams().WithResults(1, 20)) + if numberOfEndpointStatuses := len(endpointStatuses); numberOfEndpointStatuses != 1 { + t.Fatalf("expected 1 EndpointStatus, got %d", numberOfEndpointStatuses) + } + if len(endpointStatuses[0].Results) != 2 { + t.Fatalf("expected 2 results, got %d", len(endpointStatuses[0].Results)) + } + // Add 2 more results + store.Insert(&testEndpoint, &testUnsuccessfulResult) + store.Insert(&testEndpoint, &testUnsuccessfulResult) + // Verify that they exist + endpointStatuses, _ = store.GetAllEndpointStatuses(paging.NewEndpointStatusParams().WithResults(1, 20)) + if numberOfEndpointStatuses := len(endpointStatuses); numberOfEndpointStatuses != 1 { + t.Fatalf("expected 1 EndpointStatus, got %d", numberOfEndpointStatuses) + } + if len(endpointStatuses[0].Results) != 4 { + t.Fatalf("expected 4 results, got %d", len(endpointStatuses[0].Results)) + } + // Clear the store, which should also clear the cache + store.Clear() + // Verify that they no longer exist + endpointStatuses, _ = store.GetAllEndpointStatuses(paging.NewEndpointStatusParams().WithResults(1, 20)) + if numberOfEndpointStatuses := len(endpointStatuses); numberOfEndpointStatuses != 0 { + t.Fatalf("expected 0 EndpointStatus, got %d", numberOfEndpointStatuses) + } +} + func TestStore_Persistence(t *testing.T) { path := t.TempDir() + "/TestStore_Persistence.db" store, _ := NewStore("sqlite", path, false) @@ -368,10 +404,10 @@ func TestStore_NoRows(t *testing.T) { defer store.Close() tx, _ := store.db.Begin() defer tx.Rollback() - if _, err := store.getLastEndpointResultSuccessValue(tx, 1); err != errNoRowsReturned { + if _, err := store.getLastEndpointResultSuccessValue(tx, 1); !errors.Is(err, errNoRowsReturned) { t.Errorf("should've %v, got %v", errNoRowsReturned, err) } - if _, err := store.getAgeOfOldestEndpointUptimeEntry(tx, 1); err != errNoRowsReturned { + if _, err := store.getAgeOfOldestEndpointUptimeEntry(tx, 1); !errors.Is(err, errNoRowsReturned) { t.Errorf("should've %v, got %v", errNoRowsReturned, err) } } @@ -564,3 +600,131 @@ func TestCacheKey(t *testing.T) { }) } } + +func TestTriggeredEndpointAlertsPersistence(t *testing.T) { + store, _ := NewStore("sqlite", t.TempDir()+"/TestTriggeredEndpointAlertsPersistence.db", false) + defer store.Close() + yes, desc := false, "description" + ep := testEndpoint + ep.NumberOfSuccessesInARow = 0 + alrt := &alert.Alert{ + Type: alert.TypePagerDuty, + Enabled: &yes, + FailureThreshold: 4, + SuccessThreshold: 2, + Description: &desc, + SendOnResolved: &yes, + Triggered: true, + ResolveKey: "1234567", + } + // Alert just triggered, so NumberOfSuccessesInARow is 0 + if err := store.UpsertTriggeredEndpointAlert(&ep, alrt); err != nil { + t.Fatal("expected no error, got", err.Error()) + } + exists, resolveKey, numberOfSuccessesInARow, err := store.GetTriggeredEndpointAlert(&ep, alrt) + if err != nil { + t.Fatal("expected no error, got", err.Error()) + } + if !exists { + t.Error("expected triggered alert to exist") + } + if resolveKey != alrt.ResolveKey { + t.Errorf("expected resolveKey %s, got %s", alrt.ResolveKey, resolveKey) + } + if numberOfSuccessesInARow != ep.NumberOfSuccessesInARow { + t.Errorf("expected persisted NumberOfSuccessesInARow to be %d, got %d", ep.NumberOfSuccessesInARow, numberOfSuccessesInARow) + } + // Endpoint just had a successful evaluation, so NumberOfSuccessesInARow is now 1 + ep.NumberOfSuccessesInARow++ + if err := store.UpsertTriggeredEndpointAlert(&ep, alrt); err != nil { + t.Fatal("expected no error, got", err.Error()) + } + exists, resolveKey, numberOfSuccessesInARow, err = store.GetTriggeredEndpointAlert(&ep, alrt) + if err != nil { + t.Error("expected no error, got", err.Error()) + } + if !exists { + t.Error("expected triggered alert to exist") + } + if resolveKey != alrt.ResolveKey { + t.Errorf("expected resolveKey %s, got %s", alrt.ResolveKey, resolveKey) + } + if numberOfSuccessesInARow != ep.NumberOfSuccessesInARow { + t.Errorf("expected persisted NumberOfSuccessesInARow to be %d, got %d", ep.NumberOfSuccessesInARow, numberOfSuccessesInARow) + } + // Simulate the endpoint having another successful evaluation, which means the alert is now resolved, + // and we should delete the triggered alert from the store + ep.NumberOfSuccessesInARow++ + if err := store.DeleteTriggeredEndpointAlert(&ep, alrt); err != nil { + t.Fatal("expected no error, got", err.Error()) + } + exists, _, _, err = store.GetTriggeredEndpointAlert(&ep, alrt) + if err != nil { + t.Error("expected no error, got", err.Error()) + } + if exists { + t.Error("expected triggered alert to no longer exist as it has been deleted") + } +} + +func TestStore_DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(t *testing.T) { + store, _ := NewStore("sqlite", t.TempDir()+"/TestStore_DeleteAllTriggeredAlertsNotInChecksumsByEndpoint.db", false) + defer store.Close() + yes, desc := false, "description" + ep1 := testEndpoint + ep1.Name = "ep1" + ep2 := testEndpoint + ep2.Name = "ep2" + alert1 := alert.Alert{ + Type: alert.TypePagerDuty, + Enabled: &yes, + FailureThreshold: 4, + SuccessThreshold: 2, + Description: &desc, + SendOnResolved: &yes, + Triggered: true, + ResolveKey: "1234567", + } + alert2 := alert1 + alert2.Type, alert2.ResolveKey = alert.TypeSlack, "" + alert3 := alert2 + if err := store.UpsertTriggeredEndpointAlert(&ep1, &alert1); err != nil { + t.Fatal("expected no error, got", err.Error()) + } + if err := store.UpsertTriggeredEndpointAlert(&ep1, &alert2); err != nil { + t.Fatal("expected no error, got", err.Error()) + } + if err := store.UpsertTriggeredEndpointAlert(&ep2, &alert3); err != nil { + t.Fatal("expected no error, got", err.Error()) + } + if exists, _, _, _ := store.GetTriggeredEndpointAlert(&ep1, &alert1); !exists { + t.Error("expected alert1 to have been deleted") + } + if exists, _, _, _ := store.GetTriggeredEndpointAlert(&ep1, &alert2); !exists { + t.Error("expected alert2 to exist for ep1") + } + if exists, _, _, _ := store.GetTriggeredEndpointAlert(&ep2, &alert3); !exists { + t.Error("expected alert3 to exist for ep2") + } + // Now we simulate the alert configuration being updated, and the alert being resolved + if deleted := store.DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(&ep1, []string{alert2.Checksum()}); deleted != 1 { + t.Errorf("expected 1 triggered alert to be deleted, got %d", deleted) + } + if exists, _, _, _ := store.GetTriggeredEndpointAlert(&ep1, &alert1); exists { + t.Error("expected alert1 to have been deleted") + } + if exists, _, _, _ := store.GetTriggeredEndpointAlert(&ep1, &alert2); !exists { + t.Error("expected alert2 to exist for ep1") + } + if exists, _, _, _ := store.GetTriggeredEndpointAlert(&ep2, &alert3); !exists { + t.Error("expected alert3 to exist for ep2") + } + // Now let's just assume all alerts for ep1 were removed + if deleted := store.DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(&ep1, []string{}); deleted != 1 { + t.Errorf("expected 1 triggered alert to be deleted, got %d", deleted) + } + // Make sure the alert for ep2 still exists + if exists, _, _, _ := store.GetTriggeredEndpointAlert(&ep2, &alert3); !exists { + t.Error("expected alert3 to exist for ep2") + } +} diff --git a/storage/store/store.go b/storage/store/store.go index 4dea6dcf..5cb20e36 100644 --- a/storage/store/store.go +++ b/storage/store/store.go @@ -5,6 +5,7 @@ import ( "log" "time" + "github.com/TwiN/gatus/v5/alerting/alert" "github.com/TwiN/gatus/v5/config/endpoint" "github.com/TwiN/gatus/v5/storage" "github.com/TwiN/gatus/v5/storage/store/common/paging" @@ -41,6 +42,21 @@ type Store interface { // Used to delete endpoints that have been persisted but are no longer part of the configured endpoints DeleteAllEndpointStatusesNotInKeys(keys []string) int + // GetTriggeredEndpointAlert returns whether the triggered alert for the specified endpoint as well as the necessary information to resolve it + GetTriggeredEndpointAlert(ep *endpoint.Endpoint, alert *alert.Alert) (exists bool, resolveKey string, numberOfSuccessesInARow int, err error) + + // UpsertTriggeredEndpointAlert inserts/updates a triggered alert for an endpoint + // Used for persistence of triggered alerts across application restarts + UpsertTriggeredEndpointAlert(ep *endpoint.Endpoint, triggeredAlert *alert.Alert) error + + // DeleteTriggeredEndpointAlert deletes a triggered alert for an endpoint + DeleteTriggeredEndpointAlert(ep *endpoint.Endpoint, triggeredAlert *alert.Alert) error + + // DeleteAllTriggeredAlertsNotInChecksumsByEndpoint removes all triggered alerts owned by an endpoint whose alert + // configurations are not provided in the checksums list. + // This prevents triggered alerts that have been removed or modified from lingering in the database. + DeleteAllTriggeredAlertsNotInChecksumsByEndpoint(ep *endpoint.Endpoint, checksums []string) int + // Clear deletes everything from the store Clear() diff --git a/watchdog/alerting.go b/watchdog/alerting.go index 09e73a40..e02d1086 100644 --- a/watchdog/alerting.go +++ b/watchdog/alerting.go @@ -7,6 +7,7 @@ import ( "github.com/TwiN/gatus/v5/alerting" "github.com/TwiN/gatus/v5/config/endpoint" + "github.com/TwiN/gatus/v5/storage/store" ) // HandleAlerting takes care of alerts to resolve and alerts to trigger based on result success or failure @@ -50,9 +51,12 @@ func handleAlertsToTrigger(ep *endpoint.Endpoint, result *endpoint.Result, alert log.Printf("[watchdog.handleAlertsToTrigger] Failed to send an alert for endpoint=%s: %s", ep.Name, err.Error()) } else { endpointAlert.Triggered = true + if err := store.Get().UpsertTriggeredEndpointAlert(ep, endpointAlert); err != nil { + log.Printf("[watchdog.handleAlertsToTrigger] Failed to persist triggered endpoint alert for endpoint with key=%s: %s", ep.Key(), err.Error()) + } } } else { - log.Printf("[watchdog.handleAlertsToResolve] Not sending alert of type=%s despite being TRIGGERED, because the provider wasn't configured properly", endpointAlert.Type) + log.Printf("[watchdog.handleAlertsToTrigger] Not sending alert of type=%s despite being TRIGGERED, because the provider wasn't configured properly", endpointAlert.Type) } } } @@ -60,21 +64,31 @@ func handleAlertsToTrigger(ep *endpoint.Endpoint, result *endpoint.Result, alert func handleAlertsToResolve(ep *endpoint.Endpoint, result *endpoint.Result, alertingConfig *alerting.Config, debug bool) { ep.NumberOfSuccessesInARow++ for _, endpointAlert := range ep.Alerts { - if !endpointAlert.IsEnabled() || !endpointAlert.Triggered || endpointAlert.SuccessThreshold > ep.NumberOfSuccessesInARow { + isStillBelowSuccessThreshold := endpointAlert.SuccessThreshold > ep.NumberOfSuccessesInARow + if isStillBelowSuccessThreshold && endpointAlert.IsEnabled() { + // Persist NumberOfSuccessesInARow + if err := store.Get().UpsertTriggeredEndpointAlert(ep, endpointAlert); err != nil { + log.Printf("[watchdog.handleAlertsToResolve] Failed to update triggered endpoint alert for endpoint with key=%s: %s", ep.Key(), err.Error()) + } + } + if !endpointAlert.IsEnabled() || !endpointAlert.Triggered || isStillBelowSuccessThreshold { continue } // Even if the alert provider returns an error, we still set the alert's Triggered variable to false. // Further explanation can be found on Alert's Triggered field. endpointAlert.Triggered = false + if err := store.Get().DeleteTriggeredEndpointAlert(ep, endpointAlert); err != nil { + log.Printf("[watchdog.handleAlertsToResolve] Failed to delete persisted triggered endpoint alert for endpoint with key=%s: %s", ep.Key(), err.Error()) + } if !endpointAlert.IsSendingOnResolved() { continue } alertProvider := alertingConfig.GetAlertingProviderByAlertType(endpointAlert.Type) if alertProvider != nil { - log.Printf("[watchdog.handleAlertsToResolve] Sending %s alert because alert for endpoint=%s with description='%s' has been RESOLVED", endpointAlert.Type, ep.Name, endpointAlert.GetDescription()) + log.Printf("[watchdog.handleAlertsToResolve] Sending %s alert because alert for endpoint with key=%s with description='%s' has been RESOLVED", endpointAlert.Type, ep.Key(), endpointAlert.GetDescription()) err := alertProvider.Send(ep, endpointAlert, result, true) if err != nil { - log.Printf("[watchdog.handleAlertsToResolve] Failed to send an alert for endpoint=%s: %s", ep.Name, err.Error()) + log.Printf("[watchdog.handleAlertsToResolve] Failed to send an alert for endpoint with key=%s: %s", ep.Key(), err.Error()) } } else { log.Printf("[watchdog.handleAlertsToResolve] Not sending alert of type=%s despite being RESOLVED, because the provider wasn't configured properly", endpointAlert.Type)