diff --git a/.github/assets/slack-alerts.png b/.github/assets/slack-alerts.png new file mode 100644 index 00000000..3f9e3cad Binary files /dev/null and b/.github/assets/slack-alerts.png differ diff --git a/README.md b/README.md index c2798003..d99f0d02 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ core applications: https://status.twinnation.org/ - [Sending a GraphQL request](#sending-a-graphql-request) - [Configuring Slack alerts](#configuring-slack-alerts) - [Configuring Twilio alerts](#configuring-twilio-alerts) - - [Configuring custom alert](#configuring-custom-alerts) + - [Configuring custom alerts](#configuring-custom-alerts) ## Features @@ -67,38 +67,40 @@ This example would look like this: ![Simple example](.github/assets/example.png) -Note that you can also add environment variables in the your configuration file (i.e. `$DOMAIN`, `${DOMAIN}`) +Note that you can also add environment variables in the configuration file (i.e. `$DOMAIN`, `${DOMAIN}`) ### Configuration -| Parameter | Description | Default | -| --------------------------------- | --------------------------------------------------------------- | -------------- | -| `metrics` | Whether to expose metrics at /metrics | `false` | -| `services` | List of services to monitor | Required `[]` | -| `services[].name` | Name of the service. Can be anything. | Required `""` | -| `services[].url` | URL to send the request to | Required `""` | -| `services[].conditions` | Conditions used to determine the health of the service | `[]` | -| `services[].interval` | Duration to wait between every status check | `60s` | -| `services[].method` | Request method | `GET` | -| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` | -| `services[].body` | Request body | `""` | -| `services[].headers` | Request headers | `{}` | -| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` | -| `services[].alerts[].enabled` | Whether to enable the alert | `false` | -| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` | -| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` | -| `alerting` | Configuration for alerting | `{}` | -| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` | -| `alerting.twilio` | Settings for alerts of type `twilio` | `""` | -| `alerting.twilio.sid` | Twilio account SID | Required `""` | -| `alerting.twilio.token` | Twilio auth token | Required `""` | -| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` | -| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` | -| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` | -| `alerting.custom.url` | Custom alerting request url | `""` | -| `alerting.custom.body` | Custom alerting request body. | `""` | -| `alerting.custom.headers` | Custom alerting request headers | `{}` | +| Parameter | Description | Default | +| -------------------------------------- | --------------------------------------------------------------- | -------------- | +| `debug` | Whether to enable debug logs | `false` | +| `metrics` | Whether to expose metrics at /metrics | `false` | +| `services` | List of services to monitor | Required `[]` | +| `services[].name` | Name of the service. Can be anything. | Required `""` | +| `services[].url` | URL to send the request to | Required `""` | +| `services[].conditions` | Conditions used to determine the health of the service | `[]` | +| `services[].interval` | Duration to wait between every status check | `60s` | +| `services[].method` | Request method | `GET` | +| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` | +| `services[].body` | Request body | `""` | +| `services[].headers` | Request headers | `{}` | +| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` | +| `services[].alerts[].enabled` | Whether to enable the alert | `false` | +| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` | +| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` | +| `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert subsides | `false` | +| `alerting` | Configuration for alerting | `{}` | +| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` | +| `alerting.twilio` | Settings for alerts of type `twilio` | `""` | +| `alerting.twilio.sid` | Twilio account SID | Required `""` | +| `alerting.twilio.token` | Twilio auth token | Required `""` | +| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` | +| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` | +| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` | +| `alerting.custom.url` | Custom alerting request url | `""` | +| `alerting.custom.body` | Custom alerting request body. | `""` | +| `alerting.custom.headers` | Custom alerting request headers | `{}` | ### Conditions @@ -121,7 +123,7 @@ Here are some examples of conditions you can use: ## Docker -Building the Docker image is done as following: +Building the Docker image is done as follows: ``` docker build . -t gatus @@ -194,33 +196,37 @@ services: - type: slack enabled: true description: "healthcheck failed 3 times in a row" + send-on-resolved: true - type: slack enabled: true threshold: 5 description: "healthcheck failed 5 times in a row" + send-on-resolved: true conditions: - "[STATUS] == 200" - "[BODY].status == UP" - "[RESPONSE_TIME] < 300" ``` +Here's an example of what the notifications look like: + +![Slack notifications](.github/assets/slack-alerts.png) + + ### Configuring Twilio alerts ```yaml alerting: twilio: - sid: **** - token: **** - from: +1-234-567-8901 - to: +1-234-567-8901 + sid: "..." + token: "..." + from: "+1-234-567-8901" + to: "+1-234-567-8901" services: - name: twinnation interval: 30s url: "https://twinnation.org/health" alerts: - - type: twilio - enabled: true - description: "healthcheck failed 3 times in a row" - type: twilio enabled: true threshold: 5 @@ -242,7 +248,10 @@ would then check if the service that started failing was recently deployed, and roll it back. The values `[ALERT_DESCRIPTION]` and `[SERVICE_NAME]` are automatically substituted for the alert description and the -service name respectively in the body (`alerting.custom.body`) and the url (`alerting.custom.url`). +service name respectively in the body (`alerting.custom.body`) as well as the url (`alerting.custom.url`). + +If you have `send-on-resolved` set to `true`, you may want to use `[ALERT_TRIGGERED_OR_RESOLVED]` to differentiate +the notifications. It will be replaced for either `TRIGGERED` or `RESOLVED`, based on the situation. For all intents and purpose, we'll configure the custom alert with a Slack webhook, but you can call anything you want. @@ -253,7 +262,7 @@ alerting: method: "POST" body: | { - "text": "[SERVICE_NAME] - [ALERT_DESCRIPTION]" + "text": "[ALERT_TRIGGERED_OR_RESOLVED]: [SERVICE_NAME] - [ALERT_DESCRIPTION]" } services: - name: twinnation @@ -263,6 +272,7 @@ services: - type: custom enabled: true threshold: 10 + send-on-resolved: true description: "healthcheck failed 10 times in a row" conditions: - "[STATUS] == 200" diff --git a/config/config.go b/config/config.go index 23638975..2ba7fe9d 100644 --- a/config/config.go +++ b/config/config.go @@ -22,6 +22,7 @@ var ( type Config struct { Metrics bool `yaml:"metrics"` + Debug bool `yaml:"debug"` Alerting *core.AlertingConfig `yaml:"alerting"` Services []*core.Service `yaml:"services"` } diff --git a/core/alert.go b/core/alert.go index bc8a12db..ebded0a7 100644 --- a/core/alert.go +++ b/core/alert.go @@ -13,6 +13,9 @@ type Alert struct { // Description of the alert. Will be included in the alert sent. Description string `yaml:"description"` + + // SendOnResolved defines whether to send a second notification when the issue has been resolved + SendOnResolved bool `yaml:"send-on-resolved"` } type AlertType string diff --git a/core/alerting.go b/core/alerting.go index 9cd83b6a..06daa331 100644 --- a/core/alerting.go +++ b/core/alerting.go @@ -2,9 +2,11 @@ package core import ( "bytes" + "encoding/base64" "fmt" "github.com/TwinProduction/gatus/client" "net/http" + "net/url" "strings" ) @@ -21,6 +23,10 @@ type TwilioAlertProvider struct { To string `yaml:"to"` } +func (provider *TwilioAlertProvider) IsValid() bool { + return len(provider.Token) > 0 && len(provider.SID) > 0 && len(provider.From) > 0 && len(provider.To) > 0 +} + type CustomAlertProvider struct { Url string `yaml:"url"` Method string `yaml:"method,omitempty"` @@ -28,31 +34,49 @@ type CustomAlertProvider struct { Headers map[string]string `yaml:"headers,omitempty"` } -func (provider *CustomAlertProvider) buildRequest(serviceName, alertDescription string) *http.Request { +func (provider *CustomAlertProvider) IsValid() bool { + return len(provider.Url) > 0 +} + +func (provider *CustomAlertProvider) buildRequest(serviceName, alertDescription string, resolved bool) *http.Request { body := provider.Body - url := provider.Url - if strings.Contains(provider.Body, "[ALERT_DESCRIPTION]") { - body = strings.ReplaceAll(provider.Body, "[ALERT_DESCRIPTION]", alertDescription) + providerUrl := provider.Url + if strings.Contains(body, "[ALERT_DESCRIPTION]") { + body = strings.ReplaceAll(body, "[ALERT_DESCRIPTION]", alertDescription) } - if strings.Contains(provider.Body, "[SERVICE_NAME]") { - body = strings.ReplaceAll(provider.Body, "[SERVICE_NAME]", serviceName) + if strings.Contains(body, "[SERVICE_NAME]") { + body = strings.ReplaceAll(body, "[SERVICE_NAME]", serviceName) } - if strings.Contains(provider.Url, "[ALERT_DESCRIPTION]") { - url = strings.ReplaceAll(provider.Url, "[ALERT_DESCRIPTION]", alertDescription) + if strings.Contains(body, "[ALERT_TRIGGERED_OR_RESOLVED]") { + if resolved { + body = strings.ReplaceAll(body, "[ALERT_TRIGGERED_OR_RESOLVED]", "RESOLVED") + } else { + body = strings.ReplaceAll(body, "[ALERT_TRIGGERED_OR_RESOLVED]", "TRIGGERED") + } } - if strings.Contains(provider.Url, "[SERVICE_NAME]") { - url = strings.ReplaceAll(provider.Url, "[SERVICE_NAME]", serviceName) + if strings.Contains(providerUrl, "[ALERT_DESCRIPTION]") { + providerUrl = strings.ReplaceAll(providerUrl, "[ALERT_DESCRIPTION]", alertDescription) + } + if strings.Contains(providerUrl, "[SERVICE_NAME]") { + providerUrl = strings.ReplaceAll(providerUrl, "[SERVICE_NAME]", serviceName) + } + if strings.Contains(providerUrl, "[ALERT_TRIGGERED_OR_RESOLVED]") { + if resolved { + providerUrl = strings.ReplaceAll(providerUrl, "[ALERT_TRIGGERED_OR_RESOLVED]", "RESOLVED") + } else { + providerUrl = strings.ReplaceAll(providerUrl, "[ALERT_TRIGGERED_OR_RESOLVED]", "TRIGGERED") + } } bodyBuffer := bytes.NewBuffer([]byte(body)) - request, _ := http.NewRequest(provider.Method, url, bodyBuffer) + request, _ := http.NewRequest(provider.Method, providerUrl, bodyBuffer) for k, v := range provider.Headers { request.Header.Set(k, v) } return request } -func (provider *CustomAlertProvider) Send(serviceName, alertDescription string) error { - request := provider.buildRequest(serviceName, alertDescription) +func (provider *CustomAlertProvider) Send(serviceName, alertDescription string, resolved bool) error { + request := provider.buildRequest(serviceName, alertDescription, resolved) response, err := client.GetHttpClient().Do(request) if err != nil { return err @@ -62,3 +86,64 @@ func (provider *CustomAlertProvider) Send(serviceName, alertDescription string) } return nil } + +func CreateSlackCustomAlertProvider(slackWebHookUrl string, service *Service, alert *Alert, result *Result, resolved bool) *CustomAlertProvider { + var message string + var color string + if resolved { + message = fmt.Sprintf("An alert for *%s* has been resolved after %d failures in a row", service.Name, service.NumberOfFailuresInARow) + color = "#36A64F" + } else { + message = fmt.Sprintf("An alert for *%s* has been triggered", service.Name) + color = "#DD0000" + } + var results string + for _, conditionResult := range result.ConditionResults { + var prefix string + if conditionResult.Success { + prefix = ":heavy_check_mark:" + } else { + prefix = ":x:" + } + results += fmt.Sprintf("%s - `%s`\n", prefix, conditionResult.Condition) + } + return &CustomAlertProvider{ + Url: slackWebHookUrl, + Method: "POST", + Body: fmt.Sprintf(`{ + "text": "", + "attachments": [ + { + "title": ":helmet_with_white_cross: Gatus", + "text": "%s:\n> %s", + "short": false, + "color": "%s", + "fields": [ + { + "title": "Condition results", + "value": "%s", + "short": false + } + ] + }, + ] +}`, message, alert.Description, color, results), + Headers: map[string]string{"Content-Type": "application/json"}, + } +} + +func CreateTwilioCustomAlertProvider(provider *TwilioAlertProvider, message string) *CustomAlertProvider { + return &CustomAlertProvider{ + Url: fmt.Sprintf("https://api.twilio.com/2010-04-01/Accounts/%s/Messages.json", provider.SID), + Method: "POST", + Body: url.Values{ + "To": {provider.To}, + "From": {provider.From}, + "Body": {message}, + }.Encode(), + Headers: map[string]string{ + "Content-Type": "application/x-www-form-urlencoded", + "Authorization": fmt.Sprintf("Basic %s", base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", provider.SID, provider.Token)))), + }, + } +} diff --git a/core/service.go b/core/service.go index 1a5850c3..2df3b05e 100644 --- a/core/service.go +++ b/core/service.go @@ -46,7 +46,7 @@ type Service struct { // Alerts is the alerting configuration for the service in case of failure Alerts []*Alert `yaml:"alerts"` - numberOfFailuresInARow int + NumberOfFailuresInARow int } func (service *Service) Validate() { @@ -94,22 +94,16 @@ func (service *Service) EvaluateConditions() *Result { } } result.Timestamp = time.Now() - if result.Success { - service.numberOfFailuresInARow = 0 - // TODO: Send notification that alert has been resolved? - } else { - service.numberOfFailuresInARow++ - } return result } func (service *Service) GetAlertsTriggered() []Alert { var alerts []Alert - if service.numberOfFailuresInARow == 0 { + if service.NumberOfFailuresInARow == 0 { return alerts } for _, alert := range service.Alerts { - if alert.Enabled && alert.Threshold == service.numberOfFailuresInARow { + if alert.Enabled && alert.Threshold == service.NumberOfFailuresInARow { alerts = append(alerts, *alert) continue } diff --git a/go.sum b/go.sum index 69d7b4ec..61c6e3a2 100644 --- a/go.sum +++ b/go.sum @@ -18,6 +18,7 @@ github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/me github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2 h1:6nsPYzhq5kReh6QImI3k5qWzO4PEbvbIW2cwSfR/6xs= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= diff --git a/main.go b/main.go index 23d72080..ba8fe163 100644 --- a/main.go +++ b/main.go @@ -3,7 +3,6 @@ package main import ( "bytes" "compress/gzip" - "encoding/json" "github.com/TwinProduction/gatus/config" "github.com/TwinProduction/gatus/watchdog" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -53,12 +52,11 @@ func serviceResultsHandler(writer http.ResponseWriter, r *http.Request) { if isExpired := cachedServiceResultsTimestamp.IsZero() || time.Now().Sub(cachedServiceResultsTimestamp) > CacheTTL; isExpired { buffer := &bytes.Buffer{} gzipWriter := gzip.NewWriter(buffer) - serviceResults := watchdog.GetServiceResults() - data, err := json.Marshal(serviceResults) + data, err := watchdog.GetJsonEncodedServiceResults() if err != nil { - log.Printf("[main][serviceResultsHandler] Unable to marshall object to JSON: %s", err.Error()) + log.Printf("[main][serviceResultsHandler] Unable to marshal object to JSON: %s", err.Error()) writer.WriteHeader(http.StatusInternalServerError) - _, _ = writer.Write([]byte("Unable to marshall object to JSON")) + _, _ = writer.Write([]byte("Unable to marshal object to JSON")) return } gzipWriter.Write(data) diff --git a/watchdog/watchdog.go b/watchdog/watchdog.go index 97707575..2b5bb45e 100644 --- a/watchdog/watchdog.go +++ b/watchdog/watchdog.go @@ -1,25 +1,34 @@ package watchdog import ( - "encoding/base64" + "encoding/json" "fmt" "github.com/TwinProduction/gatus/config" "github.com/TwinProduction/gatus/core" "github.com/TwinProduction/gatus/metric" "log" - "net/url" "sync" "time" ) var ( serviceResults = make(map[string][]*core.Result) - rwLock sync.RWMutex + + // serviceResultsMutex is used to prevent concurrent map access + serviceResultsMutex sync.RWMutex + + // monitoringMutex is used to prevent multiple services from being evaluated at the same time. + // Without this, conditions using response time may become inaccurate. + monitoringMutex sync.Mutex ) -// GetServiceResults returns a list of the last 20 results for each services -func GetServiceResults() *map[string][]*core.Result { - return &serviceResults +// GetJsonEncodedServiceResults returns a list of the last 20 results for each services encoded using json.Marshal. +// The reason why the encoding is done here is because we use a mutex to prevent concurrent map access. +func GetJsonEncodedServiceResults() ([]byte, error) { + serviceResultsMutex.RLock() + data, err := json.Marshal(serviceResults) + serviceResultsMutex.RUnlock() + return data, err } // Monitor loops over each services and starts a goroutine to monitor each services separately @@ -33,71 +42,72 @@ func Monitor(cfg *config.Config) { // monitor monitors a single service in a loop func monitor(service *core.Service) { + cfg := config.Get() for { // By placing the lock here, we prevent multiple services from being monitored at the exact same time, which // could cause performance issues and return inaccurate results - rwLock.Lock() - log.Printf("[watchdog][monitor] Monitoring serviceName=%s", service.Name) + monitoringMutex.Lock() + if cfg.Debug { + log.Printf("[watchdog][monitor] Monitoring serviceName=%s", service.Name) + } result := service.EvaluateConditions() metric.PublishMetricsForService(service, result) + serviceResultsMutex.Lock() serviceResults[service.Name] = append(serviceResults[service.Name], result) if len(serviceResults[service.Name]) > 20 { serviceResults[service.Name] = serviceResults[service.Name][1:] } - rwLock.Unlock() + serviceResultsMutex.Unlock() var extra string if !result.Success { extra = fmt.Sprintf("responseBody=%s", result.Body) } log.Printf( - "[watchdog][monitor] Finished monitoring serviceName=%s; errors=%d; requestDuration=%s; %s", + "[watchdog][monitor] Monitored serviceName=%s; success=%v; errors=%d; requestDuration=%s; %s", service.Name, + result.Success, len(result.Errors), result.Duration.Round(time.Millisecond), extra, ) + handleAlerting(service, result) + if cfg.Debug { + log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s again", service.Interval, service.Name) + } + monitoringMutex.Unlock() + time.Sleep(service.Interval) + } +} - cfg := config.Get() - if cfg.Alerting != nil { - for _, alertTriggered := range service.GetAlertsTriggered() { +func handleAlerting(service *core.Service, result *core.Result) { + cfg := config.Get() + if cfg.Alerting == nil { + return + } + if result.Success { + if service.NumberOfFailuresInARow > 0 { + for _, alert := range service.Alerts { + if !alert.Enabled || !alert.SendOnResolved || alert.Threshold > service.NumberOfFailuresInARow { + continue + } var alertProvider *core.CustomAlertProvider - if alertTriggered.Type == core.SlackAlert { + if alert.Type == core.SlackAlert { if len(cfg.Alerting.Slack) > 0 { - log.Printf("[watchdog][monitor] Sending Slack alert because alert with description=%s has been triggered", alertTriggered.Description) - alertProvider = &core.CustomAlertProvider{ - Url: cfg.Alerting.Slack, - Method: "POST", - Body: fmt.Sprintf(`{"text":"*[Gatus]*\n*service:* %s\n*description:* %s"}`, service.Name, alertTriggered.Description), - Headers: map[string]string{"Content-Type": "application/json"}, - } + log.Printf("[watchdog][handleAlerting] Sending Slack alert because alert with description=%s has been resolved", alert.Description) + alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, true) } else { - log.Printf("[watchdog][monitor] Not sending Slack alert despite being triggered, because there is no Slack webhook configured") + log.Printf("[watchdog][handleAlerting] Not sending Slack alert despite being triggered, because there is no Slack webhook configured") } - } else if alertTriggered.Type == core.TwilioAlert { - if len(cfg.Alerting.Twilio.Token) > 0 && - len(cfg.Alerting.Twilio.SID) > 0 && - len(cfg.Alerting.Twilio.From) > 0 && - len(cfg.Alerting.Twilio.To) > 0 { - log.Printf("[watchdog][monitor] Sending Twilio alert because alert with description=%s has been triggered", alertTriggered.Description) - alertProvider = &core.CustomAlertProvider{ - Url: fmt.Sprintf("https://api.twilio.com/2010-04-01/Accounts/%s/Messages.json", cfg.Alerting.Twilio.SID), - Method: "POST", - Body: url.Values{ - "To": {cfg.Alerting.Twilio.To}, - "From": {cfg.Alerting.Twilio.From}, - "Body": {fmt.Sprintf("%s - %s", service.Name, alertTriggered.Description)}, - }.Encode(), - Headers: map[string]string{ - "Content-Type": "application/x-www-form-urlencoded", - "Authorization": fmt.Sprintf("Basic %s", base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf("%s:%s", cfg.Alerting.Twilio.SID, cfg.Alerting.Twilio.Token)))), - }, - } + } else if alert.Type == core.TwilioAlert { + if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() { + log.Printf("[watchdog][handleAlerting] Sending Twilio alert because alert with description=%s has been resolved", alert.Description) + alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("RESOLVED: %s - %s", service.Name, alert.Description)) } else { - log.Printf("[watchdog][monitor] Not sending Twilio alert despite being triggered, because twilio config settings missing") + log.Printf("[watchdog][handleAlerting] Not sending Twilio alert despite being resolved, because Twilio isn't configured properly") } - } else if alertTriggered.Type == core.CustomAlert { - if cfg.Alerting.Custom != nil && len(cfg.Alerting.Custom.Url) > 0 { - log.Printf("[watchdog][monitor] Sending custom alert because alert with description=%s has been triggered", alertTriggered.Description) + } else if alert.Type == core.CustomAlert { + if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() { + log.Printf("[watchdog][handleAlerting] Sending custom alert because alert with description=%s has been resolved", alert.Description) alertProvider = &core.CustomAlertProvider{ Url: cfg.Alerting.Custom.Url, Method: cfg.Alerting.Custom.Method, @@ -105,19 +115,59 @@ func monitor(service *core.Service) { Headers: cfg.Alerting.Custom.Headers, } } else { - log.Printf("[watchdog][monitor] Not sending custom alert despite being triggered, because there is no custom url configured") + log.Printf("[watchdog][handleAlerting] Not sending custom alert despite being resolved, because the custom provider isn't configured properly") } } if alertProvider != nil { - err := alertProvider.Send(service.Name, alertTriggered.Description) + err := alertProvider.Send(service.Name, alert.Description, true) if err != nil { - log.Printf("[watchdog][monitor] Ran into error sending an alert: %s", err.Error()) + log.Printf("[watchdog][handleAlerting] Ran into error sending an alert: %s", err.Error()) } } } } - - log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s", service.Interval, service.Name) - time.Sleep(service.Interval) + service.NumberOfFailuresInARow = 0 + } else { + service.NumberOfFailuresInARow++ + for _, alert := range service.Alerts { + // If the alert hasn't been triggered, move to the next one + if !alert.Enabled || alert.Threshold != service.NumberOfFailuresInARow { + continue + } + var alertProvider *core.CustomAlertProvider + if alert.Type == core.SlackAlert { + if len(cfg.Alerting.Slack) > 0 { + log.Printf("[watchdog][handleAlerting] Sending Slack alert because alert with description=%s has been triggered", alert.Description) + alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, false) + } else { + log.Printf("[watchdog][handleAlerting] Not sending Slack alert despite being triggered, because there is no Slack webhook configured") + } + } else if alert.Type == core.TwilioAlert { + if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() { + log.Printf("[watchdog][handleAlerting] Sending Twilio alert because alert with description=%s has been triggered", alert.Description) + alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("TRIGGERED: %s - %s", service.Name, alert.Description)) + } else { + log.Printf("[watchdog][handleAlerting] Not sending Twilio alert despite being triggered, because Twilio config settings missing") + } + } else if alert.Type == core.CustomAlert { + if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() { + log.Printf("[watchdog][handleAlerting] Sending custom alert because alert with description=%s has been triggered", alert.Description) + alertProvider = &core.CustomAlertProvider{ + Url: cfg.Alerting.Custom.Url, + Method: cfg.Alerting.Custom.Method, + Body: cfg.Alerting.Custom.Body, + Headers: cfg.Alerting.Custom.Headers, + } + } else { + log.Printf("[watchdog][handleAlerting] Not sending custom alert despite being triggered, because there is no custom url configured") + } + } + if alertProvider != nil { + err := alertProvider.Send(service.Name, alert.Description, false) + if err != nil { + log.Printf("[watchdog][handleAlerting] Ran into error sending an alert: %s", err.Error()) + } + } + } } }