mirror of
https://github.com/TwiN/gatus.git
synced 2025-02-16 10:20:00 +01:00
Add support for PagerDuty
This commit is contained in:
parent
cf6a74f862
commit
75b7a41c9d
295
README.md
295
README.md
@ -17,14 +17,16 @@ core applications: https://status.twinnation.org/
|
||||
- [Usage](#usage)
|
||||
- [Configuration](#configuration)
|
||||
- [Conditions](#conditions)
|
||||
- [Alerting](#alerting)
|
||||
- [Configuring Slack alerts](#configuring-slack-alerts)
|
||||
- [Configuring PagerDuty alerts](#configuring-pagerduty-alerts)
|
||||
- [Configuring Twilio alerts](#configuring-twilio-alerts)
|
||||
- [Configuring custom alerts](#configuring-custom-alerts)
|
||||
- [Docker](#docker)
|
||||
- [Running the tests](#running-the-tests)
|
||||
- [Using in Production](#using-in-production)
|
||||
- [FAQ](#faq)
|
||||
- [Sending a GraphQL request](#sending-a-graphql-request)
|
||||
- [Configuring Slack alerts](#configuring-slack-alerts)
|
||||
- [Configuring Twilio alerts](#configuring-twilio-alerts)
|
||||
- [Configuring custom alerts](#configuring-custom-alerts)
|
||||
|
||||
|
||||
## Features
|
||||
@ -72,35 +74,36 @@ Note that you can also add environment variables in the configuration file (i.e.
|
||||
|
||||
### Configuration
|
||||
|
||||
| Parameter | Description | Default |
|
||||
| -------------------------------------- | --------------------------------------------------------------- | -------------- |
|
||||
| `debug` | Whether to enable debug logs | `false` |
|
||||
| `metrics` | Whether to expose metrics at /metrics | `false` |
|
||||
| `services` | List of services to monitor | Required `[]` |
|
||||
| `services[].name` | Name of the service. Can be anything. | Required `""` |
|
||||
| `services[].url` | URL to send the request to | Required `""` |
|
||||
| `services[].conditions` | Conditions used to determine the health of the service | `[]` |
|
||||
| `services[].interval` | Duration to wait between every status check | `60s` |
|
||||
| `services[].method` | Request method | `GET` |
|
||||
| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` |
|
||||
| `services[].body` | Request body | `""` |
|
||||
| `services[].headers` | Request headers | `{}` |
|
||||
| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` |
|
||||
| `services[].alerts[].enabled` | Whether to enable the alert | `false` |
|
||||
| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` |
|
||||
| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` |
|
||||
| `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert subsides | `false` |
|
||||
| `alerting` | Configuration for alerting | `{}` |
|
||||
| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` |
|
||||
| `alerting.twilio` | Settings for alerts of type `twilio` | `""` |
|
||||
| `alerting.twilio.sid` | Twilio account SID | Required `""` |
|
||||
| `alerting.twilio.token` | Twilio auth token | Required `""` |
|
||||
| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` |
|
||||
| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` |
|
||||
| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` |
|
||||
| `alerting.custom.url` | Custom alerting request url | `""` |
|
||||
| `alerting.custom.body` | Custom alerting request body. | `""` |
|
||||
| `alerting.custom.headers` | Custom alerting request headers | `{}` |
|
||||
| Parameter | Description | Default |
|
||||
| --------------------------------------------- | -------------------------------------------------------------------------- | -------------- |
|
||||
| `debug` | Whether to enable debug logs | `false` |
|
||||
| `metrics` | Whether to expose metrics at /metrics | `false` |
|
||||
| `services` | List of services to monitor | Required `[]` |
|
||||
| `services[].name` | Name of the service. Can be anything. | Required `""` |
|
||||
| `services[].url` | URL to send the request to | Required `""` |
|
||||
| `services[].conditions` | Conditions used to determine the health of the service | `[]` |
|
||||
| `services[].interval` | Duration to wait between every status check | `60s` |
|
||||
| `services[].method` | Request method | `GET` |
|
||||
| `services[].graphql` | Whether to wrap the body in a query param (`{"query":"$body"}`) | `false` |
|
||||
| `services[].body` | Request body | `""` |
|
||||
| `services[].headers` | Request headers | `{}` |
|
||||
| `services[].alerts[].type` | Type of alert. Valid types: `slack`, `twilio`, `custom` | Required `""` |
|
||||
| `services[].alerts[].enabled` | Whether to enable the alert | `false` |
|
||||
| `services[].alerts[].threshold` | Number of failures in a row needed before triggering the alert | `3` |
|
||||
| `services[].alerts[].description` | Description of the alert. Will be included in the alert sent | `""` |
|
||||
| `services[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert subsides | `false` |
|
||||
| `services[].alerts[].success-before-resolved` | Number of successes in a row needed before sending a resolved notification | `2` |
|
||||
| `alerting` | Configuration for alerting | `{}` |
|
||||
| `alerting.slack` | Webhook to use for alerts of type `slack` | `""` |
|
||||
| `alerting.twilio` | Settings for alerts of type `twilio` | `""` |
|
||||
| `alerting.twilio.sid` | Twilio account SID | Required `""` |
|
||||
| `alerting.twilio.token` | Twilio auth token | Required `""` |
|
||||
| `alerting.twilio.from` | Number to send Twilio alerts from | Required `""` |
|
||||
| `alerting.twilio.to` | Number to send twilio alerts to | Required `""` |
|
||||
| `alerting.custom` | Configuration for custom actions on failure or alerts | `""` |
|
||||
| `alerting.custom.url` | Custom alerting request url | `""` |
|
||||
| `alerting.custom.body` | Custom alerting request body. | `""` |
|
||||
| `alerting.custom.headers` | Custom alerting request headers | `{}` |
|
||||
|
||||
|
||||
### Conditions
|
||||
@ -121,6 +124,136 @@ Here are some examples of conditions you can use:
|
||||
| `len([BODY].name) == 8` | String at jsonpath `$.name` has a length of 8 | `{"name":"john.doe"}` | `{"name":"bob"}` |
|
||||
|
||||
|
||||
### Alerting
|
||||
|
||||
|
||||
|
||||
#### Configuring Slack alerts
|
||||
|
||||
```yaml
|
||||
alerting:
|
||||
slack: "https://hooks.slack.com/services/**********/**********/**********"
|
||||
services:
|
||||
- name: twinnation
|
||||
interval: 30s
|
||||
url: "https://twinnation.org/health"
|
||||
alerts:
|
||||
- type: slack
|
||||
enabled: true
|
||||
description: "healthcheck failed 3 times in a row"
|
||||
send-on-resolved: true
|
||||
- type: slack
|
||||
enabled: true
|
||||
threshold: 5
|
||||
description: "healthcheck failed 5 times in a row"
|
||||
send-on-resolved: true
|
||||
conditions:
|
||||
- "[STATUS] == 200"
|
||||
- "[BODY].status == UP"
|
||||
- "[RESPONSE_TIME] < 300"
|
||||
```
|
||||
|
||||
Here's an example of what the notifications look like:
|
||||
|
||||
![Slack notifications](.github/assets/slack-alerts.png)
|
||||
|
||||
|
||||
#### Configuring PagerDuty alerts
|
||||
|
||||
It is highly recommended to set `services[].alerts[].send-on-resolved` to `true` for alerts
|
||||
of type `pagerduty`, because unlike other alerts, the operation resulting from setting said
|
||||
parameter to `true` will not create another incident, but mark the incident as resolved on
|
||||
PagerDuty instead.
|
||||
|
||||
```yaml
|
||||
alerting:
|
||||
pagerduty: "********************************"
|
||||
services:
|
||||
- name: twinnation
|
||||
interval: 30s
|
||||
url: "https://twinnation.org/health"
|
||||
alerts:
|
||||
- type: pagerduty
|
||||
enabled: true
|
||||
threshold: 3
|
||||
description: "healthcheck failed 3 times in a row"
|
||||
send-on-resolved: true
|
||||
success-before-resolved: 5
|
||||
conditions:
|
||||
- "[STATUS] == 200"
|
||||
- "[BODY].status == UP"
|
||||
- "[RESPONSE_TIME] < 300"
|
||||
```
|
||||
|
||||
|
||||
#### Configuring Twilio alerts
|
||||
|
||||
```yaml
|
||||
alerting:
|
||||
twilio:
|
||||
sid: "..."
|
||||
token: "..."
|
||||
from: "+1-234-567-8901"
|
||||
to: "+1-234-567-8901"
|
||||
services:
|
||||
- name: twinnation
|
||||
interval: 30s
|
||||
url: "https://twinnation.org/health"
|
||||
alerts:
|
||||
- type: twilio
|
||||
enabled: true
|
||||
threshold: 5
|
||||
description: "healthcheck failed 5 times in a row"
|
||||
conditions:
|
||||
- "[STATUS] == 200"
|
||||
- "[BODY].status == UP"
|
||||
- "[RESPONSE_TIME] < 300"
|
||||
```
|
||||
|
||||
|
||||
#### Configuring custom alerts
|
||||
|
||||
While they're called alerts, you can use this feature to call anything.
|
||||
|
||||
For instance, you could automate rollbacks by having an application that keeps tracks of new deployments, and by
|
||||
leveraging Gatus, you could have Gatus call that application endpoint when a service starts failing. Your application
|
||||
would then check if the service that started failing was recently deployed, and if it was, then automatically
|
||||
roll it back.
|
||||
|
||||
The values `[ALERT_DESCRIPTION]` and `[SERVICE_NAME]` are automatically substituted for the alert description and the
|
||||
service name respectively in the body (`alerting.custom.body`) as well as the url (`alerting.custom.url`).
|
||||
|
||||
If you have `send-on-resolved` set to `true`, you may want to use `[ALERT_TRIGGERED_OR_RESOLVED]` to differentiate
|
||||
the notifications. It will be replaced for either `TRIGGERED` or `RESOLVED`, based on the situation.
|
||||
|
||||
For all intents and purpose, we'll configure the custom alert with a Slack webhook, but you can call anything you want.
|
||||
|
||||
```yaml
|
||||
alerting:
|
||||
custom:
|
||||
url: "https://hooks.slack.com/services/**********/**********/**********"
|
||||
method: "POST"
|
||||
body: |
|
||||
{
|
||||
"text": "[ALERT_TRIGGERED_OR_RESOLVED]: [SERVICE_NAME] - [ALERT_DESCRIPTION]"
|
||||
}
|
||||
services:
|
||||
- name: twinnation
|
||||
interval: 30s
|
||||
url: "https://twinnation.org/health"
|
||||
alerts:
|
||||
- type: custom
|
||||
enabled: true
|
||||
threshold: 10
|
||||
send-on-resolved: true
|
||||
description: "healthcheck failed 10 times in a row"
|
||||
conditions:
|
||||
- "[STATUS] == 200"
|
||||
- "[BODY].status == UP"
|
||||
- "[RESPONSE_TIME] < 300"
|
||||
```
|
||||
|
||||
|
||||
## Docker
|
||||
|
||||
```
|
||||
@ -173,101 +306,3 @@ will send a `POST` request to `http://localhost:8080/playground` with the follow
|
||||
```json
|
||||
{"query":" {\n user(gender: \"female\") {\n id\n name\n gender\n avatar\n }\n }"}
|
||||
```
|
||||
|
||||
|
||||
### Configuring Slack alerts
|
||||
|
||||
```yaml
|
||||
alerting:
|
||||
slack: "https://hooks.slack.com/services/**********/**********/**********"
|
||||
services:
|
||||
- name: twinnation
|
||||
interval: 30s
|
||||
url: "https://twinnation.org/health"
|
||||
alerts:
|
||||
- type: slack
|
||||
enabled: true
|
||||
description: "healthcheck failed 3 times in a row"
|
||||
send-on-resolved: true
|
||||
- type: slack
|
||||
enabled: true
|
||||
threshold: 5
|
||||
description: "healthcheck failed 5 times in a row"
|
||||
send-on-resolved: true
|
||||
conditions:
|
||||
- "[STATUS] == 200"
|
||||
- "[BODY].status == UP"
|
||||
- "[RESPONSE_TIME] < 300"
|
||||
```
|
||||
|
||||
Here's an example of what the notifications look like:
|
||||
|
||||
![Slack notifications](.github/assets/slack-alerts.png)
|
||||
|
||||
|
||||
### Configuring Twilio alerts
|
||||
|
||||
```yaml
|
||||
alerting:
|
||||
twilio:
|
||||
sid: "..."
|
||||
token: "..."
|
||||
from: "+1-234-567-8901"
|
||||
to: "+1-234-567-8901"
|
||||
services:
|
||||
- name: twinnation
|
||||
interval: 30s
|
||||
url: "https://twinnation.org/health"
|
||||
alerts:
|
||||
- type: twilio
|
||||
enabled: true
|
||||
threshold: 5
|
||||
description: "healthcheck failed 5 times in a row"
|
||||
conditions:
|
||||
- "[STATUS] == 200"
|
||||
- "[BODY].status == UP"
|
||||
- "[RESPONSE_TIME] < 300"
|
||||
```
|
||||
|
||||
|
||||
### Configuring custom alerts
|
||||
|
||||
While they're called alerts, you can use this feature to call anything.
|
||||
|
||||
For instance, you could automate rollbacks by having an application that keeps tracks of new deployments, and by
|
||||
leveraging Gatus, you could have Gatus call that application endpoint when a service starts failing. Your application
|
||||
would then check if the service that started failing was recently deployed, and if it was, then automatically
|
||||
roll it back.
|
||||
|
||||
The values `[ALERT_DESCRIPTION]` and `[SERVICE_NAME]` are automatically substituted for the alert description and the
|
||||
service name respectively in the body (`alerting.custom.body`) as well as the url (`alerting.custom.url`).
|
||||
|
||||
If you have `send-on-resolved` set to `true`, you may want to use `[ALERT_TRIGGERED_OR_RESOLVED]` to differentiate
|
||||
the notifications. It will be replaced for either `TRIGGERED` or `RESOLVED`, based on the situation.
|
||||
|
||||
For all intents and purpose, we'll configure the custom alert with a Slack webhook, but you can call anything you want.
|
||||
|
||||
```yaml
|
||||
alerting:
|
||||
custom:
|
||||
url: "https://hooks.slack.com/services/**********/**********/**********"
|
||||
method: "POST"
|
||||
body: |
|
||||
{
|
||||
"text": "[ALERT_TRIGGERED_OR_RESOLVED]: [SERVICE_NAME] - [ALERT_DESCRIPTION]"
|
||||
}
|
||||
services:
|
||||
- name: twinnation
|
||||
interval: 30s
|
||||
url: "https://twinnation.org/health"
|
||||
alerts:
|
||||
- type: custom
|
||||
enabled: true
|
||||
threshold: 10
|
||||
send-on-resolved: true
|
||||
description: "healthcheck failed 10 times in a row"
|
||||
conditions:
|
||||
- "[STATUS] == 200"
|
||||
- "[BODY].status == UP"
|
||||
- "[RESPONSE_TIME] < 300"
|
||||
```
|
||||
|
158
alerting/alerting.go
Normal file
158
alerting/alerting.go
Normal file
@ -0,0 +1,158 @@
|
||||
package alerting
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/TwinProduction/gatus/config"
|
||||
"github.com/TwinProduction/gatus/core"
|
||||
"log"
|
||||
)
|
||||
|
||||
// Handle takes care of alerts to resolve and alerts to trigger based on result success or failure
|
||||
func Handle(service *core.Service, result *core.Result) {
|
||||
cfg := config.Get()
|
||||
if cfg.Alerting == nil {
|
||||
return
|
||||
}
|
||||
if result.Success {
|
||||
handleAlertsToResolve(service, result, cfg)
|
||||
} else {
|
||||
handleAlertsToTrigger(service, result, cfg)
|
||||
}
|
||||
}
|
||||
|
||||
func handleAlertsToTrigger(service *core.Service, result *core.Result, cfg *config.Config) {
|
||||
service.NumberOfSuccessesInARow = 0
|
||||
service.NumberOfFailuresInARow++
|
||||
for _, alert := range service.Alerts {
|
||||
// If the alert hasn't been triggered, move to the next one
|
||||
if !alert.Enabled || alert.Threshold != service.NumberOfFailuresInARow {
|
||||
continue
|
||||
}
|
||||
if alert.Triggered {
|
||||
if cfg.Debug {
|
||||
log.Printf("[alerting][handleAlertsToTrigger] Alert with description='%s' has already been triggered, skipping", alert.Description)
|
||||
}
|
||||
continue
|
||||
}
|
||||
var alertProvider *core.CustomAlertProvider
|
||||
if alert.Type == core.SlackAlert {
|
||||
if len(cfg.Alerting.Slack) > 0 {
|
||||
log.Printf("[alerting][handleAlertsToTrigger] Sending Slack alert because alert with description='%s' has been triggered", alert.Description)
|
||||
alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, false)
|
||||
} else {
|
||||
log.Printf("[alerting][handleAlertsToTrigger] Not sending Slack alert despite being triggered, because there is no Slack webhook configured")
|
||||
}
|
||||
} else if alert.Type == core.PagerDutyAlert {
|
||||
if len(cfg.Alerting.PagerDuty) > 0 {
|
||||
log.Printf("[alerting][handleAlertsToTrigger] Sending PagerDuty alert because alert with description='%s' has been triggered", alert.Description)
|
||||
alertProvider = core.CreatePagerDutyCustomAlertProvider(cfg.Alerting.PagerDuty, "trigger", "", service, fmt.Sprintf("TRIGGERED: %s - %s", service.Name, alert.Description))
|
||||
} else {
|
||||
log.Printf("[alerting][handleAlertsToTrigger] Not sending PagerDuty alert despite being triggered, because PagerDuty isn't configured properly")
|
||||
}
|
||||
} else if alert.Type == core.TwilioAlert {
|
||||
if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() {
|
||||
log.Printf("[alerting][handleAlertsToTrigger] Sending Twilio alert because alert with description='%s' has been triggered", alert.Description)
|
||||
alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("TRIGGERED: %s - %s", service.Name, alert.Description))
|
||||
} else {
|
||||
log.Printf("[alerting][handleAlertsToTrigger] Not sending Twilio alert despite being triggered, because Twilio config settings missing")
|
||||
}
|
||||
} else if alert.Type == core.CustomAlert {
|
||||
if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {
|
||||
log.Printf("[alerting][handleAlertsToTrigger] Sending custom alert because alert with description='%s' has been triggered", alert.Description)
|
||||
alertProvider = &core.CustomAlertProvider{
|
||||
Url: cfg.Alerting.Custom.Url,
|
||||
Method: cfg.Alerting.Custom.Method,
|
||||
Body: cfg.Alerting.Custom.Body,
|
||||
Headers: cfg.Alerting.Custom.Headers,
|
||||
}
|
||||
} else {
|
||||
log.Printf("[alerting][handleAlertsToTrigger] Not sending custom alert despite being triggered, because there is no custom url configured")
|
||||
}
|
||||
}
|
||||
if alertProvider != nil {
|
||||
// TODO: retry on error
|
||||
var err error
|
||||
if alert.Type == core.PagerDutyAlert {
|
||||
var body []byte
|
||||
body, err = alertProvider.Send(service.Name, alert.Description, true)
|
||||
if err == nil {
|
||||
var response pagerDutyResponse
|
||||
err = json.Unmarshal(body, &response)
|
||||
if err != nil {
|
||||
log.Printf("[alerting][handleAlertsToTrigger] Ran into error unmarshaling pager duty response: %s", err.Error())
|
||||
} else {
|
||||
alert.ResolveKey = response.DedupKey
|
||||
}
|
||||
}
|
||||
} else {
|
||||
_, err = alertProvider.Send(service.Name, alert.Description, false)
|
||||
}
|
||||
if err != nil {
|
||||
log.Printf("[alerting][handleAlertsToTrigger] Ran into error sending an alert: %s", err.Error())
|
||||
} else {
|
||||
alert.Triggered = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func handleAlertsToResolve(service *core.Service, result *core.Result, cfg *config.Config) {
|
||||
service.NumberOfSuccessesInARow++
|
||||
for _, alert := range service.Alerts {
|
||||
if !alert.Enabled || !alert.Triggered || alert.SuccessBeforeResolved > service.NumberOfSuccessesInARow {
|
||||
continue
|
||||
}
|
||||
alert.Triggered = false
|
||||
if !alert.SendOnResolved {
|
||||
continue
|
||||
}
|
||||
var alertProvider *core.CustomAlertProvider
|
||||
if alert.Type == core.SlackAlert {
|
||||
if len(cfg.Alerting.Slack) > 0 {
|
||||
log.Printf("[alerting][handleAlertsToResolve] Sending Slack alert because alert with description='%s' has been resolved", alert.Description)
|
||||
alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, true)
|
||||
} else {
|
||||
log.Printf("[alerting][handleAlertsToResolve] Not sending Slack alert despite being resolved, because there is no Slack webhook configured")
|
||||
}
|
||||
} else if alert.Type == core.PagerDutyAlert {
|
||||
if len(cfg.Alerting.PagerDuty) > 0 {
|
||||
log.Printf("[alerting][handleAlertsToResolve] Sending PagerDuty alert because alert with description='%s' has been resolved", alert.Description)
|
||||
alertProvider = core.CreatePagerDutyCustomAlertProvider(cfg.Alerting.PagerDuty, "resolve", alert.ResolveKey, service, fmt.Sprintf("RESOLVED: %s - %s", service.Name, alert.Description))
|
||||
} else {
|
||||
log.Printf("[alerting][handleAlertsToResolve] Not sending PagerDuty alert despite being resolved, because PagerDuty isn't configured properly")
|
||||
}
|
||||
} else if alert.Type == core.TwilioAlert {
|
||||
if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() {
|
||||
log.Printf("[alerting][handleAlertsToResolve] Sending Twilio alert because alert with description='%s' has been resolved", alert.Description)
|
||||
alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("RESOLVED: %s - %s", service.Name, alert.Description))
|
||||
} else {
|
||||
log.Printf("[alerting][handleAlertsToResolve] Not sending Twilio alert despite being resolved, because Twilio isn't configured properly")
|
||||
}
|
||||
} else if alert.Type == core.CustomAlert {
|
||||
if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {
|
||||
log.Printf("[alerting][handleAlertsToResolve] Sending custom alert because alert with description='%s' has been resolved", alert.Description)
|
||||
alertProvider = &core.CustomAlertProvider{
|
||||
Url: cfg.Alerting.Custom.Url,
|
||||
Method: cfg.Alerting.Custom.Method,
|
||||
Body: cfg.Alerting.Custom.Body,
|
||||
Headers: cfg.Alerting.Custom.Headers,
|
||||
}
|
||||
} else {
|
||||
log.Printf("[alerting][handleAlertsToResolve] Not sending custom alert despite being resolved, because the custom provider isn't configured properly")
|
||||
}
|
||||
}
|
||||
if alertProvider != nil {
|
||||
// TODO: retry on error
|
||||
_, err := alertProvider.Send(service.Name, alert.Description, true)
|
||||
if err != nil {
|
||||
log.Printf("[alerting][handleAlertsToResolve] Ran into error sending an alert: %s", err.Error())
|
||||
} else {
|
||||
if alert.Type == core.PagerDutyAlert {
|
||||
alert.ResolveKey = ""
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
service.NumberOfFailuresInARow = 0
|
||||
}
|
7
alerting/pagerduty.go
Normal file
7
alerting/pagerduty.go
Normal file
@ -0,0 +1,7 @@
|
||||
package alerting
|
||||
|
||||
type pagerDutyResponse struct {
|
||||
Status string `json:"status"`
|
||||
Message string `json:"message"`
|
||||
DedupKey string `json:"dedup_key"`
|
||||
}
|
@ -16,12 +16,24 @@ type Alert struct {
|
||||
|
||||
// SendOnResolved defines whether to send a second notification when the issue has been resolved
|
||||
SendOnResolved bool `yaml:"send-on-resolved"`
|
||||
|
||||
// SuccessBeforeResolved defines whether to send a second notification when the issue has been resolved
|
||||
SuccessBeforeResolved int `yaml:"success-before-resolved"`
|
||||
|
||||
// ResolveKey is an optional field that is used by some providers (i.e. PagerDuty's dedup_key) to resolve
|
||||
// ongoing/triggered incidents
|
||||
ResolveKey string
|
||||
|
||||
// Triggered is used to determine whether an alert has been triggered. When an alert is resolved, this value
|
||||
// should be set back to false. It is used to prevent the same alert from going out twice.
|
||||
Triggered bool
|
||||
}
|
||||
|
||||
type AlertType string
|
||||
|
||||
const (
|
||||
SlackAlert AlertType = "slack"
|
||||
TwilioAlert AlertType = "twilio"
|
||||
CustomAlert AlertType = "custom"
|
||||
SlackAlert AlertType = "slack"
|
||||
PagerDutyAlert AlertType = "pagerduty"
|
||||
TwilioAlert AlertType = "twilio"
|
||||
CustomAlert AlertType = "custom"
|
||||
)
|
||||
|
@ -5,15 +5,17 @@ import (
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"github.com/TwinProduction/gatus/client"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type AlertingConfig struct {
|
||||
Slack string `yaml:"slack"`
|
||||
Twilio *TwilioAlertProvider `yaml:"twilio"`
|
||||
Custom *CustomAlertProvider `yaml:"custom"`
|
||||
Slack string `yaml:"slack"`
|
||||
PagerDuty string `yaml:"pagerduty"`
|
||||
Twilio *TwilioAlertProvider `yaml:"twilio"`
|
||||
Custom *CustomAlertProvider `yaml:"custom"`
|
||||
}
|
||||
|
||||
type TwilioAlertProvider struct {
|
||||
@ -75,26 +77,32 @@ func (provider *CustomAlertProvider) buildRequest(serviceName, alertDescription
|
||||
return request
|
||||
}
|
||||
|
||||
func (provider *CustomAlertProvider) Send(serviceName, alertDescription string, resolved bool) error {
|
||||
// Send a request to the alert provider and return the body
|
||||
func (provider *CustomAlertProvider) Send(serviceName, alertDescription string, resolved bool) ([]byte, error) {
|
||||
request := provider.buildRequest(serviceName, alertDescription, resolved)
|
||||
response, err := client.GetHttpClient().Do(request)
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
if response.StatusCode > 399 {
|
||||
return fmt.Errorf("call to provider alert returned status code %d", response.StatusCode)
|
||||
body, err := ioutil.ReadAll(response.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("call to provider alert returned status code %d", response.StatusCode)
|
||||
} else {
|
||||
return nil, fmt.Errorf("call to provider alert returned status code %d: %s", response.StatusCode, string(body))
|
||||
}
|
||||
}
|
||||
return nil
|
||||
return ioutil.ReadAll(response.Body)
|
||||
}
|
||||
|
||||
func CreateSlackCustomAlertProvider(slackWebHookUrl string, service *Service, alert *Alert, result *Result, resolved bool) *CustomAlertProvider {
|
||||
var message string
|
||||
var color string
|
||||
if resolved {
|
||||
message = fmt.Sprintf("An alert for *%s* has been resolved after %d failures in a row", service.Name, service.NumberOfFailuresInARow)
|
||||
message = fmt.Sprintf("An alert for *%s* has been resolved after passing successfully %d time(s) in a row", service.Name, alert.SuccessBeforeResolved)
|
||||
color = "#36A64F"
|
||||
} else {
|
||||
message = fmt.Sprintf("An alert for *%s* has been triggered", service.Name)
|
||||
message = fmt.Sprintf("An alert for *%s* has been triggered due to having failed %d time(s) in a row", service.Name, alert.Threshold)
|
||||
color = "#DD0000"
|
||||
}
|
||||
var results string
|
||||
@ -147,3 +155,24 @@ func CreateTwilioCustomAlertProvider(provider *TwilioAlertProvider, message stri
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// https://developer.pagerduty.com/docs/events-api-v2/trigger-events/
|
||||
func CreatePagerDutyCustomAlertProvider(routingKey, eventAction, resolveKey string, service *Service, message string) *CustomAlertProvider {
|
||||
return &CustomAlertProvider{
|
||||
Url: "https://events.pagerduty.com/v2/enqueue",
|
||||
Method: "POST",
|
||||
Body: fmt.Sprintf(`{
|
||||
"routing_key": "%s",
|
||||
"dedup_key": "%s",
|
||||
"event_action": "%s",
|
||||
"payload": {
|
||||
"summary": "%s",
|
||||
"source": "%s",
|
||||
"severity": "critical"
|
||||
}
|
||||
}`, routingKey, resolveKey, eventAction, message, service.Name),
|
||||
Headers: map[string]string{
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
}
|
||||
}
|
||||
|
@ -46,7 +46,8 @@ type Service struct {
|
||||
// Alerts is the alerting configuration for the service in case of failure
|
||||
Alerts []*Alert `yaml:"alerts"`
|
||||
|
||||
NumberOfFailuresInARow int
|
||||
NumberOfFailuresInARow int
|
||||
NumberOfSuccessesInARow int
|
||||
}
|
||||
|
||||
func (service *Service) Validate() {
|
||||
@ -64,6 +65,9 @@ func (service *Service) Validate() {
|
||||
if alert.Threshold <= 0 {
|
||||
alert.Threshold = 3
|
||||
}
|
||||
if alert.SuccessBeforeResolved <= 0 {
|
||||
alert.SuccessBeforeResolved = 2
|
||||
}
|
||||
}
|
||||
if len(service.Url) == 0 {
|
||||
panic(ErrNoUrl)
|
||||
|
@ -3,6 +3,7 @@ package watchdog
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"github.com/TwinProduction/gatus/alerting"
|
||||
"github.com/TwinProduction/gatus/config"
|
||||
"github.com/TwinProduction/gatus/core"
|
||||
"github.com/TwinProduction/gatus/metric"
|
||||
@ -70,7 +71,7 @@ func monitor(service *core.Service) {
|
||||
result.Duration.Round(time.Millisecond),
|
||||
extra,
|
||||
)
|
||||
handleAlerting(service, result)
|
||||
alerting.Handle(service, result)
|
||||
if cfg.Debug {
|
||||
log.Printf("[watchdog][monitor] Waiting for interval=%s before monitoring serviceName=%s again", service.Interval, service.Name)
|
||||
}
|
||||
@ -78,96 +79,3 @@ func monitor(service *core.Service) {
|
||||
time.Sleep(service.Interval)
|
||||
}
|
||||
}
|
||||
|
||||
func handleAlerting(service *core.Service, result *core.Result) {
|
||||
cfg := config.Get()
|
||||
if cfg.Alerting == nil {
|
||||
return
|
||||
}
|
||||
if result.Success {
|
||||
if service.NumberOfFailuresInARow > 0 {
|
||||
for _, alert := range service.Alerts {
|
||||
if !alert.Enabled || !alert.SendOnResolved || alert.Threshold > service.NumberOfFailuresInARow {
|
||||
continue
|
||||
}
|
||||
var alertProvider *core.CustomAlertProvider
|
||||
if alert.Type == core.SlackAlert {
|
||||
if len(cfg.Alerting.Slack) > 0 {
|
||||
log.Printf("[watchdog][handleAlerting] Sending Slack alert because alert with description=%s has been resolved", alert.Description)
|
||||
alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, true)
|
||||
} else {
|
||||
log.Printf("[watchdog][handleAlerting] Not sending Slack alert despite being triggered, because there is no Slack webhook configured")
|
||||
}
|
||||
} else if alert.Type == core.TwilioAlert {
|
||||
if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() {
|
||||
log.Printf("[watchdog][handleAlerting] Sending Twilio alert because alert with description=%s has been resolved", alert.Description)
|
||||
alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("RESOLVED: %s - %s", service.Name, alert.Description))
|
||||
} else {
|
||||
log.Printf("[watchdog][handleAlerting] Not sending Twilio alert despite being resolved, because Twilio isn't configured properly")
|
||||
}
|
||||
} else if alert.Type == core.CustomAlert {
|
||||
if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {
|
||||
log.Printf("[watchdog][handleAlerting] Sending custom alert because alert with description=%s has been resolved", alert.Description)
|
||||
alertProvider = &core.CustomAlertProvider{
|
||||
Url: cfg.Alerting.Custom.Url,
|
||||
Method: cfg.Alerting.Custom.Method,
|
||||
Body: cfg.Alerting.Custom.Body,
|
||||
Headers: cfg.Alerting.Custom.Headers,
|
||||
}
|
||||
} else {
|
||||
log.Printf("[watchdog][handleAlerting] Not sending custom alert despite being resolved, because the custom provider isn't configured properly")
|
||||
}
|
||||
}
|
||||
if alertProvider != nil {
|
||||
err := alertProvider.Send(service.Name, alert.Description, true)
|
||||
if err != nil {
|
||||
log.Printf("[watchdog][handleAlerting] Ran into error sending an alert: %s", err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
service.NumberOfFailuresInARow = 0
|
||||
} else {
|
||||
service.NumberOfFailuresInARow++
|
||||
for _, alert := range service.Alerts {
|
||||
// If the alert hasn't been triggered, move to the next one
|
||||
if !alert.Enabled || alert.Threshold != service.NumberOfFailuresInARow {
|
||||
continue
|
||||
}
|
||||
var alertProvider *core.CustomAlertProvider
|
||||
if alert.Type == core.SlackAlert {
|
||||
if len(cfg.Alerting.Slack) > 0 {
|
||||
log.Printf("[watchdog][handleAlerting] Sending Slack alert because alert with description=%s has been triggered", alert.Description)
|
||||
alertProvider = core.CreateSlackCustomAlertProvider(cfg.Alerting.Slack, service, alert, result, false)
|
||||
} else {
|
||||
log.Printf("[watchdog][handleAlerting] Not sending Slack alert despite being triggered, because there is no Slack webhook configured")
|
||||
}
|
||||
} else if alert.Type == core.TwilioAlert {
|
||||
if cfg.Alerting.Twilio != nil && cfg.Alerting.Twilio.IsValid() {
|
||||
log.Printf("[watchdog][handleAlerting] Sending Twilio alert because alert with description=%s has been triggered", alert.Description)
|
||||
alertProvider = core.CreateTwilioCustomAlertProvider(cfg.Alerting.Twilio, fmt.Sprintf("TRIGGERED: %s - %s", service.Name, alert.Description))
|
||||
} else {
|
||||
log.Printf("[watchdog][handleAlerting] Not sending Twilio alert despite being triggered, because Twilio config settings missing")
|
||||
}
|
||||
} else if alert.Type == core.CustomAlert {
|
||||
if cfg.Alerting.Custom != nil && cfg.Alerting.Custom.IsValid() {
|
||||
log.Printf("[watchdog][handleAlerting] Sending custom alert because alert with description=%s has been triggered", alert.Description)
|
||||
alertProvider = &core.CustomAlertProvider{
|
||||
Url: cfg.Alerting.Custom.Url,
|
||||
Method: cfg.Alerting.Custom.Method,
|
||||
Body: cfg.Alerting.Custom.Body,
|
||||
Headers: cfg.Alerting.Custom.Headers,
|
||||
}
|
||||
} else {
|
||||
log.Printf("[watchdog][handleAlerting] Not sending custom alert despite being triggered, because there is no custom url configured")
|
||||
}
|
||||
}
|
||||
if alertProvider != nil {
|
||||
err := alertProvider.Send(service.Name, alert.Description, false)
|
||||
if err != nil {
|
||||
log.Printf("[watchdog][handleAlerting] Ran into error sending an alert: %s", err.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user