Merge pull request #52 from TwinProduction/service-groups

Add service groups
This commit is contained in:
Chris C 2020-11-27 08:48:54 -05:00 committed by GitHub
commit 35c232d925
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 254 additions and 66 deletions

BIN
.github/assets/service-groups.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

View File

@ -43,6 +43,7 @@ core applications: https://status.twinnation.org/
- [Monitoring using DNS queries](#monitoring-using-dns-queries)
- [Basic authentication](#basic-authentication)
- [disable-monitoring-lock](#disable-monitoring-lock)
- [Service groups](#service-groups)
## Features
@ -97,6 +98,7 @@ Note that you can also add environment variables in the configuration file (i.e.
| `metrics` | Whether to expose metrics at /metrics | `false` |
| `services` | List of services to monitor | Required `[]` |
| `services[].name` | Name of the service. Can be anything. | Required `""` |
| `services[].group` | Group name. Used to group multiple services together on the dashboard. See [Service groups](#service-groups). | `""` |
| `services[].url` | URL to send the request to | Required `""` |
| `services[].method` | Request method | `GET` |
| `services[].insecure` | Whether to skip verifying the server's certificate chain and host name | `false` |
@ -614,3 +616,49 @@ There are three main reasons why you might want to disable the monitoring lock:
technically, if you create 100 services with a 1 seconds interval, Gatus will send 100 requests per second)
- You have a _lot_ of services to monitor
- You want to test multiple services at very short interval (< 5s)
### Service groups
Service groups are used for grouping multiple services together on the dashboard.
```yaml
services:
- name: frontend
group: core
url: "https://example.org/"
interval: 5m
conditions:
- "[STATUS] == 200"
- name: backend
group: core
url: "https://example.org/"
interval: 5m
conditions:
- "[STATUS] == 200"
- name: monitoring
group: internal
url: "https://example.org/"
interval: 5m
conditions:
- "[STATUS] == 200"
- name: nas
group: internal
url: "https://example.org/"
interval: 5m
conditions:
- "[STATUS] == 200"
- name: random service that isn't part of a group
url: "https://example.org/"
interval: 5m
conditions:
- "[STATUS] == 200"
```
The configuration above will result in a dashboard that looks like this:
![Gatus Service Groups](.github/assets/service-groups.png)

View File

@ -1,15 +1,37 @@
metrics: true
services:
- name: twinnation
- name: frontend
group: core
url: "https://twinnation.org/health"
interval: 30s
interval: 1m
conditions:
- "[STATUS] == 200"
- "[BODY].status == UP"
- "[RESPONSE_TIME] < 1000"
- name: backend
group: core
url: "http://example.org/"
interval: 5m
conditions:
- "[STATUS] == 200"
- name: monitoring
group: internal
url: "http://example.com/"
interval: 5m
conditions:
- "[STATUS] == 200"
- name: nas
group: internal
url: "https://example.org/"
interval: 5m
conditions:
- "[STATUS] == 200"
- name: cat-fact
url: "https://cat-fact.herokuapp.com/facts/random"
interval: 1m
interval: 5m
conditions:
- "[STATUS] == 200"
- "[BODY].deleted == false"

10
core/condition-result.go Normal file
View File

@ -0,0 +1,10 @@
package core
// ConditionResult result of a Condition
type ConditionResult struct {
// Condition that was evaluated
Condition string `json:"condition"`
// Success whether the condition was met (successful) or not (failed)
Success bool `json:"success"`
}

11
core/health-status.go Normal file
View File

@ -0,0 +1,11 @@
package core
// HealthStatus is the status of Gatus
type HealthStatus struct {
// Status is the state of Gatus (UP/DOWN)
Status string `json:"status"`
// Message is an accompanying description of why the status is as reported.
// If the Status is UP, no message will be provided
Message string `json:"message,omitempty"`
}

View File

@ -4,22 +4,12 @@ import (
"time"
)
// HealthStatus is the status of Gatus
type HealthStatus struct {
// Status is the state of Gatus (UP/DOWN)
Status string `json:"status"`
// Message is an accompanying description of why the status is as reported.
// If the Status is UP, no message will be provided
Message string `json:"message,omitempty"`
}
// Result of the evaluation of a Service
type Result struct {
// HTTPStatus is the HTTP response status code
HTTPStatus int `json:"status"`
// DNSRCode is the response code of DNS query in human readable version
// DNSRCode is the response code of a DNS query in a human readable format
DNSRCode string `json:"dns-rcode"`
// Body is the response body
@ -52,12 +42,3 @@ type Result struct {
// CertificateExpiration is the duration before the certificate expires
CertificateExpiration time.Duration `json:"certificate-expiration,omitempty"`
}
// ConditionResult result of a Condition
type ConditionResult struct {
// Condition that was evaluated
Condition string `json:"condition"`
// Success whether the condition was met (successful) or not (failed)
Success bool `json:"success"`
}

27
core/service-status.go Normal file
View File

@ -0,0 +1,27 @@
package core
// ServiceStatus contains the evaluation Results of a Service
type ServiceStatus struct {
// Group the service is a part of. Used for grouping multiple services together on the front end.
Group string `json:"group,omitempty"`
// Results is the list of service evaluation results
Results []*Result `json:"results"`
}
// NewServiceStatus creates a new ServiceStatus
func NewServiceStatus(service *Service) *ServiceStatus {
return &ServiceStatus{
Group: service.Group,
Results: make([]*Result, 0),
}
}
// AddResult adds a Result to ServiceStatus.Results and makes sure that there are
// no more than 20 results in the Results slice
func (ss *ServiceStatus) AddResult(result *Result) {
ss.Results = append(ss.Results, result)
if len(ss.Results) > 20 {
ss.Results = ss.Results[1:]
}
}

View File

@ -0,0 +1,22 @@
package core
import "testing"
func TestNewServiceStatus(t *testing.T) {
service := &Service{Group: "test"}
serviceStatus := NewServiceStatus(service)
if serviceStatus.Group != service.Group {
t.Errorf("expected %s, got %s", service.Group, serviceStatus.Group)
}
}
func TestServiceStatus_AddResult(t *testing.T) {
service := &Service{Group: "test"}
serviceStatus := NewServiceStatus(service)
for i := 0; i < 50; i++ {
serviceStatus.AddResult(&Result{})
}
if len(serviceStatus.Results) != 20 {
t.Errorf("expected serviceStatus.Results to not exceed a length of 20")
}
}

View File

@ -30,6 +30,9 @@ type Service struct {
// Name of the service. Can be anything.
Name string `yaml:"name"`
// Group the service is a part of. Used for grouping multiple services together on the front end.
Group string `yaml:"group,omitempty"`
// URL to send the request to
URL string `yaml:"url"`

30
main.go
View File

@ -18,19 +18,19 @@ import (
const cacheTTL = 10 * time.Second
var (
cachedServiceResults []byte
cachedServiceResultsGzipped []byte
cachedServiceResultsTimestamp time.Time
cachedServiceStatuses []byte
cachedServiceStatusesGzipped []byte
cachedServiceStatusesTimestamp time.Time
)
func main() {
cfg := loadConfiguration()
resultsHandler := serviceResultsHandler
statusesHandler := serviceStatusesHandler
if cfg.Security != nil && cfg.Security.IsValid() {
resultsHandler = security.Handler(serviceResultsHandler, cfg.Security)
statusesHandler = security.Handler(serviceStatusesHandler, cfg.Security)
}
http.HandleFunc("/favicon.ico", favIconHandler) // favicon needs to be always served from the root
http.HandleFunc(cfg.Web.PrependWithContextRoot("/api/v1/results"), resultsHandler)
http.HandleFunc(cfg.Web.PrependWithContextRoot("/api/v1/statuses"), statusesHandler)
http.HandleFunc(cfg.Web.PrependWithContextRoot("/health"), healthHandler)
http.Handle(cfg.Web.ContextRoot, GzipHandler(http.StripPrefix(cfg.Web.ContextRoot, http.FileServer(http.Dir("./static")))))
@ -56,29 +56,29 @@ func loadConfiguration() *config.Config {
return config.Get()
}
func serviceResultsHandler(writer http.ResponseWriter, r *http.Request) {
if isExpired := cachedServiceResultsTimestamp.IsZero() || time.Now().Sub(cachedServiceResultsTimestamp) > cacheTTL; isExpired {
func serviceStatusesHandler(writer http.ResponseWriter, r *http.Request) {
if isExpired := cachedServiceStatusesTimestamp.IsZero() || time.Now().Sub(cachedServiceStatusesTimestamp) > cacheTTL; isExpired {
buffer := &bytes.Buffer{}
gzipWriter := gzip.NewWriter(buffer)
data, err := watchdog.GetJSONEncodedServiceResults()
data, err := watchdog.GetJSONEncodedServiceStatuses()
if err != nil {
log.Printf("[main][serviceResultsHandler] Unable to marshal object to JSON: %s", err.Error())
log.Printf("[main][serviceStatusesHandler] Unable to marshal object to JSON: %s", err.Error())
writer.WriteHeader(http.StatusInternalServerError)
_, _ = writer.Write([]byte("Unable to marshal object to JSON"))
return
}
gzipWriter.Write(data)
gzipWriter.Close()
cachedServiceResults = data
cachedServiceResultsGzipped = buffer.Bytes()
cachedServiceResultsTimestamp = time.Now()
cachedServiceStatuses = data
cachedServiceStatusesGzipped = buffer.Bytes()
cachedServiceStatusesTimestamp = time.Now()
}
var data []byte
if strings.Contains(r.Header.Get("Accept-Encoding"), "gzip") {
writer.Header().Set("Content-Encoding", "gzip")
data = cachedServiceResultsGzipped
data = cachedServiceStatusesGzipped
} else {
data = cachedServiceResults
data = cachedServiceStatuses
}
writer.Header().Add("Content-type", "application/json")
writer.WriteHeader(http.StatusOK)

View File

@ -99,6 +99,13 @@
#settings select:focus {
box-shadow: none;
}
.service-group {
cursor: pointer;
user-select: none;
}
.service-group h5:hover {
color: #1b1e21 !important;
}
</style>
</head>
<body>
@ -162,7 +169,7 @@
function showTooltip(serviceName, index, element) {
userClickedStatus = false;
clearTimeout(timerHandler);
let serviceResult = serviceStatuses[serviceName][index];
let serviceResult = serviceStatuses[serviceName].results[index];
$("#tooltip-timestamp").text(prettifyTimestamp(serviceResult.timestamp));
$("#tooltip-response-time").text(parseInt(serviceResult.duration/1000000) + "ms");
// Populate the condition section
@ -219,8 +226,8 @@
return "<span class='status badge badge-danger' style='width: 5%' onmouseenter='showTooltip(\""+serviceName+"\", "+index+", this)' onmouseleave='fadeTooltip()' onclick='userClickedStatus = !userClickedStatus;'>X</span>";
}
function refreshResults() {
$.getJSON("./api/v1/results", function (data) {
function refreshStatuses() {
$.getJSON("./api/v1/statuses", function (data) {
// Update the table only if there's a change
if (JSON.stringify(serviceStatuses) !== JSON.stringify(data)) {
serviceStatuses = data;
@ -230,16 +237,17 @@
}
function buildTable() {
let output = "";
let outputByGroup = {};
for (let serviceName in serviceStatuses) {
let serviceStatusOverTime = "";
let hostname = serviceStatuses[serviceName][serviceStatuses[serviceName].length-1].hostname
let serviceStatus = serviceStatuses[serviceName];
let hostname = serviceStatus.results[serviceStatus.results.length-1].hostname;
let minResponseTime = null;
let maxResponseTime = null;
let newestTimestamp = null;
let oldestTimestamp = null;
for (let key in serviceStatuses[serviceName]) {
let serviceResult = serviceStatuses[serviceName][key];
for (let key in serviceStatus.results) {
let serviceResult = serviceStatus.results[key];
serviceStatusOverTime = createStatusBadge(serviceName, key, serviceResult.success) + serviceStatusOverTime;
const responseTime = parseInt(serviceResult.duration/1000000);
if (minResponseTime == null || minResponseTime > responseTime) {
@ -256,8 +264,8 @@
oldestTimestamp = timestamp;
}
}
output += ""
+ "<div class='container py-3 border-left border-right border-top border-black'>"
let output = ""
+ "<div class='container py-3 border-left border-right border-top border-black rounded-0'>"
+ " <div class='row mb-2'>"
+ " <div class='col-md-10'>"
+ " <span class='font-weight-bold'>" + serviceName + "</span> <span class='text-secondary font-weight-lighter'>- " + hostname + "</span>"
@ -280,10 +288,59 @@
+ " </div>"
+ " </div>"
+ "</div>";
// create an empty entry if this group is new
if (!outputByGroup[serviceStatus.group]) {
outputByGroup[serviceStatus.group] = "";
}
outputByGroup[serviceStatus.group] += output;
}
let output = "";
for (let group in outputByGroup) {
// Services that don't have a group should be skipped and left for last
if (group === 'undefined') {
continue
}
let key = group.replace(/[^a-zA-Z0-9]/g, '');
let existingGroupContentSelector = $("#service-group-" + key + "-content");
let isCurrentlyHidden = existingGroupContentSelector.length && existingGroupContentSelector[0].style.display === 'none';
let groupStatus = "<span class='text-success'>&#10003;</span>";
if (outputByGroup[group].includes("badge badge-danger")) {
groupStatus = "<span class='text-warning'>~</span>";
}
output += ""
+ "<div class='mt-" + (output.length ? '4' : '3') + "'>"
+ " <div class='container pt-2 border-left border-right border-top border-black border-bottom service-group' id='service-group-" + key + "' data-group='" + key + "' onclick='toggleGroup(this)'>"
+ " <h5 class='text-secondary text-monospace pb-0'>"
+ " " + groupStatus + " " + group
+ " <span class='float-right service-group-arrow' id='service-group-" + key + "-arrow'>" + (isCurrentlyHidden ? "&#9660;" : "&#9650;") + "</span>"
+ " </h5>"
+ " </div>"
+ " <div class='service-group-content' id='service-group-" + key + "-content' style='" + (isCurrentlyHidden ? "display: none;" : "") + "'>"
+ " " + outputByGroup[group]
+ " </div>"
+ "</div>";
}
// Add all services that don't have a group at the end
if (outputByGroup['undefined']) {
output += ""
+ "<div class='mt-" + (output.length ? '4' : '3') + "'>"
+ " " + outputByGroup['undefined']
+ "</div>"
}
$("#results").html(output);
}
function toggleGroup(element) {
let selector = $("#service-group-" + element.dataset.group + "-content");
selector.toggle("fast", function() {
if (selector.length && selector[0].style.display === 'none') {
$("#service-group-" + element.dataset.group + "-arrow").html("&#9660;");
} else {
$("#service-group-" + element.dataset.group + "-arrow").html("&#9650;");
}
});
}
function prettifyTimestamp(timestamp) {
let date = new Date(timestamp);
let YYYY = date.getFullYear();
@ -318,15 +375,15 @@
}
function setRefreshInterval(seconds) {
refreshResults();
refreshStatuses();
refreshIntervalHandler = setInterval(function() {
refreshResults();
}, seconds * 1000)
refreshStatuses();
}, seconds * 1000);
}
$("#refresh-rate").change(function() {
clearInterval(refreshIntervalHandler);
setRefreshInterval($(this).val())
setRefreshInterval($(this).val());
});
setRefreshInterval(30);
$("#refresh-rate").val(30);

View File

@ -13,22 +13,22 @@ import (
)
var (
serviceResults = make(map[string][]*core.Result)
serviceStatuses = make(map[string]*core.ServiceStatus)
// serviceResultsMutex is used to prevent concurrent map access
serviceResultsMutex sync.RWMutex
// serviceStatusesMutex is used to prevent concurrent map access
serviceStatusesMutex sync.RWMutex
// monitoringMutex is used to prevent multiple services from being evaluated at the same time.
// Without this, conditions using response time may become inaccurate.
monitoringMutex sync.Mutex
)
// GetJSONEncodedServiceResults returns a list of the last 20 results for each services encoded using json.Marshal.
// GetJSONEncodedServiceStatuses returns a list of core.ServiceStatus for each services encoded using json.Marshal.
// The reason why the encoding is done here is because we use a mutex to prevent concurrent map access.
func GetJSONEncodedServiceResults() ([]byte, error) {
serviceResultsMutex.RLock()
data, err := json.Marshal(serviceResults)
serviceResultsMutex.RUnlock()
func GetJSONEncodedServiceStatuses() ([]byte, error) {
serviceStatusesMutex.RLock()
data, err := json.Marshal(serviceStatuses)
serviceStatusesMutex.RUnlock()
return data, err
}
@ -55,12 +55,7 @@ func monitor(service *core.Service) {
}
result := service.EvaluateHealth()
metric.PublishMetricsForService(service, result)
serviceResultsMutex.Lock()
serviceResults[service.Name] = append(serviceResults[service.Name], result)
if len(serviceResults[service.Name]) > 20 {
serviceResults[service.Name] = serviceResults[service.Name][1:]
}
serviceResultsMutex.Unlock()
UpdateServiceStatuses(service, result)
var extra string
if !result.Success {
extra = fmt.Sprintf("responseBody=%s", result.Body)
@ -83,3 +78,15 @@ func monitor(service *core.Service) {
time.Sleep(service.Interval)
}
}
// UpdateServiceStatuses updates the slice of service statuses
func UpdateServiceStatuses(service *core.Service, result *core.Result) {
serviceStatusesMutex.Lock()
serviceStatus, exists := serviceStatuses[service.Name]
if !exists {
serviceStatus = core.NewServiceStatus(service)
serviceStatuses[service.Name] = serviceStatus
}
serviceStatus.AddResult(result)
serviceStatusesMutex.Unlock()
}