2024-09-08 12:06:14 +02:00
|
|
|
package healthcheck
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2024-09-17 10:04:17 +02:00
|
|
|
"os"
|
|
|
|
"strconv"
|
2024-09-08 12:06:14 +02:00
|
|
|
"time"
|
2024-09-17 10:04:17 +02:00
|
|
|
|
|
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
|
|
|
defaultAttemptThreshold = 1
|
|
|
|
defaultAttemptThresholdEnv = "NB_RELAY_HC_ATTEMPT_THRESHOLD"
|
2024-09-08 12:06:14 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
|
|
|
healthCheckInterval = 25 * time.Second
|
2024-09-14 10:12:54 +02:00
|
|
|
healthCheckTimeout = 20 * time.Second
|
2024-09-08 12:06:14 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
// Sender is a healthcheck sender
|
|
|
|
// It will send healthcheck signal to the receiver
|
|
|
|
// If the receiver does not receive the signal in a certain time, it will send a timeout signal and stop to work
|
|
|
|
// It will also stop if the context is canceled
|
|
|
|
type Sender struct {
|
2024-09-17 10:04:17 +02:00
|
|
|
log *log.Entry
|
2024-09-08 12:06:14 +02:00
|
|
|
// HealthCheck is a channel to send health check signal to the peer
|
|
|
|
HealthCheck chan struct{}
|
|
|
|
// Timeout is a channel to the health check signal is not received in a certain time
|
|
|
|
Timeout chan struct{}
|
|
|
|
|
2024-09-17 10:04:17 +02:00
|
|
|
ack chan struct{}
|
|
|
|
alive bool
|
|
|
|
attemptThreshold int
|
2024-09-08 12:06:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewSender creates a new healthcheck sender
|
2024-09-17 10:04:17 +02:00
|
|
|
func NewSender(log *log.Entry) *Sender {
|
2024-09-08 12:06:14 +02:00
|
|
|
hc := &Sender{
|
2024-09-17 10:04:17 +02:00
|
|
|
log: log,
|
|
|
|
HealthCheck: make(chan struct{}, 1),
|
|
|
|
Timeout: make(chan struct{}, 1),
|
|
|
|
ack: make(chan struct{}, 1),
|
|
|
|
attemptThreshold: getAttemptThresholdFromEnv(),
|
2024-09-08 12:06:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return hc
|
|
|
|
}
|
|
|
|
|
|
|
|
// OnHCResponse sends an acknowledgment signal to the sender
|
|
|
|
func (hc *Sender) OnHCResponse() {
|
|
|
|
select {
|
|
|
|
case hc.ack <- struct{}{}:
|
|
|
|
default:
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (hc *Sender) StartHealthCheck(ctx context.Context) {
|
|
|
|
ticker := time.NewTicker(healthCheckInterval)
|
|
|
|
defer ticker.Stop()
|
|
|
|
|
2024-09-17 10:04:17 +02:00
|
|
|
timeoutTicker := time.NewTicker(hc.getTimeoutTime())
|
|
|
|
defer timeoutTicker.Stop()
|
2024-09-08 12:06:14 +02:00
|
|
|
|
|
|
|
defer close(hc.HealthCheck)
|
|
|
|
defer close(hc.Timeout)
|
|
|
|
|
2024-09-17 10:04:17 +02:00
|
|
|
failureCounter := 0
|
2024-09-08 12:06:14 +02:00
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ticker.C:
|
|
|
|
hc.HealthCheck <- struct{}{}
|
2024-09-17 10:04:17 +02:00
|
|
|
case <-timeoutTicker.C:
|
|
|
|
if hc.alive {
|
|
|
|
hc.alive = false
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
failureCounter++
|
|
|
|
if failureCounter < hc.attemptThreshold {
|
|
|
|
hc.log.Warnf("Health check failed attempt %d.", failureCounter)
|
|
|
|
continue
|
|
|
|
}
|
2024-09-08 12:06:14 +02:00
|
|
|
hc.Timeout <- struct{}{}
|
|
|
|
return
|
|
|
|
case <-hc.ack:
|
2024-09-17 10:04:17 +02:00
|
|
|
failureCounter = 0
|
|
|
|
hc.alive = true
|
2024-09-08 12:06:14 +02:00
|
|
|
case <-ctx.Done():
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2024-09-17 10:04:17 +02:00
|
|
|
|
|
|
|
func (hc *Sender) getTimeoutTime() time.Duration {
|
|
|
|
return healthCheckInterval + healthCheckTimeout
|
|
|
|
}
|
|
|
|
|
|
|
|
func getAttemptThresholdFromEnv() int {
|
|
|
|
if attemptThreshold := os.Getenv(defaultAttemptThresholdEnv); attemptThreshold != "" {
|
|
|
|
threshold, err := strconv.ParseInt(attemptThreshold, 10, 64)
|
|
|
|
if err != nil {
|
|
|
|
log.Errorf("Failed to parse attempt threshold from environment variable \"%s\" should be an integer. Using default value", attemptThreshold)
|
|
|
|
return defaultAttemptThreshold
|
|
|
|
}
|
|
|
|
return int(threshold)
|
|
|
|
}
|
|
|
|
return defaultAttemptThreshold
|
|
|
|
}
|