Extend netbird status command to include health information (#1471)

* Adds management, signal, and relay (STUN/TURN) health probes to the status command.

* Adds a reason when the management or signal connections are disconnected.

* Adds last wireguard handshake and received/sent bytes per peer
This commit is contained in:
Viktor Liu
2024-01-22 12:20:24 +01:00
committed by GitHub
parent d4194cba6a
commit a7d6632298
28 changed files with 1571 additions and 339 deletions

View File

@ -21,6 +21,8 @@ import (
"github.com/netbirdio/netbird/version"
)
const probeThreshold = time.Second * 5
// Server for service control.
type Server struct {
rootCtx context.Context
@ -37,6 +39,12 @@ type Server struct {
proto.UnimplementedDaemonServiceServer
statusRecorder *peer.Status
mgmProbe *internal.Probe
signalProbe *internal.Probe
relayProbe *internal.Probe
wgProbe *internal.Probe
lastProbe time.Time
}
type oauthAuthFlow struct {
@ -53,7 +61,11 @@ func New(ctx context.Context, configPath, logFile string) *Server {
latestConfigInput: internal.ConfigInput{
ConfigPath: configPath,
},
logFile: logFile,
logFile: logFile,
mgmProbe: internal.NewProbe(),
signalProbe: internal.NewProbe(),
relayProbe: internal.NewProbe(),
wgProbe: internal.NewProbe(),
}
}
@ -105,7 +117,7 @@ func (s *Server) Start() error {
}
go func() {
if err := internal.RunClient(ctx, config, s.statusRecorder); err != nil {
if err := internal.RunClientWithProbes(ctx, config, s.statusRecorder, s.mgmProbe, s.signalProbe, s.relayProbe, s.wgProbe); err != nil {
log.Errorf("init connections: %v", err)
}
}()
@ -409,7 +421,7 @@ func (s *Server) Up(callerCtx context.Context, _ *proto.UpRequest) (*proto.UpRes
}
go func() {
if err := internal.RunClient(ctx, s.config, s.statusRecorder); err != nil {
if err := internal.RunClientWithProbes(ctx, s.config, s.statusRecorder, s.mgmProbe, s.signalProbe, s.relayProbe, s.wgProbe); err != nil {
log.Errorf("run client connection: %v", err)
return
}
@ -433,7 +445,7 @@ func (s *Server) Down(_ context.Context, _ *proto.DownRequest) (*proto.DownRespo
return &proto.DownResponse{}, nil
}
// Status starts engine work in the daemon.
// Status returns the daemon status
func (s *Server) Status(
_ context.Context,
msg *proto.StatusRequest,
@ -455,6 +467,8 @@ func (s *Server) Status(
}
if msg.GetFullPeerStatus {
s.runProbes()
fullStatus := s.statusRecorder.GetFullStatus()
pbFullStatus := toProtoFullStatus(fullStatus)
statusResponse.FullStatus = pbFullStatus
@ -463,6 +477,20 @@ func (s *Server) Status(
return &statusResponse, nil
}
func (s *Server) runProbes() {
if time.Since(s.lastProbe) > probeThreshold {
managementHealthy := s.mgmProbe.Probe()
signalHealthy := s.signalProbe.Probe()
relayHealthy := s.relayProbe.Probe()
wgProbe := s.wgProbe.Probe()
// Update last time only if all probes were successful
if managementHealthy && signalHealthy && relayHealthy && wgProbe {
s.lastProbe = time.Now()
}
}
}
// GetConfig of the daemon.
func (s *Server) GetConfig(_ context.Context, _ *proto.GetConfigRequest) (*proto.GetConfigResponse, error) {
s.mutex.Lock()
@ -503,13 +531,20 @@ func toProtoFullStatus(fullStatus peer.FullStatus) *proto.FullStatus {
SignalState: &proto.SignalState{},
LocalPeerState: &proto.LocalPeerState{},
Peers: []*proto.PeerState{},
Relays: []*proto.RelayState{},
}
pbFullStatus.ManagementState.URL = fullStatus.ManagementState.URL
pbFullStatus.ManagementState.Connected = fullStatus.ManagementState.Connected
if err := fullStatus.ManagementState.Error; err != nil {
pbFullStatus.ManagementState.Error = err.Error()
}
pbFullStatus.SignalState.URL = fullStatus.SignalState.URL
pbFullStatus.SignalState.Connected = fullStatus.SignalState.Connected
if err := fullStatus.SignalState.Error; err != nil {
pbFullStatus.SignalState.Error = err.Error()
}
pbFullStatus.LocalPeerState.IP = fullStatus.LocalPeerState.IP
pbFullStatus.LocalPeerState.PubKey = fullStatus.LocalPeerState.PubKey
@ -518,17 +553,34 @@ func toProtoFullStatus(fullStatus peer.FullStatus) *proto.FullStatus {
for _, peerState := range fullStatus.Peers {
pbPeerState := &proto.PeerState{
IP: peerState.IP,
PubKey: peerState.PubKey,
ConnStatus: peerState.ConnStatus.String(),
ConnStatusUpdate: timestamppb.New(peerState.ConnStatusUpdate),
Relayed: peerState.Relayed,
Direct: peerState.Direct,
LocalIceCandidateType: peerState.LocalIceCandidateType,
RemoteIceCandidateType: peerState.RemoteIceCandidateType,
Fqdn: peerState.FQDN,
IP: peerState.IP,
PubKey: peerState.PubKey,
ConnStatus: peerState.ConnStatus.String(),
ConnStatusUpdate: timestamppb.New(peerState.ConnStatusUpdate),
Relayed: peerState.Relayed,
Direct: peerState.Direct,
LocalIceCandidateType: peerState.LocalIceCandidateType,
RemoteIceCandidateType: peerState.RemoteIceCandidateType,
LocalIceCandidateEndpoint: peerState.LocalIceCandidateEndpoint,
RemoteIceCandidateEndpoint: peerState.RemoteIceCandidateEndpoint,
Fqdn: peerState.FQDN,
LastWireguardHandshake: timestamppb.New(peerState.LastWireguardHandshake),
BytesRx: peerState.BytesRx,
BytesTx: peerState.BytesTx,
}
pbFullStatus.Peers = append(pbFullStatus.Peers, pbPeerState)
}
for _, relayState := range fullStatus.Relays {
pbRelayState := &proto.RelayState{
URI: relayState.URI.String(),
Available: relayState.Err == nil,
}
if err := relayState.Err; err != nil {
pbRelayState.Error = err.Error()
}
pbFullStatus.Relays = append(pbFullStatus.Relays, pbRelayState)
}
return &pbFullStatus
}