Monitor network changes and restart engine on detection (#1904)

This commit is contained in:
Viktor Liu
2024-05-07 18:50:34 +02:00
committed by GitHub
parent 2e0047daea
commit 920877964f
38 changed files with 1027 additions and 190 deletions

View File

@ -2,6 +2,7 @@ package internal
import (
"context"
"errors"
"fmt"
"math/rand"
"net"
@ -21,6 +22,7 @@ import (
"github.com/netbirdio/netbird/client/firewall/manager"
"github.com/netbirdio/netbird/client/internal/acl"
"github.com/netbirdio/netbird/client/internal/dns"
"github.com/netbirdio/netbird/client/internal/networkmonitor"
"github.com/netbirdio/netbird/client/internal/peer"
"github.com/netbirdio/netbird/client/internal/relay"
"github.com/netbirdio/netbird/client/internal/rosenpass"
@ -60,6 +62,9 @@ type EngineConfig struct {
// WgPrivateKey is a Wireguard private key of our peer (it MUST never leave the machine)
WgPrivateKey wgtypes.Key
// NetworkMonitor is a flag to enable network monitoring
NetworkMonitor bool
// IFaceBlackList is a list of network interfaces to ignore when discovering connection candidates (ICE related)
IFaceBlackList []string
DisableIPv6Discovery bool
@ -114,9 +119,11 @@ type Engine struct {
// clientRoutes is the most recent list of clientRoutes received from the Management Service
clientRoutes route.HAMap
cancel context.CancelFunc
clientCtx context.Context
clientCancel context.CancelFunc
ctx context.Context
ctx context.Context
cancel context.CancelFunc
wgInterface *iface.WGIface
wgProxyFactory *wgproxy.Factory
@ -126,6 +133,8 @@ type Engine struct {
// networkSerial is the latest CurrentSerial (state ID) of the network sent by the Management service
networkSerial uint64
networkWatcher *networkmonitor.NetworkWatcher
sshServerFunc func(hostKeyPEM []byte, addr string) (nbssh.Server, error)
sshServer nbssh.Server
@ -151,8 +160,8 @@ type Peer struct {
// NewEngine creates a new Connection Engine
func NewEngine(
ctx context.Context,
cancel context.CancelFunc,
clientCtx context.Context,
clientCancel context.CancelFunc,
signalClient signal.Client,
mgmClient mgm.Client,
config *EngineConfig,
@ -160,8 +169,8 @@ func NewEngine(
statusRecorder *peer.Status,
) *Engine {
return NewEngineWithProbes(
ctx,
cancel,
clientCtx,
clientCancel,
signalClient,
mgmClient,
config,
@ -176,8 +185,8 @@ func NewEngine(
// NewEngineWithProbes creates a new Connection Engine with probes attached
func NewEngineWithProbes(
ctx context.Context,
cancel context.CancelFunc,
clientCtx context.Context,
clientCancel context.CancelFunc,
signalClient signal.Client,
mgmClient mgm.Client,
config *EngineConfig,
@ -188,9 +197,10 @@ func NewEngineWithProbes(
relayProbe *Probe,
wgProbe *Probe,
) *Engine {
return &Engine{
ctx: ctx,
cancel: cancel,
clientCtx: clientCtx,
clientCancel: clientCancel,
signal: signalClient,
mgmClient: mgmClient,
peerConns: make(map[string]*peer.Conn),
@ -202,7 +212,7 @@ func NewEngineWithProbes(
networkSerial: 0,
sshServerFunc: nbssh.DefaultSSHServer,
statusRecorder: statusRecorder,
wgProxyFactory: wgproxy.NewFactory(config.WgPort),
networkWatcher: networkmonitor.New(),
mgmProbe: mgmProbe,
signalProbe: signalProbe,
relayProbe: relayProbe,
@ -214,6 +224,13 @@ func (e *Engine) Stop() error {
e.syncMsgMux.Lock()
defer e.syncMsgMux.Unlock()
if e.cancel != nil {
e.cancel()
}
// stopping network monitor first to avoid starting the engine again
e.networkWatcher.Stop()
err := e.removeAllPeers()
if err != nil {
return err
@ -222,7 +239,7 @@ func (e *Engine) Stop() error {
e.clientRoutes = nil
// very ugly but we want to remove peers from the WireGuard interface first before removing interface.
// Removing peers happens in the conn.CLose() asynchronously
// Removing peers happens in the conn.Close() asynchronously
time.Sleep(500 * time.Millisecond)
e.close()
@ -237,6 +254,13 @@ func (e *Engine) Start() error {
e.syncMsgMux.Lock()
defer e.syncMsgMux.Unlock()
if e.cancel != nil {
e.cancel()
}
e.ctx, e.cancel = context.WithCancel(e.clientCtx)
e.wgProxyFactory = wgproxy.NewFactory(e.clientCtx, e.config.WgPort)
wgIface, err := e.newWgIface()
if err != nil {
log.Errorf("failed creating wireguard interface instance %s: [%s]", e.config.WgIfaceName, err)
@ -320,6 +344,21 @@ func (e *Engine) Start() error {
e.receiveManagementEvents()
e.receiveProbeEvents()
if e.config.NetworkMonitor {
// starting network monitor at the very last to avoid disruptions
go e.networkWatcher.Start(e.ctx, func() {
log.Infof("Network monitor detected network change, restarting engine")
if err := e.Stop(); err != nil {
log.Errorf("Failed to stop engine: %v", err)
}
if err := e.Start(); err != nil {
log.Errorf("Failed to start engine: %v", err)
}
})
} else {
log.Infof("Network monitor is disabled, not starting")
}
return nil
}
@ -588,12 +627,12 @@ func (e *Engine) updateConfig(conf *mgmProto.PeerConfig) error {
// E.g. when a new peer has been registered and we are allowed to connect to it.
func (e *Engine) receiveManagementEvents() {
go func() {
err := e.mgmClient.Sync(e.handleSync)
err := e.mgmClient.Sync(e.ctx, e.handleSync)
if err != nil {
// happens if management is unavailable for a long time.
// We want to cancel the operation of the whole client
_ = CtxGetState(e.ctx).Wrap(ErrResetConnection)
e.cancel()
e.clientCancel()
return
}
log.Debugf("stopped receiving updates from Management Service")
@ -869,11 +908,12 @@ func (e *Engine) connWorker(conn *peer.Conn, peerKey string) {
conn.UpdateStunTurn(append(e.STUNs, e.TURNs...))
e.syncMsgMux.Unlock()
err := conn.Open()
err := conn.Open(e.ctx)
if err != nil {
log.Debugf("connection to peer %s failed: %v", peerKey, err)
switch err.(type) {
case *peer.ConnectionClosedError:
var connectionClosedError *peer.ConnectionClosedError
switch {
case errors.As(err, &connectionClosedError):
// conn has been forced to close, so we exit the loop
return
default:
@ -984,7 +1024,7 @@ func (e *Engine) createPeerConn(pubKey string, allowedIPs string) (*peer.Conn, e
func (e *Engine) receiveSignalEvents() {
go func() {
// connect to a stream of messages coming from the signal server
err := e.signal.Receive(func(msg *sProto.Message) error {
err := e.signal.Receive(e.ctx, func(msg *sProto.Message) error {
e.syncMsgMux.Lock()
defer e.syncMsgMux.Unlock()
@ -1058,7 +1098,7 @@ func (e *Engine) receiveSignalEvents() {
// happens if signal is unavailable for a long time.
// We want to cancel the operation of the whole client
_ = CtxGetState(e.ctx).Wrap(ErrResetConnection)
e.cancel()
e.clientCancel()
return
}
}()
@ -1119,13 +1159,16 @@ func (e *Engine) parseNATExternalIPMappings() []string {
}
func (e *Engine) close() {
if err := e.wgProxyFactory.Free(); err != nil {
log.Errorf("failed closing ebpf proxy: %s", err)
if e.wgProxyFactory != nil {
if err := e.wgProxyFactory.Free(); err != nil {
log.Errorf("failed closing ebpf proxy: %s", err)
}
}
// stop/restore DNS first so dbus and friends don't complain because of a missing interface
if e.dnsServer != nil {
e.dnsServer.Stop()
e.dnsServer = nil
}
if e.routeManager != nil {