Gracefully conn worker shutdown (#2022)

Because the connWorker are operating with the e.peerConns list we must ensure all workers exited before we modify the content of the e.peerConns list.
If we do not do that the engine will start new connWorkers for the exists ones, and they start connection for the same peers in parallel.
This commit is contained in:
Zoltan Papp
2024-05-22 11:15:29 +02:00
committed by GitHub
parent b8717b8956
commit 61034aaf4d
2 changed files with 19 additions and 2 deletions

View File

@@ -150,6 +150,8 @@ type Engine struct {
signalProbe *Probe
relayProbe *Probe
wgProbe *Probe
wgConnWorker sync.WaitGroup
}
// Peer is an instance of the Connection Peer
@@ -245,6 +247,7 @@ func (e *Engine) Stop() error {
time.Sleep(500 * time.Millisecond)
e.close()
e.wgConnWorker.Wait()
log.Infof("stopped Netbird Engine")
return nil
}
@@ -869,18 +872,25 @@ func (e *Engine) addNewPeer(peerConfig *mgmProto.RemotePeerConfig) error {
log.Warnf("error adding peer %s to status recorder, got error: %v", peerKey, err)
}
e.wgConnWorker.Add(1)
go e.connWorker(conn, peerKey)
}
return nil
}
func (e *Engine) connWorker(conn *peer.Conn, peerKey string) {
defer e.wgConnWorker.Done()
for {
// randomize starting time a bit
min := 500
max := 2000
time.Sleep(time.Duration(rand.Intn(max-min)+min) * time.Millisecond)
duration := time.Duration(rand.Intn(max-min)+min) * time.Millisecond
select {
case <-e.ctx.Done():
return
case <-time.After(duration):
}
// if peer has been removed -> give up
if !e.peerExists(peerKey) {