[client] Avoid deadlock when auto connect and early exit (#2528)

2025-06-24 03:31:28 +02:00 · 2024-09-04 19:22:33 +02:00 · 2024-09-04 19:22:33 +02:00 · c52b406afa
commit c52b406afa
parent 1ff7a953a0
2 changed files with 28 additions and 18 deletions
--- a/client/internal/connect.go
+++ b/client/internal/connect.go
@ -61,9 +61,9 @@ func (c *ConnectClient) Run() error {
 // RunWithProbes runs the client's main logic with probes attached
 func (c *ConnectClient) RunWithProbes(
 	probes *ProbeHolder,
-	runningWg *sync.WaitGroup,
+	runningChan chan error,
 ) error {
-	return c.run(MobileDependency{}, probes, runningWg)
+	return c.run(MobileDependency{}, probes, runningChan)
 }

 // RunOnAndroid with main logic on mobile system
@ -104,7 +104,7 @@ func (c *ConnectClient) RunOniOS(
 func (c *ConnectClient) run(
 	mobileDependency MobileDependency,
 	probes *ProbeHolder,
-	runningWg *sync.WaitGroup,
+	runningChan chan error,
 ) error {
 	defer func() {
 		if r := recover(); r != nil {
@ -195,6 +195,7 @@ func (c *ConnectClient) run(
 			log.Debug(err)
 			if s, ok := gstatus.FromError(err); ok && (s.Code() == codes.PermissionDenied) {
 				state.Set(StatusNeedsLogin)
+				_ = c.Stop()
 				return backoff.Permanent(wrapErr(err)) // unrecoverable error
 			}
 			return wrapErr(err)
@ -263,8 +264,9 @@ func (c *ConnectClient) run(
 		log.Infof("Netbird engine started, the IP is: %s", peerConfig.GetAddress())
 		state.Set(StatusConnected)

-		if runningWg != nil {
-			runningWg.Done()
+		if runningChan != nil {
+			runningChan <- nil
+			close(runningChan)
 		}

 		<-engineCtx.Done()
@ -287,6 +289,7 @@ func (c *ConnectClient) run(
 		log.Debugf("exiting client retry loop due to unrecoverable error: %s", err)
 		if s, ok := gstatus.FromError(err); ok && (s.Code() == codes.PermissionDenied) {
 			state.Set(StatusNeedsLogin)
+			_ = c.Stop()
 		}
 		return err
 	}
--- a/client/server/server.go
+++ b/client/server/server.go
@ -142,13 +142,11 @@ func (s *Server) Start() error {
 		s.sessionWatcher.SetOnExpireListener(s.onSessionExpire)
 	}

-	runningWg := sync.WaitGroup{}
-	runningWg.Add(1)
-	if !config.DisableAutoConnect {
-		go s.connectWithRetryRuns(ctx, config, s.statusRecorder, &runningWg)
+	if config.DisableAutoConnect {
+		return nil
 	}

-	runningWg.Wait()
+	go s.connectWithRetryRuns(ctx, config, s.statusRecorder, nil)

 	return nil
 }
@ -157,7 +155,7 @@ func (s *Server) Start() error {
 // mechanism to keep the client connected even when the connection is lost.
 // we cancel retry if the client receive a stop or down command, or if disable auto connect is configured.
 func (s *Server) connectWithRetryRuns(ctx context.Context, config *internal.Config, statusRecorder *peer.Status,
-	runningWg *sync.WaitGroup,
+	runningChan chan error,
 ) {
 	backOff := getConnectWithBackoff(ctx)
 	retryStarted := false
@ -196,7 +194,7 @@ func (s *Server) connectWithRetryRuns(ctx context.Context, config *internal.Conf
 			WgProbe:     s.wgProbe,
 		}

-		err := s.connectClient.RunWithProbes(&probes, runningWg)
+		err := s.connectClient.RunWithProbes(&probes, runningChan)
 		if err != nil {
 			log.Debugf("run client connection exited with error: %v. Will retry in the background", err)
 		}
@ -587,13 +585,22 @@ func (s *Server) Up(callerCtx context.Context, _ *proto.UpRequest) (*proto.UpRes
 	s.statusRecorder.UpdateManagementAddress(s.config.ManagementURL.String())
 	s.statusRecorder.UpdateRosenpass(s.config.RosenpassEnabled, s.config.RosenpassPermissive)

-	runningWg := sync.WaitGroup{}
-	runningWg.Add(1)
-	go s.connectWithRetryRuns(ctx, s.config, s.statusRecorder, &runningWg)
+	runningChan := make(chan error)
+	go s.connectWithRetryRuns(ctx, s.config, s.statusRecorder, runningChan)

-	runningWg.Wait()
-
-	return &proto.UpResponse{}, nil
+	for {
+		select {
+		case err := <-runningChan:
+			if err != nil {
+				log.Debugf("waiting for engine to become ready failed: %s", err)
+			} else {
+				return &proto.UpResponse{}, nil
+			}
+		case <-callerCtx.Done():
+			log.Debug("context done, stopping the wait for engine to become ready")
+			return nil, callerCtx.Err()
+		}
+	}
 }

 // Down engine work in the daemon.