[client] Prevent to block channel writing (#3474)

The "runningChan" provides feedback to the UI or any client about whether the service is up and running. If the client exits earlier than when the service successfully starts, then this channel causes a block. - Added timeout for reading the channel to ensure we don't cause blocks for too long for the caller - Modified channel writing operations to be non-blocking
2025-08-09 07:15:15 +02:00 · 2025-03-10 13:17:09 +01:00
parent 6bef474e9e
commit aaa23beeec
3 changed files with 23 additions and 25 deletions
--- a/client/embed/embed.go
+++ b/client/embed/embed.go
@ -134,10 +134,11 @@ func (c *Client) Start(startCtx context.Context) error {
 	// either startup error (permanent backoff err) or nil err (successful engine up)
 	// TODO: make after-startup backoff err available
-	run := make(chan error, 1)
+	run := make(chan struct{}, 1)
 	clientErr := make(chan error, 1)
 	go func() {
 		if err := client.Run(run); err != nil {
-			run <- err
+			clientErr <- err
 		}
 	}()
@ -147,13 +148,9 @@ func (c *Client) Start(startCtx context.Context) error {
 			return fmt.Errorf("stop error after context done. Stop error: %w. Context done: %w", stopErr, startCtx.Err())
 		}
 		return startCtx.Err()
-	case err := <-run:
+	case err := <-clientErr:
-		if err != nil {
+		return fmt.Errorf("startup: %w", err)
-			if stopErr := client.Stop(); stopErr != nil {
+	case <-run:
 				return fmt.Errorf("stop error after failed to startup. Stop error: %w. Start error: %w", stopErr, err)
 			}
 			return fmt.Errorf("startup: %w", err)
 		}
 	}
 	c.connect = client
--- a/client/internal/connect.go
+++ b/client/internal/connect.go
@ -61,7 +61,7 @@ func NewConnectClient(
 }
 // Run with main logic.
-func (c *ConnectClient) Run(runningChan chan error) error {
+func (c *ConnectClient) Run(runningChan chan struct{}) error {
 	return c.run(MobileDependency{}, runningChan)
 }
@ -102,7 +102,7 @@ func (c *ConnectClient) RunOniOS(
 	return c.run(mobileDependency, nil)
 }
-func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan error) error {
+func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan struct{}) error {
 	defer func() {
 		if r := recover(); r != nil {
 			rec := c.statusRecorder
@ -159,7 +159,6 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
 	}
 	defer c.statusRecorder.ClientStop()
 	runningChanOpen := true
 	operation := func() error {
 		// if context cancelled we not start new backoff cycle
 		if c.isContextCancelled() {
@ -282,10 +281,11 @@ func (c *ConnectClient) run(mobileDependency MobileDependency, runningChan chan
 		log.Infof("Netbird engine started, the IP is: %s", peerConfig.GetAddress())
 		state.Set(StatusConnected)
-		if runningChan != nil && runningChanOpen {
+		if runningChan != nil {
-			runningChan <- nil
+			select {
-			close(runningChan)
+			case runningChan <- struct{}{}:
-			runningChanOpen = false
+			default:
 			}
 		}
 		<-engineCtx.Done()
--- a/client/server/server.go
+++ b/client/server/server.go
@ -160,7 +160,7 @@ func (s *Server) Start() error {
 // mechanism to keep the client connected even when the connection is lost.
 // we cancel retry if the client receive a stop or down command, or if disable auto connect is configured.
 func (s *Server) connectWithRetryRuns(ctx context.Context, config *internal.Config, statusRecorder *peer.Status,
-	runningChan chan error,
+	runningChan chan struct{},
 ) {
 	backOff := getConnectWithBackoff(ctx)
 	retryStarted := false
@ -628,20 +628,21 @@ func (s *Server) Up(callerCtx context.Context, _ *proto.UpRequest) (*proto.UpRes
 	s.statusRecorder.UpdateManagementAddress(s.config.ManagementURL.String())
 	s.statusRecorder.UpdateRosenpass(s.config.RosenpassEnabled, s.config.RosenpassPermissive)
-	runningChan := make(chan error)
+	timeoutCtx, cancel := context.WithTimeout(callerCtx, 10*time.Second)
-	go s.connectWithRetryRuns(ctx, s.config, s.statusRecorder, runningChan)
+	defer cancel()
 	runningChan := make(chan struct{}, 1) // buffered channel to do not lose the signal
 	go s.connectWithRetryRuns(ctx, s.config, s.statusRecorder, runningChan)
 	for {
 		select {
-		case err := <-runningChan:
+		case <-runningChan:
-			if err != nil {
+			return &proto.UpResponse{}, nil
 				log.Debugf("waiting for engine to become ready failed: %s", err)
 			} else {
 				return &proto.UpResponse{}, nil
 			}
 		case <-callerCtx.Done():
 			log.Debug("context done, stopping the wait for engine to become ready")
 			return nil, callerCtx.Err()
 		case <-timeoutCtx.Done():
 			log.Debug("up is timed out, stopping the wait for engine to become ready")
 			return nil, timeoutCtx.Err()
 		}
 	}
 }