Gracefully handle quick relaunches of VNet daemon (#44271)

* Use a timeout when waiting for TUN device * Return error if VNEDaemonService receives another startVnet message * Sleep and call daemon again on VNEAlreadyRunningError * Reduce ThrottleInterval to 5 seconds * Define VNEErrorDomain as C string * Define error code as int, make ErrAlreadyRunning private * Simplify and rename sleep to sleepOrDone * Document error_code of StartVnetResult * Use extern for VNEAlreadyRunningError
gravitational · Jul 19, 2024 · 58d431d · 58d431d
1 parent cbf2a9d
commit 58d431d
Show file tree

Hide file tree

Showing 10 changed files with 148 additions and 23 deletions.
diff --git a/...cos/tsh/tsh.app/Contents/Library/LaunchDaemons/com.gravitational.teleport.tsh.vnetd.plist b/...cos/tsh/tsh.app/Contents/Library/LaunchDaemons/com.gravitational.teleport.tsh.vnetd.plist
@@ -20,5 +20,7 @@
 	<string>/var/log/vnet.log</string>
 	<key>StandardOutPath</key>
 	<string>/var/log/vnet.log</string>
+	<key>ThrottleInterval</key>
+	<integer>5</integer>
 </dict>
 </plist>
diff --git a/...ets/macos/tshdev/tsh.app/Contents/Library/LaunchDaemons/com.goteleport.tshdev.vnetd.plist b/...ets/macos/tshdev/tsh.app/Contents/Library/LaunchDaemons/com.goteleport.tshdev.vnetd.plist
@@ -20,5 +20,7 @@
 	<string>/var/log/vnet.log</string>
 	<key>StandardOutPath</key>
 	<string>/var/log/vnet.log</string>
+	<key>ThrottleInterval</key>
+	<integer>5</integer>
 </dict>
 </plist>
diff --git a/lib/vnet/daemon/client_darwin.go b/lib/vnet/daemon/client_darwin.go
@@ -23,6 +23,7 @@ package daemon
 // #cgo LDFLAGS: -framework Foundation -framework ServiceManagement
 // #include <stdlib.h>
 // #include "client_darwin.h"
+// #include "protocol_darwin.h"
 import "C"
 
 import (
@@ -79,8 +80,27 @@ func RegisterAndCall(ctx context.Context, config Config) error {
 		}
 	}
 
-	if err := startByCalling(ctx, bundlePath, config); err != nil {
-		return trace.Wrap(err)
+	if err = startByCalling(ctx, bundlePath, config); err != nil {
+		if !errors.Is(err, errAlreadyRunning) {
+			return trace.Wrap(err)
+		}
+
+		// If the daemon was already running, it might mean two things:
+		//
+		// 1. The user attempted to start a second instance of VNet.
+		// 2. The user has stopped the previous instance of VNet and immediately started a new one,
+		// before the daemon had a chance to notice that the previous instance was stopped and exit too.
+		//
+		// In the second case, we want to wait and repeat the call to the daemon, in case the daemon was
+		// just about to quit.
+		log.DebugContext(ctx, "VNet daemon is already running, waiting to see if it's going to shut down")
+		if err := sleepOrDone(ctx, 2*CheckUnprivilegedProcessInterval); err != nil {
+			return trace.Wrap(err)
+		}
+
+		if err := startByCalling(ctx, bundlePath, config); err != nil {
+			return trace.Wrap(err)
+		}
 	}
 
 	// TODO(ravicious): Implement monitoring the state of the daemon.
@@ -125,17 +145,17 @@ func register(ctx context.Context, bundlePath string) (ServiceStatus, error) {
 	return ServiceStatus(result.service_status), nil
 }
 
-const waitingForEnablementTimeout = time.Minute
-
-var waitingForEnablementTimeoutExceeded = errors.New("the login item was not enabled within the timeout")
-
 // waitForEnablement periodically checks if the status of the daemon has changed to
 // [serviceStatusEnabled]. This happens when the user approves the login item in system settings.
 func waitForEnablement(ctx context.Context, bundlePath string) error {
 	ticker := time.NewTicker(time.Second)
 	defer ticker.Stop()
 
-	ctx, cancel := context.WithTimeoutCause(ctx, waitingForEnablementTimeout, waitingForEnablementTimeoutExceeded)
+	// It should be less than receiveTunTimeout in the vnet package
+	// so that the user sees the error about the background item first.
+	const waitingForEnablementTimeout = 50 * time.Second
+	ctx, cancel := context.WithTimeoutCause(ctx, waitingForEnablementTimeout,
+		errors.New("the background item was not enabled within the timeout"))
 	defer cancel()
 
 	for {
@@ -276,7 +296,15 @@ func startByCalling(ctx context.Context, bundlePath string, config Config) error
 		C.StartVnet(&req, &res)
 
 		if !res.ok {
-			errC <- trace.Errorf("could not start VNet daemon: %v (%v)", C.GoString(res.error_description), C.GoString(res.error_domain))
+			errorDomain := C.GoString(res.error_domain)
+			errorCode := int(res.error_code)
+
+			if errorDomain == vnetErrorDomain && errorCode == errorCodeAlreadyRunning {
+				errC <- trace.Wrap(errAlreadyRunning)
+				return
+			}
+
+			errC <- trace.Errorf("could not start VNet daemon: %v", C.GoString(res.error_description))
 			return
 		}
 
@@ -290,3 +318,24 @@ func startByCalling(ctx context.Context, bundlePath string, config Config) error
 		return trace.Wrap(err, "connecting to the VNet daemon")
 	}
 }
+
+var (
+	// vnetErrorDomain is a custom error domain used for Objective-C errors that pertain to VNet.
+	vnetErrorDomain = C.GoString(C.VNEErrorDomain)
+	// errorCodeAlreadyRunning is returned within [vnetErrorDomain] errors to indicate that the daemon
+	// received a message to start after it was already running.
+	errorCodeAlreadyRunning = int(C.VNEAlreadyRunningError)
+	errAlreadyRunning       = errors.New("VNet is already running")
+)
+
+func sleepOrDone(ctx context.Context, d time.Duration) error {
+	timer := time.NewTimer(d)
+	defer timer.Stop()
+
+	select {
+	case <-ctx.Done():
+		return trace.Wrap(ctx.Err())
+	case <-timer.C:
+		return nil
+	}
+}
diff --git a/lib/vnet/daemon/client_darwin.h b/lib/vnet/daemon/client_darwin.h
@@ -45,6 +45,9 @@ typedef struct StartVnetRequest {
 typedef struct StartVnetResult {
   bool ok;
   const char *error_domain;
+  // error_code is code taken from an NSError instance encountered during the call to StartVnet.
+  // If ok is false, error_code is greater than zero and identifies the reason behind the error.
+  int error_code;
   const char *error_description;
 } StartVnetResult;
 

diff --git a/lib/vnet/daemon/client_darwin.m b/lib/vnet/daemon/client_darwin.m
@@ -117,8 +117,8 @@ - (void)startVnet:(VnetConfig *)vnetConfig completion:(void (^)(NSError *))compl
   }];
 
   [(id<VNEDaemonProtocol>)proxy startVnet:vnetConfig
-                               completion:^(void) {
-                                 completion(nil);
+                               completion:^(NSError *error) {
+                                 completion(error);
                                }];
 }
 
@@ -143,6 +143,7 @@ void StartVnet(StartVnetRequest *request, StartVnetResult *outResult) {
                completion:^(NSError *error) {
                  if (error) {
                    outResult->error_domain = VNECopyNSString([error domain]);
+                   outResult->error_code = (int)[error code];
                    outResult->error_description = VNECopyNSString([error description]);
                    dispatch_semaphore_signal(sema);
                    return;

diff --git a/lib/vnet/daemon/common.go b/lib/vnet/daemon/common.go
@@ -17,6 +17,8 @@
 package daemon
 
 import (
+	"time"
+
 	"github.com/gravitational/trace"
 )
 
@@ -47,3 +49,7 @@ func (c *Config) CheckAndSetDefaults() error {
 	}
 	return nil
 }
+
+// CheckUnprivilegedProcessInterval denotes how often the admin process should check if the
+// unprivileged process has quit.
+const CheckUnprivilegedProcessInterval = time.Second
diff --git a/lib/vnet/daemon/protocol_darwin.h b/lib/vnet/daemon/protocol_darwin.h
@@ -1,6 +1,16 @@
 #ifndef TELEPORT_LIB_VNET_DAEMON_PROTOCOL_DARWIN_H_
 #define TELEPORT_LIB_VNET_DAEMON_PROTOCOL_DARWIN_H_
 
+#import <Foundation/Foundation.h>
+
+// VNEErrorDomain is a custom error domain used for Objective-C errors that pertain to VNet.
+extern const char* const VNEErrorDomain;
+
+// VNEAlreadyRunningError indicates that the daemon already received a VNet config.
+// It won't accept a new one during its lifetime, instead it's expected to stop, after
+// which the client might spawn a new instance of the daemon.
+extern const int VNEAlreadyRunningError;
+
 typedef struct VnetConfig {
   const char *socket_path;
   const char *ipv6_prefix;
@@ -9,11 +19,13 @@ typedef struct VnetConfig {
 } VnetConfig;
 
 @protocol VNEDaemonProtocol
-// startVnet starts VNet in a separate thread and returns immediately.
-// Only the first call to this method starts VNet. Subsequent calls are noops.
-// The daemon process exits after VNet is stopped, after which it can be
-// spawned again by calling this method.
-- (void)startVnet:(VnetConfig *)vnetConfig completion:(void (^)(void))completion;
+// startVnet passes the config back to Go code (which then starts VNet in a separate thread)
+// and returns immediately.
+//
+// Only the first call to this method starts VNet. Subsequent calls return VNEAlreadyRunningError.
+// The daemon process exits after VNet is stopped, after which it can be spawned again by calling
+// this method.
+- (void)startVnet:(VnetConfig *)vnetConfig completion:(void (^)(NSError *error))completion;
 @end
 
 #endif /* TELEPORT_LIB_VNET_DAEMON_PROTOCOL_DARWIN_H_ */
diff --git a/lib/vnet/daemon/protocol_darwin.m b/lib/vnet/daemon/protocol_darwin.m
@@ -0,0 +1,22 @@
+//go:build vnetdaemon
+// +build vnetdaemon
+
+// Teleport
+// Copyright (C) 2024 Gravitational, Inc.
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+const char* const VNEErrorDomain = "com.Gravitational.Vnet.ErrorDomain";
+
+const int VNEAlreadyRunningError = 1;
diff --git a/lib/vnet/daemon/service_darwin.m b/lib/vnet/daemon/service_darwin.m
@@ -29,7 +29,10 @@
 @interface VNEDaemonService () <NSXPCListenerDelegate, VNEDaemonProtocol>
 
 @property(nonatomic, strong, readwrite) NSXPCListener *listener;
+// started describes whether the XPC listener is listening for new connections.
 @property(nonatomic, readwrite) BOOL started;
+// gotConfig describes if the daemon received a VNet config from a client.
+@property(nonatomic, readwrite) BOOL gotConfig;
 
 @property(nonatomic, readwrite) NSString *socketPath;
 @property(nonatomic, readwrite) NSString *ipv6Prefix;
@@ -76,14 +79,31 @@ - (void)waitForVnetConfig {
 
 #pragma mark - VNEDaemonProtocol
 
-- (void)startVnet:(VnetConfig *)vnetConfig completion:(void (^)(void))completion {
+- (void)startVnet:(VnetConfig *)vnetConfig completion:(void (^)(NSError *error))completion {
   @synchronized(self) {
+    // startVnet is expected to be called only once per daemon's lifetime.
+    // Between the process with the daemon client exiting and the admin process (which runs the
+    // daemon) noticing this and exiting as well, a new client can be spawned and startVnet might
+    // end up getting called again.
+    //
+    // In such scenarios, we want to return an error so that the client can wait for the daemon
+    // to exit and retry the call.
+    if (_gotConfig) {
+      NSError *error = [[NSError alloc] initWithDomain:@(VNEErrorDomain)
+                                                  code:VNEAlreadyRunningError
+                                              userInfo:nil];
+      completion(error);
+      return;
+    }
+
+    _gotConfig = YES;
     _socketPath = @(vnetConfig->socket_path);
     _ipv6Prefix = @(vnetConfig->ipv6_prefix);
     _dnsAddr = @(vnetConfig->dns_addr);
     _homePath = @(vnetConfig->home_path);
+
     dispatch_semaphore_signal(_gotVnetConfigSema);
-    completion();
+    completion(nil);
   }
 }
 

diff --git a/lib/vnet/setup.go b/lib/vnet/setup.go
@@ -18,6 +18,7 @@ package vnet
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"log/slog"
 	"os"
@@ -95,9 +96,16 @@ func SetupAndRun(ctx context.Context, config *SetupAndRunConfig) (*ProcessManage
 		recvTUNErr <- err
 	}()
 
+	// It should be more than waitingForEnablementTimeout in the vnet/daemon package
+	// so that the user sees the error about the background item first.
+	const receiveTunTimeout = time.Minute
+	receiveTunCtx, cancel := context.WithTimeoutCause(ctx, receiveTunTimeout,
+		errors.New("admin process did not send back TUN device within timeout"))
+	defer cancel()
+
 	select {
-	case <-ctx.Done():
-		return nil, trace.Wrap(ctx.Err())
+	case <-receiveTunCtx.Done():
+		return nil, trace.Wrap(context.Cause(receiveTunCtx))
 	case <-processCtx.Done():
 		return nil, trace.Wrap(context.Cause(processCtx))
 	case err := <-recvTUNErr:
@@ -110,7 +118,7 @@ func SetupAndRun(ctx context.Context, config *SetupAndRunConfig) (*ProcessManage
 				slog.DebugContext(ctx, "Error from recvTUNErr ignored in favor of processCtx.Err", "error", err)
 				return nil, trace.Wrap(context.Cause(processCtx))
 			}
-			return nil, trace.Wrap(err, "receiving TUN from admin process")
+			return nil, trace.Wrap(err, "receiving TUN device from admin process")
 		}
 	}
 
@@ -230,9 +238,9 @@ func AdminSetup(ctx context.Context, config daemon.Config) error {
 	}()
 
 	// Stay alive until we get an error on errCh, indicating that the osConfig loop exited.
-	// If the socket is deleted, indicating that the parent process exited, cancel the context and then wait
-	// for the osConfig loop to exit and send an err on errCh.
-	ticker := time.NewTicker(time.Second)
+	// If the socket is deleted, indicating that the unprivileged process exited, cancel the context
+	// and then wait for the osConfig loop to exit and send an err on errCh.
+	ticker := time.NewTicker(daemon.CheckUnprivilegedProcessInterval)
 	defer ticker.Stop()
 	for {
 		select {