Skip to content

Commit

Permalink
Gracefully handle quick relaunches of VNet daemon (#44271)
Browse files Browse the repository at this point in the history
* Use a timeout when waiting for TUN device

* Return error if VNEDaemonService receives another startVnet message

* Sleep and call daemon again on VNEAlreadyRunningError

* Reduce ThrottleInterval to 5 seconds

* Define VNEErrorDomain as C string

* Define error code as int, make ErrAlreadyRunning private

* Simplify and rename sleep to sleepOrDone

* Document error_code of StartVnetResult

* Use extern for VNEAlreadyRunningError
  • Loading branch information
ravicious authored Jul 19, 2024
1 parent cbf2a9d commit 58d431d
Show file tree
Hide file tree
Showing 10 changed files with 148 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,7 @@
<string>/var/log/vnet.log</string>
<key>StandardOutPath</key>
<string>/var/log/vnet.log</string>
<key>ThrottleInterval</key>
<integer>5</integer>
</dict>
</plist>
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,7 @@
<string>/var/log/vnet.log</string>
<key>StandardOutPath</key>
<string>/var/log/vnet.log</string>
<key>ThrottleInterval</key>
<integer>5</integer>
</dict>
</plist>
65 changes: 57 additions & 8 deletions lib/vnet/daemon/client_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ package daemon
// #cgo LDFLAGS: -framework Foundation -framework ServiceManagement
// #include <stdlib.h>
// #include "client_darwin.h"
// #include "protocol_darwin.h"
import "C"

import (
Expand Down Expand Up @@ -79,8 +80,27 @@ func RegisterAndCall(ctx context.Context, config Config) error {
}
}

if err := startByCalling(ctx, bundlePath, config); err != nil {
return trace.Wrap(err)
if err = startByCalling(ctx, bundlePath, config); err != nil {
if !errors.Is(err, errAlreadyRunning) {
return trace.Wrap(err)
}

// If the daemon was already running, it might mean two things:
//
// 1. The user attempted to start a second instance of VNet.
// 2. The user has stopped the previous instance of VNet and immediately started a new one,
// before the daemon had a chance to notice that the previous instance was stopped and exit too.
//
// In the second case, we want to wait and repeat the call to the daemon, in case the daemon was
// just about to quit.
log.DebugContext(ctx, "VNet daemon is already running, waiting to see if it's going to shut down")
if err := sleepOrDone(ctx, 2*CheckUnprivilegedProcessInterval); err != nil {
return trace.Wrap(err)
}

if err := startByCalling(ctx, bundlePath, config); err != nil {
return trace.Wrap(err)
}
}

// TODO(ravicious): Implement monitoring the state of the daemon.
Expand Down Expand Up @@ -125,17 +145,17 @@ func register(ctx context.Context, bundlePath string) (ServiceStatus, error) {
return ServiceStatus(result.service_status), nil
}

const waitingForEnablementTimeout = time.Minute

var waitingForEnablementTimeoutExceeded = errors.New("the login item was not enabled within the timeout")

// waitForEnablement periodically checks if the status of the daemon has changed to
// [serviceStatusEnabled]. This happens when the user approves the login item in system settings.
func waitForEnablement(ctx context.Context, bundlePath string) error {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()

ctx, cancel := context.WithTimeoutCause(ctx, waitingForEnablementTimeout, waitingForEnablementTimeoutExceeded)
// It should be less than receiveTunTimeout in the vnet package
// so that the user sees the error about the background item first.
const waitingForEnablementTimeout = 50 * time.Second
ctx, cancel := context.WithTimeoutCause(ctx, waitingForEnablementTimeout,
errors.New("the background item was not enabled within the timeout"))
defer cancel()

for {
Expand Down Expand Up @@ -276,7 +296,15 @@ func startByCalling(ctx context.Context, bundlePath string, config Config) error
C.StartVnet(&req, &res)

if !res.ok {
errC <- trace.Errorf("could not start VNet daemon: %v (%v)", C.GoString(res.error_description), C.GoString(res.error_domain))
errorDomain := C.GoString(res.error_domain)
errorCode := int(res.error_code)

if errorDomain == vnetErrorDomain && errorCode == errorCodeAlreadyRunning {
errC <- trace.Wrap(errAlreadyRunning)
return
}

errC <- trace.Errorf("could not start VNet daemon: %v", C.GoString(res.error_description))
return
}

Expand All @@ -290,3 +318,24 @@ func startByCalling(ctx context.Context, bundlePath string, config Config) error
return trace.Wrap(err, "connecting to the VNet daemon")
}
}

var (
// vnetErrorDomain is a custom error domain used for Objective-C errors that pertain to VNet.
vnetErrorDomain = C.GoString(C.VNEErrorDomain)
// errorCodeAlreadyRunning is returned within [vnetErrorDomain] errors to indicate that the daemon
// received a message to start after it was already running.
errorCodeAlreadyRunning = int(C.VNEAlreadyRunningError)
errAlreadyRunning = errors.New("VNet is already running")
)

func sleepOrDone(ctx context.Context, d time.Duration) error {
timer := time.NewTimer(d)
defer timer.Stop()

select {
case <-ctx.Done():
return trace.Wrap(ctx.Err())
case <-timer.C:
return nil
}
}
3 changes: 3 additions & 0 deletions lib/vnet/daemon/client_darwin.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ typedef struct StartVnetRequest {
typedef struct StartVnetResult {
bool ok;
const char *error_domain;
// error_code is code taken from an NSError instance encountered during the call to StartVnet.
// If ok is false, error_code is greater than zero and identifies the reason behind the error.
int error_code;
const char *error_description;
} StartVnetResult;

Expand Down
5 changes: 3 additions & 2 deletions lib/vnet/daemon/client_darwin.m
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,8 @@ - (void)startVnet:(VnetConfig *)vnetConfig completion:(void (^)(NSError *))compl
}];

[(id<VNEDaemonProtocol>)proxy startVnet:vnetConfig
completion:^(void) {
completion(nil);
completion:^(NSError *error) {
completion(error);
}];
}

Expand All @@ -143,6 +143,7 @@ void StartVnet(StartVnetRequest *request, StartVnetResult *outResult) {
completion:^(NSError *error) {
if (error) {
outResult->error_domain = VNECopyNSString([error domain]);
outResult->error_code = (int)[error code];
outResult->error_description = VNECopyNSString([error description]);
dispatch_semaphore_signal(sema);
return;
Expand Down
6 changes: 6 additions & 0 deletions lib/vnet/daemon/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
package daemon

import (
"time"

"github.com/gravitational/trace"
)

Expand Down Expand Up @@ -47,3 +49,7 @@ func (c *Config) CheckAndSetDefaults() error {
}
return nil
}

// CheckUnprivilegedProcessInterval denotes how often the admin process should check if the
// unprivileged process has quit.
const CheckUnprivilegedProcessInterval = time.Second
22 changes: 17 additions & 5 deletions lib/vnet/daemon/protocol_darwin.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
#ifndef TELEPORT_LIB_VNET_DAEMON_PROTOCOL_DARWIN_H_
#define TELEPORT_LIB_VNET_DAEMON_PROTOCOL_DARWIN_H_

#import <Foundation/Foundation.h>

// VNEErrorDomain is a custom error domain used for Objective-C errors that pertain to VNet.
extern const char* const VNEErrorDomain;

// VNEAlreadyRunningError indicates that the daemon already received a VNet config.
// It won't accept a new one during its lifetime, instead it's expected to stop, after
// which the client might spawn a new instance of the daemon.
extern const int VNEAlreadyRunningError;

typedef struct VnetConfig {
const char *socket_path;
const char *ipv6_prefix;
Expand All @@ -9,11 +19,13 @@ typedef struct VnetConfig {
} VnetConfig;

@protocol VNEDaemonProtocol
// startVnet starts VNet in a separate thread and returns immediately.
// Only the first call to this method starts VNet. Subsequent calls are noops.
// The daemon process exits after VNet is stopped, after which it can be
// spawned again by calling this method.
- (void)startVnet:(VnetConfig *)vnetConfig completion:(void (^)(void))completion;
// startVnet passes the config back to Go code (which then starts VNet in a separate thread)
// and returns immediately.
//
// Only the first call to this method starts VNet. Subsequent calls return VNEAlreadyRunningError.
// The daemon process exits after VNet is stopped, after which it can be spawned again by calling
// this method.
- (void)startVnet:(VnetConfig *)vnetConfig completion:(void (^)(NSError *error))completion;
@end

#endif /* TELEPORT_LIB_VNET_DAEMON_PROTOCOL_DARWIN_H_ */
22 changes: 22 additions & 0 deletions lib/vnet/daemon/protocol_darwin.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//go:build vnetdaemon
// +build vnetdaemon

// Teleport
// Copyright (C) 2024 Gravitational, Inc.
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

const char* const VNEErrorDomain = "com.Gravitational.Vnet.ErrorDomain";

const int VNEAlreadyRunningError = 1;
24 changes: 22 additions & 2 deletions lib/vnet/daemon/service_darwin.m
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@
@interface VNEDaemonService () <NSXPCListenerDelegate, VNEDaemonProtocol>

@property(nonatomic, strong, readwrite) NSXPCListener *listener;
// started describes whether the XPC listener is listening for new connections.
@property(nonatomic, readwrite) BOOL started;
// gotConfig describes if the daemon received a VNet config from a client.
@property(nonatomic, readwrite) BOOL gotConfig;

@property(nonatomic, readwrite) NSString *socketPath;
@property(nonatomic, readwrite) NSString *ipv6Prefix;
Expand Down Expand Up @@ -76,14 +79,31 @@ - (void)waitForVnetConfig {

#pragma mark - VNEDaemonProtocol

- (void)startVnet:(VnetConfig *)vnetConfig completion:(void (^)(void))completion {
- (void)startVnet:(VnetConfig *)vnetConfig completion:(void (^)(NSError *error))completion {
@synchronized(self) {
// startVnet is expected to be called only once per daemon's lifetime.
// Between the process with the daemon client exiting and the admin process (which runs the
// daemon) noticing this and exiting as well, a new client can be spawned and startVnet might
// end up getting called again.
//
// In such scenarios, we want to return an error so that the client can wait for the daemon
// to exit and retry the call.
if (_gotConfig) {
NSError *error = [[NSError alloc] initWithDomain:@(VNEErrorDomain)
code:VNEAlreadyRunningError
userInfo:nil];
completion(error);
return;
}

_gotConfig = YES;
_socketPath = @(vnetConfig->socket_path);
_ipv6Prefix = @(vnetConfig->ipv6_prefix);
_dnsAddr = @(vnetConfig->dns_addr);
_homePath = @(vnetConfig->home_path);

dispatch_semaphore_signal(_gotVnetConfigSema);
completion();
completion(nil);
}
}

Expand Down
20 changes: 14 additions & 6 deletions lib/vnet/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package vnet

import (
"context"
"errors"
"fmt"
"log/slog"
"os"
Expand Down Expand Up @@ -95,9 +96,16 @@ func SetupAndRun(ctx context.Context, config *SetupAndRunConfig) (*ProcessManage
recvTUNErr <- err
}()

// It should be more than waitingForEnablementTimeout in the vnet/daemon package
// so that the user sees the error about the background item first.
const receiveTunTimeout = time.Minute
receiveTunCtx, cancel := context.WithTimeoutCause(ctx, receiveTunTimeout,
errors.New("admin process did not send back TUN device within timeout"))
defer cancel()

select {
case <-ctx.Done():
return nil, trace.Wrap(ctx.Err())
case <-receiveTunCtx.Done():
return nil, trace.Wrap(context.Cause(receiveTunCtx))
case <-processCtx.Done():
return nil, trace.Wrap(context.Cause(processCtx))
case err := <-recvTUNErr:
Expand All @@ -110,7 +118,7 @@ func SetupAndRun(ctx context.Context, config *SetupAndRunConfig) (*ProcessManage
slog.DebugContext(ctx, "Error from recvTUNErr ignored in favor of processCtx.Err", "error", err)
return nil, trace.Wrap(context.Cause(processCtx))
}
return nil, trace.Wrap(err, "receiving TUN from admin process")
return nil, trace.Wrap(err, "receiving TUN device from admin process")
}
}

Expand Down Expand Up @@ -230,9 +238,9 @@ func AdminSetup(ctx context.Context, config daemon.Config) error {
}()

// Stay alive until we get an error on errCh, indicating that the osConfig loop exited.
// If the socket is deleted, indicating that the parent process exited, cancel the context and then wait
// for the osConfig loop to exit and send an err on errCh.
ticker := time.NewTicker(time.Second)
// If the socket is deleted, indicating that the unprivileged process exited, cancel the context
// and then wait for the osConfig loop to exit and send an err on errCh.
ticker := time.NewTicker(daemon.CheckUnprivilegedProcessInterval)
defer ticker.Stop()
for {
select {
Expand Down

0 comments on commit 58d431d

Please sign in to comment.