Skip to content

Commit

Permalink
[v16] Improve handling of various lifecycle erorrs in VNet (#42220)
Browse files Browse the repository at this point in the history
* Split vnet.Run into vnet.Setup and vnet.Run

lib/teleterm cannot reuse this function due to different semantics
of contexts within an RPC vs a CLI command. It needs one
function that has request-response semantics and another that blocks
that can be run from a separate goroutine.

* tsh vnet: Select on ctx.Done too when awaiting TUN device

Without this extra select on ctx.Done, interrupting tsh vnet at that
moment would return an error about using a closed socket. That's because
when the context gets canceled, another goroutine closes the socket.

Instead, we should wait for either the cancelation of the context or
socket.AcceptUnix returning a value.

To test this, run `tsh vnet`. When the osascript prompt is shown, go
back to the terminal and Ctrl+C the command. The previous version would
show an error about using closed connection whereas this exits cleanly.

* Do not show context canceled errors to user

The previous version would show "context canceled" after sending SIGINT
to tsh vnet after the user entered the password in the prompt and  the
TUN device was set up.

* Fix handling of admin subcommand exiting prematurely with no err

If the socket file was removed, the admin subcommand stops. The previous
version of VNet would continue running. This version correctly
propagates the fact that the admin subcommand has quit prematurely with
no error and shuts down vnet.Manager as well.

* Correctly report admin subcommand quitting prematurely

Because of the changes from a couple of commits ago related to ignoring
context.Canceled errors, if the admin subcommand was killed or has
crashed, VNet would stop with no error message.

This commit fixes that and propagates the error from osascript.

* Avoid race condition when ctx gets canceled on awaiting TUN device

* Refactor detecting premature exit with no error

* Simplify shared VNet setup code

Squashed from #41653 (only lib/vnet and tool/tsh).

* Rename taskErr to expectedErr

* Remove special treatment of context.Canceled in tsh vnet

* Turn socket closer into critical task
  • Loading branch information
ravicious authored Jun 3, 2024
1 parent 7ca68a0 commit 4624b60
Show file tree
Hide file tree
Showing 8 changed files with 350 additions and 221 deletions.
172 changes: 112 additions & 60 deletions lib/vnet/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package vnet

import (
"context"
"fmt"
"log/slog"
"os"
"time"
Expand All @@ -27,69 +28,142 @@ import (
"golang.zx2c4.com/wireguard/tun"
)

// Run is a blocking call to create and start Teleport VNet.
func Run(ctx context.Context, appProvider AppProvider) error {
// SetupAndRun creates a network stack for VNet and runs it in the background. To do this, it also
// needs to launch an admin subcommand in the background. It returns [ProcessManager] which controls
// the lifecycle of both background tasks.
//
// The caller is expected to call Close on the process manager to close the network stack, clean
// up any resources used by it and terminate the admin subcommand.
//
// ctx is used to wait for setup steps that happen before SetupAndRun hands out the control to the
// process manager. If ctx gets canceled during SetupAndRun, the process manager gets closed along
// with its background tasks.
func SetupAndRun(ctx context.Context, appProvider AppProvider) (*ProcessManager, error) {
ipv6Prefix, err := NewIPv6Prefix()
if err != nil {
return trace.Wrap(err)
return nil, trace.Wrap(err)
}

dnsIPv6 := ipv6WithSuffix(ipv6Prefix, []byte{2})

ctx, cancel := context.WithCancel(ctx)
defer cancel()
pm, processCtx := newProcessManager()
success := false
defer func() {
if !success {
// Closes the socket and background tasks.
pm.Close()
}
}()

// Create the socket that's used to receive the TUN device from the admin subcommand.
socket, socketPath, err := createUnixSocket()
if err != nil {
return nil, trace.Wrap(err)
}
slog.DebugContext(ctx, "Created unix socket for admin subcommand", "socket", socketPath)
pm.AddCriticalBackgroundTask("socket closer", func() error {
// Keep the socket open until the process context is canceled.
// Closing the socket signals the admin subcommand to terminate.
<-processCtx.Done()
return trace.NewAggregate(processCtx.Err(), socket.Close())
})

tunCh, adminCommandErrCh := CreateAndSetupTUNDevice(ctx, ipv6Prefix.String(), dnsIPv6.String())
pm.AddCriticalBackgroundTask("admin subcommand", func() error {
return trace.Wrap(execAdminSubcommand(processCtx, socketPath, ipv6Prefix.String(), dnsIPv6.String()))
})

recvTUNErr := make(chan error, 1)
var tun tun.Device
go func() {
// Unblocks after receiving a TUN device or when the context gets canceled (and thus socket gets
// closed).
tunDevice, err := receiveTUNDevice(socket)
tun = tunDevice
recvTUNErr <- err
}()

var tun TUNDevice
select {
case err := <-adminCommandErrCh:
return trace.Wrap(err)
case tun = <-tunCh:
case <-ctx.Done():
return nil, trace.Wrap(ctx.Err())
case <-processCtx.Done():
return nil, trace.Wrap(context.Cause(processCtx))
case err := <-recvTUNErr:
if err != nil {
if processCtx.Err() != nil {
// Both errors being present means that VNet failed to receive a TUN device because of a
// problem with the admin subcommand.
// Returning error from processCtx will be more informative to the user, e.g., the error
// will say "password prompt closed by user" instead of "read from closed socket".
slog.DebugContext(ctx, "Error from recvTUNErr ignored in favor of processCtx.Err", "error", err)
return nil, trace.Wrap(context.Cause(processCtx))
}
return nil, trace.Wrap(err, "receiving TUN from admin subcommand")
}
}

appResolver, err := NewTCPAppResolver(appProvider)
if err != nil {
return trace.Wrap(err)
return nil, trace.Wrap(err)
}

manager, err := NewManager(&Config{
ns, err := newNetworkStack(&Config{
TUNDevice: tun,
IPv6Prefix: ipv6Prefix,
DNSIPv6: dnsIPv6,
TCPHandlerResolver: appResolver,
})
if err != nil {
return trace.Wrap(err)
return nil, trace.Wrap(err)
}

allErrors := make(chan error, 2)
g, ctx := errgroup.WithContext(ctx)
g.Go(func() error {
// Make sure to cancel the context if manager.Run terminates for any reason.
defer cancel()
err := trace.Wrap(manager.Run(ctx), "running VNet manager")
allErrors <- err
return err
pm.AddCriticalBackgroundTask("network stack", func() error {
return trace.Wrap(ns.Run(processCtx))
})
g.Go(func() error {
var adminCommandErr error
select {
case adminCommandErr = <-adminCommandErrCh:
// The admin command exited before the context was canceled, cancel everything and exit.
cancel()
case <-ctx.Done():
// The context has been canceled, the admin command should now exit.
adminCommandErr = <-adminCommandErrCh

success = true
return pm, nil
}

func newProcessManager() (*ProcessManager, context.Context) {
ctx, cancel := context.WithCancel(context.Background())
g, ctx := errgroup.WithContext(ctx)

return &ProcessManager{
g: g,
cancel: cancel,
}, ctx
}

// ProcessManager handles background tasks needed to run VNet.
// Its semantics are similar to an error group with a context, but it cancels the context whenever
// any task returns prematurely, that is, a task exits while the context was not canceled.
type ProcessManager struct {
g *errgroup.Group
cancel context.CancelFunc
}

// AddCriticalBackgroundTask adds a function to the error group. [task] is expected to block until
// the context returned by [newProcessManager] gets canceled. The context gets canceled either by
// calling Close on [ProcessManager] or if any task returns.
func (pm *ProcessManager) AddCriticalBackgroundTask(name string, task func() error) {
pm.g.Go(func() error {
err := task()
if err == nil {
// Make sure to always return an error so that the errgroup context is canceled.
err = fmt.Errorf("critical task %q exited prematurely", name)
}
adminCommandErr = trace.Wrap(adminCommandErr, "running admin subcommand")
allErrors <- adminCommandErr
return adminCommandErr
return trace.Wrap(err)
})
// Deliberately ignoring the error from g.Wait() to return an aggregate of all errors.
_ = g.Wait()
close(allErrors)
return trace.NewAggregateFromChannel(allErrors, context.Background())
}

// Wait blocks and waits for the background tasks to finish, which typically happens when another
// goroutine calls Close on the process manager.
func (pm *ProcessManager) Wait() error {
return trace.Wrap(pm.g.Wait())
}

// Close stops any active background tasks by canceling the underlying context.
func (pm *ProcessManager) Close() {
pm.cancel()
}

// AdminSubcommand is the tsh subcommand that should run as root that will create and setup a TUN device and
Expand Down Expand Up @@ -136,28 +210,6 @@ func AdminSubcommand(ctx context.Context, socketPath, ipv6Prefix, dnsAddr string
}
}

// CreateAndSetupTUNDevice creates a virtual network device and configures the host OS to use that device for
// VNet connections.
//
// If not already running as root, it will spawn a root process to handle the TUN creation and host
// configuration.
//
// After the TUN device is created, it will be sent on the result channel. Any error will be sent on the err
// channel. Always select on both the result channel and the err channel when waiting for a result.
//
// This will keep running until [ctx] is canceled or an unrecoverable error is encountered, in order to keep
// the host OS configuration up to date.
func CreateAndSetupTUNDevice(ctx context.Context, ipv6Prefix, dnsAddr string) (<-chan tun.Device, <-chan error) {
if os.Getuid() == 0 {
// We can get here if the user runs `tsh vnet` as root, but it is not in the expected path when
// started as a regular user. Typically we expect `tsh vnet` to be run as a non-root user, and for
// AdminSubcommand to directly call createAndSetupTUNDeviceAsRoot.
return createAndSetupTUNDeviceAsRoot(ctx, ipv6Prefix, dnsAddr)
} else {
return createAndSetupTUNDeviceWithoutRoot(ctx, ipv6Prefix, dnsAddr)
}
}

// createAndSetupTUNDeviceAsRoot creates a virtual network device and configures the host OS to use that device for
// VNet connections.
//
Expand Down
97 changes: 37 additions & 60 deletions lib/vnet/setup_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"context"
"errors"
"fmt"
"log/slog"
"net"
"os"
"os/exec"
Expand All @@ -33,7 +32,6 @@ import (

"github.com/google/uuid"
"github.com/gravitational/trace"
"golang.org/x/sync/errgroup"
"golang.org/x/sys/unix"
"golang.zx2c4.com/wireguard/tun"

Expand All @@ -42,56 +40,16 @@ import (
"github.com/gravitational/teleport/api/types"
)

// createAndSetupTUNDeviceWithoutRoot creates a virtual network device and configures the host OS to use that
// device for VNet connections. It will spawn a root process to handle the TUN creation and host
// configuration.
//
// After the TUN device is created, it will be sent on the result channel. Any error will be sent on the err
// channel. Always select on both the result channel and the err channel when waiting for a result.
//
// This will keep running until [ctx] is canceled or an unrecoverable error is encountered, in order to keep
// the host OS configuration up to date.
func createAndSetupTUNDeviceWithoutRoot(ctx context.Context, ipv6Prefix, dnsAddr string) (<-chan tun.Device, <-chan error) {
tunCh := make(chan tun.Device, 1)
errCh := make(chan error, 1)

slog.InfoContext(ctx, "Spawning child process as root to create and setup TUN device")
socket, socketPath, err := createUnixSocket()
// receiveTUNDevice is a blocking call which waits for the admin subcommand to pass over the socket
// the name and fd of the TUN device.
func receiveTUNDevice(socket *net.UnixListener) (tun.Device, error) {
tunName, tunFd, err := recvTUNNameAndFd(socket)
if err != nil {
errCh <- trace.Wrap(err, "creating unix socket")
return tunCh, errCh
}

// Make sure all goroutines complete before sending an err on the error chan, to be sure they all have a
// chance to clean up before the process terminates.
g, ctx := errgroup.WithContext(ctx)
g.Go(func() error {
// Requirements:
// - must close the socket concurrently with recvTUNNameAndFd if the context is canceled to unblock
// a stuck AcceptUnix (can't defer).
// - must close the socket exactly once before letting the process terminate.
<-ctx.Done()
return trace.Wrap(socket.Close())
})
g.Go(func() error {
// Admin command is expected to run until ctx is canceled.
return trace.Wrap(execAdminSubcommand(ctx, socketPath, ipv6Prefix, dnsAddr))
})
g.Go(func() error {
tunName, tunFd, err := recvTUNNameAndFd(ctx, socket)
if err != nil {
return trace.Wrap(err, "receiving TUN name and file descriptor")
}
tunDevice, err := tun.CreateTUNFromFile(os.NewFile(tunFd, tunName), 0)
if err != nil {
return trace.Wrap(err, "creating TUN device from file descriptor")
}
tunCh <- tunDevice
return nil
})
go func() { errCh <- g.Wait() }()
return nil, trace.Wrap(err, "receiving TUN name and file descriptor")
}

return tunCh, errCh
tunDevice, err := tun.CreateTUNFromFile(os.NewFile(tunFd, tunName), 0)
return tunDevice, trace.Wrap(err, "creating TUN device from file descriptor")
}

func execAdminSubcommand(ctx context.Context, socketPath, ipv6Prefix, dnsAddr string) error {
Expand Down Expand Up @@ -135,10 +93,36 @@ do shell script quoted form of executableName & `+
if strings.Contains(stderr, "-128") {
return trace.Errorf("password prompt closed by user")
}
return trace.Wrap(exitError, "admin subcommand exited, stderr: %s", stderr)

if errors.Is(ctx.Err(), context.Canceled) {
// osascript exiting due to canceled context.
return ctx.Err()
}

stderrDesc := ""
if stderr != "" {
stderrDesc = fmt.Sprintf(", stderr: %s", stderr)
}
return trace.Wrap(exitError, "osascript exited%s", stderrDesc)
}

return trace.Wrap(err)
}

if ctx.Err() == nil {
// The admin subcommand is expected to run until VNet gets stopped (in other words, until ctx
// gets canceled).
//
// If it exits with no error _before_ ctx is canceled, then it most likely means that the socket
// was unexpectedly removed. When the socket gets removed, the admin subcommand assumes that the
// unprivileged process (executing this code here) has quit and thus it should quit as well. But
// we know that it's not the case, so in this scenario we return an error instead.
//
// If we don't return an error here, then other code won't be properly notified about the fact
// that the admin process has quit.
return trace.Errorf("admin subcommand exited prematurely with no error (likely because socket was removed)")
}

return nil
}

Expand Down Expand Up @@ -178,19 +162,12 @@ func sendTUNNameAndFd(socketPath, tunName string, fd uintptr) error {

// recvTUNNameAndFd receives the name of a TUN device and its open file descriptor over a unix socket, meant
// for passing the TUN from the root process which must create it to the user process.
func recvTUNNameAndFd(ctx context.Context, socket *net.UnixListener) (string, uintptr, error) {
func recvTUNNameAndFd(socket *net.UnixListener) (string, uintptr, error) {
conn, err := socket.AcceptUnix()
if err != nil {
return "", 0, trace.Wrap(err, "accepting connection on unix socket")
}

// Close the connection early to unblock reads if the context is canceled.
ctx, cancel := context.WithCancel(ctx)
defer cancel()
go func() {
<-ctx.Done()
conn.Close()
}()
defer conn.Close()

msg := make([]byte, 128)
oob := make([]byte, unix.CmsgSpace(4)) // Fd is 4 bytes
Expand Down
15 changes: 11 additions & 4 deletions lib/vnet/setup_other.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ package vnet

import (
"context"
"net"
"runtime"

"github.com/gravitational/trace"
Expand All @@ -32,16 +33,22 @@ var (
ErrVnetNotImplemented = &trace.NotImplementedError{Message: "VNet is not implemented on " + runtime.GOOS}
)

func createAndSetupTUNDeviceWithoutRoot(ctx context.Context, ipv6Prefix, dnsAddr string) (<-chan tun.Device, <-chan error) {
errCh := make(chan error, 1)
errCh <- trace.Wrap(ErrVnetNotImplemented)
return nil, errCh
func createUnixSocket() (*net.UnixListener, string, error) {
return nil, "", trace.Wrap(ErrVnetNotImplemented)
}

func sendTUNNameAndFd(socketPath, tunName string, fd uintptr) error {
return trace.Wrap(ErrVnetNotImplemented)
}

func receiveTUNDevice(socket *net.UnixListener) (tun.Device, error) {
return nil, trace.Wrap(ErrVnetNotImplemented)
}

func configureOS(ctx context.Context, cfg *osConfig) error {
return trace.Wrap(ErrVnetNotImplemented)
}

func execAdminSubcommand(ctx context.Context, socketPath, ipv6Prefix, dnsAddr string) error {
return trace.Wrap(ErrVnetNotImplemented)
}
Loading

0 comments on commit 4624b60

Please sign in to comment.