Skip to content

Commit

Permalink
Added actionable errors for network issues.
Browse files Browse the repository at this point in the history
Updated Application Access to return actionable error message when
possible for network errors.
  • Loading branch information
russjones committed Nov 20, 2024
1 parent 192e8eb commit 4bf032e
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 3 deletions.
4 changes: 4 additions & 0 deletions api/defaults/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ const (
// DefaultIdleTimeout is a default idle connection timeout.
DefaultIdleTimeout = 30 * time.Second

// DefaultDialTimeout is the default time to wait for a connection to be
// established.
DefaultDialTimeout = 5 * time.Second

// KeepAliveCountMax is the number of keep-alive messages that can be sent
// without receiving a response from the client before the client is
// disconnected. The max count mirrors ClientAliveCountMax of sshd.
Expand Down
58 changes: 55 additions & 3 deletions lib/srv/app/transport.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,19 @@ package app
import (
"context"
"crypto/tls"
"io"
"log/slog"
"net"
"net/http"
"net/url"
"path"
"slices"
"strings"

"github.com/gravitational/trace"

"github.com/gravitational/teleport"
apidefaults "github.com/gravitational/teleport/api/defaults"
"github.com/gravitational/teleport/api/types"
"github.com/gravitational/teleport/api/types/wrappers"
"github.com/gravitational/teleport/lib"
Expand Down Expand Up @@ -97,6 +100,14 @@ func newTransport(ctx context.Context, c *transportConfig) (*transport, error) {
if err != nil {
return nil, trace.Wrap(err)
}

// Add a timeout to the dialer so failures to establish network connections
// don't cause requests to hang forever.
d := net.Dialer{
Timeout: apidefaults.DefaultDialTimeout,
}
tr.DialContext = d.DialContext

tr.TLSClientConfig, err = configureTLS(c)
if err != nil {
return nil, trace.Wrap(err)
Expand Down Expand Up @@ -143,15 +154,39 @@ func (t *transport) RoundTrip(r *http.Request) (*http.Response, error) {
return nil, trace.Wrap(err)
}

// Forward the request to the target application and emit an audit event.
// Add a timeout to the request, so slow servers don't cause requests to
// hang forever.
timeout, cancel := context.WithTimeout(r.Context(), apidefaults.DefaultIOTimeout)
defer cancel()
r = r.WithContext(timeout)

// Forward the request to the target application.
//
// If a network error occured when connecting to the target application, log
// and return a helpful error message to the user and Teleport
// administrator.
resp, err := t.tr.RoundTrip(r)
if message, ok := utils.CanExplainNetworkError(err); ok {
t.log.DebugContext(r.Context(), "Request failed with a network error.",
"raw_error", err, "human_error", message)

code := trace.ErrorToCode(err)
return &http.Response{
StatusCode: code,
Status: http.StatusText(code),
Proto: r.Proto,
ProtoMajor: r.ProtoMajor,
ProtoMinor: r.ProtoMinor,
Body: io.NopCloser(strings.NewReader(charWrap(message))),
TLS: r.TLS,
}, nil
}
if err != nil {
return nil, trace.Wrap(err)
}
status := uint32(resp.StatusCode)

// Emit the event to the audit log.
if err := sessCtx.Audit.OnRequest(t.closeContext, sessCtx, r, status, nil /*aws endpoint*/); err != nil {
if err := sessCtx.Audit.OnRequest(t.closeContext, sessCtx, r, uint32(resp.StatusCode), nil /*aws endpoint*/); err != nil {
return nil, trace.Wrap(err)
}

Expand Down Expand Up @@ -293,3 +328,20 @@ func host(addr string) string {
}
return host
}

// charWrap wraps a line to about 80 characters to make it easier to read.
func charWrap(message string) string {
var n int
var sb strings.Builder
for _, word := range strings.Fields(message) {
sb.WriteString(word)
sb.WriteString(" ")

n += len(word) + 1
if n > 80 {
sb.WriteString("\n")
n = 0
}
}
return sb.String()
}
67 changes: 67 additions & 0 deletions lib/utils/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
package utils

import (
"context"
"errors"
"fmt"
"io"
"net"
"strings"
Expand All @@ -28,6 +30,7 @@ import (
"github.com/gravitational/trace"

"github.com/gravitational/teleport/api/constants"
"github.com/gravitational/teleport/api/defaults"
)

// IsUseOfClosedNetworkError returns true if the specified error
Expand Down Expand Up @@ -86,6 +89,70 @@ func IsUntrustedCertErr(err error) bool {
strings.Contains(errMsg, "certificate is not trusted")
}

// CanExplainNetworkError returns a simple to understand error message that can
// be used to debug common network and/or protocol errors.
func CanExplainNetworkError(err error) (string, bool) {
var oerr *net.OpError
var derr *net.DNSError

switch {
// Connection refused errors can be reproduced by attempting to connect to a
// host:port that no process is listening on. The raw error typically looks
// like the following:
//
// dial tcp 127.0.0.1:8000: connect: connection refused
case errors.Is(err, syscall.ECONNREFUSED):
return "Connection refused. Run \"nc -vz a.b.c.d PORT\" on the Teleport " +
"agent to verify the target application is running and listening on " +
"the expected host and port.", true
// Host unreachable errors can be reproduced by running
// "ip route add unreachable a.b.c.d" to update the routing table to make
// the host unreachable. Packets will be discarded and an ICMP message
// will be returned. The raw error typically looks like the following:
//
// dial tcp 10.10.10.10:8000: connect: no route to host
case errors.Is(err, syscall.EHOSTUNREACH):
return "No route to host. Run \"ip route get a.b.c.d\" on the Teleport " +
"agent to verify a route to the target application exists in the " +
"routing table.", true
// Connection reset errors can be reproduced by creating a HTTP server that
// accepts requests but closes the connection before writing a response. The
// raw error typically looks like the following:
//
// read tcp 127.0.0.1:49764->127.0.0.1:8000: read: connection reset by peer
case errors.Is(err, syscall.ECONNRESET):
return "Connection reset by peer. Run \"curl -v a.b.c.d\" on the Teleport " +
"agent to verify the target application (or a load balancer in the " +
"network path) is not abruptly closing the connection after accepting it.", true
// I/O timeouts can be reproduced by creating a server with a customer
// listener that will time.Sleep after Accept(). The raw error typically
// looks like the following:
//
// dial tcp 127.0.0.1:8000: i/o timeout
case errors.As(err, &oerr) && oerr.Timeout():
return fmt.Sprintf("Network I/O timeout. Run \"nc -vz a.b.c.d\" on the "+
"Teleport agent to verify the target application is accepting network "+
"connections in under %v.", defaults.DefaultDialTimeout), true
// Slow responses can be reprodued by creating a HTTP server that does a
// time.Sleep before responding. The raw error typically looks like the following:
//
// context deadline exceeded
case errors.Is(err, context.DeadlineExceeded):
return fmt.Sprintf("Timeout waiting for response. Run \"curl -v a.b.c.d\" on the "+
"Teleport agent to verify the target application is responding to "+
"requests in under %v.", defaults.DefaultIOTimeout), true
// No such host errors can be reproduced by attempting to resolve a invalid
// domain name. The raw error typically looks like the following:
//
// dial tcp: lookup fasfasfasf.com: no such host
case errors.As(err, &derr) && derr.IsNotFound:
return "No such host. Run \"dig +short fqdn\" on the Teleport agent to " +
"verify the target application has a valid DNS entry.", true
}

return "", false
}

const (
// SelfSignedCertsMsg is a helper message to point users towards helpful documentation.
SelfSignedCertsMsg = "Your proxy certificate is not trusted or expired. " +
Expand Down

0 comments on commit 4bf032e

Please sign in to comment.