From 4bf032edb889768f0a62ce5698e712d78e085fa4 Mon Sep 17 00:00:00 2001 From: Russell Jones Date: Wed, 30 Oct 2024 09:52:38 -0700 Subject: [PATCH] Added actionable errors for network issues. Updated Application Access to return actionable error message when possible for network errors. --- api/defaults/defaults.go | 4 +++ lib/srv/app/transport.go | 58 ++++++++++++++++++++++++++++++++-- lib/utils/errors.go | 67 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 126 insertions(+), 3 deletions(-) diff --git a/api/defaults/defaults.go b/api/defaults/defaults.go index 88624cd12ce1d..35b006c0c497a 100644 --- a/api/defaults/defaults.go +++ b/api/defaults/defaults.go @@ -34,6 +34,10 @@ const ( // DefaultIdleTimeout is a default idle connection timeout. DefaultIdleTimeout = 30 * time.Second + // DefaultDialTimeout is the default time to wait for a connection to be + // established. + DefaultDialTimeout = 5 * time.Second + // KeepAliveCountMax is the number of keep-alive messages that can be sent // without receiving a response from the client before the client is // disconnected. The max count mirrors ClientAliveCountMax of sshd. diff --git a/lib/srv/app/transport.go b/lib/srv/app/transport.go index d019af70dbe9f..314e17003e296 100644 --- a/lib/srv/app/transport.go +++ b/lib/srv/app/transport.go @@ -21,16 +21,19 @@ package app import ( "context" "crypto/tls" + "io" "log/slog" "net" "net/http" "net/url" "path" "slices" + "strings" "github.com/gravitational/trace" "github.com/gravitational/teleport" + apidefaults "github.com/gravitational/teleport/api/defaults" "github.com/gravitational/teleport/api/types" "github.com/gravitational/teleport/api/types/wrappers" "github.com/gravitational/teleport/lib" @@ -97,6 +100,14 @@ func newTransport(ctx context.Context, c *transportConfig) (*transport, error) { if err != nil { return nil, trace.Wrap(err) } + + // Add a timeout to the dialer so failures to establish network connections + // don't cause requests to hang forever. + d := net.Dialer{ + Timeout: apidefaults.DefaultDialTimeout, + } + tr.DialContext = d.DialContext + tr.TLSClientConfig, err = configureTLS(c) if err != nil { return nil, trace.Wrap(err) @@ -143,15 +154,39 @@ func (t *transport) RoundTrip(r *http.Request) (*http.Response, error) { return nil, trace.Wrap(err) } - // Forward the request to the target application and emit an audit event. + // Add a timeout to the request, so slow servers don't cause requests to + // hang forever. + timeout, cancel := context.WithTimeout(r.Context(), apidefaults.DefaultIOTimeout) + defer cancel() + r = r.WithContext(timeout) + + // Forward the request to the target application. + // + // If a network error occured when connecting to the target application, log + // and return a helpful error message to the user and Teleport + // administrator. resp, err := t.tr.RoundTrip(r) + if message, ok := utils.CanExplainNetworkError(err); ok { + t.log.DebugContext(r.Context(), "Request failed with a network error.", + "raw_error", err, "human_error", message) + + code := trace.ErrorToCode(err) + return &http.Response{ + StatusCode: code, + Status: http.StatusText(code), + Proto: r.Proto, + ProtoMajor: r.ProtoMajor, + ProtoMinor: r.ProtoMinor, + Body: io.NopCloser(strings.NewReader(charWrap(message))), + TLS: r.TLS, + }, nil + } if err != nil { return nil, trace.Wrap(err) } - status := uint32(resp.StatusCode) // Emit the event to the audit log. - if err := sessCtx.Audit.OnRequest(t.closeContext, sessCtx, r, status, nil /*aws endpoint*/); err != nil { + if err := sessCtx.Audit.OnRequest(t.closeContext, sessCtx, r, uint32(resp.StatusCode), nil /*aws endpoint*/); err != nil { return nil, trace.Wrap(err) } @@ -293,3 +328,20 @@ func host(addr string) string { } return host } + +// charWrap wraps a line to about 80 characters to make it easier to read. +func charWrap(message string) string { + var n int + var sb strings.Builder + for _, word := range strings.Fields(message) { + sb.WriteString(word) + sb.WriteString(" ") + + n += len(word) + 1 + if n > 80 { + sb.WriteString("\n") + n = 0 + } + } + return sb.String() +} diff --git a/lib/utils/errors.go b/lib/utils/errors.go index ed557b2168b76..018f2b3dc6ed0 100644 --- a/lib/utils/errors.go +++ b/lib/utils/errors.go @@ -19,7 +19,9 @@ package utils import ( + "context" "errors" + "fmt" "io" "net" "strings" @@ -28,6 +30,7 @@ import ( "github.com/gravitational/trace" "github.com/gravitational/teleport/api/constants" + "github.com/gravitational/teleport/api/defaults" ) // IsUseOfClosedNetworkError returns true if the specified error @@ -86,6 +89,70 @@ func IsUntrustedCertErr(err error) bool { strings.Contains(errMsg, "certificate is not trusted") } +// CanExplainNetworkError returns a simple to understand error message that can +// be used to debug common network and/or protocol errors. +func CanExplainNetworkError(err error) (string, bool) { + var oerr *net.OpError + var derr *net.DNSError + + switch { + // Connection refused errors can be reproduced by attempting to connect to a + // host:port that no process is listening on. The raw error typically looks + // like the following: + // + // dial tcp 127.0.0.1:8000: connect: connection refused + case errors.Is(err, syscall.ECONNREFUSED): + return "Connection refused. Run \"nc -vz a.b.c.d PORT\" on the Teleport " + + "agent to verify the target application is running and listening on " + + "the expected host and port.", true + // Host unreachable errors can be reproduced by running + // "ip route add unreachable a.b.c.d" to update the routing table to make + // the host unreachable. Packets will be discarded and an ICMP message + // will be returned. The raw error typically looks like the following: + // + // dial tcp 10.10.10.10:8000: connect: no route to host + case errors.Is(err, syscall.EHOSTUNREACH): + return "No route to host. Run \"ip route get a.b.c.d\" on the Teleport " + + "agent to verify a route to the target application exists in the " + + "routing table.", true + // Connection reset errors can be reproduced by creating a HTTP server that + // accepts requests but closes the connection before writing a response. The + // raw error typically looks like the following: + // + // read tcp 127.0.0.1:49764->127.0.0.1:8000: read: connection reset by peer + case errors.Is(err, syscall.ECONNRESET): + return "Connection reset by peer. Run \"curl -v a.b.c.d\" on the Teleport " + + "agent to verify the target application (or a load balancer in the " + + "network path) is not abruptly closing the connection after accepting it.", true + // I/O timeouts can be reproduced by creating a server with a customer + // listener that will time.Sleep after Accept(). The raw error typically + // looks like the following: + // + // dial tcp 127.0.0.1:8000: i/o timeout + case errors.As(err, &oerr) && oerr.Timeout(): + return fmt.Sprintf("Network I/O timeout. Run \"nc -vz a.b.c.d\" on the "+ + "Teleport agent to verify the target application is accepting network "+ + "connections in under %v.", defaults.DefaultDialTimeout), true + // Slow responses can be reprodued by creating a HTTP server that does a + // time.Sleep before responding. The raw error typically looks like the following: + // + // context deadline exceeded + case errors.Is(err, context.DeadlineExceeded): + return fmt.Sprintf("Timeout waiting for response. Run \"curl -v a.b.c.d\" on the "+ + "Teleport agent to verify the target application is responding to "+ + "requests in under %v.", defaults.DefaultIOTimeout), true + // No such host errors can be reproduced by attempting to resolve a invalid + // domain name. The raw error typically looks like the following: + // + // dial tcp: lookup fasfasfasf.com: no such host + case errors.As(err, &derr) && derr.IsNotFound: + return "No such host. Run \"dig +short fqdn\" on the Teleport agent to " + + "verify the target application has a valid DNS entry.", true + } + + return "", false +} + const ( // SelfSignedCertsMsg is a helper message to point users towards helpful documentation. SelfSignedCertsMsg = "Your proxy certificate is not trusted or expired. " +