From f8654aece4b6dde13a8b89b67dc64ace9f44d541 Mon Sep 17 00:00:00 2001 From: Russell Jones Date: Wed, 4 Dec 2024 14:02:17 -0800 Subject: [PATCH] Added actionable errors for network issues. (#49246) (#49788) Added actionable errors for common network issues. Updated Application Access to use actionable errors. --- lib/srv/app/transport.go | 62 +++++++++++++++++++++++++++++++++++-- lib/utils/errors.go | 66 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 3 deletions(-) diff --git a/lib/srv/app/transport.go b/lib/srv/app/transport.go index d019af70dbe9f..262ce70626825 100644 --- a/lib/srv/app/transport.go +++ b/lib/srv/app/transport.go @@ -21,12 +21,15 @@ package app import ( "context" "crypto/tls" + "io" "log/slog" "net" "net/http" "net/url" "path" "slices" + "strings" + "time" "github.com/gravitational/trace" @@ -97,6 +100,12 @@ func newTransport(ctx context.Context, c *transportConfig) (*transport, error) { if err != nil { return nil, trace.Wrap(err) } + + // Add a timeout to control how long it takes to (start) getting a response + // from the target server. This allows Teleport to show the user a helpful + // error message when the target service is slow in responding. + tr.ResponseHeaderTimeout = requestTimeout + tr.TLSClientConfig, err = configureTLS(c) if err != nil { return nil, trace.Wrap(err) @@ -143,15 +152,35 @@ func (t *transport) RoundTrip(r *http.Request) (*http.Response, error) { return nil, trace.Wrap(err) } - // Forward the request to the target application and emit an audit event. + // Forward the request to the target application. + // + // If a network error occurred when connecting to the target application, + // log and return a helpful error message to the user and Teleport + // administrator. resp, err := t.tr.RoundTrip(r) + if message, ok := utils.CanExplainNetworkError(err); ok { + if t.log.Enabled(r.Context(), slog.LevelDebug) { + t.log.DebugContext(r.Context(), "application request failed with a network error", + "raw_error", err, "human_error", strings.Join(strings.Fields(message), " ")) + } + + code := trace.ErrorToCode(err) + return &http.Response{ + StatusCode: code, + Status: http.StatusText(code), + Proto: r.Proto, + ProtoMajor: r.ProtoMajor, + ProtoMinor: r.ProtoMinor, + Body: io.NopCloser(strings.NewReader(charWrap(message))), + TLS: r.TLS, + }, nil + } if err != nil { return nil, trace.Wrap(err) } - status := uint32(resp.StatusCode) // Emit the event to the audit log. - if err := sessCtx.Audit.OnRequest(t.closeContext, sessCtx, r, status, nil /*aws endpoint*/); err != nil { + if err := sessCtx.Audit.OnRequest(t.closeContext, sessCtx, r, uint32(resp.StatusCode), nil /*aws endpoint*/); err != nil { return nil, trace.Wrap(err) } @@ -293,3 +322,30 @@ func host(addr string) string { } return host } + +// charWrap wraps a line to about 80 characters to make it easier to read. +func charWrap(message string) string { + var sb strings.Builder + for _, line := range strings.Split(message, "\n") { + var n int + for _, word := range strings.Fields(line) { + sb.WriteString(word) + sb.WriteString(" ") + + n += len(word) + 1 + if n > 80 { + sb.WriteString("\n") + n = 0 + } + } + sb.WriteString("\n") + } + return sb.String() +} + +const ( + // requestTimeout is the timeout to receive a response from the upstream + // server. Start it out large (not to break things) and slowly decrease it + // over time. + requestTimeout = 5 * time.Minute +) diff --git a/lib/utils/errors.go b/lib/utils/errors.go index ed557b2168b76..14e56b188c418 100644 --- a/lib/utils/errors.go +++ b/lib/utils/errors.go @@ -19,6 +19,7 @@ package utils import ( + "context" "errors" "io" "net" @@ -86,6 +87,71 @@ func IsUntrustedCertErr(err error) bool { strings.Contains(errMsg, "certificate is not trusted") } +// CanExplainNetworkError returns a simple to understand error message that can +// be used to debug common network and/or protocol errors. +func CanExplainNetworkError(err error) (string, bool) { + var derr *net.DNSError + + switch { + // Connection refused errors can be reproduced by attempting to connect to a + // host:port that no process is listening on. The raw error typically looks + // like the following: + // + // dial tcp 127.0.0.1:8000: connect: connection refused + case errors.Is(err, syscall.ECONNREFUSED): + return `Connection Refused + +Teleport was unable to connect to the requested host, possibly because the server is not running. Ensure the server is running and listening on the correct port. + +Use "nc -vz HOST PORT" to help debug this issue.`, true + // Host unreachable errors can be reproduced by running + // "ip route add unreachable HOST" to update the routing table to make + // the host unreachable. Packets will be discarded and an ICMP message + // will be returned. The raw error typically looks like the following: + // + // dial tcp 10.10.10.10:8000: connect: no route to host + case errors.Is(err, syscall.EHOSTUNREACH): + return `No Route to Host + +Teleport could not connect to the requested host, likely because there is no valid network path to reach it. Check the network routing table to ensure a valid path to the host exists. + +Use "ping HOST" and "ip route get HOST" to help debug this issue.`, true + // Connection reset errors can be reproduced by creating a HTTP server that + // accepts requests but closes the connection before writing a response. The + // raw error typically looks like the following: + // + // read tcp 127.0.0.1:49764->127.0.0.1:8000: read: connection reset by peer + case errors.Is(err, syscall.ECONNRESET): + return `Connection Reset by Peer + +Teleport could not complete the request because the server abruptly closed the connection before the response was received. To resolve this issue, ensure the server (or load balancer) does not have a timeout terminating the connection early and verify that the server is not crash looping. + +Use protocol-specific tools (e.g., curl, psql) to help debug this issue.`, true + // Slow responses can be reprodued by creating a HTTP server that does a + // time.Sleep before responding. The raw error typically looks like the following: + // + // context deadline exceeded + case errors.Is(err, context.DeadlineExceeded): + return `Context Deadline Exceeded + +Teleport did not receive a response within the timeout period, likely due to the system being overloaded, network congestion, or a firewall blocking traffic. To resolve this issue, connect to the host directly and ensure it is responding promptly. + +Use protocol-specific tools (e.g., curl, psql) to assist in debugging this issue.`, true + // No such host errors can be reproduced by attempting to resolve a invalid + // domain name. The raw error typically looks like the following: + // + // dial tcp: lookup qweqweqwe.com: no such host + case errors.As(err, &derr) && derr.IsNotFound: + return `No Such Host + +Teleport was unable to resolve the provided domain name, likely because the domain does not exist. To resolve this issue, verify the domain is correct and ensure the DNS resolver is properly resolving it. + +Use "dig +short HOST" to help debug this issue.`, true + } + + return "", false +} + const ( // SelfSignedCertsMsg is a helper message to point users towards helpful documentation. SelfSignedCertsMsg = "Your proxy certificate is not trusted or expired. " +