Skip to content

Commit

Permalink
Added actionable errors for network issues. (#49246) (#49787)
Browse files Browse the repository at this point in the history
Added actionable errors for common network issues. Updated Application
Access to use actionable errors.
  • Loading branch information
russjones authored Dec 4, 2024
1 parent 8ec7063 commit 4c06657
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 3 deletions.
62 changes: 59 additions & 3 deletions lib/srv/app/transport.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@ package app
import (
"context"
"crypto/tls"
"io"
"log/slog"
"net"
"net/http"
"net/url"
"path"
"slices"
"strings"
"time"

"github.com/gravitational/trace"

Expand Down Expand Up @@ -97,6 +100,12 @@ func newTransport(ctx context.Context, c *transportConfig) (*transport, error) {
if err != nil {
return nil, trace.Wrap(err)
}

// Add a timeout to control how long it takes to (start) getting a response
// from the target server. This allows Teleport to show the user a helpful
// error message when the target service is slow in responding.
tr.ResponseHeaderTimeout = requestTimeout

tr.TLSClientConfig, err = configureTLS(c)
if err != nil {
return nil, trace.Wrap(err)
Expand Down Expand Up @@ -143,15 +152,35 @@ func (t *transport) RoundTrip(r *http.Request) (*http.Response, error) {
return nil, trace.Wrap(err)
}

// Forward the request to the target application and emit an audit event.
// Forward the request to the target application.
//
// If a network error occurred when connecting to the target application,
// log and return a helpful error message to the user and Teleport
// administrator.
resp, err := t.tr.RoundTrip(r)
if message, ok := utils.CanExplainNetworkError(err); ok {
if t.log.Enabled(r.Context(), slog.LevelDebug) {
t.log.DebugContext(r.Context(), "application request failed with a network error",
"raw_error", err, "human_error", strings.Join(strings.Fields(message), " "))
}

code := trace.ErrorToCode(err)
return &http.Response{
StatusCode: code,
Status: http.StatusText(code),
Proto: r.Proto,
ProtoMajor: r.ProtoMajor,
ProtoMinor: r.ProtoMinor,
Body: io.NopCloser(strings.NewReader(charWrap(message))),
TLS: r.TLS,
}, nil
}
if err != nil {
return nil, trace.Wrap(err)
}
status := uint32(resp.StatusCode)

// Emit the event to the audit log.
if err := sessCtx.Audit.OnRequest(t.closeContext, sessCtx, r, status, nil /*aws endpoint*/); err != nil {
if err := sessCtx.Audit.OnRequest(t.closeContext, sessCtx, r, uint32(resp.StatusCode), nil /*aws endpoint*/); err != nil {
return nil, trace.Wrap(err)
}

Expand Down Expand Up @@ -293,3 +322,30 @@ func host(addr string) string {
}
return host
}

// charWrap wraps a line to about 80 characters to make it easier to read.
func charWrap(message string) string {
var sb strings.Builder
for _, line := range strings.Split(message, "\n") {
var n int
for _, word := range strings.Fields(line) {
sb.WriteString(word)
sb.WriteString(" ")

n += len(word) + 1
if n > 80 {
sb.WriteString("\n")
n = 0
}
}
sb.WriteString("\n")
}
return sb.String()
}

const (
// requestTimeout is the timeout to receive a response from the upstream
// server. Start it out large (not to break things) and slowly decrease it
// over time.
requestTimeout = 5 * time.Minute
)
66 changes: 66 additions & 0 deletions lib/utils/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package utils

import (
"context"
"errors"
"io"
"net"
Expand Down Expand Up @@ -86,6 +87,71 @@ func IsUntrustedCertErr(err error) bool {
strings.Contains(errMsg, "certificate is not trusted")
}

// CanExplainNetworkError returns a simple to understand error message that can
// be used to debug common network and/or protocol errors.
func CanExplainNetworkError(err error) (string, bool) {
var derr *net.DNSError

switch {
// Connection refused errors can be reproduced by attempting to connect to a
// host:port that no process is listening on. The raw error typically looks
// like the following:
//
// dial tcp 127.0.0.1:8000: connect: connection refused
case errors.Is(err, syscall.ECONNREFUSED):
return `Connection Refused
Teleport was unable to connect to the requested host, possibly because the server is not running. Ensure the server is running and listening on the correct port.
Use "nc -vz HOST PORT" to help debug this issue.`, true
// Host unreachable errors can be reproduced by running
// "ip route add unreachable HOST" to update the routing table to make
// the host unreachable. Packets will be discarded and an ICMP message
// will be returned. The raw error typically looks like the following:
//
// dial tcp 10.10.10.10:8000: connect: no route to host
case errors.Is(err, syscall.EHOSTUNREACH):
return `No Route to Host
Teleport could not connect to the requested host, likely because there is no valid network path to reach it. Check the network routing table to ensure a valid path to the host exists.
Use "ping HOST" and "ip route get HOST" to help debug this issue.`, true
// Connection reset errors can be reproduced by creating a HTTP server that
// accepts requests but closes the connection before writing a response. The
// raw error typically looks like the following:
//
// read tcp 127.0.0.1:49764->127.0.0.1:8000: read: connection reset by peer
case errors.Is(err, syscall.ECONNRESET):
return `Connection Reset by Peer
Teleport could not complete the request because the server abruptly closed the connection before the response was received. To resolve this issue, ensure the server (or load balancer) does not have a timeout terminating the connection early and verify that the server is not crash looping.
Use protocol-specific tools (e.g., curl, psql) to help debug this issue.`, true
// Slow responses can be reprodued by creating a HTTP server that does a
// time.Sleep before responding. The raw error typically looks like the following:
//
// context deadline exceeded
case errors.Is(err, context.DeadlineExceeded):
return `Context Deadline Exceeded
Teleport did not receive a response within the timeout period, likely due to the system being overloaded, network congestion, or a firewall blocking traffic. To resolve this issue, connect to the host directly and ensure it is responding promptly.
Use protocol-specific tools (e.g., curl, psql) to assist in debugging this issue.`, true
// No such host errors can be reproduced by attempting to resolve a invalid
// domain name. The raw error typically looks like the following:
//
// dial tcp: lookup qweqweqwe.com: no such host
case errors.As(err, &derr) && derr.IsNotFound:
return `No Such Host
Teleport was unable to resolve the provided domain name, likely because the domain does not exist. To resolve this issue, verify the domain is correct and ensure the DNS resolver is properly resolving it.
Use "dig +short HOST" to help debug this issue.`, true
}

return "", false
}

const (
// SelfSignedCertsMsg is a helper message to point users towards helpful documentation.
SelfSignedCertsMsg = "Your proxy certificate is not trusted or expired. " +
Expand Down

0 comments on commit 4c06657

Please sign in to comment.