Skip to content

Commit

Permalink
Avoid returning 500 for TRACE_TOO_LARGE (grafana#4160)
Browse files Browse the repository at this point in the history
* Avoid returning 500 when for TRACE_TOO_LARGE

* Test for frontend return of 413

* Check the GRPC message content as well

* Move status code rewrite to the frontend combiner for tracebyid

* Match the error string in tests

* Quit and not return an error

* Move from 413 to 422 after some discussion

* Update changelog

* Update modules/frontend/combiner/trace_by_id.go

Co-authored-by: Javier Molina Reyes <[email protected]>

* Lint

---------

Co-authored-by: Javier Molina Reyes <[email protected]>
  • Loading branch information
zalegrala and javiermolinar authored Oct 18, 2024
1 parent 280128f commit 67be243
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 15 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
* [CHANGE] No longer send the final diff in GRPC streaming. Instead we rely on the streamed intermediate results. [#4062](https://github.com/grafana/tempo/pull/4062) (@joe-elliott)
* [CHANGE] Update Go to 1.23.1 [#4146](https://github.com/grafana/tempo/pull/4146) [#4147](https://github.com/grafana/tempo/pull/4147) (@javiermolinar)
* [CHANGE] TraceQL: Add range condition for byte predicates [#4198](https://github.com/grafana/tempo/pull/4198) (@ie-pham)
* [CHANGE] Return 422 for TRACE_TOO_LARGE queries [#4160](https://github.com/grafana/tempo/pull/4160) (@zalegrala)
* [FEATURE] Discarded span logging `log_discarded_spans` [#3957](https://github.com/grafana/tempo/issues/3957) (@dastrobu)
* [FEATURE] TraceQL support for instrumentation scope [#3967](https://github.com/grafana/tempo/pull/3967) (@ie-pham)
* [ENHANCEMENT] TraceQL: Attribute iterators collect matched array values [#3867](https://github.com/grafana/tempo/pull/3867) (@electron0zero, @stoewer)
Expand Down
17 changes: 9 additions & 8 deletions integration/e2e/limits_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

"github.com/grafana/tempo/integration/util"
"github.com/grafana/tempo/pkg/httpclient"
"github.com/grafana/tempo/pkg/model/trace"
"github.com/grafana/tempo/pkg/tempopb"
tempoUtil "github.com/grafana/tempo/pkg/util"
"github.com/grafana/tempo/pkg/util/test"
Expand Down Expand Up @@ -233,22 +234,22 @@ func TestQueryLimits(t *testing.T) {
querierClient := httpclient.New("http://"+tempo.Endpoint(3200)+"/querier", tempoUtil.FakeTenantID)

_, err = client.QueryTrace(tempoUtil.TraceIDToHexString(traceID[:]))
require.ErrorContains(t, err, "trace exceeds max size")
require.ErrorContains(t, err, "failed with response: 500") // confirm frontend returns 500
require.ErrorContains(t, err, trace.ErrTraceTooLarge.Error())
require.ErrorContains(t, err, "failed with response: 422") // confirm frontend returns 422

_, err = querierClient.QueryTrace(tempoUtil.TraceIDToHexString(traceID[:]))
require.ErrorContains(t, err, "trace exceeds max size")
require.ErrorContains(t, err, "failed with response: 500") // todo: this should return 400 ideally so the frontend does not retry, but does not currently
require.ErrorContains(t, err, trace.ErrTraceTooLarge.Error())
require.ErrorContains(t, err, "failed with response: 422")

// complete block timeout is 10 seconds
time.Sleep(15 * time.Second)
_, err = client.QueryTrace(tempoUtil.TraceIDToHexString(traceID[:]))
require.ErrorContains(t, err, "trace exceeds max size")
require.ErrorContains(t, err, "failed with response: 500") // confirm frontend returns 500
require.ErrorContains(t, err, trace.ErrTraceTooLarge.Error())
require.ErrorContains(t, err, "failed with response: 422") // confirm frontend returns 422

_, err = querierClient.QueryTrace(tempoUtil.TraceIDToHexString(traceID[:]))
require.ErrorContains(t, err, "trace exceeds max size")
require.ErrorContains(t, err, "failed with response: 400") // confirm querier returns 400
require.ErrorContains(t, err, trace.ErrTraceTooLarge.Error())
require.ErrorContains(t, err, "failed with response: 422") // confirm querier returns 422
}

func TestLimitsPartialSuccess(t *testing.T) {
Expand Down
13 changes: 13 additions & 0 deletions modules/frontend/combiner/trace_by_id.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package combiner

import (
"bytes"
"errors"
"fmt"
"io"
"net/http"
Expand Down Expand Up @@ -83,6 +84,13 @@ func (c *traceByIDCombiner) AddResponse(r PipelineResponse) error {

// Consume the trace
_, err = c.c.Consume(resp.Trace)

if errors.Is(err, trace.ErrTraceTooLarge) {
c.code = http.StatusUnprocessableEntity
c.statusMessage = fmt.Sprint(err)
return nil
}

return err
}

Expand Down Expand Up @@ -156,6 +164,11 @@ func (c *traceByIDCombiner) shouldQuit() bool {
return false
}

// test special case for 422
if c.code == http.StatusUnprocessableEntity {
return true
}

// bail on other 400s
if c.code/100 == 4 {
return true
Expand Down
6 changes: 3 additions & 3 deletions modules/frontend/combiner/trace_by_id_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@ func TestTraceByIDShouldQuit(t *testing.T) {
should = c.ShouldQuit()
require.False(t, should)

// trace too large, should not quit but should return an error
// trace too large, should quit and should not return an error
c = NewTraceByID(1, api.HeaderAcceptJSON)
err = c.AddResponse(toHTTPProtoResponse(t, &tempopb.TraceByIDResponse{
Trace: test.MakeTrace(1, nil),
Metrics: &tempopb.TraceByIDMetrics{},
}, 200))
require.Error(t, err)
require.NoError(t, err)
should = c.ShouldQuit()
require.False(t, should)
require.True(t, should)
}

func TestTraceByIDHonorsContentType(t *testing.T) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,9 @@ func (c statusCodeAdjustWare) RoundTrip(req Request) (*http.Response, error) {
// if the frontend issues a bad request then externally we need to represent that as an
// internal error
// exceptions
// 422 - unprocessable entity
// 429 - too many requests
if resp.StatusCode >= 400 && resp.StatusCode < 500 && resp.StatusCode != 429 {
if resp.StatusCode >= 400 && resp.StatusCode < 500 && resp.StatusCode != 429 && resp.StatusCode != 422 {
resp.StatusCode = http.StatusInternalServerError
resp.Status = http.StatusText(http.StatusInternalServerError)
// leave the body alone. it will preserve the original error message
Expand Down
8 changes: 5 additions & 3 deletions modules/querier/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"net/http"
"strings"
"time"

"github.com/golang/protobuf/jsonpb" //nolint:all //deprecated
Expand Down Expand Up @@ -425,9 +426,10 @@ func handleError(w http.ResponseWriter, err error) {
return
}

// todo: better understand all errors returned from queriers and categorize more as 4XX
if errors.Is(err, trace.ErrTraceTooLarge) {
http.Error(w, err.Error(), http.StatusBadRequest)
// TODO: better understand all errors returned from queriers and categorize more as 4XX
// NOTE: we receive a GRPC error from the ingesters, and so we need to check the string content of error as well.
if errors.Is(err, trace.ErrTraceTooLarge) || strings.Contains(err.Error(), trace.ErrTraceTooLarge.Error()) {
http.Error(w, err.Error(), http.StatusUnprocessableEntity)
return
}

Expand Down

0 comments on commit 67be243

Please sign in to comment.