From 47b60d6f526a871af4bb48ab149665b1aa20c3df Mon Sep 17 00:00:00 2001 From: "Grot (@grafanabot)" <43478413+grafanabot@users.noreply.github.com> Date: Wed, 17 Jan 2024 10:19:40 -0600 Subject: [PATCH] [release-2.8.x] fix(log results cache): compose empty response based on the request (#11658) Backport e915efc7f81350ea82d4dcbe105055075df6fc76 from #11657 --- **What this PR does / why we need it**: Log results cache when handling a hit composes an empty response based on the cached request. But the limit or direction fields in the cached request need not match with the current request being served. This causes the log results cache to return a response with incorrect limit. This incorrect limit could then get applied when merging responses upstream (split by interval mw for ex.) This pr fixes this by composing the response based on the request being served. I also thought about updating the cache key to include both limit and direction to have a clear separation, but I left it as is for the following reason: if a time range contains no log lines, that result would not change irrespective of a different limit or direction **Which issue(s) this PR fixes**: Fixes # **Special notes for your reviewer**: **Checklist** - [x] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: Ashwanth --- CHANGELOG.md | 56 +++++++++++++++++++ pkg/querier/queryrange/log_result_cache.go | 5 +- .../queryrange/log_result_cache_test.go | 49 ++++++++++++++++ 3 files changed, 108 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 172b9a924ac68..52b67a0072a0e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,62 @@ * [10375](https://github.com/grafana/loki/pull/10375) **trevorwhitney**: Fix ingester query when getting label values by passing matchers * [11601](https://github.com/grafana/loki/pull/11601) **dannykopping** Ruler: Fixed a panic that can be caused by concurrent read-write access of tenant configs when there are a large amount of rules. +* [11606](https://github.com/grafana/loki/pull/11606) **dannykopping** Fixed regression adding newlines to HTTP error response bodies which may break client integrations. +* [11657](https://github.com/grafana/loki/pull/11657) **ashwanthgoli** Log results cache: compose empty response based on the request being served to avoid returning incorrect limit or direction. + +##### Changes + +* [11490](https://github.com/grafana/loki/pull/11490) **andresperezl**: Helm: Use `/ingester/shutdown` for `preStop` hook in write pods. +* [10366](https://github.com/grafana/loki/pull/10366) **shantanualsi** Upgrade thanos objstore, dskit and other modules +* [10451](https://github.com/grafana/loki/pull/10451) **shantanualsi** Upgrade thanos `objstore` +* [10814](https://github.com/grafana/loki/pull/10814) **shantanualsi,kaviraj** Upgrade prometheus to v0.47.1 and dskit +* [10959](https://github.com/grafana/loki/pull/10959) **slim-bean** introduce a backoff wait on subquery retries. +* [11121](https://github.com/grafana/loki/pull/11121) **periklis** Ensure all lifecycler cfgs ref a valid IPv6 addr and port combination +* [10650](https://github.com/grafana/loki/pull/10650) **matthewpi** Ensure the frontend uses a valid IPv6 addr and port combination + +#### Promtail + +* [10752](https://github.com/grafana/loki/pull/10752) **gonzalesraul**: structured_metadata: enable structured_metadata convert labels + +##### Enhancements + +* [10416](https://github.com/grafana/loki/pull/10416) **lpugoy**: Lambda-Promtail: Add support for WAF logs in S3 +* [10301](https://github.com/grafana/loki/pull/10301) **wildum**: users can now define `additional_fields` in cloudflare configuration. +* [10755](https://github.com/grafana/loki/pull/10755) **hainenber**: Lambda-Promtail: Add support for dropping labels passed via env var + +##### Changes + +* [10677](https://github.com/grafana/loki/pull/10677) **chaudum** Remove deprecated `stream_lag_labels` setting from both the `options` and `client` configuration sections. +* [10689](https://github.com/grafana/loki/pull/10689) **dylanguedes**: Ingester: Make jitter to be 20% of flush check period instead of 1%. +* [11420](https://github.com/grafana/loki/pull/11420) **zry98**: Show a clearer reason in "disable watchConfig" log message when server is disabled. + +##### Fixes + +* [10631](https://github.com/grafana/loki/pull/10631) **thampiotr**: Fix race condition in cleaning up metrics when stopping to tail files. +* [10798](https://github.com/grafana/loki/pull/10798) **hainenber**: Fix agent panicking after reloaded due to duplicate metric collector registration. +* [10848](https://github.com/grafana/loki/pull/10848) **rgroothuijsen**: Correctly parse list of drop stage sources from YAML. + +#### LogCLI + +#### Mixins + +* [11087](https://github.com/grafana/loki/pull/11087) **JoaoBraveCoding**: Adds structured metadata panels for ingested data + +#### Fixes + +#### FluentD + +#### Jsonnet + +* [11312](https://github.com/grafana/loki/pull/11312) **sentoz**: Loki ksonnet: Do not generate configMap for consul if you are using memberlist + +* [11020](https://github.com/grafana/loki/pull/11020) **ashwanthgoli**: Loki ksonnet: Do not generate table-manager manifests if shipper store is in-use. + +* [10784](https://github.com/grafana/loki/pull/10894) **slim-bean** Update index gateway client to use a headless service. + +* [10542](https://github.com/grafana/loki/pull/10542) **chaudum**: Remove legacy deployment mode for ingester (Deployment, without WAL) and instead always run them as StatefulSet. + +## 2.9.2 (2023-10-16) ### All Changes diff --git a/pkg/querier/queryrange/log_result_cache.go b/pkg/querier/queryrange/log_result_cache.go index e009fda6d9e3a..7386cc8677ad1 100644 --- a/pkg/querier/queryrange/log_result_cache.go +++ b/pkg/querier/queryrange/log_result_cache.go @@ -103,7 +103,8 @@ func (l *logResultCache) Do(ctx context.Context, req queryrangebase.Request) (qu interval := validation.SmallestPositiveNonZeroDurationPerTenant(tenantIDs, l.limits.QuerySplitDuration) // skip caching by if interval is unset - if interval == 0 { + // skip caching when limit is 0 as it would get registerted as empty result in the cache even if that time range contains log lines. + if interval == 0 || lokiReq.Limit == 0 { return l.next.Do(ctx, req) } // The first subquery might not be aligned. @@ -178,7 +179,7 @@ func (l *logResultCache) handleMiss(ctx context.Context, cacheKey string, req *L func (l *logResultCache) handleHit(ctx context.Context, cacheKey string, cachedRequest *LokiRequest, lokiReq *LokiRequest) (queryrangebase.Response, error) { l.metrics.CacheHit.Inc() // we start with an empty response - result := emptyResponse(cachedRequest) + result := emptyResponse(lokiReq) // if the request is the same and cover the whole time range, // we can just return the cached result. if cachedRequest.StartTs.UnixNano() <= lokiReq.StartTs.UnixNano() && cachedRequest.EndTs.UnixNano() >= lokiReq.EndTs.UnixNano() { diff --git a/pkg/querier/queryrange/log_result_cache_test.go b/pkg/querier/queryrange/log_result_cache_test.go index 5e2f172e5025d..078d235cbf01b 100644 --- a/pkg/querier/queryrange/log_result_cache_test.go +++ b/pkg/querier/queryrange/log_result_cache_test.go @@ -526,6 +526,54 @@ func Test_LogResultFillingGap(t *testing.T) { fake.AssertExpectations(t) } +func Test_LogResultCacheDifferentLimit(t *testing.T) { + var ( + ctx = user.InjectOrgID(context.Background(), "foo") + lrc = NewLogResultCache( + log.NewNopLogger(), + fakeLimits{ + splits: map[string]time.Duration{"foo": time.Minute}, + }, + cache.NewMockCache(), + nil, + nil, + nil, + ) + ) + + req1 := &LokiRequest{ + StartTs: time.Unix(0, time.Minute.Nanoseconds()), + EndTs: time.Unix(0, 2*time.Minute.Nanoseconds()), + Limit: entriesLimit, + } + + req2 := &LokiRequest{ + StartTs: time.Unix(0, time.Minute.Nanoseconds()), + EndTs: time.Unix(0, 2*time.Minute.Nanoseconds()), + Limit: 10, + } + + fake := newFakeResponse([]mockResponse{ + { + RequestResponse: queryrangebase.RequestResponse{ + Request: req1, + Response: emptyResponse(req1), + }, + }, + }) + + h := lrc.Wrap(fake) + + resp, err := h.Do(ctx, req1) + require.NoError(t, err) + require.Equal(t, emptyResponse(req1), resp) + resp, err = h.Do(ctx, req2) + require.NoError(t, err) + require.Equal(t, emptyResponse(req2), resp) + + fake.AssertExpectations(t) +} + func TestExtractLokiResponse(t *testing.T) { for _, tc := range []struct { name string @@ -623,6 +671,7 @@ func newFakeResponse(responses []mockResponse) fakeResponse { for _, r := range responses { m.On("Do", mock.Anything, r.Request).Return(r.Response, r.err).Once() } + return fakeResponse{ Mock: m, }