results cache: add new stat query_length_served to measure cache ef…

…fectiveness (#11589) **What this PR does / why we need it**: cache hit rate that is currently being measured using metrics and stats does not account for the fact that a cache hit could return partial results. When we query the cache for a key, we get back a list of extents and these need no cover the entire (split) range of the cache key. This pr adds a new cache stat called `query_length_served` to better measure the cache effectiveness. ``` query_length - sum(length of downstream queries) ``` **Which issue(s) this PR fixes**: Fixes #<issue number> **Special notes for your reviewer**: **Checklist** - [ ] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [x] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](d10549e) - [ ] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](0d4416a)
grafana · Jan 5, 2024 · 852becf · 852becf
1 parent f04d0db
commit 852becf
Show file tree

Hide file tree

Showing 13 changed files with 302 additions and 125 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -45,6 +45,7 @@
 * [11284](https://github.com/grafana/loki/pull/11284) **ashwanthgoli** Config: Adds `frontend.max-query-capacity` to tune per-tenant query capacity.
 * [11539](https://github.com/grafana/loki/pull/11539) **kaviraj,ashwanthgoli** Support caching /series and /labels query results
 * [11545](https://github.com/grafana/loki/pull/11545) **dannykopping** Force correct memcached timeout when fetching chunks.
+* [11589](https://github.com/grafana/loki/pull/11589) **ashwanthgoli** Results Cache: Adds `query_length_served` cache stat to measure the length of the query served from cache.
 
 ##### Fixes
 * [11074](https://github.com/grafana/loki/pull/11074) **hainenber** Fix panic in lambda-promtail due to mishandling of empty DROP_LABELS env var.

diff --git a/pkg/logql/metrics.go b/pkg/logql/metrics.go
@@ -163,6 +163,7 @@ func RecordRangeAndInstantQueryMetrics(
 		"cache_result_req", stats.Caches.Result.EntriesRequested,
 		"cache_result_hit", stats.Caches.Result.EntriesFound,
 		"cache_result_download_time", stats.Caches.Result.CacheDownloadTime(),
+		"cache_result_query_length_served", stats.Caches.Result.CacheQueryLengthServed(),
 	}...)
 
 	logValues = append(logValues, tagsToKeyValues(queryTags)...)
@@ -228,6 +229,7 @@ func RecordLabelQueryMetrics(
 		"cache_label_results_hit", stats.Caches.LabelResult.EntriesFound,
 		"cache_label_results_stored", stats.Caches.LabelResult.EntriesStored,
 		"cache_label_results_download_time", stats.Caches.LabelResult.CacheDownloadTime(),
+		"cache_label_results_query_length_served", stats.Caches.LabelResult.CacheQueryLengthServed(),
 	)
 
 	execLatency.WithLabelValues(status, queryType, "").Observe(stats.Summary.ExecTime)
@@ -341,6 +343,7 @@ func RecordSeriesQueryMetrics(ctx context.Context, log log.Logger, start, end ti
 		"cache_series_results_hit", stats.Caches.SeriesResult.EntriesFound,
 		"cache_series_results_stored", stats.Caches.SeriesResult.EntriesStored,
 		"cache_series_results_download_time", stats.Caches.SeriesResult.CacheDownloadTime(),
+		"cache_series_results_query_length_served", stats.Caches.SeriesResult.CacheQueryLengthServed(),
 	)
 
 	if shard != nil {
@@ -428,6 +431,7 @@ func RecordVolumeQueryMetrics(ctx context.Context, log log.Logger, start, end ti
 		"cache_volume_results_hit", stats.Caches.VolumeResult.EntriesFound,
 		"cache_volume_results_stored", stats.Caches.VolumeResult.EntriesStored,
 		"cache_volume_results_download_time", stats.Caches.VolumeResult.CacheDownloadTime(),
+		"cache_volume_results_query_length_served", stats.Caches.VolumeResult.CacheQueryLengthServed(),
 	)
 
 	execLatency.WithLabelValues(status, queryType, "").Observe(stats.Summary.ExecTime)

diff --git a/pkg/logql/metrics_test.go b/pkg/logql/metrics_test.go
@@ -108,16 +108,17 @@ func TestLogLabelsQuery(t *testing.T) {
 		},
 		Caches: stats.Caches{
 			LabelResult: stats.Cache{
-				EntriesRequested: 2,
-				EntriesFound:     1,
-				EntriesStored:    1,
-				DownloadTime:     80,
+				EntriesRequested:  2,
+				EntriesFound:      1,
+				EntriesStored:     1,
+				DownloadTime:      80,
+				QueryLengthServed: 10,
 			},
 		},
 	})
 	require.Regexp(t,
 		fmt.Sprintf(
-			"level=info org_id=foo traceID=%s sampled=true latency=slow query_type=labels splits=0 start=.* end=.* start_delta=1h0m0.* end_delta=.* length=1h0m0s duration=25.25s status=200 label=foo query= query_hash=2166136261 total_entries=12 cache_label_results_req=2 cache_label_results_hit=1 cache_label_results_stored=1 cache_label_results_download_time=80ns\n",
+			"level=info org_id=foo traceID=%s sampled=true latency=slow query_type=labels splits=0 start=.* end=.* start_delta=1h0m0.* end_delta=.* length=1h0m0s duration=25.25s status=200 label=foo query= query_hash=2166136261 total_entries=12 cache_label_results_req=2 cache_label_results_hit=1 cache_label_results_stored=1 cache_label_results_download_time=80ns cache_label_results_query_length_served=10ns\n",
 			sp.Context().(jaeger.SpanContext).SpanID().String(),
 		),
 		buf.String())
@@ -142,16 +143,17 @@ func TestLogSeriesQuery(t *testing.T) {
 		},
 		Caches: stats.Caches{
 			SeriesResult: stats.Cache{
-				EntriesRequested: 2,
-				EntriesFound:     1,
-				EntriesStored:    1,
-				DownloadTime:     80,
+				EntriesRequested:  2,
+				EntriesFound:      1,
+				EntriesStored:     1,
+				DownloadTime:      80,
+				QueryLengthServed: 10,
 			},
 		},
 	})
 	require.Regexp(t,
 		fmt.Sprintf(
-			"level=info org_id=foo traceID=%s sampled=true latency=slow query_type=series splits=0 start=.* end=.* start_delta=1h0m0.* end_delta=.* length=1h0m0s duration=25.25s status=200 match=\"{container_name=.*\"}:{app=.*}\" query_hash=23523089 total_entries=10 cache_series_results_req=2 cache_series_results_hit=1 cache_series_results_stored=1 cache_series_results_download_time=80ns\n",
+			"level=info org_id=foo traceID=%s sampled=true latency=slow query_type=series splits=0 start=.* end=.* start_delta=1h0m0.* end_delta=.* length=1h0m0s duration=25.25s status=200 match=\"{container_name=.*\"}:{app=.*}\" query_hash=23523089 total_entries=10 cache_series_results_req=2 cache_series_results_hit=1 cache_series_results_stored=1 cache_series_results_download_time=80ns cache_series_results_query_length_served=10ns\n",
 			sp.Context().(jaeger.SpanContext).SpanID().String(),
 		),
 		buf.String())

diff --git a/pkg/logqlmodel/stats/context.go b/pkg/logqlmodel/stats/context.go
@@ -231,12 +231,17 @@ func (c *Cache) Merge(m Cache) {
 	c.BytesSent += m.BytesSent
 	c.BytesReceived += m.BytesReceived
 	c.DownloadTime += m.DownloadTime
+	c.QueryLengthServed += m.QueryLengthServed
 }
 
 func (c *Cache) CacheDownloadTime() time.Duration {
 	return time.Duration(c.DownloadTime)
 }
 
+func (c *Cache) CacheQueryLengthServed() time.Duration {
+	return time.Duration(c.QueryLengthServed)
+}
+
 func (r *Result) MergeSplit(m Result) {
 	m.Summary.Splits = 1
 	r.Merge(m)
@@ -429,6 +434,16 @@ func (c *Context) AddCacheRequest(t CacheType, i int) {
 	atomic.AddInt32(&stats.Requests, int32(i))
 }
 
+// AddCacheQueryLengthServed measures the length of the query served from cache
+func (c *Context) AddCacheQueryLengthServed(t CacheType, i time.Duration) {
+	stats := c.getCacheStatsByType(t)
+	if stats == nil {
+		return
+	}
+
+	atomic.AddInt64(&stats.QueryLengthServed, int64(i))
+}
+
 func (c *Context) AddSplitQueries(num int64) {
 	atomic.AddInt64(&c.result.Summary.Splits, num)
 }

diff --git a/pkg/logqlmodel/stats/context_test.go b/pkg/logqlmodel/stats/context_test.go
@@ -210,7 +210,8 @@ func TestResult_Merge(t *testing.T) {
 				EntriesFound:     2,
 			},
 			Result: Cache{
-				EntriesStored: 3,
+				EntriesStored:     3,
+				QueryLengthServed: int64(3 * time.Hour),
 			},
 		},
 		Summary: Summary{
@@ -272,7 +273,8 @@ func TestResult_Merge(t *testing.T) {
 				EntriesFound:     2 * 2,
 			},
 			Result: Cache{
-				EntriesStored: 2 * 3,
+				EntriesStored:     2 * 3,
+				QueryLengthServed: int64(2 * 3 * time.Hour),
 			},
 		},
 		Summary: Summary{
@@ -325,6 +327,7 @@ func TestCaches(t *testing.T) {
 
 	statsCtx.AddCacheRequest(ChunkCache, 5)
 	statsCtx.AddCacheEntriesStored(ResultCache, 3)
+	statsCtx.AddCacheQueryLengthServed(ResultCache, 3*time.Hour)
 	statsCtx.AddCacheEntriesRequested(IndexCache, 22)
 	statsCtx.AddCacheBytesRetrieved(ChunkCache, 1024)
 	statsCtx.AddCacheBytesSent(ChunkCache, 512)
@@ -341,7 +344,8 @@ func TestCaches(t *testing.T) {
 			EntriesFound:     2,
 		},
 		Result: Cache{
-			EntriesStored: 3,
+			EntriesStored:     3,
+			QueryLengthServed: int64(time.Hour * 3),
 		},
 	}, statsCtx.Caches())
 }