Skip to content

Commit

Permalink
results cache: add new stat query_length_served to measure cache ef…
Browse files Browse the repository at this point in the history
…fectiveness (#11589)

**What this PR does / why we need it**:
cache hit rate that is currently being measured using metrics and stats
does not account for the fact that a cache hit could return partial
results. When we query the cache for a key, we get back a list of
extents and these need no cover the entire (split) range of the cache
key.

This pr adds a new cache stat called `query_length_served` to better
measure the cache effectiveness.
```
query_length - sum(length of downstream queries)
```

**Which issue(s) this PR fixes**:
Fixes #<issue number>

**Special notes for your reviewer**:

**Checklist**
- [ ] Reviewed the
[`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md)
guide (**required**)
- [x] Documentation added
- [x] Tests updated
- [x] `CHANGELOG.md` updated
- [ ] If the change is worth mentioning in the release notes, add
`add-to-release-notes` label
- [ ] Changes that require user attention or interaction to upgrade are
documented in `docs/sources/setup/upgrade/_index.md`
- [ ] For Helm chart changes bump the Helm chart version in
`production/helm/loki/Chart.yaml` and update
`production/helm/loki/CHANGELOG.md` and
`production/helm/loki/README.md`. [Example
PR](d10549e)
- [ ] If the change is deprecating or removing a configuration option,
update the `deprecated-config.yaml` and `deleted-config.yaml` files
respectively in the `tools/deprecated-config-checker` directory.
[Example
PR](0d4416a)
  • Loading branch information
ashwanthgoli authored Jan 5, 2024
1 parent f04d0db commit 852becf
Show file tree
Hide file tree
Showing 13 changed files with 302 additions and 125 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
* [11284](https://github.com/grafana/loki/pull/11284) **ashwanthgoli** Config: Adds `frontend.max-query-capacity` to tune per-tenant query capacity.
* [11539](https://github.com/grafana/loki/pull/11539) **kaviraj,ashwanthgoli** Support caching /series and /labels query results
* [11545](https://github.com/grafana/loki/pull/11545) **dannykopping** Force correct memcached timeout when fetching chunks.
* [11589](https://github.com/grafana/loki/pull/11589) **ashwanthgoli** Results Cache: Adds `query_length_served` cache stat to measure the length of the query served from cache.

##### Fixes
* [11074](https://github.com/grafana/loki/pull/11074) **hainenber** Fix panic in lambda-promtail due to mishandling of empty DROP_LABELS env var.
Expand Down
4 changes: 4 additions & 0 deletions pkg/logql/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ func RecordRangeAndInstantQueryMetrics(
"cache_result_req", stats.Caches.Result.EntriesRequested,
"cache_result_hit", stats.Caches.Result.EntriesFound,
"cache_result_download_time", stats.Caches.Result.CacheDownloadTime(),
"cache_result_query_length_served", stats.Caches.Result.CacheQueryLengthServed(),
}...)

logValues = append(logValues, tagsToKeyValues(queryTags)...)
Expand Down Expand Up @@ -228,6 +229,7 @@ func RecordLabelQueryMetrics(
"cache_label_results_hit", stats.Caches.LabelResult.EntriesFound,
"cache_label_results_stored", stats.Caches.LabelResult.EntriesStored,
"cache_label_results_download_time", stats.Caches.LabelResult.CacheDownloadTime(),
"cache_label_results_query_length_served", stats.Caches.LabelResult.CacheQueryLengthServed(),
)

execLatency.WithLabelValues(status, queryType, "").Observe(stats.Summary.ExecTime)
Expand Down Expand Up @@ -341,6 +343,7 @@ func RecordSeriesQueryMetrics(ctx context.Context, log log.Logger, start, end ti
"cache_series_results_hit", stats.Caches.SeriesResult.EntriesFound,
"cache_series_results_stored", stats.Caches.SeriesResult.EntriesStored,
"cache_series_results_download_time", stats.Caches.SeriesResult.CacheDownloadTime(),
"cache_series_results_query_length_served", stats.Caches.SeriesResult.CacheQueryLengthServed(),
)

if shard != nil {
Expand Down Expand Up @@ -428,6 +431,7 @@ func RecordVolumeQueryMetrics(ctx context.Context, log log.Logger, start, end ti
"cache_volume_results_hit", stats.Caches.VolumeResult.EntriesFound,
"cache_volume_results_stored", stats.Caches.VolumeResult.EntriesStored,
"cache_volume_results_download_time", stats.Caches.VolumeResult.CacheDownloadTime(),
"cache_volume_results_query_length_served", stats.Caches.VolumeResult.CacheQueryLengthServed(),
)

execLatency.WithLabelValues(status, queryType, "").Observe(stats.Summary.ExecTime)
Expand Down
22 changes: 12 additions & 10 deletions pkg/logql/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,16 +108,17 @@ func TestLogLabelsQuery(t *testing.T) {
},
Caches: stats.Caches{
LabelResult: stats.Cache{
EntriesRequested: 2,
EntriesFound: 1,
EntriesStored: 1,
DownloadTime: 80,
EntriesRequested: 2,
EntriesFound: 1,
EntriesStored: 1,
DownloadTime: 80,
QueryLengthServed: 10,
},
},
})
require.Regexp(t,
fmt.Sprintf(
"level=info org_id=foo traceID=%s sampled=true latency=slow query_type=labels splits=0 start=.* end=.* start_delta=1h0m0.* end_delta=.* length=1h0m0s duration=25.25s status=200 label=foo query= query_hash=2166136261 total_entries=12 cache_label_results_req=2 cache_label_results_hit=1 cache_label_results_stored=1 cache_label_results_download_time=80ns\n",
"level=info org_id=foo traceID=%s sampled=true latency=slow query_type=labels splits=0 start=.* end=.* start_delta=1h0m0.* end_delta=.* length=1h0m0s duration=25.25s status=200 label=foo query= query_hash=2166136261 total_entries=12 cache_label_results_req=2 cache_label_results_hit=1 cache_label_results_stored=1 cache_label_results_download_time=80ns cache_label_results_query_length_served=10ns\n",
sp.Context().(jaeger.SpanContext).SpanID().String(),
),
buf.String())
Expand All @@ -142,16 +143,17 @@ func TestLogSeriesQuery(t *testing.T) {
},
Caches: stats.Caches{
SeriesResult: stats.Cache{
EntriesRequested: 2,
EntriesFound: 1,
EntriesStored: 1,
DownloadTime: 80,
EntriesRequested: 2,
EntriesFound: 1,
EntriesStored: 1,
DownloadTime: 80,
QueryLengthServed: 10,
},
},
})
require.Regexp(t,
fmt.Sprintf(
"level=info org_id=foo traceID=%s sampled=true latency=slow query_type=series splits=0 start=.* end=.* start_delta=1h0m0.* end_delta=.* length=1h0m0s duration=25.25s status=200 match=\"{container_name=.*\"}:{app=.*}\" query_hash=23523089 total_entries=10 cache_series_results_req=2 cache_series_results_hit=1 cache_series_results_stored=1 cache_series_results_download_time=80ns\n",
"level=info org_id=foo traceID=%s sampled=true latency=slow query_type=series splits=0 start=.* end=.* start_delta=1h0m0.* end_delta=.* length=1h0m0s duration=25.25s status=200 match=\"{container_name=.*\"}:{app=.*}\" query_hash=23523089 total_entries=10 cache_series_results_req=2 cache_series_results_hit=1 cache_series_results_stored=1 cache_series_results_download_time=80ns cache_series_results_query_length_served=10ns\n",
sp.Context().(jaeger.SpanContext).SpanID().String(),
),
buf.String())
Expand Down
15 changes: 15 additions & 0 deletions pkg/logqlmodel/stats/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,12 +231,17 @@ func (c *Cache) Merge(m Cache) {
c.BytesSent += m.BytesSent
c.BytesReceived += m.BytesReceived
c.DownloadTime += m.DownloadTime
c.QueryLengthServed += m.QueryLengthServed
}

func (c *Cache) CacheDownloadTime() time.Duration {
return time.Duration(c.DownloadTime)
}

func (c *Cache) CacheQueryLengthServed() time.Duration {
return time.Duration(c.QueryLengthServed)
}

func (r *Result) MergeSplit(m Result) {
m.Summary.Splits = 1
r.Merge(m)
Expand Down Expand Up @@ -429,6 +434,16 @@ func (c *Context) AddCacheRequest(t CacheType, i int) {
atomic.AddInt32(&stats.Requests, int32(i))
}

// AddCacheQueryLengthServed measures the length of the query served from cache
func (c *Context) AddCacheQueryLengthServed(t CacheType, i time.Duration) {
stats := c.getCacheStatsByType(t)
if stats == nil {
return
}

atomic.AddInt64(&stats.QueryLengthServed, int64(i))
}

func (c *Context) AddSplitQueries(num int64) {
atomic.AddInt64(&c.result.Summary.Splits, num)
}
Expand Down
10 changes: 7 additions & 3 deletions pkg/logqlmodel/stats/context_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,8 @@ func TestResult_Merge(t *testing.T) {
EntriesFound: 2,
},
Result: Cache{
EntriesStored: 3,
EntriesStored: 3,
QueryLengthServed: int64(3 * time.Hour),
},
},
Summary: Summary{
Expand Down Expand Up @@ -272,7 +273,8 @@ func TestResult_Merge(t *testing.T) {
EntriesFound: 2 * 2,
},
Result: Cache{
EntriesStored: 2 * 3,
EntriesStored: 2 * 3,
QueryLengthServed: int64(2 * 3 * time.Hour),
},
},
Summary: Summary{
Expand Down Expand Up @@ -325,6 +327,7 @@ func TestCaches(t *testing.T) {

statsCtx.AddCacheRequest(ChunkCache, 5)
statsCtx.AddCacheEntriesStored(ResultCache, 3)
statsCtx.AddCacheQueryLengthServed(ResultCache, 3*time.Hour)
statsCtx.AddCacheEntriesRequested(IndexCache, 22)
statsCtx.AddCacheBytesRetrieved(ChunkCache, 1024)
statsCtx.AddCacheBytesSent(ChunkCache, 512)
Expand All @@ -341,7 +344,8 @@ func TestCaches(t *testing.T) {
EntriesFound: 2,
},
Result: Cache{
EntriesStored: 3,
EntriesStored: 3,
QueryLengthServed: int64(time.Hour * 3),
},
}, statsCtx.Caches())
}
Loading

0 comments on commit 852becf

Please sign in to comment.