Introduce separate histogram metric for bloom query latency

Signed-off-by: Christian Haudum <[email protected]>
grafana · Jan 16, 2024 · 1e6aab6 · 1e6aab6
1 parent a17cb9c
commit 1e6aab6
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 10 deletions.
diff --git a/integration/loki_micro_services_test.go b/integration/loki_micro_services_test.go
@@ -1218,7 +1218,8 @@ func TestBloomFiltersEndToEnd(t *testing.T) {
 		metrics, err := cliBloomCompactor.Metrics()
 		require.NoError(t, err)
 		successfulRunCount := getMetricValue(t, "loki_bloomcompactor_runs_completed_total", metrics)
-		return successfulRunCount >= 1
+		t.Log("successful bloom compactor runs", successfulRunCount)
+		return successfulRunCount == 1
 	}, 30*time.Second, time.Second)
 
 	// use bloom gateway to perform needle in the haystack queries
@@ -1238,18 +1239,18 @@ func TestBloomFiltersEndToEnd(t *testing.T) {
 	bloomGwMetrics, err := cliBloomGateway.Metrics()
 	require.NoError(t, err)
 
-	mf, err := extractMetricFamily("loki_bloom_gateway_store_latency", bloomGwMetrics)
-	require.NoError(t, err)
-
 	unfilteredCount := getMetricValue(t, "loki_bloom_gateway_chunkrefs_pre_filtering", bloomGwMetrics)
 	require.Equal(t, float64(10), unfilteredCount)
 
 	filteredCount := getMetricValue(t, "loki_bloom_gateway_chunkrefs_post_filtering", bloomGwMetrics)
 	require.Equal(t, float64(1), filteredCount)
 
+	mf, err := extractMetricFamily("loki_bloom_gateway_bloom_query_latency", bloomGwMetrics)
+	require.NoError(t, err)
+
 	count := getValueFromMetricFamilyWithFunc(mf, &dto.LabelPair{
-		Name:  proto.String("operation"),
-		Value: proto.String("ForEach"),
+		Name:  proto.String("status"),
+		Value: proto.String("success"),
 	}, func(m *dto.Metric) uint64 {
 		return m.Histogram.GetSampleCount()
 	})

diff --git a/pkg/bloomgateway/worker.go b/pkg/bloomgateway/worker.go
@@ -27,6 +27,7 @@ type workerMetrics struct {
 	dequeueErrors      *prometheus.CounterVec
 	dequeueWaitTime    *prometheus.SummaryVec
 	storeAccessLatency *prometheus.HistogramVec
+	bloomQueryLatency  *prometheus.HistogramVec
 }
 
 func newWorkerMetrics(registerer prometheus.Registerer, namespace, subsystem string) *workerMetrics {
@@ -50,6 +51,13 @@ func newWorkerMetrics(registerer prometheus.Registerer, namespace, subsystem str
 			Name:      "dequeue_wait_time",
 			Help:      "Time spent waiting for dequeuing tasks from queue",
 		}, labels),
+		bloomQueryLatency: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
+			Namespace: namespace,
+			Subsystem: subsystem,
+			Name:      "bloom_query_latency",
+			Help:      "Latency in seconds of processing bloom blocks",
+		}, append(labels, "status")),
+		// TODO(chaudum): Move this metric into the bloomshipper
 		storeAccessLatency: promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
 			Namespace: namespace,
 			Subsystem: subsystem,
@@ -210,20 +218,18 @@ func (w *worker) stopping(err error) error {
 }
 
 func (w *worker) processBlocksWithCallback(taskCtx context.Context, tenant string, day time.Time, blockRefs []bloomshipper.BlockRef, boundedRefs []boundedTasks) error {
-	storeFetchStart := time.Now()
 	return w.store.ForEach(taskCtx, tenant, blockRefs, func(bq *v1.BlockQuerier, minFp, maxFp uint64) error {
-		w.metrics.storeAccessLatency.WithLabelValues(w.id, "ForEach").Observe(time.Since(storeFetchStart).Seconds())
 		for _, b := range boundedRefs {
 			if b.blockRef.MinFingerprint == minFp && b.blockRef.MaxFingerprint == maxFp {
-				processBlock(bq, day, b.tasks)
+				w.processBlock(bq, day, b.tasks)
 				return nil
 			}
 		}
 		return nil
 	})
 }
 
-func processBlock(blockQuerier *v1.BlockQuerier, day time.Time, tasks []Task) {
+func (w *worker) processBlock(blockQuerier *v1.BlockQuerier, day time.Time, tasks []Task) {
 	schema, err := blockQuerier.Schema()
 	if err != nil {
 		for _, t := range tasks {
@@ -234,10 +240,16 @@ func processBlock(blockQuerier *v1.BlockQuerier, day time.Time, tasks []Task) {
 	tokenizer := v1.NewNGramTokenizer(schema.NGramLen(), 0)
 	it := newTaskMergeIterator(day, tokenizer, tasks...)
 	fq := blockQuerier.Fuse([]v1.PeekingIterator[v1.Request]{it})
+
+	start := time.Now()
 	err = fq.Run()
+
+	label := "success"
 	if err != nil {
+		label = "failure"
 		for _, t := range tasks {
 			t.ErrCh <- errors.Wrap(err, "failed to run chunk check")
 		}
 	}
+	w.metrics.bloomQueryLatency.WithLabelValues(w.id, label).Observe(time.Since(start).Seconds())
 }