Skip to content

Commit

Permalink
chore(blooms): bloom_client records cache locality score (grafana#12458)
Browse files Browse the repository at this point in the history
This is a helpful way to measure how effectively we distribute filter queries to bloom-gws. Should mostly converge to 1 once we start using jump-hash instead of the ring for these, but it'll give us a good way to measure that improvement.
  • Loading branch information
owen-d authored and rhnasc committed Apr 12, 2024
1 parent 850820a commit 0f85c29
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 12 deletions.
37 changes: 26 additions & 11 deletions pkg/bloomgateway/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,12 @@ type Client interface {
}

type GatewayClient struct {
cfg ClientConfig
limits Limits
logger log.Logger
pool *ringclient.Pool
ring ring.ReadRing
cfg ClientConfig
limits Limits
logger log.Logger
metrics *clientMetrics
pool *ringclient.Pool
ring ring.ReadRing
}

func NewClient(
Expand Down Expand Up @@ -206,11 +207,12 @@ func NewClient(
}

return &GatewayClient{
cfg: cfg,
logger: logger,
limits: limits,
pool: clientpool.NewPool("bloom-gateway", cfg.PoolConfig, cfg.Ring, ringclient.PoolAddrFunc(poolFactory), logger, metricsNamespace),
ring: readRing,
cfg: cfg,
logger: logger,
limits: limits,
metrics: newClientMetrics(registerer),
pool: clientpool.NewPool("bloom-gateway", cfg.PoolConfig, cfg.Ring, ringclient.PoolAddrFunc(poolFactory), logger, metricsNamespace),
ring: readRing,
}, nil
}

Expand All @@ -231,7 +233,7 @@ func shuffleAddrs(addrs []string) []string {

// FilterChunkRefs implements Client
func (c *GatewayClient) FilterChunks(ctx context.Context, tenant string, from, through model.Time, groups []*logproto.GroupedChunkRefs, plan plan.QueryPlan) ([]*logproto.GroupedChunkRefs, error) {
if !c.limits.BloomGatewayEnabled(tenant) {
if !c.limits.BloomGatewayEnabled(tenant) || len(groups) == 0 {
return groups, nil
}

Expand All @@ -246,7 +248,20 @@ func (c *GatewayClient) FilterChunks(ctx context.Context, tenant string, from, t
if err != nil {
return nil, errors.Wrap(err, "bloom gateway get replication sets")
}

servers = partitionByReplicationSet(groups, servers)
if len(servers) > 0 {
// cache locality score (higher is better):
// `% keyspace / % instances`. Ideally converges to 1 (querying x% of keyspace requires x% of instances),
// but can be less if the keyspace is not evenly distributed across instances. Ideal operation will see the range of
// `1-2/num_instances` -> `1`, where the former represents slight
// overlap on instances to the left and right of the range.
firstFp, lastFp := groups[0].Fingerprint, groups[len(groups)-1].Fingerprint
pctKeyspace := float64(lastFp-firstFp) / float64(math.MaxUint64)
pctInstances := float64(len(servers)) / float64(len(rs.Instances))
cacheLocalityScore := pctKeyspace / pctInstances
c.metrics.cacheLocalityScore.Observe(cacheLocalityScore)
}

results := make([][]*logproto.GroupedChunkRefs, len(servers))
count := 0
Expand Down
2 changes: 1 addition & 1 deletion pkg/bloomgateway/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func TestBloomGatewayClient(t *testing.T) {
require.NoError(t, err)
res, err := c.FilterChunks(context.Background(), "tenant", model.Now(), model.Now(), nil, plan.QueryPlan{AST: expr})
require.NoError(t, err)
require.Equal(t, []*logproto.GroupedChunkRefs{}, res)
require.Equal(t, 0, len(res))
})
}

Expand Down
18 changes: 18 additions & 0 deletions pkg/bloomgateway/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,31 @@ import (

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"

"github.com/grafana/loki/v3/pkg/util/constants"
)

type metrics struct {
*workerMetrics
*serverMetrics
}

type clientMetrics struct {
cacheLocalityScore prometheus.Histogram
}

func newClientMetrics(registerer prometheus.Registerer) *clientMetrics {
return &clientMetrics{
cacheLocalityScore: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
Namespace: constants.Loki,
Subsystem: "bloom_gateway_client",
Name: "cache_locality_score",
Help: "Cache locality score of the bloom filter, as measured by % of keyspace touched / % of bloom_gws required",
Buckets: prometheus.LinearBuckets(0.01, 0.2, 5),
}),
}
}

type serverMetrics struct {
inflightRequests prometheus.Summary
requestedSeries prometheus.Histogram
Expand Down

0 comments on commit 0f85c29

Please sign in to comment.