diff --git a/pkg/storage/bloom/v1/bloom_tokenizer.go b/pkg/storage/bloom/v1/bloom_tokenizer.go index 7d2ba41b7f49c..f0365f7cc78d8 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer.go @@ -97,11 +97,14 @@ func (bt *BloomTokenizer) newBloom() *Bloom { } } +// Populates a bloom filter(s) with the tokens from the given chunks. +// Called once per series func (bt *BloomTokenizer) Populate( blooms SizedIterator[*Bloom], chks Iterator[ChunkRefWithIter], ch chan *BloomCreation, ) { + clear(bt.cache) // MUST always clear the cache before starting a new series var next bool // All but the last bloom are considered full -- send back unaltered diff --git a/pkg/storage/bloom/v1/bloom_tokenizer_test.go b/pkg/storage/bloom/v1/bloom_tokenizer_test.go index 7685faaa92427..ad7040ddf431e 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer_test.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer_test.go @@ -288,6 +288,45 @@ func BenchmarkPopulateSeriesWithBloom(b *testing.B) { } } +func TestTokenizerClearsCacheBetweenPopulateCalls(t *testing.T) { + bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, NewMetrics(nil)) + line := "foobarbazz" + var blooms []*Bloom + + for i := 0; i < 2; i++ { + ch := make(chan *BloomCreation) + itr, err := chunkRefItrFromLines(line) + require.NoError(t, err) + go bt.Populate( + NewEmptyIter[*Bloom](), + NewSliceIter([]ChunkRefWithIter{ + { + Ref: ChunkRef{}, + Itr: itr, + }, + }), + ch, + ) + var ct int + for created := range ch { + blooms = append(blooms, created.Bloom) + ct++ + } + // ensure we created one bloom for each call + require.Equal(t, 1, ct) + + } + + for _, bloom := range blooms { + toks := bt.lineTokenizer.Tokens(line) + for toks.Next() { + token := toks.At() + require.True(t, bloom.Test(token)) + } + require.NoError(t, toks.Err()) + } +} + func BenchmarkMapClear(b *testing.B) { bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics) for i := 0; i < b.N; i++ {