From 7116a80faf2dd2f3c5009a2a07571d3ed764e0e1 Mon Sep 17 00:00:00 2001 From: "Grot (@grafanabot)" <43478413+grafanabot@users.noreply.github.com> Date: Tue, 2 Jul 2024 10:59:16 +0200 Subject: [PATCH] chore: [k207] fix(blooms): ensure tokenizer cache is reset between series (#13373) Backport 04bc3a423c8ea9e7c945b15dffb83d674bab3a68 from #13370 --- Fixes bug where ngrams were not added to blooms b/c they had been added previously to a potentially different series. This caused blooms to fail membership tests incorrectly. Co-authored-by: Owen Diehl --- pkg/storage/bloom/v1/bloom_tokenizer.go | 3 ++ pkg/storage/bloom/v1/bloom_tokenizer_test.go | 39 ++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/pkg/storage/bloom/v1/bloom_tokenizer.go b/pkg/storage/bloom/v1/bloom_tokenizer.go index 7d2ba41b7f49c..f0365f7cc78d8 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer.go @@ -97,11 +97,14 @@ func (bt *BloomTokenizer) newBloom() *Bloom { } } +// Populates a bloom filter(s) with the tokens from the given chunks. +// Called once per series func (bt *BloomTokenizer) Populate( blooms SizedIterator[*Bloom], chks Iterator[ChunkRefWithIter], ch chan *BloomCreation, ) { + clear(bt.cache) // MUST always clear the cache before starting a new series var next bool // All but the last bloom are considered full -- send back unaltered diff --git a/pkg/storage/bloom/v1/bloom_tokenizer_test.go b/pkg/storage/bloom/v1/bloom_tokenizer_test.go index 7685faaa92427..ad7040ddf431e 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer_test.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer_test.go @@ -288,6 +288,45 @@ func BenchmarkPopulateSeriesWithBloom(b *testing.B) { } } +func TestTokenizerClearsCacheBetweenPopulateCalls(t *testing.T) { + bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, NewMetrics(nil)) + line := "foobarbazz" + var blooms []*Bloom + + for i := 0; i < 2; i++ { + ch := make(chan *BloomCreation) + itr, err := chunkRefItrFromLines(line) + require.NoError(t, err) + go bt.Populate( + NewEmptyIter[*Bloom](), + NewSliceIter([]ChunkRefWithIter{ + { + Ref: ChunkRef{}, + Itr: itr, + }, + }), + ch, + ) + var ct int + for created := range ch { + blooms = append(blooms, created.Bloom) + ct++ + } + // ensure we created one bloom for each call + require.Equal(t, 1, ct) + + } + + for _, bloom := range blooms { + toks := bt.lineTokenizer.Tokens(line) + for toks.Next() { + token := toks.At() + require.True(t, bloom.Test(token)) + } + require.NoError(t, toks.Err()) + } +} + func BenchmarkMapClear(b *testing.B) { bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics) for i := 0; i < b.N; i++ {