diff --git a/pkg/storage/bloom/v1/bloom_tokenizer.go b/pkg/storage/bloom/v1/bloom_tokenizer.go index 7060052438190..e2659180b4eac 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer.go @@ -11,7 +11,6 @@ import ( "github.com/grafana/loki/pkg/chunkenc" "github.com/grafana/loki/pkg/logproto" "github.com/grafana/loki/pkg/logql/log" - "github.com/grafana/loki/pkg/storage/bloom/v1/filter" "github.com/grafana/loki/pkg/storage/chunk" util_log "github.com/grafana/loki/pkg/util/log" //"github.com/grafana/loki/tools/tsdb/helpers" @@ -69,20 +68,26 @@ func clearCache(cache map[string]interface{}) { } } -func (bt *BloomTokenizer) PopulateSBF(sbf *filter.ScalableBloomFilter, chunks []chunk.Chunk) { +func (bt *BloomTokenizer) PopulateSeriesWithBloom(seriesWithBloom *SeriesWithBloom, chunks []chunk.Chunk) { clearCache(bt.cache) for idx := range chunks { lc := chunks[idx].Data.(*chunkenc.Facade).LokiChunk() bt.chunkIDTokenizer.Reinit(chunks[idx].ChunkRef) // TODO: error handling - itr, _ := lc.Iterator( + itr, err := lc.Iterator( context.Background(), time.Unix(0, 0), // TODO: Parameterize/better handle the timestamps? time.Unix(0, math.MaxInt64), logproto.FORWARD, log.NewNoopPipeline().ForStream(chunks[idx].Metric), ) + if err != nil { + level.Info(util_log.Logger).Log("chunk iterator cannot be created") + return + } + + defer itr.Close() for itr.Next() && itr.Error() == nil { toks := bt.chunkIDTokenizer.Tokens(itr.Entry().Line) @@ -94,7 +99,7 @@ func (bt *BloomTokenizer) PopulateSBF(sbf *filter.ScalableBloomFilter, chunks [] if !found { bt.cache[str] = nil - sbf.TestAndAdd(tok.Key) + seriesWithBloom.Bloom.ScalableBloomFilter.TestAndAdd(tok.Key) if len(bt.cache) > 150000 { // While crude, this has proven efficient in performance testing. This speaks to the similarity in log lines near each other clearCache(bt.cache) @@ -103,6 +108,11 @@ func (bt *BloomTokenizer) PopulateSBF(sbf *filter.ScalableBloomFilter, chunks [] } } } + seriesWithBloom.Series.Chunks = append(seriesWithBloom.Series.Chunks, ChunkRef{ + Start: chunks[idx].From, + End: chunks[idx].Through, + Checksum: chunks[idx].Checksum, + }) } // for each chunk } diff --git a/tools/tsdb/bloom-tester/lib.go b/tools/tsdb/bloom-tester/lib.go index 2a40f162a8995..18ceb14b6b611 100644 --- a/tools/tsdb/bloom-tester/lib.go +++ b/tools/tsdb/bloom-tester/lib.go @@ -350,7 +350,17 @@ func analyze(metrics *Metrics, sampler Sampler, indexShipper indexshipper.IndexS startTime := time.Now().UnixMilli() sbf := experiment.bloom() - bloomTokenizer.PopulateSBF(sbf, got) + bloom := bt.Bloom{ + ScalableBloomFilter: *sbf, + } + series := bt.Series{ + Fingerprint: fp, + } + swb := bt.SeriesWithBloom{ + Bloom: &bloom, + Series: &series, + } + bloomTokenizer.PopulateSeriesWithBloom(&swb, got) endTime := time.Now().UnixMilli() if len(got) > 0 { @@ -361,7 +371,7 @@ func analyze(metrics *Metrics, sampler Sampler, indexShipper indexshipper.IndexS float64(estimatedCount(sbf.Capacity(), sbf.FillRatio())), ) - writeSBF(sbf, + writeSBF(&swb.Bloom.ScalableBloomFilter, os.Getenv("DIR"), fmt.Sprint(bucketPrefix, experiment.name), os.Getenv("BUCKET"),