-
Notifications
You must be signed in to change notification settings - Fork 3.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[bloom-compactor] downloading chunks in batches #11649
Changes from all commits
4dcf3c8
e90f8d5
a97a9e4
3ef9d53
ec446f1
6c8131a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,7 +22,7 @@ import ( | |
) | ||
|
||
type compactorTokenizer interface { | ||
PopulateSeriesWithBloom(bloom *v1.SeriesWithBloom, chunks []chunk.Chunk) error | ||
PopulateSeriesWithBloom(bloom *v1.SeriesWithBloom, chunkBatchesIterator v1.Iterator[[]chunk.Chunk]) error | ||
vlad-diachenko marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I think we can avoid modifying the argument of this method as well as its implementation at bloom_tokenizer.go. We can call PopulateSeriesWithBloom for each call of the iterator Next(). E.g. on batchesIterator, err := newChunkBatchesIterator(ctx, storeClient.chunk, chunkRefs, limits.BloomCompactorChunksBatchSize(job.tenantID))
if err != nil {
return fmt.Errorf("error creating chunks batches iterator: %w", err)
}
for batchesIterator.Next() {
if err := batchesIterator.Err() ...
chunks := batchesIterator.At()
err = bt.PopulateSeriesWithBloom(&bloomForChks, chunks)
}
err = bt.PopulateSeriesWithBloom(&bloomForChks, batchesIterator) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would affect the metrics we expose inside So what about PopulateSeriesWithBloom receiving an iterator of chunks ( func (c *chunksBatchesIterator) loadNextBatch() error {
batchSize := c.batchSize
chunksToDownloadCount := len(c.chunksToDownload)
if chunksToDownloadCount < batchSize {
batchSize = chunksToDownloadCount
}
chunksToDownload := c.chunksToDownload[:batchSize]
c.chunksToDownload = c.chunksToDownload[batchSize:]
newBatch, err := c.client.GetChunks(c.context, chunksToDownload)
if err != nil {
return err
}
c.currentBatch = newBatch
return nil
}
func (c *chunksBatchesIterator) Next() bool {
if len(c.currentBatch) == 0 {
if len(c.chunksToDownload) == 0 {
return false
}
if c.err = c.loadNextBatch(); c.err != nil {
return false
}
}
// Pop the first chunk from the current batch and set it as the current chunk.
c.currentChunk = c.currentBatch[0]
c.currentBatch = c.currentBatch[1:]
return true
}
func (c *chunksBatchesIterator) At() chunk.Chunk {
return c.currentChunk
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, it will do the same... but I do not believe that we win anything by changing it this way... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wdyt? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, not a strong opinion! |
||
} | ||
|
||
type chunkClient interface { | ||
|
@@ -86,7 +86,7 @@ func makeChunkRefs(chksMetas []tsdbindex.ChunkMeta, tenant string, fp model.Fing | |
return chunkRefs | ||
} | ||
|
||
func buildBloomFromSeries(seriesMeta seriesMeta, fpRate float64, tokenizer compactorTokenizer, chunks []chunk.Chunk) (v1.SeriesWithBloom, error) { | ||
func buildBloomFromSeries(seriesMeta seriesMeta, fpRate float64, tokenizer compactorTokenizer, chunks v1.Iterator[[]chunk.Chunk]) (v1.SeriesWithBloom, error) { | ||
// Create a bloom for this series | ||
bloomForChks := v1.SeriesWithBloom{ | ||
Series: &v1.Series{ | ||
|
@@ -155,21 +155,20 @@ func createLocalDirName(workingDir string, job Job) string { | |
} | ||
|
||
// Compacts given list of chunks, uploads them to storage and returns a list of bloomBlocks | ||
func compactNewChunks( | ||
ctx context.Context, | ||
func compactNewChunks(ctx context.Context, | ||
logger log.Logger, | ||
job Job, | ||
fpRate float64, | ||
bt compactorTokenizer, | ||
storeClient chunkClient, | ||
builder blockBuilder, | ||
limits Limits, | ||
) (bloomshipper.Block, error) { | ||
// Ensure the context has not been canceled (ie. compactor shutdown has been triggered). | ||
if err := ctx.Err(); err != nil { | ||
return bloomshipper.Block{}, err | ||
} | ||
|
||
bloomIter := newLazyBloomBuilder(ctx, job, storeClient, bt, fpRate, logger) | ||
bloomIter := newLazyBloomBuilder(ctx, job, storeClient, bt, logger, limits) | ||
|
||
// Build and upload bloomBlock to storage | ||
block, err := buildBlockFromBlooms(ctx, logger, builder, bloomIter, job) | ||
|
@@ -182,13 +181,14 @@ func compactNewChunks( | |
} | ||
|
||
type lazyBloomBuilder struct { | ||
ctx context.Context | ||
metas v1.Iterator[seriesMeta] | ||
tenant string | ||
client chunkClient | ||
bt compactorTokenizer | ||
fpRate float64 | ||
logger log.Logger | ||
ctx context.Context | ||
metas v1.Iterator[seriesMeta] | ||
tenant string | ||
client chunkClient | ||
bt compactorTokenizer | ||
fpRate float64 | ||
logger log.Logger | ||
chunksBatchSize int | ||
|
||
cur v1.SeriesWithBloom // retured by At() | ||
err error // returned by Err() | ||
|
@@ -198,15 +198,16 @@ type lazyBloomBuilder struct { | |
// which are used by the blockBuilder to write a bloom block. | ||
// We use an interator to avoid loading all blooms into memory first, before | ||
// building the block. | ||
func newLazyBloomBuilder(ctx context.Context, job Job, client chunkClient, bt compactorTokenizer, fpRate float64, logger log.Logger) *lazyBloomBuilder { | ||
func newLazyBloomBuilder(ctx context.Context, job Job, client chunkClient, bt compactorTokenizer, logger log.Logger, limits Limits) *lazyBloomBuilder { | ||
return &lazyBloomBuilder{ | ||
ctx: ctx, | ||
metas: v1.NewSliceIter(job.seriesMetas), | ||
client: client, | ||
tenant: job.tenantID, | ||
bt: bt, | ||
fpRate: fpRate, | ||
logger: logger, | ||
ctx: ctx, | ||
metas: v1.NewSliceIter(job.seriesMetas), | ||
client: client, | ||
tenant: job.tenantID, | ||
bt: bt, | ||
fpRate: limits.BloomFalsePositiveRate(job.tenantID), | ||
logger: logger, | ||
chunksBatchSize: limits.BloomCompactorChunksBatchSize(job.tenantID), | ||
} | ||
} | ||
|
||
|
@@ -218,20 +219,18 @@ func (it *lazyBloomBuilder) Next() bool { | |
} | ||
meta := it.metas.At() | ||
|
||
// Get chunks data from list of chunkRefs | ||
chks, err := it.client.GetChunks(it.ctx, makeChunkRefs(meta.chunkRefs, it.tenant, meta.seriesFP)) | ||
batchesIterator, err := newChunkBatchesIterator(it.ctx, it.client, makeChunkRefs(meta.chunkRefs, it.tenant, meta.seriesFP), it.chunksBatchSize) | ||
if err != nil { | ||
it.err = err | ||
it.cur = v1.SeriesWithBloom{} | ||
level.Debug(it.logger).Log("err in getChunks", err) | ||
level.Debug(it.logger).Log("msg", "err creating chunks batches iterator", "err", err) | ||
return false | ||
} | ||
|
||
it.cur, err = buildBloomFromSeries(meta, it.fpRate, it.bt, chks) | ||
it.cur, err = buildBloomFromSeries(meta, it.fpRate, it.bt, batchesIterator) | ||
if err != nil { | ||
it.err = err | ||
it.cur = v1.SeriesWithBloom{} | ||
level.Debug(it.logger).Log("err in buildBloomFromSeries", err) | ||
level.Debug(it.logger).Log("msg", "err in buildBloomFromSeries", "err", err) | ||
return false | ||
} | ||
return true | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
package bloomcompactor | ||
|
||
import ( | ||
"context" | ||
"errors" | ||
|
||
"github.com/grafana/loki/pkg/storage/chunk" | ||
) | ||
|
||
type chunksBatchesIterator struct { | ||
context context.Context | ||
client chunkClient | ||
chunksToDownload []chunk.Chunk | ||
batchSize int | ||
|
||
currentBatch []chunk.Chunk | ||
err error | ||
} | ||
|
||
func newChunkBatchesIterator(context context.Context, client chunkClient, chunksToDownload []chunk.Chunk, batchSize int) (*chunksBatchesIterator, error) { | ||
if batchSize <= 0 { | ||
vlad-diachenko marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return nil, errors.New("batchSize must be greater than 0") | ||
} | ||
return &chunksBatchesIterator{context: context, client: client, chunksToDownload: chunksToDownload, batchSize: batchSize}, nil | ||
} | ||
|
||
func (c *chunksBatchesIterator) Next() bool { | ||
if len(c.chunksToDownload) == 0 { | ||
return false | ||
} | ||
batchSize := c.batchSize | ||
chunksToDownloadCount := len(c.chunksToDownload) | ||
if chunksToDownloadCount < batchSize { | ||
batchSize = chunksToDownloadCount | ||
} | ||
chunksToDownload := c.chunksToDownload[:batchSize] | ||
c.chunksToDownload = c.chunksToDownload[batchSize:] | ||
c.currentBatch, c.err = c.client.GetChunks(c.context, chunksToDownload) | ||
return c.err == nil | ||
} | ||
|
||
func (c *chunksBatchesIterator) Err() error { | ||
return c.err | ||
} | ||
|
||
func (c *chunksBatchesIterator) At() []chunk.Chunk { | ||
return c.currentBatch | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
package bloomcompactor | ||
|
||
import ( | ||
"context" | ||
"errors" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/require" | ||
|
||
"github.com/grafana/loki/pkg/storage/chunk" | ||
tsdbindex "github.com/grafana/loki/pkg/storage/stores/shipper/indexshipper/tsdb/index" | ||
) | ||
|
||
func Test_chunksBatchesIterator(t *testing.T) { | ||
tests := map[string]struct { | ||
batchSize int | ||
chunksToDownload []chunk.Chunk | ||
constructorError error | ||
|
||
hadNextCount int | ||
}{ | ||
"expected error if batch size is set to 0": { | ||
batchSize: 0, | ||
constructorError: errors.New("batchSize must be greater than 0"), | ||
}, | ||
"expected no error if there are no chunks": { | ||
hadNextCount: 0, | ||
batchSize: 10, | ||
}, | ||
"expected 1 call to the client": { | ||
chunksToDownload: createFakeChunks(10), | ||
hadNextCount: 1, | ||
batchSize: 20, | ||
}, | ||
"expected 1 call to the client(2)": { | ||
chunksToDownload: createFakeChunks(10), | ||
hadNextCount: 1, | ||
batchSize: 10, | ||
}, | ||
"expected 2 calls to the client": { | ||
chunksToDownload: createFakeChunks(10), | ||
hadNextCount: 2, | ||
batchSize: 6, | ||
}, | ||
"expected 10 calls to the client": { | ||
chunksToDownload: createFakeChunks(10), | ||
hadNextCount: 10, | ||
batchSize: 1, | ||
}, | ||
} | ||
for name, data := range tests { | ||
t.Run(name, func(t *testing.T) { | ||
client := &fakeClient{} | ||
iterator, err := newChunkBatchesIterator(context.Background(), client, data.chunksToDownload, data.batchSize) | ||
if data.constructorError != nil { | ||
require.Equal(t, err, data.constructorError) | ||
return | ||
} | ||
hadNextCount := 0 | ||
var downloadedChunks []chunk.Chunk | ||
for iterator.Next() { | ||
hadNextCount++ | ||
downloaded := iterator.At() | ||
downloadedChunks = append(downloadedChunks, downloaded...) | ||
require.LessOrEqual(t, len(downloaded), data.batchSize) | ||
} | ||
require.NoError(t, iterator.Err()) | ||
require.Equal(t, data.chunksToDownload, downloadedChunks) | ||
require.Equal(t, data.hadNextCount, client.callsCount) | ||
require.Equal(t, data.hadNextCount, hadNextCount) | ||
}) | ||
} | ||
} | ||
|
||
func createFakeChunks(count int) []chunk.Chunk { | ||
metas := make([]tsdbindex.ChunkMeta, 0, count) | ||
for i := 0; i < count; i++ { | ||
metas = append(metas, tsdbindex.ChunkMeta{ | ||
Checksum: uint32(i), | ||
MinTime: int64(i), | ||
MaxTime: int64(i + 100), | ||
KB: uint32(i * 100), | ||
Entries: uint32(i * 10), | ||
}) | ||
} | ||
return makeChunkRefs(metas, "fake", 0xFFFF) | ||
} | ||
|
||
type fakeClient struct { | ||
callsCount int | ||
} | ||
|
||
func (f *fakeClient) GetChunks(_ context.Context, chunks []chunk.Chunk) ([]chunk.Chunk, error) { | ||
f.callsCount++ | ||
return chunks, nil | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would prefer to pass the limits because it's extendable...