Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Clean up unused bloom filter related code #14183

Merged
merged 6 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Remove n-gram length and n-gram skip
Signed-off-by: Christian Haudum <[email protected]>
  • Loading branch information
chaudum committed Sep 19, 2024
commit 7f6bf488ec0c4f6918d2f0e3b25915039ef20927
4 changes: 1 addition & 3 deletions pkg/bloombuild/builder/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -340,11 +340,9 @@ func (b *Builder) processTask(

var (
blockCt int
nGramSize = uint64(b.limits.BloomNGramLength(tenant))
nGramSkip = uint64(b.limits.BloomNGramSkip(tenant))
maxBlockSize = uint64(b.limits.BloomMaxBlockSize(tenant))
maxBloomSize = uint64(b.limits.BloomMaxBloomSize(tenant))
blockOpts = v1.NewBlockOptions(blockEnc, nGramSize, nGramSkip, maxBlockSize, maxBloomSize)
blockOpts = v1.NewBlockOptions(blockEnc, maxBlockSize, maxBloomSize)
created []bloomshipper.Meta
totalSeries int
bytesAdded int
Expand Down
2 changes: 0 additions & 2 deletions pkg/bloombuild/builder/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ func (cfg *Config) Validate() error {

type Limits interface {
BloomBlockEncoding(tenantID string) string
BloomNGramLength(tenantID string) int
BloomNGramSkip(tenantID string) int
BloomMaxBlockSize(tenantID string) int
BloomMaxBloomSize(tenantID string) int
}
2 changes: 0 additions & 2 deletions pkg/bloombuild/builder/spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,6 @@ func NewSimpleBloomGenerator(
reporter: reporter,

tokenizer: v1.NewBloomTokenizer(
opts.Schema.NGramLen(),
opts.Schema.NGramSkip(),
int(opts.UnencodedBlockOptions.MaxBloomSizeBytes),
metrics,
log.With(
Expand Down
8 changes: 4 additions & 4 deletions pkg/bloombuild/builder/spec_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,13 @@ func TestSimpleBloomGenerator(t *testing.T) {
}{
{
desc: "SkipsIncompatibleSchemas",
fromSchema: v1.NewBlockOptions(enc, 3, 0, maxBlockSize, 0),
toSchema: v1.NewBlockOptions(enc, 4, 0, maxBlockSize, 0),
fromSchema: v1.NewBlockOptions(enc, maxBlockSize, 0),
toSchema: v1.NewBlockOptions(enc, maxBlockSize, 0),
},
{
desc: "CombinesBlocks",
fromSchema: v1.NewBlockOptions(enc, 4, 0, maxBlockSize, 0),
toSchema: v1.NewBlockOptions(enc, 4, 0, maxBlockSize, 0),
fromSchema: v1.NewBlockOptions(enc, maxBlockSize, 0),
toSchema: v1.NewBlockOptions(enc, maxBlockSize, 0),
},
} {
t.Run(fmt.Sprintf("%s/%s", tc.desc, enc), func(t *testing.T) {
Expand Down
2 changes: 1 addition & 1 deletion pkg/bloombuild/planner/planner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ func genBlock(ref bloomshipper.BlockRef) (bloomshipper.Block, error) {
writer := v1.NewMemoryBlockWriter(indexBuf, bloomsBuf)
reader := v1.NewByteReader(indexBuf, bloomsBuf)

blockOpts := v1.NewBlockOptions(compression.EncNone, 4, 1, 0, 0)
blockOpts := v1.NewBlockOptions(compression.EncNone, 0, 0)

builder, err := v1.NewBlockBuilder(blockOpts, writer)
if err != nil {
Expand Down
25 changes: 7 additions & 18 deletions pkg/storage/bloom/v1/bloom_tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@ type BloomTokenizer struct {
metrics *Metrics
logger log.Logger

maxBloomSize int // size in bytes
lineTokenizer *NGramTokenizer
cache map[string]interface{}
maxBloomSize int // size in bytes
cache map[string]interface{}
}

const cacheSize = 150000
Expand All @@ -37,25 +36,15 @@ const eightBits = 8
// 1) The token slices generated must not be mutated externally
// 2) The token slice must not be used after the next call to `Tokens()` as it will repopulate the slice.
// 2) This is not thread safe.
func NewBloomTokenizer(nGramLen, nGramSkip int, maxBloomSize int, metrics *Metrics, logger log.Logger) *BloomTokenizer {
level.Info(logger).Log("msg", "create new bloom tokenizer", "ngram length", nGramLen, "ngram skip", nGramSkip)
func NewBloomTokenizer(maxBloomSize int, metrics *Metrics, logger log.Logger) *BloomTokenizer {
return &BloomTokenizer{
metrics: metrics,
logger: logger,
cache: make(map[string]interface{}, cacheSize),
lineTokenizer: NewNGramTokenizer(nGramLen, nGramSkip),
maxBloomSize: maxBloomSize,
metrics: metrics,
logger: logger,
cache: make(map[string]interface{}, cacheSize),
maxBloomSize: maxBloomSize,
}
}

func (bt *BloomTokenizer) N() uint64 {
return uint64(bt.lineTokenizer.N())
}

func (bt *BloomTokenizer) SkipFactor() uint64 {
return uint64(bt.lineTokenizer.SkipFactor())
}

// prefixedToken returns a byte slice with sufficient capacity for a chunk-ref prefixed token
// of specific ngram length, along with the length of the prefix.
// It ensures enough capacity for the prefix and the token so additional tokens can be created
Expand Down
33 changes: 7 additions & 26 deletions pkg/storage/bloom/v1/bloom_tokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,6 @@ import (
"github.com/prometheus/client_golang/prometheus"
)

const (
DefaultNGramLength = 4
DefaultNGramSkip = 0
)

var (
four = NewNGramTokenizer(4, 0)
metrics = NewMetrics(prometheus.DefaultRegisterer)
Expand Down Expand Up @@ -82,24 +77,10 @@ func TestPrefixedKeyCreation(t *testing.T) {
}
}

func TestSetLineTokenizer(t *testing.T) {
t.Parallel()
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics, logger.NewNopLogger())

// Validate defaults
require.Equal(t, bt.lineTokenizer.N(), DefaultNGramLength)
require.Equal(t, bt.lineTokenizer.SkipFactor(), DefaultNGramSkip)

// Set new tokenizer, and validate against that
bt.lineTokenizer = NewNGramTokenizer(6, 7)
require.Equal(t, bt.lineTokenizer.N(), 6)
require.Equal(t, bt.lineTokenizer.SkipFactor(), 7)
}

func TestTokenizerPopulate(t *testing.T) {
t.Parallel()
var testLine = "this is a log line"
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics, logger.NewNopLogger())
bt := NewBloomTokenizer(0, metrics, logger.NewNopLogger())

metadata := push.LabelsAdapter{
{Name: "pod", Value: "loki-1"},
Expand Down Expand Up @@ -144,7 +125,7 @@ func TestTokenizerPopulate(t *testing.T) {

func TestBloomTokenizerPopulateWithoutPreexistingBloom(t *testing.T) {
var testLine = "this is a log line"
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics, logger.NewNopLogger())
bt := NewBloomTokenizer(0, metrics, logger.NewNopLogger())

metadata := push.LabelsAdapter{
{Name: "pod", Value: "loki-1"},
Expand Down Expand Up @@ -221,7 +202,7 @@ func randomStr(ln int) string {

func TestTokenizerPopulateWontExceedMaxSize(t *testing.T) {
maxSize := 4 << 10
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, maxSize, NewMetrics(nil), logger.NewNopLogger())
bt := NewBloomTokenizer(maxSize, NewMetrics(nil), logger.NewNopLogger())
ch := make(chan *BloomCreation)

metadata := make([]push.LabelsAdapter, 0, 4<<10)
Expand Down Expand Up @@ -269,7 +250,7 @@ func populateAndConsumeBloom(
func BenchmarkPopulateSeriesWithBloom(b *testing.B) {
for i := 0; i < b.N; i++ {
var testLine = lorem + lorem + lorem
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics, logger.NewNopLogger())
bt := NewBloomTokenizer(0, metrics, logger.NewNopLogger())

sbf := filter.NewScalableBloomFilter(1024, 0.01, 0.8)

Expand Down Expand Up @@ -302,7 +283,7 @@ func BenchmarkPopulateSeriesWithBloom(b *testing.B) {
}

func TestTokenizerClearsCacheBetweenPopulateCalls(t *testing.T) {
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, NewMetrics(nil), logger.NewNopLogger())
bt := NewBloomTokenizer(0, NewMetrics(nil), logger.NewNopLogger())
md := push.LabelsAdapter{
{Name: "trace_id", Value: "3bef3c91643bde73"},
}
Expand Down Expand Up @@ -340,7 +321,7 @@ func TestTokenizerClearsCacheBetweenPopulateCalls(t *testing.T) {
}

func BenchmarkMapClear(b *testing.B) {
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics, logger.NewNopLogger())
bt := NewBloomTokenizer(0, metrics, logger.NewNopLogger())
for i := 0; i < b.N; i++ {
for k := 0; k < cacheSize; k++ {
bt.cache[fmt.Sprint(k)] = k
Expand All @@ -351,7 +332,7 @@ func BenchmarkMapClear(b *testing.B) {
}

func BenchmarkNewMap(b *testing.B) {
bt := NewBloomTokenizer(DefaultNGramLength, DefaultNGramSkip, 0, metrics, logger.NewNopLogger())
bt := NewBloomTokenizer(0, metrics, logger.NewNopLogger())
for i := 0; i < b.N; i++ {
for k := 0; k < cacheSize; k++ {
bt.cache[fmt.Sprint(k)] = k
Expand Down
8 changes: 3 additions & 5 deletions pkg/storage/bloom/v1/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,10 @@ func (b BlockOptions) Encode(enc *encoding.Encbuf) {
enc.PutBE64(b.BlockSize)
}

func NewBlockOptions(enc compression.Encoding, nGramLength, nGramSkip, maxBlockSizeBytes, maxBloomSizeBytes uint64) BlockOptions {
func NewBlockOptions(enc compression.Encoding, maxBlockSizeBytes, maxBloomSizeBytes uint64) BlockOptions {
opts := NewBlockOptionsFromSchema(Schema{
version: CurrentSchemaVersion,
encoding: enc,
nGramLength: nGramLength,
nGramSkip: nGramSkip,
version: CurrentSchemaVersion,
encoding: enc,
})
opts.BlockSize = maxBlockSizeBytes
opts.UnencodedBlockOptions.MaxBloomSizeBytes = maxBloomSizeBytes
Expand Down
24 changes: 8 additions & 16 deletions pkg/storage/bloom/v1/builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,8 @@ func TestBlockOptions_RoundTrip(t *testing.T) {
t.Parallel()
opts := BlockOptions{
Schema: Schema{
version: CurrentSchemaVersion,
encoding: compression.EncSnappy,
nGramLength: 10,
nGramSkip: 2,
version: CurrentSchemaVersion,
encoding: compression.EncSnappy,
},
SeriesPageSize: 100,
BloomPageSize: 10 << 10,
Expand Down Expand Up @@ -87,10 +85,8 @@ func TestBlockBuilder_RoundTrip(t *testing.T) {
t.Run(desc, func(t *testing.T) {
blockOpts := BlockOptions{
Schema: Schema{
version: CurrentSchemaVersion,
encoding: enc,
nGramLength: 10,
nGramSkip: 2,
version: CurrentSchemaVersion,
encoding: enc,
},
SeriesPageSize: 100,
BloomPageSize: 10 << 10,
Expand Down Expand Up @@ -398,10 +394,8 @@ func TestBlockReset(t *testing.T) {
reader := NewByteReader(indexBuf, bloomsBuf)

schema := Schema{
version: CurrentSchemaVersion,
encoding: compression.EncSnappy,
nGramLength: 10,
nGramSkip: 2,
version: CurrentSchemaVersion,
encoding: compression.EncSnappy,
}

builder, err := NewBlockBuilder(
Expand Down Expand Up @@ -456,10 +450,8 @@ func TestMergeBuilder_Roundtrip(t *testing.T) {

blockOpts := BlockOptions{
Schema: Schema{
version: CurrentSchemaVersion,
encoding: compression.EncSnappy, // test with different encodings?
nGramLength: 4, // needs to match values from MkBasicSeriesWithBlooms
nGramSkip: 0, // needs to match values from MkBasicSeriesWithBlooms
version: CurrentSchemaVersion,
encoding: compression.EncSnappy, // test with different encodings?
},
SeriesPageSize: 100,
BloomPageSize: 10 << 10,
Expand Down
6 changes: 2 additions & 4 deletions pkg/storage/bloom/v1/fuse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,8 @@ func TestFuseMultiPage(t *testing.T) {
builder, err := NewBlockBuilder(
BlockOptions{
Schema: Schema{
version: CurrentSchemaVersion,
encoding: compression.EncSnappy,
nGramLength: 3, // we test trigrams
nGramSkip: 0,
version: CurrentSchemaVersion,
encoding: compression.EncSnappy,
},
SeriesPageSize: 100,
BloomPageSize: 10, // So we force one bloom per page
Expand Down
31 changes: 11 additions & 20 deletions pkg/storage/bloom/v1/schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,22 +38,19 @@ var (
)

type Schema struct {
version Version
encoding compression.Encoding
nGramLength, nGramSkip uint64
version Version
encoding compression.Encoding
}

func NewSchema() Schema {
return Schema{
version: CurrentSchemaVersion,
encoding: compression.EncNone,
nGramLength: 0,
nGramSkip: 0,
version: CurrentSchemaVersion,
encoding: compression.EncNone,
}
}

func (s Schema) String() string {
return fmt.Sprintf("%s,encoding=%s,ngram=%d,skip=%d", s.version, s.encoding, s.nGramLength, s.nGramSkip)
return fmt.Sprintf("%s,encoding=%s", s.version, s.encoding)
}

func (s Schema) Compatible(other Schema) bool {
Expand All @@ -64,14 +61,6 @@ func (s Schema) Version() Version {
return s.version
}

func (s Schema) NGramLen() int {
return int(s.nGramLength)
}

func (s Schema) NGramSkip() int {
return int(s.nGramSkip)
}

// byte length
func (s Schema) Len() int {
// magic number + version + encoding + ngram length + ngram skip
Expand All @@ -91,8 +80,9 @@ func (s *Schema) Encode(enc *encoding.Encbuf) {
enc.PutBE32(magicNumber)
enc.PutByte(byte(s.version))
enc.PutByte(byte(s.encoding))
enc.PutBE64(s.nGramLength)
enc.PutBE64(s.nGramSkip)
// kept to keep compatibility
enc.PutBE64(0) // previously n-gram length
enc.PutBE64(0) // previously n-gram skip

}

Expand Down Expand Up @@ -123,8 +113,9 @@ func (s *Schema) Decode(dec *encoding.Decbuf) error {
return errors.Wrap(err, "parsing encoding")
}

s.nGramLength = dec.Be64()
s.nGramSkip = dec.Be64()
// kept to keep compatibility
_ = dec.Be64() // previously n-gram length
_ = dec.Be64() // previously n-gram skip

return dec.Err()
}
6 changes: 2 additions & 4 deletions pkg/storage/bloom/v1/test_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,8 @@ func MakeBlock(t testing.TB, nth int, fromFp, throughFp model.Fingerprint, fromT
builder, err := NewBlockBuilder(
BlockOptions{
Schema: Schema{
version: CurrentSchemaVersion,
encoding: compression.EncSnappy,
nGramLength: 4, // see DefaultNGramLength in bloom_tokenizer_test.go
nGramSkip: 0, // see DefaultNGramSkip in bloom_tokenizer_test.go
version: CurrentSchemaVersion,
encoding: compression.EncSnappy,
},
SeriesPageSize: 100,
BloomPageSize: 10 << 10,
Expand Down
6 changes: 2 additions & 4 deletions pkg/storage/bloom/v1/versioned_builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,8 @@ import (
func smallBlockOpts(v Version, enc compression.Encoding) BlockOptions {
return BlockOptions{
Schema: Schema{
version: v,
encoding: enc,
nGramLength: 4,
nGramSkip: 0,
version: v,
encoding: enc,
},
SeriesPageSize: 100,
BloomPageSize: 2 << 10,
Expand Down
Loading