diff --git a/pkg/storage/bloom/v1/bloom_tokenizer.go b/pkg/storage/bloom/v1/bloom_tokenizer.go index 8581b75416e40..49f92d51ac1d5 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer.go @@ -121,18 +121,23 @@ func (bt *BloomTokenizer) PopulateSeriesWithBloom(seriesWithBloom *SeriesWithBlo // TokenizeLine returns a slice of tokens for the given line, based on the current value of the tokenizer // If the tokenizer has a skip value, then the line will be tokenized multiple times, // starting at the beginning of the line, with "skip" number of iterations, offset by one each time -func (bt *BloomTokenizer) TokenizeLine(line string) []Token { - tokens := make([]Token, 0, 100) +// Each offset is kept as a separate slice of tokens, and all are returned in a slice of slices +func (bt *BloomTokenizer) TokenizeLine(line string) [][]Token { + allTokens := make([][]Token, 0, 10) if len(line) >= bt.lineTokenizer.GetMin() && len(line) >= bt.lineTokenizer.GetSkip() { for i := 0; i <= bt.lineTokenizer.GetSkip(); i++ { - tmp := bt.lineTokenizer.Tokens(line[i:]) - for _, token := range tmp { + tmpTokens := make([]Token, 0, 100) + tokens := bt.lineTokenizer.Tokens(line[i:]) + for _, token := range tokens { tmpToken := Token{} tmpToken.Key = make([]byte, len(token.Key)) copy(tmpToken.Key, token.Key) - tokens = append(tokens, tmpToken) + tmpTokens = append(tmpTokens, tmpToken) + } + if len(tokens) > 0 { + allTokens = append(allTokens, tmpTokens) } } } - return tokens + return allTokens } diff --git a/pkg/storage/bloom/v1/bloom_tokenizer_test.go b/pkg/storage/bloom/v1/bloom_tokenizer_test.go index 015282975f6f8..33c9770fe403e 100644 --- a/pkg/storage/bloom/v1/bloom_tokenizer_test.go +++ b/pkg/storage/bloom/v1/bloom_tokenizer_test.go @@ -49,28 +49,28 @@ func TestDefaultTokenizeLine(t *testing.T) { for _, tc := range []struct { desc string input string - exp []Token + exp [][]Token }{ { desc: "empty", input: "", - exp: []Token{}, + exp: [][]Token{}, }, { desc: "single char", input: "a", - exp: []Token{}, + exp: [][]Token{}, }, { desc: "four chars", input: "abcd", - exp: []Token{ - {Key: []byte("abcd")}}, + exp: [][]Token{ + {{Key: []byte("abcd")}}}, }, { desc: "uuid partial", input: "2b1a5e46-36a2-4", - exp: []Token{ + exp: [][]Token{{ {Key: []byte("2b1a")}, {Key: []byte("b1a5")}, {Key: []byte("1a5e")}, @@ -82,7 +82,7 @@ func TestDefaultTokenizeLine(t *testing.T) { {Key: []byte("-36a")}, {Key: []byte("36a2")}, {Key: []byte("6a2-")}, - {Key: []byte("a2-4")}, + {Key: []byte("a2-4")}}, }, }, } { @@ -99,37 +99,37 @@ func TestTokenizeLineWithSkips(t *testing.T) { for _, tc := range []struct { desc string input string - exp []Token + exp [][]Token }{ { desc: "empty", input: "", - exp: []Token{}, + exp: [][]Token{}, }, { desc: "single char", input: "a", - exp: []Token{}, + exp: [][]Token{}, }, { desc: "four chars", input: "abcd", - exp: []Token{ - {Key: []byte("abcd")}}, + exp: [][]Token{{ + {Key: []byte("abcd")}}}, }, { desc: "longer string", input: "abcdefghijkl", - exp: []Token{ - {Key: []byte("abcd")}, - {Key: []byte("defg")}, - {Key: []byte("ghij")}, - {Key: []byte("bcde")}, - {Key: []byte("efgh")}, - {Key: []byte("hijk")}, - {Key: []byte("cdef")}, - {Key: []byte("fghi")}, - {Key: []byte("ijkl")}, + exp: [][]Token{ + {{Key: []byte("abcd")}, + {Key: []byte("defg")}, + {Key: []byte("ghij")}}, + {{Key: []byte("bcde")}, + {Key: []byte("efgh")}, + {Key: []byte("hijk")}}, + {{Key: []byte("cdef")}, + {Key: []byte("fghi")}, + {Key: []byte("ijkl")}}, }, }, } { @@ -178,7 +178,7 @@ func TestPopulateSeriesWithBloom(t *testing.T) { bt.PopulateSeriesWithBloom(&swb, chunks) tokens := bt.TokenizeLine(testLine) - for _, token := range tokens { + for _, token := range tokens[0] { require.True(t, swb.Bloom.Test(token.Key)) } }