Skip to content

Commit

Permalink
Tokenize line with a skip tokenizer properly
Browse files Browse the repository at this point in the history
  • Loading branch information
paul1r committed Nov 3, 2023
1 parent 2197d35 commit 2eec275
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 2 deletions.
17 changes: 16 additions & 1 deletion pkg/storage/bloom/v1/bloom_tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,21 @@ func (bt *BloomTokenizer) PopulateSeriesWithBloom(seriesWithBloom *SeriesWithBlo
} // for each chunk
}

// TokenizeLine returns a slice of tokens for the given line, based on the current value of the tokenizer
// If the tokenizer has a skip value, then the line will be tokenized multiple times,
// starting at the beginning of the line, with "skip" number of iterations, offset by one each time
func (bt *BloomTokenizer) TokenizeLine(line string) []Token {
return bt.lineTokenizer.Tokens(line)
tokens := make([]Token, 0, 100)
if len(line) >= bt.lineTokenizer.GetMin() && len(line) >= bt.lineTokenizer.GetSkip() {
for i := 0; i <= bt.lineTokenizer.GetSkip(); i++ {
tmp := bt.lineTokenizer.Tokens(line[i:])
for _, token := range tmp {
tmpToken := Token{}
tmpToken.Key = make([]byte, len(token.Key), len(token.Key))
copy(tmpToken.Key, token.Key)
tokens = append(tokens, tmpToken)
}
}
}
return tokens
}
49 changes: 48 additions & 1 deletion pkg/storage/bloom/v1/bloom_tokenizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func TestSetLineTokenizer(t *testing.T) {
require.Equal(t, bt.chunkIDTokenizer.GetSkip(), 2)
}

func TestTokenizeLine(t *testing.T) {
func TestDefaultTokenizeLine(t *testing.T) {
bt, _ := NewBloomTokenizer(prometheus.DefaultRegisterer)

for _, tc := range []struct {
Expand Down Expand Up @@ -88,6 +88,53 @@ func TestTokenizeLine(t *testing.T) {
}
}

func TestTokenizeLineWithSkips(t *testing.T) {
bt, _ := NewBloomTokenizer(prometheus.DefaultRegisterer)
bt.SetLineTokenizer(NewNGramTokenizer(DefaultNGramLength, DefaultNGramLength+1, 2))

for _, tc := range []struct {
desc string
input string
exp []Token
}{
{
desc: "empty",
input: "",
exp: []Token{},
},
{
desc: "single char",
input: "a",
exp: []Token{},
},
{
desc: "four chars",
input: "abcd",
exp: []Token{
{Key: []byte("abcd")}},
},
{
desc: "longer string",
input: "abcdefghijkl",
exp: []Token{
{Key: []byte("abcd")},
{Key: []byte("defg")},
{Key: []byte("ghij")},
{Key: []byte("bcde")},
{Key: []byte("efgh")},
{Key: []byte("hijk")},
{Key: []byte("cdef")},
{Key: []byte("fghi")},
{Key: []byte("ijkl")},
},
},
} {
t.Run(tc.desc, func(t *testing.T) {
require.Equal(t, tc.exp, bt.TokenizeLine(tc.input))
})
}
}

func TestPopulateSeriesWithBloom(t *testing.T) {
var testLine = "this is a log line"
bt, _ := NewBloomTokenizer(prometheus.DefaultRegisterer)
Expand Down

0 comments on commit 2eec275

Please sign in to comment.