Remove ngram bloom tester

Signed-off-by: Christian Haudum <[email protected]>
grafana · Sep 19, 2024 · 05e5d45 · 05e5d45
1 parent 7f6bf48
commit 05e5d45
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 373 deletions.
diff --git a/pkg/storage/bloom/v1/bloom_tester.go b/pkg/storage/bloom/v1/bloom_tester.go
@@ -2,15 +2,8 @@ package v1
 
 import (
 	"fmt"
-	"unicode/utf8"
 	"unsafe"
 
-	"github.com/grafana/regexp"
-
-	iter "github.com/grafana/loki/v3/pkg/iter/v2"
-	"github.com/grafana/loki/v3/pkg/logql/log"
-	"github.com/grafana/loki/v3/pkg/logql/log/pattern"
-	"github.com/grafana/loki/v3/pkg/logql/syntax"
 	"github.com/grafana/loki/v3/pkg/storage/bloom/v1/filter"
 )
 
@@ -39,228 +32,20 @@ func (b BloomTests) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefi
 	return true
 }
 
-// ExtractTestableLineFilters extracts all line filters from an expression
-// that can be tested against a bloom filter. This will skip any line filters
-// after a line format expression. A line format expression might add content
-// that the query later matches against, which can't be tested with a bloom filter.
-// E.g. For {app="fake"} |= "foo" | line_format "thisNewTextShouldMatch" |= "thisNewTextShouldMatch"
-// this function will return only the line filter for "foo" since the line filter for "thisNewTextShouldMatch"
-// wouldn't match against the bloom filter but should match against the query.
-func ExtractTestableLineFilters(expr syntax.Expr) []syntax.LineFilterExpr {
-	if expr == nil {
-		return nil
-	}
-
-	var filters []syntax.LineFilterExpr
-	var lineFmtFound bool
-	visitor := &syntax.DepthFirstTraversal{
-		VisitLineFilterFn: func(_ syntax.RootVisitor, e *syntax.LineFilterExpr) {
-			if e != nil && !lineFmtFound {
-				filters = append(filters, *e)
-			}
-		},
-		VisitLineFmtFn: func(_ syntax.RootVisitor, e *syntax.LineFmtExpr) {
-			if e != nil {
-				lineFmtFound = true
-			}
-		},
-	}
-	expr.Accept(visitor)
-	return filters
-}
-
-// FiltersToBloomTest converts a list of line filters to a BloomTest.
-// Note that all the line filters should be testable against a bloom filter.
-// Use ExtractTestableLineFilters to extract testable line filters from an expression.
-// TODO(owen-d): limits the number of bloom lookups run.
-// An arbitrarily high number can overconsume cpu and is a DoS vector.
-// TODO(owen-d): use for loop not recursion to protect callstack
-func FiltersToBloomTest(b NGramBuilder, filters ...syntax.LineFilterExpr) BloomTest {
-	tests := make(BloomTests, 0, len(filters))
-	for _, f := range filters {
-		if f.Left != nil {
-			tests = append(tests, FiltersToBloomTest(b, *f.Left))
-		}
-		if f.Or != nil {
-			left := FiltersToBloomTest(b, *f.Or)
-			right := simpleFilterToBloomTest(b, f.LineFilter)
-			tests = append(tests, newOrTest(left, right))
-			continue
-		}
-
-		tests = append(tests, simpleFilterToBloomTest(b, f.LineFilter))
-	}
-	return tests
-}
-
-func simpleFilterToBloomTest(b NGramBuilder, filter syntax.LineFilter) BloomTest {
-	switch filter.Ty {
-	case log.LineMatchNotEqual, log.LineMatchNotRegexp, log.LineMatchNotPattern:
-		// We cannot test _negated_ filters with a bloom filter since blooms are probabilistic
-		// filters that can only tell us if a string _might_ exist.
-		// For example, for `!= "foo"`, the bloom filter might tell us that the string "foo" might exist
-		// but because we are not sure, we cannot discard that chunk because it might actually not be there.
-		// Therefore, we return a test that always returns true.
-		return MatchAll
-	case log.LineMatchEqual:
-		return newStringTest(b, filter.Match)
-	case log.LineMatchRegexp:
-		return MatchAll
-	case log.LineMatchPattern:
-		return newPatternTest(b, filter.Match)
-	default:
-		return MatchAll
-	}
-}
-
-type bloomCheckerWrapper struct {
-	bloom filter.Checker
-}
-
-// Test implements the log.Checker interface
-func (b bloomCheckerWrapper) Test(line []byte, _ bool, _ bool) bool {
-	return b.bloom.Test(line)
-}
-
-// TestRegex implements the log.Checker interface
-func (b bloomCheckerWrapper) TestRegex(_ *regexp.Regexp) bool {
-	// We won't support regexes in bloom filters so we just return true
-	return true
-}
-
-type logCheckerWrapper struct {
-	checker log.Checker
-}
-
-// Test implements the filter.Checker interface
-func (l logCheckerWrapper) Test(data []byte) bool {
-	return l.checker.Test(data, true, false)
-}
-
-type matcherFilterWrapper struct {
-	filter log.Matcher
-}
-
-func (m matcherFilterWrapper) Matches(bloom filter.Checker) bool {
-	return m.filter.Matches(bloomCheckerWrapper{bloom})
-}
-
-func (m matcherFilterWrapper) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefixLen int) bool {
-	return m.filter.Matches(bloomCheckerWrapper{prefixedChecker{
-		checker:   bloom,
-		buf:       buf,
-		prefixLen: prefixLen,
-	}})
-}
-
-type prefixedChecker struct {
-	checker   filter.Checker
-	buf       []byte
-	prefixLen int
-}
-
-func (p prefixedChecker) Test(data []byte) bool {
-	return p.checker.Test(append(p.buf[:p.prefixLen], data...))
-}
-
 type matchAllTest struct{}
 
 var MatchAll = matchAllTest{}
 
+// Matches implements BloomTest
 func (n matchAllTest) Matches(_ filter.Checker) bool {
 	return true
 }
 
+// MatchesWithPrefixBuf implements BloomTest
 func (n matchAllTest) MatchesWithPrefixBuf(_ filter.Checker, _ []byte, _ int) bool {
 	return true
 }
 
-// NGramBuilder is an interface for tokenizing strings into ngrams
-// Extracting this interface allows us to test the bloom filter without having to use the actual tokenizer
-// TODO: This should be moved to tokenizer.go
-type NGramBuilder interface {
-	Tokens(line string) iter.Iterator[[]byte]
-	N() int
-	SkipFactor() int
-}
-
-type stringTest struct {
-	ngrams [][]byte
-}
-
-func newStringTest(b NGramBuilder, search string) (res BloomTest) {
-	// search string must be longer than the combined ngram length and skip factor
-	// in order for all possible skip offsets to have at least 1 ngram
-	skip := b.SkipFactor()
-	if ct := utf8.RuneCountInString(search); ct < b.N()+skip {
-		return MatchAll
-	}
-
-	tests := make([]stringTest, 0, skip)
-
-	for i := 0; i < skip+1; i++ {
-		searchWithOffset := search
-		for j := 0; j < i; j++ {
-			_, size := utf8.DecodeRuneInString(searchWithOffset)
-			// NB(owen-d): small bounds check for invalid utf8
-			searchWithOffset = searchWithOffset[min(size, len(searchWithOffset)):]
-		}
-
-		var test stringTest
-		it := b.Tokens(searchWithOffset)
-		for it.Next() {
-			ngram := make([]byte, len(it.At()))
-			copy(ngram, it.At())
-			test.ngrams = append(test.ngrams, ngram)
-		}
-		tests = append(tests, test)
-	}
-
-	res = tests[0]
-	for _, t := range tests[1:] {
-		res = newOrTest(res, t)
-	}
-	return res
-}
-
-// Matches implements the BloomTest interface
-func (b stringTest) Matches(bloom filter.Checker) bool {
-	for _, ngram := range b.ngrams {
-		if !bloom.Test(ngram) {
-			return false
-		}
-	}
-	return true
-}
-
-// MatchesWithPrefixBuf implements the BloomTest interface
-func (b stringTest) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefixLen int) bool {
-	for _, ngram := range b.ngrams {
-		buf = append(buf[:prefixLen], ngram...)
-		if !bloom.Test(buf) {
-			return false
-		}
-	}
-	return true
-}
-
-type stringMatcherFilter struct {
-	test BloomTest
-}
-
-// Matches implements the log.Filterer interface
-func (b stringMatcherFilter) Matches(test log.Checker) bool {
-	return b.test.Matches(logCheckerWrapper{test})
-}
-
-func newStringFilterFunc(b NGramBuilder) log.NewMatcherFiltererFunc {
-	return func(match []byte, _ bool) log.MatcherFilterer {
-		return log.WrapMatcher(stringMatcherFilter{
-			test: newStringTest(b, string(match)),
-		})
-	}
-}
-
 type orTest struct {
 	left, right BloomTest
 }
@@ -286,10 +71,12 @@ func newOrTest(left, right BloomTest) orTest {
 	}
 }
 
+// Matches implements BloomTest
 func (o orTest) Matches(bloom filter.Checker) bool {
 	return o.left.Matches(bloom) || o.right.Matches(bloom)
 }
 
+// MatchesWithPrefixBuf implements BloomTest
 func (o orTest) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefixLen int) bool {
 	return o.left.MatchesWithPrefixBuf(bloom, buf, prefixLen) || o.right.MatchesWithPrefixBuf(bloom, buf, prefixLen)
 }
@@ -305,28 +92,16 @@ func newAndTest(left, right BloomTest) andTest {
 	}
 }
 
+// Matches implements BloomTest
 func (a andTest) Matches(bloom filter.Checker) bool {
 	return a.left.Matches(bloom) && a.right.Matches(bloom)
 }
 
+// MatchesWithPrefixBuf implements BloomTest
 func (a andTest) MatchesWithPrefixBuf(bloom filter.Checker, buf []byte, prefixLen int) bool {
 	return a.left.MatchesWithPrefixBuf(bloom, buf, prefixLen) && a.right.MatchesWithPrefixBuf(bloom, buf, prefixLen)
 }
 
-func newPatternTest(b NGramBuilder, match string) BloomTest {
-	lit, err := pattern.ParseLiterals(match)
-	if err != nil {
-		return MatchAll
-	}
-
-	var res BloomTests
-
-	for _, l := range lit {
-		res = append(res, newStringTest(b, string(l)))
-	}
-	return res
-}
-
 func LabelMatchersToBloomTest(matchers ...LabelMatcher) BloomTest {
 	tests := make(BloomTests, 0, len(matchers))
 	for _, matcher := range matchers {