Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add pattern match line filter #12398

Merged
merged 6 commits into from
Apr 2, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions pkg/logql/log/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (

"github.com/prometheus/prometheus/model/labels"

"github.com/grafana/loki/pkg/logql/log/pattern"
"github.com/grafana/loki/pkg/util"
)

Expand All @@ -23,6 +24,8 @@ const (
LineMatchNotEqual
LineMatchRegexp
LineMatchNotRegexp
LineMatchPattern
LineMatchNotPattern
)

func (t LineMatchType) String() string {
Expand All @@ -35,6 +38,10 @@ func (t LineMatchType) String() string {
return "|~"
case LineMatchNotRegexp:
return "!~"
case LineMatchPattern:
return "|>"
case LineMatchNotPattern:
return "!>"
default:
return ""
}
Expand Down Expand Up @@ -553,6 +560,10 @@ func NewFilter(match string, mt LineMatchType) (Filterer, error) {
return newContainsFilter([]byte(match), false), nil
case LineMatchNotEqual:
return NewNotFilter(newContainsFilter([]byte(match), false)), nil
case LineMatchPattern:
return newPatternFilterer([]byte(match), true)
case LineMatchNotPattern:
return newPatternFilterer([]byte(match), false)
default:
return nil, fmt.Errorf("unknown matcher: %v", match)
}
Expand Down Expand Up @@ -783,3 +794,37 @@ func (s *RegexSimplifier) simplifyConcatAlternate(reg *syntax.Regexp, literal []
}
return nil, false
}

type patternFilter struct {
matcher *pattern.Matcher
pattern []byte
}

func newPatternFilterer(p []byte, match bool) (MatcherFilterer, error) {
m, err := pattern.ParseLineFilter(p)
if err != nil {
return nil, err
}
filter := &patternFilter{
matcher: m,
pattern: p,
}
if !match {
return NewNotFilter(filter), nil
}
return filter, nil
}

func (f *patternFilter) Filter(line []byte) bool { return f.matcher.Test(line) }

func (f *patternFilter) Matches(test Checker) bool {
return test.Test(f.pattern, false, false)
}

func (f *patternFilter) ToStage() Stage {
return StageFunc{
process: func(_ int64, line []byte, _ *LabelsBuilder) ([]byte, bool) {
return line, f.Filter(line)
},
}
}
2 changes: 1 addition & 1 deletion pkg/logql/log/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ func (l *LogfmtParser) Process(_ int64, line []byte, lbs *LabelsBuilder) ([]byte
func (l *LogfmtParser) RequiredLabelNames() []string { return []string{} }

type PatternParser struct {
matcher pattern.Matcher
matcher *pattern.Matcher
names []string
}

Expand Down
33 changes: 24 additions & 9 deletions pkg/logql/log/pattern/ast.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,21 @@ func (e expr) validate() error {
return ErrNoCapture
}
// Consecutive captures are not allowed.
if err := e.validateNoConsecutiveCaptures(); err != nil {
return err
}
caps := e.captures()
uniq := map[string]struct{}{}
for _, c := range caps {
if _, ok := uniq[c]; ok {
return fmt.Errorf("duplicate capture name (%s): %w", c, ErrInvalidExpr)
}
uniq[c] = struct{}{}
}
return nil
}

func (e expr) validateNoConsecutiveCaptures() error {
for i, n := range e {
if i+1 >= len(e) {
break
Expand All @@ -30,21 +45,21 @@ func (e expr) validate() error {
}
}
}
return nil
}

caps := e.captures()
uniq := map[string]struct{}{}
for _, c := range caps {
if _, ok := uniq[c]; ok {
return fmt.Errorf("duplicate capture name (%s): %w", c, ErrInvalidExpr)
func (e expr) validateNoNamedCaptures() error {
for i, n := range e {
if c, ok := e[i].(capture); ok && !c.isUnnamed() {
return fmt.Errorf("%w: found '%s'", ErrCaptureNotAllowed, n.String())
}
uniq[c] = struct{}{}
}
return nil
}

func (e expr) captures() (captures []string) {
for _, n := range e {
if c, ok := n.(capture); ok && !c.isUnamed() {
if c, ok := n.(capture); ok && !c.isUnnamed() {
captures = append(captures, c.Name())
}
}
Expand All @@ -65,8 +80,8 @@ func (c capture) Name() string {
return string(c)
}

func (c capture) isUnamed() bool {
return string(c) == underscore
func (c capture) isUnnamed() bool {
return len(c) == 1 && c[0] == underscore[0]
}

type literals []byte
Expand Down
6 changes: 5 additions & 1 deletion pkg/logql/log/pattern/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@ func init() {
}

func parseExpr(input string) (expr, error) {
return parseExprBytes([]byte(input))
}

func parseExprBytes(input []byte) (expr, error) {
l := newLexer()
l.setData([]byte(input))
l.setData(input)
e := exprNewParser().Parse(l)
if e != 0 || len(l.errs) > 0 {
return nil, l.errs[0]
Expand Down
93 changes: 78 additions & 15 deletions pkg/logql/log/pattern/pattern.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,40 +6,67 @@ import (
)

var (
ErrNoCapture = errors.New("at least one capture is required")
ErrInvalidExpr = errors.New("invalid expression")
ErrNoCapture = errors.New("at least one capture is required")
ErrCaptureNotAllowed = errors.New("named captures are not allowed")
Copy link
Contributor Author

@kolesnikovae kolesnikovae Mar 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is debatable, but I think it makes sense: if a user tries to parse fields at the filter stage, it's better to return an explicit error rather than silently ignore the user's intent. In the future, the restriction can be removed.

Note that we're using an unnamed placeholder <_>. I think we could use a new capture identifier (such as <*>) to emphasize the difference with the pattern parse stage. However, it feels like a new syntax for pattern matching, which may confuse users. I would like to hear others' thoughts on this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have strong opinitions using _ vs * but I definitively agree we should fail fast.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would vote for <_> as the behavior between these two examples feels consistent to me:

|> "dur=<_> err=<_>"
| pattern "dur=<_> err=<_>"

I think of that as telling Loki to ignore any of the content in <_>

I'm not sure I see the reason to use <*> or maybe I'm misunderstanding something?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This popped up in the Slack conversation, so I brought it here.

Thank you, guys, for sharing your thoughts. I keep <_>.

ErrInvalidExpr = errors.New("invalid expression")
)

type Matcher interface {
Matches(in []byte) [][]byte
Names() []string
}

type matcher struct {
type Matcher struct {
e expr

captures [][]byte
names []string
}

func New(in string) (Matcher, error) {
func New(in string) (*Matcher, error) {
e, err := parseExpr(in)
if err != nil {
return nil, err
}
if err := e.validate(); err != nil {
return nil, err
}
return &matcher{
return &Matcher{
e: e,
captures: make([][]byte, 0, e.captureCount()),
names: e.captures(),
}, nil
}

func ParseLineFilter(in []byte) (*Matcher, error) {
if len(in) == 0 {
return new(Matcher), nil
}
e, err := parseExprBytes(in)
if err != nil {
return nil, err
}
if err = e.validateNoConsecutiveCaptures(); err != nil {
return nil, err
}
if err = e.validateNoNamedCaptures(); err != nil {
return nil, err
}
return &Matcher{e: e}, nil
}

func ParseLiterals(in string) ([][]byte, error) {
e, err := parseExpr(in)
if err != nil {
return nil, err
}
lit := make([][]byte, 0, len(e))
for _, n := range e {
if l, ok := n.(literals); ok {
lit = append(lit, l)
}
}
return lit, nil
}

// Matches matches the given line with the provided pattern.
// Matches invalidates the previous returned captures array.
func (m *matcher) Matches(in []byte) [][]byte {
func (m *Matcher) Matches(in []byte) [][]byte {
if len(in) == 0 {
return nil
}
Expand All @@ -62,7 +89,7 @@ func (m *matcher) Matches(in []byte) [][]byte {
// from now we have capture - literals - capture ... (literals)?
for len(expr) != 0 {
if len(expr) == 1 { // we're ending on a capture.
if !(expr[0].(capture)).isUnamed() {
if !(expr[0].(capture)).isUnnamed() {
captures = append(captures, in)
}
return captures
Expand All @@ -73,13 +100,13 @@ func (m *matcher) Matches(in []byte) [][]byte {
i := bytes.Index(in, ls)
if i == -1 {
// if a capture is missed we return up to the end as the capture.
if !capt.isUnamed() {
if !capt.isUnnamed() {
captures = append(captures, in)
}
return captures
}

if capt.isUnamed() {
if capt.isUnnamed() {
in = in[len(ls)+i:]
continue
}
Expand All @@ -90,6 +117,42 @@ func (m *matcher) Matches(in []byte) [][]byte {
return captures
}

func (m *matcher) Names() []string {
func (m *Matcher) Names() []string {
return m.names
}

func (m *Matcher) Test(in []byte) bool {
if len(in) == 0 || len(m.e) == 0 {
// An empty line can only match an empty pattern.
return len(in) == 0 && len(m.e) == 0
}
var off int
for i := 0; i < len(m.e); i++ {
lit, ok := m.e[i].(literals)
if !ok {
continue
}
j := bytes.Index(in[off:], lit)
if j == -1 {
return false
}
if i != 0 && j == 0 {
// This means we either have repetitive literals, or an empty
// capture. Either way, the line does not match the pattern.
return false
}
off += j + len(lit)
}
// If we ended up on a literal, we only consider the test successful,
// if the remaining input is empty. Otherwise, we ended up on capture,
// therefore the reminder (the captured text) must not be empty.
//
// For example, "foo bar baz" does not match "<_> bar", but it matches
// "<_> baz" and "foo <_>".
//
// Empty captures are not allowed as well: " bar " does not match
// "<_> bar <_>", but matches "<_>bar<_>".
_, reqRem := m.e[len(m.e)-1].(capture)
hasRem := off != len(in)
return reqRem == hasRem
}
Loading
Loading