Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes and improvements to pattern matching #63

Merged
merged 1 commit into from
Aug 16, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 63 additions & 49 deletions objfile/patterns.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,21 @@ func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) {
pattern = strings.ToLower(pattern)

patLen := 0
sequenceLen := 0
needleOffset := 0
needle := make([]byte, 0)
tmpNeedle := make([]byte, 0)

resetNeedle := func() {
patLen += sequenceLen
sequenceLen = 0
if len(tmpNeedle) > len(needle) {
needle = slices.Clone(tmpNeedle)
needleOffset = patLen - len(tmpNeedle)
}
tmpNeedle = make([]byte, 0)
}

var regex_pattern string
for i := 0; i < len(pattern); {
// at the start of this loop,
Expand All @@ -82,13 +94,8 @@ func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) {
regex_pattern += "."

i += 2
patLen += 1
if len(tmpNeedle) > len(needle) {
needle = slices.Clone(tmpNeedle)
tmpNeedle = make([]byte, 0)
} else {
tmpNeedle = make([]byte, 0)
}
resetNeedle()
sequenceLen = 1
continue
}

Expand All @@ -106,12 +113,12 @@ func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) {
return nil, errors.New("[] didn't contain a dash")
}

_, err := strconv.Atoi(low)
lowInt, err := strconv.Atoi(low)
if err != nil {
return nil, errors.New("invalid number")
}

_, err = strconv.Atoi(high)
highInt, err := strconv.Atoi(high)
if err != nil {
return nil, errors.New("invalid number")
}
Expand All @@ -123,15 +130,16 @@ func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) {
regex_pattern += high
regex_pattern += "}"

i += end + 1
patLen += 1
// YARA evaluates lazily, make sure we match that:
// AA BB BB
// { AA [0-1] BB }
// must produce:
// AA BB
regex_pattern += "?"

if len(tmpNeedle) > len(needle) {
needle = slices.Clone(tmpNeedle)
tmpNeedle = make([]byte, 0)
} else {
tmpNeedle = make([]byte, 0)
}
i += end + 1
resetNeedle()
sequenceLen = highInt - lowInt + 1
continue
}

Expand Down Expand Up @@ -161,13 +169,8 @@ func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) {
regex_pattern += ")"

i += end + 1
patLen += len(choices)
if len(tmpNeedle) > len(needle) {
needle = slices.Clone(tmpNeedle)
tmpNeedle = make([]byte, 0)
} else {
tmpNeedle = make([]byte, 0)
}
resetNeedle()
sequenceLen = 1
continue
}

Expand All @@ -185,13 +188,8 @@ func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) {
regex_pattern += "]"

i += 2
patLen += 1
if len(tmpNeedle) > len(needle) {
needle = slices.Clone(tmpNeedle)
tmpNeedle = make([]byte, 0)
} else {
tmpNeedle = make([]byte, 0)
}
resetNeedle()
sequenceLen = 1
continue
}

Expand All @@ -205,36 +203,33 @@ func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) {
}
tmpNeedle = append(tmpNeedle, byte(byt))
i += 2
patLen += 1
sequenceLen += 1
continue
}

return nil, errors.New("unexpected value")
}

if len(tmpNeedle) > len(needle) {
needle = slices.Clone(tmpNeedle)
//tmpNeedle = make([]byte, 0) not needed at exit
}
resetNeedle()

r := binaryregexp.MustCompile(regex_pattern)
if r == nil {
// use "single line" flag to match "\n" as regular character
r, err := binaryregexp.Compile("(?s)" + regex_pattern)
if err != nil {
return nil, errors.New("failed to compile regex")
}
return &RegexAndNeedle{patLen, regex_pattern, r, needle}, nil
return &RegexAndNeedle{patLen, regex_pattern, r, needleOffset, needle}, nil
}

func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int {
data_len := len(data)
matches := make([]int, 0)

// use an optimized memscan to find some candidates chunks from the much large haystack
// use an optimized memscan to find some candidates chunks from the much larger haystack
needleMatches := findAllOccurrences(data, [][]byte{regexInfo.needle})
for _, needleMatch := range needleMatches {
// we might have found a needle beginning at the very end of our regex
// widen the window to regex scan from the [-regexLen:regexLen] so we scan the front too
data_start := needleMatch - regexInfo.len
data_end := needleMatch + regexInfo.len
// adjust the window to the pattern start and end
data_start := needleMatch - regexInfo.needleOffset
data_end := needleMatch + regexInfo.len - regexInfo.needleOffset
if data_start >= data_len {
continue
} else if data_start <= 0 {
Expand All @@ -250,16 +245,35 @@ func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int {
// the match offset is the start index of the chunk + reMatch index
start := reMatch[0] + data_start

//end := reMatch[1]
//end := reMatch[1] + data_start
matches = append(matches, start)

// special case to handle sub-matches, which are skipped by regex but matched by YARA:
// AA AA BB CC
// { AA [0-1] BB CC }
// must produce:
// AA AA BB CC
// AA BB CC
subStart := start + 1
for {
subMatches := regexInfo.re.FindAllIndex(data[subStart:data_end], -1)
if len(subMatches) == 0 {
break
}
for _, match := range subMatches {
matches = append(matches, match[0]+subStart)
}
subStart += subMatches[0][0] + 1
}
}
}
return matches
}

type RegexAndNeedle struct {
len int
rawre string
re *binaryregexp.Regexp
needle []byte // longest fixed sub-sequence of regex
len int
rawre string
re *binaryregexp.Regexp
needleOffset int // offset within the pattern
needle []byte // longest fixed sub-sequence of regex
}
Loading