Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize and omit duplicate pattern matches #66

Merged
merged 1 commit into from
Oct 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 55 additions & 15 deletions objfile/patterns.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package objfile

import (
"errors"
"sort"
"strconv"
"strings"

Expand Down Expand Up @@ -238,11 +239,48 @@ func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) {
return &RegexAndNeedle{patLen, regex_pattern, r, needleOffset, needle}, nil
}

func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int {
func getOrSetRegion(regionMap map[int]map[int]bool, start, end int) bool {
if ends, ok := regionMap[start]; ok {
if ends[end] {
return true
} else {
ends[end] = true
return false
}
} else {
regionMap[start] = map[int]bool{end: true}
return false
}
}

func regionMapToSlices(regionMap map[int]map[int]bool) [][]int {
totalSize := 0
keys := make([]int, 0, len(regionMap))
for key, valueMap := range regionMap {
keys = append(keys, key)
totalSize += len(valueMap)
}
sort.Ints(keys)
result := make([][]int, 0, totalSize)
for _, key := range keys {
values := make([]int, 0, len(regionMap[key]))
for value := range regionMap[key] {
values = append(values, value)
}
sort.Ints(values)
for _, value := range values {
result = append(result, []int{key, value})
}
}
return result
}

func FindRegex(data []byte, regexInfo *RegexAndNeedle) [][]int {
data_len := len(data)
matches := make([]int, 0)
matchMap := make(map[int]map[int]bool)
cacheMap := make(map[int]map[int]bool)

// use an optimized memscan to find some candidates chunks from the much larger haystack
// use an optimized memscan to find all candidates chunks from the much larger haystack
needleMatches := findAllOccurrences(data, [][]byte{regexInfo.needle})
for _, needleMatch := range needleMatches {
// adjust the window to the pattern start and end
Expand All @@ -258,35 +296,37 @@ func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int {
data_end = data_len
}

// don't repeat previously scanned chunks
if getOrSetRegion(cacheMap, data_start, data_end) {
continue
}
// do the full regex scan on a very small chunk
for _, reMatch := range regexInfo.re.FindAllIndex(data[data_start:data_end], -1) {
// the match offset is the start index of the chunk + reMatch index
start := reMatch[0] + data_start
end := reMatch[1] + data_start
getOrSetRegion(matchMap, start, end)

//end := reMatch[1] + data_start
matches = append(matches, start)

// special case to handle sub-matches, which are skipped by regex but matched by YARA:
// AA AA BB CC
// { AA [0-1] BB CC }
// must produce:
// AA AA BB CC
// AA BB CC
// handle sub-matches, which are skipped by regex but matched by YARA
subStart := start + 1
for {
// don't repeat previously scanned chunks
if getOrSetRegion(cacheMap, subStart, data_end) {
break
}
subMatches := regexInfo.re.FindAllIndex(data[subStart:data_end], -1)
if len(subMatches) == 0 {
break
}

for _, match := range subMatches {
matches = append(matches, match[0]+subStart)
getOrSetRegion(matchMap, match[0]+subStart, match[1]+subStart)
}
subStart += subMatches[0][0] + 1
}
}
}
return matches

return regionMapToSlices(matchMap)
}

type RegexAndNeedle struct {
Expand Down
26 changes: 26 additions & 0 deletions objfile/patterns_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package objfile

import (
"bytes"
"reflect"
"testing"

"rsc.io/binaryregexp"
Expand Down Expand Up @@ -265,4 +266,29 @@ func TestRegexpPatternFromYaraPattern(t *testing.T) {
t.Errorf("incorrect needle")
}
})

t.Run("Repeat", func(t *testing.T) {
reg, err := RegexpPatternFromYaraPattern("{ AA [0-512] BB }")

if err != nil {
t.Errorf("pattern errored")
}

if reg.len != 514 {
t.Errorf("incorrect pattern length")
}

if reg.needleOffset != 0 {
t.Errorf("incorrect needle offset")
}

if !bytes.Equal(reg.needle, []byte{0xAA}) {
t.Errorf("incorrect needle")
}

results := FindRegex([]byte{0xAA, 0xAA, 0xAA, 0xBB, 0xAA, 0xAA, 0xBB, 0xAA, 0xBB, 0xCC}, reg)
if !reflect.DeepEqual(results, [][]int{{0, 4}, {1, 4}, {2, 4}, {4, 7}, {5, 7}, {7, 9}}) {
t.Errorf("incorrect match indexes")
}
})
}
10 changes: 5 additions & 5 deletions objfile/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, x64reg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int

// this is the pointer offset stored in the instruction
// 0x44E06A: 48 8D 0D 4F F0 24 00 lea rcx, off_69D0C0 (result: 0x24f04f)
Expand All @@ -119,7 +119,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, x86reg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int

moduleDataPtr := uint64(binary.LittleEndian.Uint32(data[sigPtr+x86sig.moduleDataPtrLoc:][:4]))
matches = append(matches, SignatureMatch{
Expand All @@ -138,7 +138,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, arm64reg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int

adrp := binary.LittleEndian.Uint32(data[sigPtr+ARM64_sig.moduleDataPtrADRP:][:4])
add := binary.LittleEndian.Uint32(data[sigPtr+ARM64_sig.moduleDataPtrADD:][:4])
Expand Down Expand Up @@ -169,7 +169,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, arm32reg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int
ldr := binary.LittleEndian.Uint32(data[sigPtr+ARM32_sig.moduleDataPtrLDR:][:4])
// ARM PC relative is always +8 due to legacy nonsense
ldr_pointer_stub := uint64((ldr & 0x00000FFF) + 8)
Expand All @@ -190,7 +190,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, ppcBEreg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int
moduleDataPtrHi := int64(binary.BigEndian.Uint16(data[sigPtr+PPC_BE_sig.moduleDataPtrHi:][:2]))
// addi takes a signed immediate
moduleDataPtrLo := int64(int16(binary.BigEndian.Uint16(data[sigPtr+PPC_BE_sig.moduleDataPtrLo:][:2])))
Expand Down
Loading