Skip to content

Commit

Permalink
Optimize pattern sub-matches
Browse files Browse the repository at this point in the history
  • Loading branch information
ViRb3 committed Aug 17, 2024
1 parent 750bb57 commit 6f4badc
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 13 deletions.
62 changes: 54 additions & 8 deletions objfile/patterns.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package objfile

import (
"errors"
"sort"
"strconv"
"strings"

Expand Down Expand Up @@ -238,11 +239,48 @@ func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) {
return &RegexAndNeedle{patLen, regex_pattern, r, needleOffset, needle}, nil
}

func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int {
func getOrSetRegion(regionMap map[int]map[int]bool, start, end int) bool {
if ends, ok := regionMap[start]; ok {
if ends[end] {
return true
} else {
ends[end] = true
return false
}
} else {
regionMap[start] = map[int]bool{end: true}
return false
}
}

func regionMapToSlices(regionMap map[int]map[int]bool) [][]int {
totalSize := 0
keys := make([]int, 0, len(regionMap))
for key, valueMap := range regionMap {
keys = append(keys, key)
totalSize += len(valueMap)
}
sort.Ints(keys)
result := make([][]int, 0, totalSize)
for _, key := range keys {
values := make([]int, 0, len(regionMap[key]))
for value := range regionMap[key] {
values = append(values, value)
}
sort.Ints(values)
for _, value := range values {
result = append(result, []int{key, value})
}
}
return result
}

func FindRegex(data []byte, regexInfo *RegexAndNeedle) [][]int {
data_len := len(data)
matches := make([]int, 0)
matchMap := make(map[int]map[int]bool)
cacheMap := make(map[int]map[int]bool)

// use an optimized memscan to find some candidates chunks from the much larger haystack
// use an optimized memscan to find all candidates chunks from the much larger haystack
needleMatches := findAllOccurrences(data, [][]byte{regexInfo.needle})
for _, needleMatch := range needleMatches {
// adjust the window to the pattern start and end
Expand All @@ -258,13 +296,16 @@ func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int {
data_end = data_len - 1
}

// don't repeat previously scanned chunks
if getOrSetRegion(cacheMap, data_start, data_end) {
continue
}
// do the full regex scan on a very small chunk
for _, reMatch := range regexInfo.re.FindAllIndex(data[data_start:data_end], -1) {
// the match offset is the start index of the chunk + reMatch index
start := reMatch[0] + data_start

//end := reMatch[1] + data_start
matches = append(matches, start)
end := reMatch[1] + data_start
getOrSetRegion(matchMap, start, end)

// special case to handle sub-matches, which are skipped by regex but matched by YARA:
// AA AA BB CC
Expand All @@ -274,18 +315,23 @@ func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int {
// AA BB CC
subStart := start + 1
for {
// don't repeat previously scanned chunks
if getOrSetRegion(cacheMap, subStart, data_end) {
break
}
subMatches := regexInfo.re.FindAllIndex(data[subStart:data_end], -1)
if len(subMatches) == 0 {
break
}
for _, match := range subMatches {
matches = append(matches, match[0]+subStart)
getOrSetRegion(matchMap, match[0]+subStart, match[1]+subStart)
}
subStart += subMatches[0][0] + 1
}
}
}
return matches

return regionMapToSlices(matchMap)
}

type RegexAndNeedle struct {
Expand Down
10 changes: 5 additions & 5 deletions objfile/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, x64reg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int

// this is the pointer offset stored in the instruction
// 0x44E06A: 48 8D 0D 4F F0 24 00 lea rcx, off_69D0C0 (result: 0x24f04f)
Expand All @@ -119,7 +119,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, x86reg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int

moduleDataPtr := uint64(binary.LittleEndian.Uint32(data[sigPtr+x86sig.moduleDataPtrLoc:][:4]))
matches = append(matches, SignatureMatch{
Expand All @@ -138,7 +138,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, arm64reg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int

adrp := binary.LittleEndian.Uint32(data[sigPtr+ARM64_sig.moduleDataPtrADRP:][:4])
add := binary.LittleEndian.Uint32(data[sigPtr+ARM64_sig.moduleDataPtrADD:][:4])
Expand Down Expand Up @@ -169,7 +169,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, arm32reg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int
ldr := binary.LittleEndian.Uint32(data[sigPtr+ARM32_sig.moduleDataPtrLDR:][:4])
// ARM PC relative is always +8 due to legacy nonsense
ldr_pointer_stub := uint64((ldr & 0x00000FFF) + 8)
Expand All @@ -190,7 +190,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch {
}

for _, match := range FindRegex(data, ppcBEreg) {
sigPtr := uint64(match) // from int
sigPtr := uint64(match[0]) // from int
moduleDataPtrHi := int64(binary.BigEndian.Uint16(data[sigPtr+PPC_BE_sig.moduleDataPtrHi:][:2]))
// addi takes a signed immediate
moduleDataPtrLo := int64(int16(binary.BigEndian.Uint16(data[sigPtr+PPC_BE_sig.moduleDataPtrLo:][:2])))
Expand Down

0 comments on commit 6f4badc

Please sign in to comment.