From 1a943f15836b682f7cee841856b1f091ac6d674b Mon Sep 17 00:00:00 2001 From: Victor Date: Sun, 25 Aug 2024 18:11:09 +0100 Subject: [PATCH] Optimize and omit duplicate pattern matches --- objfile/patterns.go | 70 +++++++++++++++++++++++++++++++--------- objfile/patterns_test.go | 26 +++++++++++++++ objfile/scanner.go | 10 +++--- 3 files changed, 86 insertions(+), 20 deletions(-) diff --git a/objfile/patterns.go b/objfile/patterns.go index f5ec652..2843e0b 100644 --- a/objfile/patterns.go +++ b/objfile/patterns.go @@ -2,6 +2,7 @@ package objfile import ( "errors" + "sort" "strconv" "strings" @@ -238,11 +239,48 @@ func RegexpPatternFromYaraPattern(pattern string) (*RegexAndNeedle, error) { return &RegexAndNeedle{patLen, regex_pattern, r, needleOffset, needle}, nil } -func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int { +func getOrSetRegion(regionMap map[int]map[int]bool, start, end int) bool { + if ends, ok := regionMap[start]; ok { + if ends[end] { + return true + } else { + ends[end] = true + return false + } + } else { + regionMap[start] = map[int]bool{end: true} + return false + } +} + +func regionMapToSlices(regionMap map[int]map[int]bool) [][]int { + totalSize := 0 + keys := make([]int, 0, len(regionMap)) + for key, valueMap := range regionMap { + keys = append(keys, key) + totalSize += len(valueMap) + } + sort.Ints(keys) + result := make([][]int, 0, totalSize) + for _, key := range keys { + values := make([]int, 0, len(regionMap[key])) + for value := range regionMap[key] { + values = append(values, value) + } + sort.Ints(values) + for _, value := range values { + result = append(result, []int{key, value}) + } + } + return result +} + +func FindRegex(data []byte, regexInfo *RegexAndNeedle) [][]int { data_len := len(data) - matches := make([]int, 0) + matchMap := make(map[int]map[int]bool) + cacheMap := make(map[int]map[int]bool) - // use an optimized memscan to find some candidates chunks from the much larger haystack + // use an optimized memscan to find all candidates chunks from the much larger haystack needleMatches := findAllOccurrences(data, [][]byte{regexInfo.needle}) for _, needleMatch := range needleMatches { // adjust the window to the pattern start and end @@ -258,35 +296,37 @@ func FindRegex(data []byte, regexInfo *RegexAndNeedle) []int { data_end = data_len } + // don't repeat previously scanned chunks + if getOrSetRegion(cacheMap, data_start, data_end) { + continue + } // do the full regex scan on a very small chunk for _, reMatch := range regexInfo.re.FindAllIndex(data[data_start:data_end], -1) { // the match offset is the start index of the chunk + reMatch index start := reMatch[0] + data_start + end := reMatch[1] + data_start + getOrSetRegion(matchMap, start, end) - //end := reMatch[1] + data_start - matches = append(matches, start) - - // special case to handle sub-matches, which are skipped by regex but matched by YARA: - // AA AA BB CC - // { AA [0-1] BB CC } - // must produce: - // AA AA BB CC - // AA BB CC + // handle sub-matches, which are skipped by regex but matched by YARA subStart := start + 1 for { + // don't repeat previously scanned chunks + if getOrSetRegion(cacheMap, subStart, data_end) { + break + } subMatches := regexInfo.re.FindAllIndex(data[subStart:data_end], -1) if len(subMatches) == 0 { break } - for _, match := range subMatches { - matches = append(matches, match[0]+subStart) + getOrSetRegion(matchMap, match[0]+subStart, match[1]+subStart) } subStart += subMatches[0][0] + 1 } } } - return matches + + return regionMapToSlices(matchMap) } type RegexAndNeedle struct { diff --git a/objfile/patterns_test.go b/objfile/patterns_test.go index a13d425..7e043c6 100644 --- a/objfile/patterns_test.go +++ b/objfile/patterns_test.go @@ -2,6 +2,7 @@ package objfile import ( "bytes" + "reflect" "testing" "rsc.io/binaryregexp" @@ -265,4 +266,29 @@ func TestRegexpPatternFromYaraPattern(t *testing.T) { t.Errorf("incorrect needle") } }) + + t.Run("Repeat", func(t *testing.T) { + reg, err := RegexpPatternFromYaraPattern("{ AA [0-512] BB }") + + if err != nil { + t.Errorf("pattern errored") + } + + if reg.len != 514 { + t.Errorf("incorrect pattern length") + } + + if reg.needleOffset != 0 { + t.Errorf("incorrect needle offset") + } + + if !bytes.Equal(reg.needle, []byte{0xAA}) { + t.Errorf("incorrect needle") + } + + results := FindRegex([]byte{0xAA, 0xAA, 0xAA, 0xBB, 0xAA, 0xAA, 0xBB, 0xAA, 0xBB, 0xCC}, reg) + if !reflect.DeepEqual(results, [][]int{{0, 4}, {1, 4}, {2, 4}, {4, 7}, {5, 7}, {7, 9}}) { + t.Errorf("incorrect match indexes") + } + }) } diff --git a/objfile/scanner.go b/objfile/scanner.go index a4b33da..359c20f 100644 --- a/objfile/scanner.go +++ b/objfile/scanner.go @@ -94,7 +94,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch { } for _, match := range FindRegex(data, x64reg) { - sigPtr := uint64(match) // from int + sigPtr := uint64(match[0]) // from int // this is the pointer offset stored in the instruction // 0x44E06A: 48 8D 0D 4F F0 24 00 lea rcx, off_69D0C0 (result: 0x24f04f) @@ -119,7 +119,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch { } for _, match := range FindRegex(data, x86reg) { - sigPtr := uint64(match) // from int + sigPtr := uint64(match[0]) // from int moduleDataPtr := uint64(binary.LittleEndian.Uint32(data[sigPtr+x86sig.moduleDataPtrLoc:][:4])) matches = append(matches, SignatureMatch{ @@ -138,7 +138,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch { } for _, match := range FindRegex(data, arm64reg) { - sigPtr := uint64(match) // from int + sigPtr := uint64(match[0]) // from int adrp := binary.LittleEndian.Uint32(data[sigPtr+ARM64_sig.moduleDataPtrADRP:][:4]) add := binary.LittleEndian.Uint32(data[sigPtr+ARM64_sig.moduleDataPtrADD:][:4]) @@ -169,7 +169,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch { } for _, match := range FindRegex(data, arm32reg) { - sigPtr := uint64(match) // from int + sigPtr := uint64(match[0]) // from int ldr := binary.LittleEndian.Uint32(data[sigPtr+ARM32_sig.moduleDataPtrLDR:][:4]) // ARM PC relative is always +8 due to legacy nonsense ldr_pointer_stub := uint64((ldr & 0x00000FFF) + 8) @@ -190,7 +190,7 @@ func findModuleInitPCHeader(data []byte, sectionBase uint64) []SignatureMatch { } for _, match := range FindRegex(data, ppcBEreg) { - sigPtr := uint64(match) // from int + sigPtr := uint64(match[0]) // from int moduleDataPtrHi := int64(binary.BigEndian.Uint16(data[sigPtr+PPC_BE_sig.moduleDataPtrHi:][:2])) // addi takes a signed immediate moduleDataPtrLo := int64(int16(binary.BigEndian.Uint16(data[sigPtr+PPC_BE_sig.moduleDataPtrLo:][:2])))