From a203f56bdb800e50faf6d2d7b2bf11e0f99b3146 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 28 Mar 2022 12:14:07 -0600 Subject: [PATCH 01/43] rules: add new scope "instruction" --- capa/rules.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/capa/rules.py b/capa/rules.py index be6906666..e2c449d84 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -74,11 +74,13 @@ class Scope(str, Enum): FILE = "file" FUNCTION = "function" BASIC_BLOCK = "basic block" + INSTRUCTION = "instruction" FILE_SCOPE = Scope.FILE.value FUNCTION_SCOPE = Scope.FUNCTION.value BASIC_BLOCK_SCOPE = Scope.BASIC_BLOCK.value +INSTRUCTION_SCOPE = Scope.INSTRUCTION.value SUPPORTED_FEATURES = { From c8fedb0f7038428998bfccd16e143d129e1722dc Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 28 Mar 2022 12:39:58 -0600 Subject: [PATCH 02/43] gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index cc331b545..7793d351e 100644 --- a/.gitignore +++ b/.gitignore @@ -118,3 +118,5 @@ rule-linter-output.log scripts/perf/*.txt scripts/perf/*.svg scripts/perf/*.zip +.direnv +.envrc From 9da9c3aceb0511844cda6c8ae5a6ed63d0de7d7d Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 28 Mar 2022 12:40:10 -0600 Subject: [PATCH 03/43] rules: add valid features for insn scope --- capa/rules.py | 5 ++++ tests/test_rules_insn_scope.py | 45 ++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 tests/test_rules_insn_scope.py diff --git a/capa/rules.py b/capa/rules.py index e2c449d84..18bfa0612 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -126,6 +126,11 @@ class Scope(str, Enum): capa.features.common.OS, capa.features.common.Arch, }, + INSTRUCTION_SCOPE: { + capa.features.common.Arch, + capa.features.common.OS, + capa.features.insn.Mnemonic, + }, } # all basic block scope features are also function scope features diff --git a/tests/test_rules_insn_scope.py b/tests/test_rules_insn_scope.py new file mode 100644 index 000000000..3e2a3baf6 --- /dev/null +++ b/tests/test_rules_insn_scope.py @@ -0,0 +1,45 @@ +# Copyright (C) 2022 FireEye, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import textwrap + +import pytest + +import capa.rules + + +def test_rule_scope_instruction(): + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + scope: instruction + features: + - and: + - mnemonic: mov + - arch: i386 + - os: windows + """ + ) + ) + + with pytest.raises(capa.rules.InvalidRule): + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + scope: instruction + features: + - characteristic: embedded pe + """ + ) + ) From 890870bf452a70fe83eb227c84ed933599a4a2a0 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 28 Mar 2022 12:54:54 -0600 Subject: [PATCH 04/43] rules: let subscope blocks have descriptions --- capa/engine.py | 4 ++-- capa/rules.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/capa/engine.py b/capa/engine.py index 19207afc9..b28a15a2c 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -235,8 +235,8 @@ class Subscope(Statement): the engine should preprocess rules to extract subscope statements into their own rules. """ - def __init__(self, scope, child): - super(Subscope, self).__init__() + def __init__(self, scope, child, description=None): + super(Subscope, self).__init__(description=description) self.scope = scope self.child = child diff --git a/capa/rules.py b/capa/rules.py index 18bfa0612..b57e7b51f 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -426,7 +426,7 @@ def build_statements(d, scope: str): if len(d[key]) != 1: raise InvalidRule("subscope must have exactly one child statement") - return ceng.Subscope(FUNCTION_SCOPE, build_statements(d[key][0], FUNCTION_SCOPE)) + return ceng.Subscope(FUNCTION_SCOPE, build_statements(d[key][0], FUNCTION_SCOPE), description=description) elif key == "basic block": if scope != FUNCTION_SCOPE: @@ -435,7 +435,7 @@ def build_statements(d, scope: str): if len(d[key]) != 1: raise InvalidRule("subscope must have exactly one child statement") - return ceng.Subscope(BASIC_BLOCK_SCOPE, build_statements(d[key][0], BASIC_BLOCK_SCOPE)) + return ceng.Subscope(BASIC_BLOCK_SCOPE, build_statements(d[key][0], BASIC_BLOCK_SCOPE), description=description) elif key.startswith("count(") and key.endswith(")"): # e.g.: From 2baf05acdb995ed22900dff8fefd240985d84ca2 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 28 Mar 2022 12:55:09 -0600 Subject: [PATCH 05/43] rules: parse instruction subscope with implied AND --- capa/rules.py | 23 ++++++++++ tests/test_rules_insn_scope.py | 78 ++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) diff --git a/capa/rules.py b/capa/rules.py index b57e7b51f..34520018b 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -437,6 +437,29 @@ def build_statements(d, scope: str): return ceng.Subscope(BASIC_BLOCK_SCOPE, build_statements(d[key][0], BASIC_BLOCK_SCOPE), description=description) + elif key == "instruction": + if scope not in (FUNCTION_SCOPE, BASIC_BLOCK_SCOPE): + raise InvalidRule("instruction subscope supported only for function and basic block scope") + + if len(d[key]) == 1: + statements = build_statements(d[key][0], INSTRUCTION_SCOPE) + else: + # for instruction subscopes, we support a shorthand in which the top level AND is implied. + # the following are equivalent: + # + # - instruction: + # - and: + # - arch: i386 + # - mnemonic: cmp + # + # - instruction: + # - arch: i386 + # - mnemonic: cmp + # + statements = ceng.And([build_statements(dd, INSTRUCTION_SCOPE) for dd in d[key]]) + + return ceng.Subscope(INSTRUCTION_SCOPE, statements, description=description) + elif key.startswith("count(") and key.endswith(")"): # e.g.: # diff --git a/tests/test_rules_insn_scope.py b/tests/test_rules_insn_scope.py index 3e2a3baf6..25be1bf3a 100644 --- a/tests/test_rules_insn_scope.py +++ b/tests/test_rules_insn_scope.py @@ -43,3 +43,81 @@ def test_rule_scope_instruction(): """ ) ) + + +def test_rule_subscope_instruction(): + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + scope: function + features: + - and: + - instruction: + - and: + - mnemonic: mov + - arch: i386 + - os: windows + """ + ) + ) + + +def test_scope_instruction_implied_and(): + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + scope: function + features: + - and: + - instruction: + - mnemonic: mov + - arch: i386 + - os: windows + """ + ) + ) + + +def test_scope_instruction_description(): + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + scope: function + features: + - and: + - instruction: + - description: foo + - mnemonic: mov + - arch: i386 + - os: windows + """ + ) + ) + + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + scope: function + features: + - and: + - instruction: + - description: foo + - mnemonic: mov + - arch: i386 + - os: windows + """ + ) + ) + From b0619f4f0174edc0f0858befcd873b7531abf876 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 28 Mar 2022 13:03:58 -0600 Subject: [PATCH 06/43] rules: index instruction rules in ruleset --- capa/rules.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/capa/rules.py b/capa/rules.py index 34520018b..6fb90bc44 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -1008,6 +1008,7 @@ def __init__(self, rules: List[Rule]): self.file_rules = self._get_rules_for_scope(rules, FILE_SCOPE) self.function_rules = self._get_rules_for_scope(rules, FUNCTION_SCOPE) self.basic_block_rules = self._get_rules_for_scope(rules, BASIC_BLOCK_SCOPE) + self.instruction_rules = self._get_rules_for_scope(rules, INSTRUCTION_SCOPE) self.rules = {rule.name: rule for rule in rules} self.rules_by_namespace = index_rules_by_namespace(rules) @@ -1019,6 +1020,9 @@ def __init__(self, rules: List[Rule]): (self._easy_basic_block_rules_by_feature, self._hard_basic_block_rules) = self._index_rules_by_feature( self.basic_block_rules ) + (self._easy_instruction_rules_by_feature, self._hard_instruction_rules) = self._index_rules_by_feature( + self.instruction_rules + ) def __len__(self): return len(self.rules) @@ -1244,6 +1248,9 @@ def match(self, scope: Scope, features: FeatureSet, va: int) -> Tuple[FeatureSet elif scope is Scope.BASIC_BLOCK: easy_rules_by_feature = self._easy_basic_block_rules_by_feature hard_rule_names = self._hard_basic_block_rules + elif scope is Scope.INSTRUCTION: + easy_rules_by_feature = self._easy_instruction_rules_by_feature + hard_rule_names = self._hard_instruction_rules else: assert_never(scope) From 46cc681eba2a3772c69f53c88af86a71884f5b1a Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 28 Mar 2022 13:04:13 -0600 Subject: [PATCH 07/43] tests: demonstrate instruct subscope rule extraction --- tests/test_rules_insn_scope.py | 43 +++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/tests/test_rules_insn_scope.py b/tests/test_rules_insn_scope.py index 25be1bf3a..c4bbc768a 100644 --- a/tests/test_rules_insn_scope.py +++ b/tests/test_rules_insn_scope.py @@ -46,23 +46,34 @@ def test_rule_scope_instruction(): def test_rule_subscope_instruction(): - capa.rules.Rule.from_yaml( - textwrap.dedent( - """ - rule: - meta: - name: test rule - scope: function - features: - - and: - - instruction: - - and: - - mnemonic: mov - - arch: i386 - - os: windows - """ - ) + rules = capa.rules.RuleSet( + [ + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + scope: function + features: + - and: + - instruction: + - and: + - mnemonic: mov + - arch: i386 + - os: windows + """ + ) + ) + ] ) + # the function rule scope will have one rules: + # - `test rule` + assert len(rules.function_rules) == 1 + + # the insn rule scope have one rule: + # - the rule on which `test rule` depends + assert len(rules.instruction_rules) == 1 def test_scope_instruction_implied_and(): From dde52f2bc89dc076459f9ee89fe4c752e609c490 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 28 Mar 2022 13:04:44 -0600 Subject: [PATCH 08/43] pep8 --- capa/rules.py | 2 +- tests/test_rules_insn_scope.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/capa/rules.py b/capa/rules.py index 6fb90bc44..97ff3689d 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -446,7 +446,7 @@ def build_statements(d, scope: str): else: # for instruction subscopes, we support a shorthand in which the top level AND is implied. # the following are equivalent: - # + # # - instruction: # - and: # - arch: i386 diff --git a/tests/test_rules_insn_scope.py b/tests/test_rules_insn_scope.py index c4bbc768a..27d07489f 100644 --- a/tests/test_rules_insn_scope.py +++ b/tests/test_rules_insn_scope.py @@ -131,4 +131,3 @@ def test_scope_instruction_description(): """ ) ) - From 031ea167e8b12a44a2082cd50ef4e97dd7b12b78 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 28 Mar 2022 13:17:18 -0600 Subject: [PATCH 09/43] add pycodestyle config --- .github/workflows/tests.yml | 2 ++ setup.cfg | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 setup.cfg diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index cb8b8c364..cf12c6055 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -37,6 +37,8 @@ jobs: run: isort --profile black --length-sort --line-width 120 -c . - name: Lint with black run: black -l 120 --check . + - name: Lint with pycodestyle + run: pycodestyle --show-source capa/ scripts/ tests/ - name: Check types with mypy run: mypy --config-file .github/mypy/mypy.ini capa/ scripts/ tests/ diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 000000000..5e0292f42 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,21 @@ +[bdist_wheel] +universal = 1 + +[aliases] +test = pytest + +[pycodestyle] +# the following suppress lints that conflict with the project's style: +# +# E203 Whitespace before : +# E302 expected 2 blank lines, found 1 +# E402 module level import not at top of file +# E501 line too long (209 > 180 characters) +# E712 comparison to False should be 'if cond is False:' or 'if not cond:' +# E722 do not use bare 'except' +# E731 do not assign a lambda expression, use a def +# W291 trailing whitespace +# W503 line break before binary operator +ignore = E203, E302, E402, E501, E712, E722, E731, W291, W503 +max-line-length = 180 +statistics = True From 963cfbf38051e579536bd5304da9e94df5aec1f1 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 28 Mar 2022 13:17:35 -0600 Subject: [PATCH 10/43] pep8 --- capa/features/extractors/smda/insn.py | 2 +- capa/rules.py | 2 +- scripts/capa2yara.py | 12 ++++++------ scripts/lint.py | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/capa/features/extractors/smda/insn.py b/capa/features/extractors/smda/insn.py index c15de2a48..e4b921325 100644 --- a/capa/features/extractors/smda/insn.py +++ b/capa/features/extractors/smda/insn.py @@ -218,7 +218,7 @@ def extract_insn_offset_features(f, bb, insn): # mov eax, [esi + ecx + 16384] operands = [o.strip() for o in insn.operands.split(",")] for operand in operands: - if not "ptr" in operand: + if "ptr" not in operand: continue if "esp" in operand or "ebp" in operand or "rbp" in operand: continue diff --git a/capa/rules.py b/capa/rules.py index 97ff3689d..b9f58f269 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -83,7 +83,7 @@ class Scope(str, Enum): INSTRUCTION_SCOPE = Scope.INSTRUCTION.value -SUPPORTED_FEATURES = { +SUPPORTED_FEATURES: Dict[str, Set] = { FILE_SCOPE: { capa.features.common.MatchedRule, capa.features.file.Export, diff --git a/scripts/capa2yara.py b/scripts/capa2yara.py index a3ca323dc..bdc4521ae 100644 --- a/scripts/capa2yara.py +++ b/scripts/capa2yara.py @@ -311,7 +311,7 @@ def do_statement(s_type, kid): return yara_strings, yara_condition - ############################## end def do_statement + # end: def do_statement yara_strings_list = [] yara_condition_list = [] @@ -390,7 +390,9 @@ def do_statement(s_type, kid): logger.info("kid coming: " + repr(kid.name)) # logger.info("grandchildren: " + repr(kid.children)) - ##### here we go into RECURSION ################################################################################## + # + # here we go into RECURSION + # yara_strings_sub, yara_condition_sub, rule_comment_sub, incomplete_sub = convert_rule( kid, rulename, cround, depth ) @@ -496,9 +498,7 @@ def do_statement(s_type, kid): yara_condition = "\n\t" + yara_condition_list[0] - logger.info( - f"################# end of convert_rule() #strings: {len(yara_strings_list)} #conditions: {len(yara_condition_list)}" - ) + logger.info(f"# end of convert_rule() #strings: {len(yara_strings_list)} #conditions: {len(yara_condition_list)}") logger.info(f"strings: {yara_strings} conditions: {yara_condition}") return yara_strings, yara_condition, rule_comment, incomplete @@ -617,7 +617,7 @@ def convert_rules(rules, namespaces, cround): # examples in capa can contain the same hash several times with different offset, so check if it's already there: # (keeping the offset might be interessting for some but breaks yara-ci for checking of the final rules - if not value in seen_hashes: + if value not in seen_hashes: yara_meta += "\t" + meta_name + ' = "' + value + '"\n' seen_hashes.append(value) diff --git a/scripts/lint.py b/scripts/lint.py index 3cd2686e7..7c13793b5 100644 --- a/scripts/lint.py +++ b/scripts/lint.py @@ -247,7 +247,7 @@ def __init__(self): self.enabled_frameworks = [] # This regex matches the format defined in the recommendation attribute - self.reg = re.compile("^([\w\s-]+)::(.+) \[([A-Za-z0-9.]+)\]$") + self.reg = re.compile("^([\\w\\s-]+)::(.+) \\[([A-Za-z0-9.]+)\\]$") def _entry_check(self, framework, category, entry, eid): if category not in self.data[framework].keys(): From 00d439f681c998cb00724da7705898b0397ce72e Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 28 Mar 2022 13:29:06 -0600 Subject: [PATCH 11/43] main: rename find_code_capabilities --- capa/main.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/capa/main.py b/capa/main.py index 8bef30db7..74c96b811 100644 --- a/capa/main.py +++ b/capa/main.py @@ -85,16 +85,18 @@ def set_vivisect_log_level(level): logging.getLogger("envi.codeflow").setLevel(level) -def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle): +def find_code_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle) -> Tuple[MatchResults, MatchResults, int]: + """ + find matches for the given rules within the given function. + + returns: tuple containing (match results for function, match results for basic blocks, number of features) + """ # contains features from: # - insns # - function function_features = collections.defaultdict(set) # type: FeatureSet bb_matches = collections.defaultdict(list) # type: MatchResults - for feature, va in itertools.chain(extractor.extract_function_features(f), extractor.extract_global_features()): - function_features[feature].add(va) - for bb in extractor.get_basic_blocks(f): # contains features from: # - insns @@ -122,6 +124,9 @@ def find_function_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f: for va, _ in res: capa.engine.index_rule_matches(function_features, rule, [va]) + for feature, va in itertools.chain(extractor.extract_function_features(f), extractor.extract_global_features()): + function_features[feature].add(va) + _, function_matches = ruleset.match(Scope.FUNCTION, function_features, int(f)) return function_matches, bb_matches, len(function_features) @@ -182,7 +187,7 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro pb.set_postfix_str("skipped %d library functions (%d%%)" % (n_libs, percentage)) continue - function_matches, bb_matches, feature_count = find_function_capabilities(ruleset, extractor, f) + function_matches, bb_matches, feature_count = find_code_capabilities(ruleset, extractor, f) meta["feature_counts"]["functions"][function_address] = feature_count logger.debug("analyzed function 0x%x and extracted %d features", function_address, feature_count) From b76930d2a39d34cb6084682a0228b05c1b857ba7 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 28 Mar 2022 13:47:53 -0600 Subject: [PATCH 12/43] main: split out basic block feature, match extraction --- capa/main.py | 84 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 27 deletions(-) diff --git a/capa/main.py b/capa/main.py index 74c96b811..841c9b2d7 100644 --- a/capa/main.py +++ b/capa/main.py @@ -45,7 +45,7 @@ from capa.rules import Rule, Scope, RuleSet from capa.engine import FeatureSet, MatchResults from capa.helpers import get_file_taste -from capa.features.extractors.base_extractor import FunctionHandle, FeatureExtractor +from capa.features.extractors.base_extractor import BBHandle, FunctionHandle, FeatureExtractor RULES_PATH_DEFAULT_STRING = "(embedded rules)" SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)" @@ -85,44 +85,74 @@ def set_vivisect_log_level(level): logging.getLogger("envi.codeflow").setLevel(level) -def find_code_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle) -> Tuple[MatchResults, MatchResults, int]: +def find_basic_block_capabilities( + ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle, bb: BBHandle +) -> Tuple[FeatureSet, MatchResults]: """ - find matches for the given rules within the given function. + find matches for the given rules within the given basic block. - returns: tuple containing (match results for function, match results for basic blocks, number of features) + returns: tuple containing (features for basic block, match results for basic block) """ # contains features from: # - insns - # - function - function_features = collections.defaultdict(set) # type: FeatureSet - bb_matches = collections.defaultdict(list) # type: MatchResults - - for bb in extractor.get_basic_blocks(f): - # contains features from: - # - insns - # - basic blocks - bb_features = collections.defaultdict(set) - + # - basic blocks + + # all features found within this basic block, + # includes features found within instructions. + features = collections.defaultdict(set) # type: FeatureSet + + for feature, va in itertools.chain( + extractor.extract_basic_block_features(f, bb), extractor.extract_global_features() + ): + features[feature].add(va) + + for insn in extractor.get_instructions(f, bb): + # these are instruction features, which are associated with basic blocks, + # not instruction scope features. + # + # yes: characteristic: fs access + # no: instruction: ... + # + # instruction scope features are handled specially to avoid evaluating too many scopes. for feature, va in itertools.chain( - extractor.extract_basic_block_features(f, bb), extractor.extract_global_features() + extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features() ): - bb_features[feature].add(va) - function_features[feature].add(va) + features[feature].add(va) + + # matches found at within this basic block. + _, matches = ruleset.match(Scope.BASIC_BLOCK, features, int(bb)) + + for rule_name, res in matches.items(): + rule = ruleset[rule_name] + for va, _ in res: + capa.engine.index_rule_matches(features, rule, [va]) + + return features, matches - for insn in extractor.get_instructions(f, bb): - for feature, va in itertools.chain( - extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features() - ): - bb_features[feature].add(va) - function_features[feature].add(va) - _, matches = ruleset.match(Scope.BASIC_BLOCK, bb_features, int(bb)) +def find_code_capabilities( + ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle +) -> Tuple[MatchResults, MatchResults, int]: + """ + find matches for the given rules within the given function. + + returns: tuple containing (match results for function, match results for basic blocks, number of features) + """ + # all features found within this function, + # includes features found within basic blocks (and instructions). + function_features = collections.defaultdict(set) # type: FeatureSet + + # matches found at the basic block scope. + # might be found at different basic blocks, thats ok. + bb_matches = collections.defaultdict(list) # type: MatchResults + + for bb in extractor.get_basic_blocks(f): + features, matches = find_basic_block_capabilities(ruleset, extractor, f, bb) + for feature, vas in features.items(): + function_features[feature].update(vas) for rule_name, res in matches.items(): bb_matches[rule_name].extend(res) - rule = ruleset[rule_name] - for va, _ in res: - capa.engine.index_rule_matches(function_features, rule, [va]) for feature, va in itertools.chain(extractor.extract_function_features(f), extractor.extract_global_features()): function_features[feature].add(va) From bd6e62e9bf5d70bde71582222b33cb035318c8c4 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 29 Mar 2022 11:26:21 -0600 Subject: [PATCH 13/43] Update scripts/lint.py Co-authored-by: Moritz --- scripts/lint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/lint.py b/scripts/lint.py index 7c13793b5..f3f161640 100644 --- a/scripts/lint.py +++ b/scripts/lint.py @@ -247,7 +247,7 @@ def __init__(self): self.enabled_frameworks = [] # This regex matches the format defined in the recommendation attribute - self.reg = re.compile("^([\\w\\s-]+)::(.+) \\[([A-Za-z0-9.]+)\\]$") + self.reg = re.compile(r"^([\w\s-]+)::(.+) \[([A-Za-z0-9.]+)\]$") def _entry_check(self, framework, category, entry, eid): if category not in self.data[framework].keys(): From c1b28f58d0e535da4ea6126a0bed7ea011e814cc Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 29 Mar 2022 12:25:27 -0600 Subject: [PATCH 14/43] rules: don't use global features to downselect rules closes #931 --- capa/rules.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/capa/rules.py b/capa/rules.py index b9f58f269..3e51347f4 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -1086,9 +1086,21 @@ def rec(rule_name: str, node: Union[Feature, Statement]): # hard feature: requires scan or match lookup rules_with_hard_features.add(rule_name) elif isinstance(node, capa.features.common.Feature): - # easy feature: hash lookup - rules_with_easy_features.add(rule_name) - rules_by_feature[node].add(rule_name) + if capa.features.common.is_global_feature(node): + # we don't want to index global features + # because they're not very selective. + # + # they're global, so if they match at one location in a file, + # they'll match at every location in a file. + # so thats not helpful to decide how to downselect. + # + # and, a global rule will never be the sole selector in a rule. + # TODO: probably want a lint for this. + pass + else: + # easy feature: hash lookup + rules_with_easy_features.add(rule_name) + rules_by_feature[node].add(rule_name) elif isinstance(node, (ceng.Not)): # `not:` statements are tricky to deal with. # From 1a28c324f1358b6ccbdf036c5fbbfebc13e967a7 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 29 Mar 2022 12:26:39 -0600 Subject: [PATCH 15/43] rules: doc --- capa/rules.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/capa/rules.py b/capa/rules.py index 3e51347f4..b60b26fa9 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -1048,6 +1048,9 @@ def _index_rules_by_feature(rules) -> Tuple[Dict[Feature, Set[str]], List[str]]: at this time, a rule evaluator can't do anything special with the "hard rules". it must still do a full top-down match of each rule, in topological order. + + this does not index global features, because these are not selective, and + won't be used as the sole feature used to match. """ # we'll do a couple phases: From 1839746bf8dbb3c6c231aa81e50002ecf2400f2e Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 29 Mar 2022 12:29:54 -0600 Subject: [PATCH 16/43] main: factor out matching at instruction scope --- capa/main.py | 95 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 65 insertions(+), 30 deletions(-) diff --git a/capa/main.py b/capa/main.py index 841c9b2d7..007cb2411 100644 --- a/capa/main.py +++ b/capa/main.py @@ -45,7 +45,7 @@ from capa.rules import Rule, Scope, RuleSet from capa.engine import FeatureSet, MatchResults from capa.helpers import get_file_taste -from capa.features.extractors.base_extractor import BBHandle, FunctionHandle, FeatureExtractor +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor RULES_PATH_DEFAULT_STRING = "(embedded rules)" SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)" @@ -85,41 +85,63 @@ def set_vivisect_log_level(level): logging.getLogger("envi.codeflow").setLevel(level) +def find_instruction_capabilities( + ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle, bb: BBHandle, insn: InsnHandle +) -> Tuple[FeatureSet, MatchResults]: + """ + find matches for the given rules for the given instruction. + + returns: tuple containing (features for instruction, match results for instruction) + """ + # all features found for the instruction. + features = collections.defaultdict(set) # type: FeatureSet + + for feature, va in itertools.chain( + extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features() + ): + features[feature].add(va) + + # matches found at this instruction. + _, matches = ruleset.match(Scope.INSTRUCTION, features, int(insn)) + + for rule_name, res in matches.items(): + rule = ruleset[rule_name] + for va, _ in res: + capa.engine.index_rule_matches(features, rule, [va]) + + return features, matches + + def find_basic_block_capabilities( ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle, bb: BBHandle -) -> Tuple[FeatureSet, MatchResults]: +) -> Tuple[FeatureSet, MatchResults, MatchResults]: """ find matches for the given rules within the given basic block. - returns: tuple containing (features for basic block, match results for basic block) + returns: tuple containing (features for basic block, match results for basic block, match results for instructions) """ - # contains features from: - # - insns - # - basic blocks - # all features found within this basic block, # includes features found within instructions. features = collections.defaultdict(set) # type: FeatureSet + # matches found at the instruction scope. + # might be found at different instructions, thats ok. + insn_matches = collections.defaultdict(list) # type: MatchResults + + for insn in extractor.get_instructions(f, bb): + ifeatures, imatches = find_instruction_capabilities(ruleset, extractor, f, bb, insn) + for feature, vas in ifeatures.items(): + features[feature].update(vas) + + for rule_name, res in imatches.items(): + insn_matches[rule_name].extend(res) + for feature, va in itertools.chain( extractor.extract_basic_block_features(f, bb), extractor.extract_global_features() ): features[feature].add(va) - for insn in extractor.get_instructions(f, bb): - # these are instruction features, which are associated with basic blocks, - # not instruction scope features. - # - # yes: characteristic: fs access - # no: instruction: ... - # - # instruction scope features are handled specially to avoid evaluating too many scopes. - for feature, va in itertools.chain( - extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features() - ): - features[feature].add(va) - - # matches found at within this basic block. + # matches found within this basic block. _, matches = ruleset.match(Scope.BASIC_BLOCK, features, int(bb)) for rule_name, res in matches.items(): @@ -127,16 +149,16 @@ def find_basic_block_capabilities( for va, _ in res: capa.engine.index_rule_matches(features, rule, [va]) - return features, matches + return features, matches, insn_matches def find_code_capabilities( ruleset: RuleSet, extractor: FeatureExtractor, f: FunctionHandle -) -> Tuple[MatchResults, MatchResults, int]: +) -> Tuple[MatchResults, MatchResults, MatchResults, int]: """ find matches for the given rules within the given function. - returns: tuple containing (match results for function, match results for basic blocks, number of features) + returns: tuple containing (match results for function, match results for basic blocks, match results for instructions, number of features) """ # all features found within this function, # includes features found within basic blocks (and instructions). @@ -146,19 +168,26 @@ def find_code_capabilities( # might be found at different basic blocks, thats ok. bb_matches = collections.defaultdict(list) # type: MatchResults + # matches found at the instruction scope. + # might be found at different instructions, thats ok. + insn_matches = collections.defaultdict(list) # type: MatchResults + for bb in extractor.get_basic_blocks(f): - features, matches = find_basic_block_capabilities(ruleset, extractor, f, bb) + features, bmatches, imatches = find_basic_block_capabilities(ruleset, extractor, f, bb) for feature, vas in features.items(): function_features[feature].update(vas) - for rule_name, res in matches.items(): + for rule_name, res in bmatches.items(): bb_matches[rule_name].extend(res) + for rule_name, res in imatches.items(): + insn_matches[rule_name].extend(res) + for feature, va in itertools.chain(extractor.extract_function_features(f), extractor.extract_global_features()): function_features[feature].add(va) _, function_matches = ruleset.match(Scope.FUNCTION, function_features, int(f)) - return function_matches, bb_matches, len(function_features) + return function_matches, bb_matches, insn_matches, len(function_features) def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, function_features: FeatureSet): @@ -185,6 +214,7 @@ def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, functi def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_progress=None) -> Tuple[MatchResults, Any]: all_function_matches = collections.defaultdict(list) # type: MatchResults all_bb_matches = collections.defaultdict(list) # type: MatchResults + all_insn_matches = collections.defaultdict(list) # type: MatchResults meta = { "feature_counts": { @@ -217,7 +247,7 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro pb.set_postfix_str("skipped %d library functions (%d%%)" % (n_libs, percentage)) continue - function_matches, bb_matches, feature_count = find_code_capabilities(ruleset, extractor, f) + function_matches, bb_matches, insn_matches, feature_count = find_code_capabilities(ruleset, extractor, f) meta["feature_counts"]["functions"][function_address] = feature_count logger.debug("analyzed function 0x%x and extracted %d features", function_address, feature_count) @@ -225,11 +255,15 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro all_function_matches[rule_name].extend(res) for rule_name, res in bb_matches.items(): all_bb_matches[rule_name].extend(res) + for rule_name, res in insn_matches.items(): + all_insn_matches[rule_name].extend(res) - # collection of features that captures the rule matches within function and BB scopes. + # collection of features that captures the rule matches within function, BB, and instruction scopes. # mapping from feature (matched rule) to set of addresses at which it matched. function_and_lower_features: FeatureSet = collections.defaultdict(set) - for rule_name, results in itertools.chain(all_function_matches.items(), all_bb_matches.items()): + for rule_name, results in itertools.chain( + all_function_matches.items(), all_bb_matches.items(), all_insn_matches.items() + ): locations = set(map(lambda p: p[0], results)) rule = ruleset[rule_name] capa.engine.index_rule_matches(function_and_lower_features, rule, locations) @@ -243,6 +277,7 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro # each rule exists in exactly one scope, # so there won't be any overlap among these following MatchResults, # and we can merge the dictionaries naively. + all_insn_matches.items(), all_bb_matches.items(), all_function_matches.items(), all_file_matches.items(), From 0effb5f8b033423924938b8495959ff3b0f9aca1 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 29 Mar 2022 12:33:55 -0600 Subject: [PATCH 17/43] changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9833732b8..8e44e681f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,8 +4,13 @@ ### New Features + - add new scope "instruction" for matching mnemonics and operands #767 @williballenthin + - add new feature "operand[...]" for matching instruction operands #767 @williballenthin + ### Breaking Changes + - instruction scope and operand feature are new and are not backwards compatible with older versions of capa + ### New Rules (4) - data-manipulation/encryption/aes/manually-build-aes-constants huynh.t.nhan@gmail.com From e0fca277f2689f5a3c8dac747b45cff754b67c9c Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 29 Mar 2022 12:58:27 -0600 Subject: [PATCH 18/43] rules: update valid features per scope --- capa/rules.py | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/capa/rules.py b/capa/rules.py index b60b26fa9..672563436 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -81,9 +81,17 @@ class Scope(str, Enum): FUNCTION_SCOPE = Scope.FUNCTION.value BASIC_BLOCK_SCOPE = Scope.BASIC_BLOCK.value INSTRUCTION_SCOPE = Scope.INSTRUCTION.value +# used only to specify supported features per scope. +# not used to validate rules. +GLOBAL_SCOPE = "global" SUPPORTED_FEATURES: Dict[str, Set] = { + GLOBAL_SCOPE: { + # these will be added to other scopes, see below. + capa.features.common.OS, + capa.features.common.Arch, + }, FILE_SCOPE: { capa.features.common.MatchedRule, capa.features.file.Export, @@ -93,20 +101,23 @@ class Scope(str, Enum): capa.features.common.Characteristic("embedded pe"), capa.features.common.String, capa.features.common.Format, - capa.features.common.OS, - capa.features.common.Arch, - }, + }, FUNCTION_SCOPE: { - # plus basic block scope features, see below + capa.features.common.MatchedRule, capa.features.basicblock.BasicBlock, capa.features.common.Characteristic("calls from"), capa.features.common.Characteristic("calls to"), capa.features.common.Characteristic("loop"), capa.features.common.Characteristic("recursive call"), - capa.features.common.OS, - capa.features.common.Arch, + # plus basic block scope features, see below }, BASIC_BLOCK_SCOPE: { + capa.features.common.MatchedRule, + capa.features.common.Characteristic("tight loop"), + capa.features.common.Characteristic("stack string"), + # plus instruction scope features, see below + }, + INSTRUCTION_SCOPE: { capa.features.common.MatchedRule, capa.features.insn.API, capa.features.insn.Number, @@ -118,21 +129,19 @@ class Scope(str, Enum): capa.features.common.Characteristic("peb access"), capa.features.common.Characteristic("fs access"), capa.features.common.Characteristic("gs access"), - capa.features.common.Characteristic("cross section flow"), - capa.features.common.Characteristic("tight loop"), - capa.features.common.Characteristic("stack string"), capa.features.common.Characteristic("indirect call"), capa.features.common.Characteristic("call $+5"), - capa.features.common.OS, - capa.features.common.Arch, - }, - INSTRUCTION_SCOPE: { - capa.features.common.Arch, - capa.features.common.OS, - capa.features.insn.Mnemonic, + capa.features.common.Characteristic("cross section flow"), }, } +# global scope features are available in all other scopes +SUPPORTED_FEATURES[INSTRUCTION_SCOPE].update(SUPPORTED_FEATURES[GLOBAL_SCOPE]) +SUPPORTED_FEATURES[BASIC_BLOCK_SCOPE].update(SUPPORTED_FEATURES[GLOBAL_SCOPE]) +SUPPORTED_FEATURES[FUNCTION_SCOPE].update(SUPPORTED_FEATURES[GLOBAL_SCOPE]) + +# all instruction scope features are also basic block features +SUPPORTED_FEATURES[BASIC_BLOCK_SCOPE].update(SUPPORTED_FEATURES[INSTRUCTION_SCOPE]) # all basic block scope features are also function scope features SUPPORTED_FEATURES[FUNCTION_SCOPE].update(SUPPORTED_FEATURES[BASIC_BLOCK_SCOPE]) From fb6b60bee32988602ccdcd5fc6836bc63a62fdfe Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 29 Mar 2022 12:58:38 -0600 Subject: [PATCH 19/43] tests: add tests demonstrating instruction (sub)scope matching --- tests/test_main.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/tests/test_main.py b/tests/test_main.py index ad0af4870..69782fc3d 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -326,6 +326,62 @@ def test_count_bb(z9324d_extractor): assert "count bb" in capabilities +def test_instruction_scope(z9324d_extractor): + # .text:004071A4 68 E8 03 00 00 push 3E8h + rules = capa.rules.RuleSet( + [ + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: push 1000 + namespace: test + scope: instruction + features: + - and: + - mnemonic: push + - number: 1000 + """ + ) + ) + ] + ) + capabilities, meta = capa.main.find_capabilities(rules, z9324d_extractor) + assert "push 1000" in capabilities + assert 0x4071A4 in set(map(lambda result: result[0], capabilities["push 1000"])) + + +def test_instruction_subscope(z9324d_extractor): + # .text:00406F60 sub_406F60 proc near + # [...] + # .text:004071A4 68 E8 03 00 00 push 3E8h + rules = capa.rules.RuleSet( + [ + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: push 1000 on i386 + namespace: test + scope: function + features: + - and: + - arch: i386 + - instruction: + - mnemonic: push + - number: 1000 + """ + ) + ) + ] + ) + capabilities, meta = capa.main.find_capabilities(rules, z9324d_extractor) + assert "push 1000 on i386" in capabilities + assert 0x406F60 in set(map(lambda result: result[0], capabilities["push 1000 on i386"])) + + def test_fix262(pma16_01_extractor, capsys): # tests rules can be loaded successfully and all output modes path = pma16_01_extractor.path From 49adb8de0cc3298603b8a3e950caeaed87bcb4fc Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 29 Mar 2022 13:00:28 -0600 Subject: [PATCH 20/43] pep8 --- capa/rules.py | 2 +- tests/test_main.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/capa/rules.py b/capa/rules.py index 672563436..11feeaf8c 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -101,7 +101,7 @@ class Scope(str, Enum): capa.features.common.Characteristic("embedded pe"), capa.features.common.String, capa.features.common.Format, - }, + }, FUNCTION_SCOPE: { capa.features.common.MatchedRule, capa.features.basicblock.BasicBlock, diff --git a/tests/test_main.py b/tests/test_main.py index 69782fc3d..a4ab05513 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -327,7 +327,7 @@ def test_count_bb(z9324d_extractor): def test_instruction_scope(z9324d_extractor): - # .text:004071A4 68 E8 03 00 00 push 3E8h + # .text:004071A4 68 E8 03 00 00 push 3E8h rules = capa.rules.RuleSet( [ capa.rules.Rule.from_yaml( @@ -355,7 +355,7 @@ def test_instruction_scope(z9324d_extractor): def test_instruction_subscope(z9324d_extractor): # .text:00406F60 sub_406F60 proc near # [...] - # .text:004071A4 68 E8 03 00 00 push 3E8h + # .text:004071A4 68 E8 03 00 00 push 3E8h rules = capa.rules.RuleSet( [ capa.rules.Rule.from_yaml( From d8d671e36fd34fc562153c5cc8059de92763ac03 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 30 Mar 2022 09:40:43 -0600 Subject: [PATCH 21/43] rules: add global scope features to file scope --- capa/rules.py | 1 + 1 file changed, 1 insertion(+) diff --git a/capa/rules.py b/capa/rules.py index 11feeaf8c..f15e157e0 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -139,6 +139,7 @@ class Scope(str, Enum): SUPPORTED_FEATURES[INSTRUCTION_SCOPE].update(SUPPORTED_FEATURES[GLOBAL_SCOPE]) SUPPORTED_FEATURES[BASIC_BLOCK_SCOPE].update(SUPPORTED_FEATURES[GLOBAL_SCOPE]) SUPPORTED_FEATURES[FUNCTION_SCOPE].update(SUPPORTED_FEATURES[GLOBAL_SCOPE]) +SUPPORTED_FEATURES[FILE_SCOPE].update(SUPPORTED_FEATURES[GLOBAL_SCOPE]) # all instruction scope features are also basic block features SUPPORTED_FEATURES[BASIC_BLOCK_SCOPE].update(SUPPORTED_FEATURES[INSTRUCTION_SCOPE]) From b942050c4ed9f73b2458f215e36ab798ceaf4500 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 30 Mar 2022 09:58:08 -0600 Subject: [PATCH 22/43] features: viv: factor out operand feature extraction --- capa/features/extractors/viv/insn.py | 216 ++++++++++++++------------- 1 file changed, 113 insertions(+), 103 deletions(-) diff --git a/capa/features/extractors/viv/insn.py b/capa/features/extractors/viv/insn.py index 921f7c694..f9c339f6c 100644 --- a/capa/features/extractors/viv/insn.py +++ b/capa/features/extractors/viv/insn.py @@ -171,37 +171,6 @@ def extract_insn_api_features(f, bb, insn): yield API(name), insn.va -def extract_insn_number_features(f, bb, insn): - """parse number features from the given instruction.""" - # example: - # - # push 3136B0h ; dwControlCode - for oper in insn.opers: - # this is for both x32 and x64 - if not isinstance(oper, (envi.archs.i386.disasm.i386ImmOper, envi.archs.i386.disasm.i386ImmMemOper)): - continue - - if isinstance(oper, envi.archs.i386.disasm.i386ImmOper): - v = oper.getOperValue(oper) - else: - v = oper.getOperAddr(oper) - - if f.vw.probeMemory(v, 1, envi.memory.MM_READ): - # this is a valid address - # assume its not also a constant. - continue - - if insn.mnem == "add" and insn.opers[0].isReg() and insn.opers[0].reg == envi.archs.i386.regs.REG_ESP: - # skip things like: - # - # .text:00401140 call sub_407E2B - # .text:00401145 add esp, 0Ch - return - - yield Number(v), insn.va - yield Number(v, bitness=get_bitness(f.vw)), insn.va - - def derefs(vw, p): """ recursively follow the given pointer, yielding the valid memory addresses along the way. @@ -340,75 +309,6 @@ def read_string(vw, offset: int) -> str: raise ValueError("not a string", offset) -def extract_insn_string_features(f, bb, insn): - """parse string features from the given instruction.""" - # example: - # - # push offset aAcr ; "ACR > " - - for oper in insn.opers: - if isinstance(oper, envi.archs.i386.disasm.i386ImmOper): - v = oper.getOperValue(oper) - elif isinstance(oper, envi.archs.i386.disasm.i386ImmMemOper): - # like 0x10056CB4 in `lea eax, dword [0x10056CB4]` - v = oper.imm - elif isinstance(oper, envi.archs.i386.disasm.i386SibOper): - # like 0x401000 in `mov eax, 0x401000[2 * ebx]` - v = oper.imm - elif isinstance(oper, envi.archs.amd64.disasm.Amd64RipRelOper): - v = oper.getOperAddr(insn) - else: - continue - - for v in derefs(f.vw, v): - try: - s = read_string(f.vw, v) - except ValueError: - continue - else: - yield String(s.rstrip("\x00")), insn.va - - -def extract_insn_offset_features(f, bb, insn): - """parse structure offset features from the given instruction.""" - # example: - # - # .text:0040112F cmp [esi+4], ebx - for oper in insn.opers: - - # this is for both x32 and x64 - # like [esi + 4] - # reg ^ - # disp - if isinstance(oper, envi.archs.i386.disasm.i386RegMemOper): - if oper.reg == envi.archs.i386.regs.REG_ESP: - continue - - if oper.reg == envi.archs.i386.regs.REG_EBP: - continue - - # TODO: do x64 support for real. - if oper.reg == envi.archs.amd64.regs.REG_RBP: - continue - - # viv already decodes offsets as signed - v = oper.disp - - yield Offset(v), insn.va - yield Offset(v, bitness=get_bitness(f.vw)), insn.va - - # like: [esi + ecx + 16384] - # reg ^ ^ - # index ^ - # disp - elif isinstance(oper, envi.archs.i386.disasm.i386SibOper): - # viv already decodes offsets as signed - v = oper.disp - - yield Offset(v), insn.va - yield Offset(v, bitness=get_bitness(f.vw)), insn.va - - def is_security_cookie(f, bb, insn) -> bool: """ check if an instruction is related to security cookie checks @@ -625,6 +525,118 @@ def extract_function_indirect_call_characteristic_features(f, bb, insn): yield Characteristic("indirect call"), insn.va +def extract_op_number_features(f, bb, insn, i, oper): + """parse number features from the given operand.""" + # example: + # + # push 3136B0h ; dwControlCode + + # this is for both x32 and x64 + if not isinstance(oper, (envi.archs.i386.disasm.i386ImmOper, envi.archs.i386.disasm.i386ImmMemOper)): + return + + if isinstance(oper, envi.archs.i386.disasm.i386ImmOper): + v = oper.getOperValue(oper) + else: + v = oper.getOperAddr(oper) + + if f.vw.probeMemory(v, 1, envi.memory.MM_READ): + # this is a valid address + # assume its not also a constant. + return + + if insn.mnem == "add" and insn.opers[0].isReg() and insn.opers[0].reg == envi.archs.i386.regs.REG_ESP: + # skip things like: + # + # .text:00401140 call sub_407E2B + # .text:00401145 add esp, 0Ch + return + + yield Number(v), insn.va + yield Number(v, bitness=get_bitness(f.vw)), insn.va + + +def extract_op_offset_features(f, bb, insn, i, oper): + """parse structure offset features from the given operand.""" + # example: + # + # .text:0040112F cmp [esi+4], ebx + + # this is for both x32 and x64 + # like [esi + 4] + # reg ^ + # disp + if isinstance(oper, envi.archs.i386.disasm.i386RegMemOper): + if oper.reg == envi.archs.i386.regs.REG_ESP: + return + + if oper.reg == envi.archs.i386.regs.REG_EBP: + return + + # TODO: do x64 support for real. + if oper.reg == envi.archs.amd64.regs.REG_RBP: + return + + # viv already decodes offsets as signed + v = oper.disp + + yield Offset(v), insn.va + yield Offset(v, bitness=get_bitness(f.vw)), insn.va + + # like: [esi + ecx + 16384] + # reg ^ ^ + # index ^ + # disp + elif isinstance(oper, envi.archs.i386.disasm.i386SibOper): + # viv already decodes offsets as signed + v = oper.disp + + yield Offset(v), insn.va + yield Offset(v, bitness=get_bitness(f.vw)), insn.va + + +def extract_op_string_features(f, bb, insn, i, oper): + """parse string features from the given operand.""" + # example: + # + # push offset aAcr ; "ACR > " + + if isinstance(oper, envi.archs.i386.disasm.i386ImmOper): + v = oper.getOperValue(oper) + elif isinstance(oper, envi.archs.i386.disasm.i386ImmMemOper): + # like 0x10056CB4 in `lea eax, dword [0x10056CB4]` + v = oper.imm + elif isinstance(oper, envi.archs.i386.disasm.i386SibOper): + # like 0x401000 in `mov eax, 0x401000[2 * ebx]` + v = oper.imm + elif isinstance(oper, envi.archs.amd64.disasm.Amd64RipRelOper): + v = oper.getOperAddr(insn) + else: + return + + for v in derefs(f.vw, v): + try: + s = read_string(f.vw, v) + except ValueError: + continue + else: + yield String(s.rstrip("\x00")), insn.va + + +def extract_operand_features(f, bb, insn): + for i, oper in enumerate(insn.opers): + for op_handler in OPERAND_HANDLERS: + for feature, va in op_handler(f, bb, insn, i, oper): + yield feature, va + + +OPERAND_HANDLERS = ( + extract_op_number_features, + extract_op_offset_features, + extract_op_string_features, +) + + def extract_features(f, bb, insn): """ extract features from the given insn. @@ -644,10 +656,7 @@ def extract_features(f, bb, insn): INSTRUCTION_HANDLERS = ( extract_insn_api_features, - extract_insn_number_features, - extract_insn_string_features, extract_insn_bytes_features, - extract_insn_offset_features, extract_insn_nzxor_characteristic_features, extract_insn_mnemonic_features, extract_insn_obfs_call_plus_5_characteristic_features, @@ -656,4 +665,5 @@ def extract_features(f, bb, insn): extract_insn_segment_access_features, extract_function_calls_from, extract_function_indirect_call_characteristic_features, + extract_operand_features, ) From cce1e41519c2ff322050f93c8b3e159052a86425 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 30 Mar 2022 13:12:49 -0600 Subject: [PATCH 23/43] formatting --- capa/features/common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/capa/features/common.py b/capa/features/common.py index 6b8677661..f9a9a9277 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -178,7 +178,6 @@ def __init__(self, value: str, description=None): class Characteristic(Feature): def __init__(self, value: str, description=None): - super(Characteristic, self).__init__(value, description=description) From e4c5ec278d830915ee817ff5076e651bb4f5de4d Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 30 Mar 2022 13:13:07 -0600 Subject: [PATCH 24/43] features: insn: define OperandImmediate and OperandOffset --- capa/features/insn.py | 49 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/capa/features/insn.py b/capa/features/insn.py index 85ef9a399..cb5b1afc1 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -5,7 +5,6 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. - import capa.render.utils from capa.features.common import Feature @@ -39,3 +38,51 @@ def get_value_str(self): class Mnemonic(Feature): def __init__(self, value: str, description=None): super(Mnemonic, self).__init__(value, description=description) + + +MAX_OPERAND_INDEX = 3 + + +class _Operand(Feature): + # superclass: don't use directly + # subclasses should set self.name and provide the value string formatter + def __init__(self, index: int, value: int, description=None): + super(_Operand, self).__init__(value, description=description) + self.index = index + + def __hash__(self): + return hash((self.name, self.value, self.bitness)) + + def __eq__(self, other): + return super().__eq__(other) and self.index == other.index + + def freeze_serialize(self): + return (self.__class__.__name__, [self.index, self.value]) + + +class OperandImmediate(_Operand): + # cached names so we don't do extra string formatting every ctor + NAMES = ["operand[%d].immediate" % i for i in range(MAX_OPERAND_INDEX)] + + # operand[i].immediate: 0x12 + def __init__(self, index: int, value: int, description=None): + super(OperandImmediate, self).__init__(index, value, description=description) + self.name = self.NAMES[index] + + def get_value_str(self) -> str: + assert isinstance(self.value, int) + return capa.render.utils.hex(self.value) + + +class OperandOffset(_Operand): + # cached names so we don't do extra string formatting every ctor + NAMES = ["operand[%d].offset" % i for i in range(MAX_OPERAND_INDEX)] + + # operand[i].offset: 0x12 + def __init__(self, index: int, value: int, description=None): + super(OperandOffset, self).__init__(index, value, description=description) + self.name = self.NAMES[index] + + def get_value_str(self) -> str: + assert isinstance(self.value, int) + return capa.render.utils.hex(self.value) From 6cbbd4d97fafff4eba154726355a5ca3e985dd52 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 30 Mar 2022 13:13:30 -0600 Subject: [PATCH 25/43] rules: parse OperandOffset and OperandImmediate features --- capa/rules.py | 42 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/capa/rules.py b/capa/rules.py index f15e157e0..f3b031715 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -125,6 +125,8 @@ class Scope(str, Enum): capa.features.common.Bytes, capa.features.insn.Offset, capa.features.insn.Mnemonic, + capa.features.insn.OperandImmediate, + capa.features.insn.OperandOffset, capa.features.common.Characteristic("nzxor"), capa.features.common.Characteristic("peb access"), capa.features.common.Characteristic("fs access"), @@ -358,7 +360,14 @@ def parse_description(s: Union[str, int, bytes], value_type: str, description=No # the string "10" that needs to become the number 10. if value_type == "bytes": value = parse_bytes(value) - elif value_type in ("number", "offset") or value_type.startswith(("number/", "offset/")): + elif ( + value_type in ("number", "offset") + or value_type.startswith(("number/", "offset/")) + or ( + value_type.startswith("operand[") + and (value_type.endswith("].immediate") or value_type.endswith("].offset")) + ) + ): try: value = parse_int(value) except ValueError: @@ -525,6 +534,37 @@ def build_statements(d, scope: str): raise InvalidRule("unexpected range: %s" % (count)) elif key == "string" and not isinstance(d[key], str): raise InvalidRule("ambiguous string value %s, must be defined as explicit string" % d[key]) + + elif key.startswith("operand[") and key.endswith("].immediate"): + index = key[len("operand[") : -len("].immediate")] + try: + index = int(index) + except ValueError: + raise InvalidRule("operand index must be an integer") + + value, description = parse_description(d[key], key, d.get("description")) + try: + feature = capa.features.insn.OperandImmediate(index, value, description=description) + except ValueError as e: + raise InvalidRule(str(e)) + ensure_feature_valid_for_scope(scope, feature) + return feature + + elif key.startswith("operand[") and key.endswith("].offset"): + index = key[len("operand[") : -len("].offset")] + try: + index = int(index) + except ValueError: + raise InvalidRule("operand index must be an integer") + + value, description = parse_description(d[key], key, d.get("description")) + try: + feature = capa.features.insn.OperandOffset(index, value, description=description) + except ValueError as e: + raise InvalidRule(str(e)) + ensure_feature_valid_for_scope(scope, feature) + return feature + elif ( (key == "os" and d[key] not in capa.features.common.VALID_OS) or (key == "format" and d[key] not in capa.features.common.VALID_FORMAT) From c7aadca25ce59bf99e5c19fa420f9f334bb5ed1f Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 30 Mar 2022 13:13:50 -0600 Subject: [PATCH 26/43] tests: demonstrate OperandOffset and OperandImmediate --- tests/test_match.py | 54 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tests/test_match.py b/tests/test_match.py index 7f69dc345..11551fde9 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -531,3 +531,57 @@ def test_match_not_not(): _, matches = match([r], {capa.features.insn.Number(100): {1, 2}}, 0x0) assert "test rule" in matches + + +def test_match_operand_immediate(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + features: + - and: + - operand[0].immediate: 0x10 + """ + ) + r = capa.rules.Rule.from_yaml(rule) + + assert capa.features.insn.OperandImmediate(0, 0x10) in {capa.features.insn.OperandImmediate(0, 0x10)} + + _, matches = match([r], {capa.features.insn.OperandImmediate(0, 0x10): {1, 2}}, 0x0) + assert "test rule" in matches + + # mismatching index + _, matches = match([r], {capa.features.insn.OperandImmediate(1, 0x10): {1, 2}}, 0x0) + assert "test rule" not in matches + + # mismatching value + _, matches = match([r], {capa.features.insn.OperandImmediate(0, 0x11): {1, 2}}, 0x0) + assert "test rule" not in matches + + +def test_match_operand_offset(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + features: + - and: + - operand[0].offset: 0x10 + """ + ) + r = capa.rules.Rule.from_yaml(rule) + + assert capa.features.insn.OperandOffset(0, 0x10) in {capa.features.insn.OperandOffset(0, 0x10)} + + _, matches = match([r], {capa.features.insn.OperandOffset(0, 0x10): {1, 2}}, 0x0) + assert "test rule" in matches + + # mismatching index + _, matches = match([r], {capa.features.insn.OperandOffset(1, 0x10): {1, 2}}, 0x0) + assert "test rule" not in matches + + # mismatching value + _, matches = match([r], {capa.features.insn.OperandOffset(0, 0x11): {1, 2}}, 0x0) + assert "test rule" not in matches From 997daf537e3087802c5fac37044a0958454b27ca Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 30 Mar 2022 13:14:08 -0600 Subject: [PATCH 27/43] viv: insn: extract OperandOffset and OperandImmediate --- capa/features/extractors/viv/insn.py | 5 ++++- tests/fixtures.py | 6 ++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/capa/features/extractors/viv/insn.py b/capa/features/extractors/viv/insn.py index f9c339f6c..0620a3a41 100644 --- a/capa/features/extractors/viv/insn.py +++ b/capa/features/extractors/viv/insn.py @@ -17,7 +17,7 @@ import capa.features.extractors.helpers import capa.features.extractors.viv.helpers -from capa.features.insn import API, Number, Offset, Mnemonic +from capa.features.insn import API, Number, Offset, Mnemonic, OperandOffset, OperandImmediate from capa.features.common import ( BITNESS_X32, BITNESS_X64, @@ -554,6 +554,7 @@ def extract_op_number_features(f, bb, insn, i, oper): yield Number(v), insn.va yield Number(v, bitness=get_bitness(f.vw)), insn.va + yield OperandImmediate(i, v), insn.va def extract_op_offset_features(f, bb, insn, i, oper): @@ -582,6 +583,7 @@ def extract_op_offset_features(f, bb, insn, i, oper): yield Offset(v), insn.va yield Offset(v, bitness=get_bitness(f.vw)), insn.va + yield OperandOffset(i, v), insn.va # like: [esi + ecx + 16384] # reg ^ ^ @@ -593,6 +595,7 @@ def extract_op_offset_features(f, bb, insn, i, oper): yield Offset(v), insn.va yield Offset(v, bitness=get_bitness(f.vw)), insn.va + yield OperandOffset(i, v), insn.va def extract_op_string_features(f, bb, insn, i, oper): diff --git a/tests/fixtures.py b/tests/fixtures.py index fc35a16bf..ef1935f07 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -418,6 +418,12 @@ def parametrize(params, values, **kwargs): ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("xor"), True), ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("in"), False), ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("out"), False), + # insn/operand.immediate + ("mimikatz", "function=0x40105D,bb=0x401073", capa.features.insn.OperandImmediate(1, 0xFF), True), + ("mimikatz", "function=0x40105D,bb=0x401073", capa.features.insn.OperandImmediate(0, 0xFF), False), + # insn/operand.offset + ("mimikatz", "function=0x40105D,bb=0x4010B0", capa.features.insn.OperandOffset(0, 4), True), + ("mimikatz", "function=0x40105D,bb=0x4010B0", capa.features.insn.OperandOffset(1, 4), False), # insn/number ("mimikatz", "function=0x40105D", capa.features.insn.Number(0xFF), True), ("mimikatz", "function=0x40105D", capa.features.insn.Number(0x3136B0), True), From 76831e9b9de3a39d0ae832243a80747274bc34f0 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 30 Mar 2022 13:20:51 -0600 Subject: [PATCH 28/43] changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e44e681f..57ae132b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,8 @@ ### New Features - add new scope "instruction" for matching mnemonics and operands #767 @williballenthin - - add new feature "operand[...]" for matching instruction operands #767 @williballenthin + - add new feature "operand[{0, 1, 2}].immediate" for matching instruction operand immediate values #767 @williballenthin + - add new feature "operand[{0, 1, 2}].offset" for matching instruction operand offsets #767 @williballenthin ### Breaking Changes From 9da4ff10da8642348de0ad02558a15ce28daf9ff Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Thu, 31 Mar 2022 10:37:06 -0600 Subject: [PATCH 29/43] *: rename OperandImmediate to OperandNumber --- CHANGELOG.md | 2 +- capa/features/insn.py | 8 ++++---- capa/rules.py | 10 +++++----- tests/fixtures.py | 6 +++--- tests/test_match.py | 12 ++++++------ 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 57ae132b8..9047b3fa6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ ### New Features - add new scope "instruction" for matching mnemonics and operands #767 @williballenthin - - add new feature "operand[{0, 1, 2}].immediate" for matching instruction operand immediate values #767 @williballenthin + - add new feature "operand[{0, 1, 2}].number" for matching instruction operand immediate values #767 @williballenthin - add new feature "operand[{0, 1, 2}].offset" for matching instruction operand offsets #767 @williballenthin ### Breaking Changes diff --git a/capa/features/insn.py b/capa/features/insn.py index cb5b1afc1..b5873a72a 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -60,13 +60,13 @@ def freeze_serialize(self): return (self.__class__.__name__, [self.index, self.value]) -class OperandImmediate(_Operand): +class OperandNumber(_Operand): # cached names so we don't do extra string formatting every ctor - NAMES = ["operand[%d].immediate" % i for i in range(MAX_OPERAND_INDEX)] + NAMES = ["operand[%d].number" % i for i in range(MAX_OPERAND_INDEX)] - # operand[i].immediate: 0x12 + # operand[i].number: 0x12 def __init__(self, index: int, value: int, description=None): - super(OperandImmediate, self).__init__(index, value, description=description) + super(OperandNumber, self).__init__(index, value, description=description) self.name = self.NAMES[index] def get_value_str(self) -> str: diff --git a/capa/rules.py b/capa/rules.py index f3b031715..d776df4ea 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -125,7 +125,7 @@ class Scope(str, Enum): capa.features.common.Bytes, capa.features.insn.Offset, capa.features.insn.Mnemonic, - capa.features.insn.OperandImmediate, + capa.features.insn.OperandNumber, capa.features.insn.OperandOffset, capa.features.common.Characteristic("nzxor"), capa.features.common.Characteristic("peb access"), @@ -365,7 +365,7 @@ def parse_description(s: Union[str, int, bytes], value_type: str, description=No or value_type.startswith(("number/", "offset/")) or ( value_type.startswith("operand[") - and (value_type.endswith("].immediate") or value_type.endswith("].offset")) + and (value_type.endswith("].number") or value_type.endswith("].offset")) ) ): try: @@ -535,8 +535,8 @@ def build_statements(d, scope: str): elif key == "string" and not isinstance(d[key], str): raise InvalidRule("ambiguous string value %s, must be defined as explicit string" % d[key]) - elif key.startswith("operand[") and key.endswith("].immediate"): - index = key[len("operand[") : -len("].immediate")] + elif key.startswith("operand[") and key.endswith("].number"): + index = key[len("operand[") : -len("].number")] try: index = int(index) except ValueError: @@ -544,7 +544,7 @@ def build_statements(d, scope: str): value, description = parse_description(d[key], key, d.get("description")) try: - feature = capa.features.insn.OperandImmediate(index, value, description=description) + feature = capa.features.insn.OperandNumber(index, value, description=description) except ValueError as e: raise InvalidRule(str(e)) ensure_feature_valid_for_scope(scope, feature) diff --git a/tests/fixtures.py b/tests/fixtures.py index ef1935f07..8630bccf1 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -418,9 +418,9 @@ def parametrize(params, values, **kwargs): ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("xor"), True), ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("in"), False), ("mimikatz", "function=0x40105D", capa.features.insn.Mnemonic("out"), False), - # insn/operand.immediate - ("mimikatz", "function=0x40105D,bb=0x401073", capa.features.insn.OperandImmediate(1, 0xFF), True), - ("mimikatz", "function=0x40105D,bb=0x401073", capa.features.insn.OperandImmediate(0, 0xFF), False), + # insn/operand.number + ("mimikatz", "function=0x40105D,bb=0x401073", capa.features.insn.OperandNumber(1, 0xFF), True), + ("mimikatz", "function=0x40105D,bb=0x401073", capa.features.insn.OperandNumber(0, 0xFF), False), # insn/operand.offset ("mimikatz", "function=0x40105D,bb=0x4010B0", capa.features.insn.OperandOffset(0, 4), True), ("mimikatz", "function=0x40105D,bb=0x4010B0", capa.features.insn.OperandOffset(1, 4), False), diff --git a/tests/test_match.py b/tests/test_match.py index 11551fde9..e3ec17e54 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -533,7 +533,7 @@ def test_match_not_not(): assert "test rule" in matches -def test_match_operand_immediate(): +def test_match_operand_number(): rule = textwrap.dedent( """ rule: @@ -541,22 +541,22 @@ def test_match_operand_immediate(): name: test rule features: - and: - - operand[0].immediate: 0x10 + - operand[0].number: 0x10 """ ) r = capa.rules.Rule.from_yaml(rule) - assert capa.features.insn.OperandImmediate(0, 0x10) in {capa.features.insn.OperandImmediate(0, 0x10)} + assert capa.features.insn.OperandNumber(0, 0x10) in {capa.features.insn.OperandNumber(0, 0x10)} - _, matches = match([r], {capa.features.insn.OperandImmediate(0, 0x10): {1, 2}}, 0x0) + _, matches = match([r], {capa.features.insn.OperandNumber(0, 0x10): {1, 2}}, 0x0) assert "test rule" in matches # mismatching index - _, matches = match([r], {capa.features.insn.OperandImmediate(1, 0x10): {1, 2}}, 0x0) + _, matches = match([r], {capa.features.insn.OperandNumber(1, 0x10): {1, 2}}, 0x0) assert "test rule" not in matches # mismatching value - _, matches = match([r], {capa.features.insn.OperandImmediate(0, 0x11): {1, 2}}, 0x0) + _, matches = match([r], {capa.features.insn.OperandNumber(0, 0x11): {1, 2}}, 0x0) assert "test rule" not in matches From 856443319c45397f3493a83ce12b0d571d9e89f6 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Thu, 31 Mar 2022 10:39:18 -0600 Subject: [PATCH 30/43] viv: insn: fix OperandNumber reference --- capa/features/extractors/viv/insn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/capa/features/extractors/viv/insn.py b/capa/features/extractors/viv/insn.py index 0620a3a41..3cfa25c7d 100644 --- a/capa/features/extractors/viv/insn.py +++ b/capa/features/extractors/viv/insn.py @@ -17,7 +17,7 @@ import capa.features.extractors.helpers import capa.features.extractors.viv.helpers -from capa.features.insn import API, Number, Offset, Mnemonic, OperandOffset, OperandImmediate +from capa.features.insn import API, Number, Offset, Mnemonic, OperandOffset, OperandNumber from capa.features.common import ( BITNESS_X32, BITNESS_X64, @@ -554,7 +554,7 @@ def extract_op_number_features(f, bb, insn, i, oper): yield Number(v), insn.va yield Number(v, bitness=get_bitness(f.vw)), insn.va - yield OperandImmediate(i, v), insn.va + yield OperandNumber(i, v), insn.va def extract_op_offset_features(f, bb, insn, i, oper): From 85b1d50945df0203b857b0be9c48845df834754c Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Thu, 31 Mar 2022 10:40:48 -0600 Subject: [PATCH 31/43] isort --- capa/features/extractors/viv/insn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/viv/insn.py b/capa/features/extractors/viv/insn.py index 3cfa25c7d..3ae065dbd 100644 --- a/capa/features/extractors/viv/insn.py +++ b/capa/features/extractors/viv/insn.py @@ -17,7 +17,7 @@ import capa.features.extractors.helpers import capa.features.extractors.viv.helpers -from capa.features.insn import API, Number, Offset, Mnemonic, OperandOffset, OperandNumber +from capa.features.insn import API, Number, Offset, Mnemonic, OperandNumber, OperandOffset from capa.features.common import ( BITNESS_X32, BITNESS_X64, From 2989af0a3fe20566854de3e8ccf5d14961692042 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 4 Apr 2022 11:49:51 -0600 Subject: [PATCH 32/43] features: use ABC to denote abstract classes --- capa/features/common.py | 3 ++- capa/features/insn.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/capa/features/common.py b/capa/features/common.py index f9a9a9277..e209cb0e5 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -7,6 +7,7 @@ # See the License for the specific language governing permissions and limitations under the License. import re +import abc import codecs import logging import collections @@ -96,7 +97,7 @@ def __nonzero__(self): return self.success -class Feature: +class Feature(abc.ABC): def __init__(self, value: Union[str, int, bytes], bitness=None, description=None): """ Args: diff --git a/capa/features/insn.py b/capa/features/insn.py index b5873a72a..957450a3f 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -5,6 +5,8 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. +import abc + import capa.render.utils from capa.features.common import Feature @@ -43,7 +45,7 @@ def __init__(self, value: str, description=None): MAX_OPERAND_INDEX = 3 -class _Operand(Feature): +class _Operand(Feature, abc.ABC): # superclass: don't use directly # subclasses should set self.name and provide the value string formatter def __init__(self, index: int, value: int, description=None): From b318b0a2888f897bd9407b07baf78e2b13aadc7e Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 4 Apr 2022 11:56:47 -0600 Subject: [PATCH 33/43] freeze: fix freeze_deserialize for features with multiple args --- .DS_Store | Bin 0 -> 6148 bytes capa/features/common.py | 2 ++ 2 files changed, 2 insertions(+) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..96f147eec4586e0682dc5976dc129980517f8e78 GIT binary patch literal 6148 zcmeHK&2G~`7@W-yaT-980I3%wOI$+;Eg&i`Nty^M@gp?|4uC>ZyTOWUNBJQ|QKX#V z9e4$9TMT;wZ@yH&d)F9K+{0hH#nHmkJD1nlC9$F`)_I;WVZAnNnQSiZOfX zuIJ7ETnzZT%?tq<{Qhn_hHSS24vYrb!i@0=~xlE8>vc23mNgG-o>( zSL89qj9$2yBy%hG;>-djoXwcyA8{;cw)8E|E|Tq%HzT`B-7yBK{L;6$!b&Jc_>|-; zhvNJk`5qljjYN7PuEspca}=E2=9s~|=saeBJ{jqC=6iqSdy{Fc_CsW{x%{PqRj^9d zyUvGj-kJ8Mvq`fzek0FbhN0hIANTsNJa^tLU*8XdY0nGXu@ZQWOUm2VUf_iDW;hF+ ziPCnW3s%u8cFWs~#qPtsihX~lwyfBTy`5UczPI~exhz^YZryow*d6*OLHL>eF$C8| z6<0K_LH;ALxqs}813wJpeYRGtnHPt;C#!UEU6{VSMd*enw38dnjjR|hM# z1t8|wtb)ht)1PR|24Y>~Dnhm}L?k6fQlYmPLhl?zBVLk+u4W=><{80yf14(KEwg3PC literal 0 HcmV?d00001 diff --git a/capa/features/common.py b/capa/features/common.py index e209cb0e5..7f25e8d4b 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -169,6 +169,8 @@ def freeze_deserialize(cls, args): kwargs = args[-1] args = args[:-1] return cls(*args, **kwargs) + else: + return cls(*args) class MatchedRule(Feature): From 750803c3ccf587fa98fb2b2566c05f3f3d2ab781 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 4 Apr 2022 11:57:02 -0600 Subject: [PATCH 34/43] freeze: register operand features --- capa/features/freeze.py | 1 + 1 file changed, 1 insertion(+) diff --git a/capa/features/freeze.py b/capa/features/freeze.py index ca0b729b5..f90959f52 100644 --- a/capa/features/freeze.py +++ b/capa/features/freeze.py @@ -67,6 +67,7 @@ def serialize_feature(feature): KNOWN_FEATURES = {F.__name__: F for F in capa.features.common.Feature.__subclasses__()} +KNOWN_FEATURES.update({F.__name__: F for F in capa.features.insn._Operand.__subclasses__()}) def deserialize_feature(doc): From 0af60d9a7e0efc0bd5421234074658195826e864 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 4 Apr 2022 12:01:13 -0600 Subject: [PATCH 35/43] freeze: fix mypy --- capa/features/freeze.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/capa/features/freeze.py b/capa/features/freeze.py index f90959f52..bfa92460c 100644 --- a/capa/features/freeze.py +++ b/capa/features/freeze.py @@ -51,6 +51,7 @@ import json import zlib import logging +from typing import Dict, Type import capa.features.file import capa.features.insn @@ -58,6 +59,7 @@ import capa.features.basicblock import capa.features.extractors.base_extractor from capa.helpers import hex +from capa.features.common import Feature logger = logging.getLogger(__name__) @@ -66,8 +68,8 @@ def serialize_feature(feature): return feature.freeze_serialize() -KNOWN_FEATURES = {F.__name__: F for F in capa.features.common.Feature.__subclasses__()} -KNOWN_FEATURES.update({F.__name__: F for F in capa.features.insn._Operand.__subclasses__()}) +KNOWN_FEATURES: Dict[str, Type[Feature]] = {F.__name__: F for F in capa.features.common.Feature.__subclasses__()} +KNOWN_FEATURES.update({F.__name__: F for F in capa.features.insn._Operand.__subclasses__()}) # type: ignore def deserialize_feature(doc): From ef93fcc89e52e193b9295822d69beb314f23bb70 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 4 Apr 2022 12:05:15 -0600 Subject: [PATCH 36/43] tests: smda: xfail operand number/offset features --- tests/test_smda_features.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_smda_features.py b/tests/test_smda_features.py index 6614c24dd..3873d71b9 100644 --- a/tests/test_smda_features.py +++ b/tests/test_smda_features.py @@ -22,6 +22,14 @@ def test_smda_features(sample, scope, feature, expected): if scope.__name__ == "file" and isinstance(feature, capa.features.file.FunctionName) and expected is True: pytest.xfail("SMDA has no function ID") + if "bb=" in scope.__name__ and isinstance(feature, capa.features.insn.OperandNumber) and expected is True: + # SMDA not currently maintained, see: https://github.com/mandiant/capa/issues/937 + pytest.xfail("SMDA doesn't support operand numbers") + + if "bb=" in scope.__name__ and isinstance(feature, capa.features.insn.OperandOffset) and expected is True: + # SMDA not currently maintained, see: https://github.com/mandiant/capa/issues/937 + pytest.xfail("SMDA doesn't support operand offsets") + fixtures.do_test_feature_presence(fixtures.get_smda_extractor, sample, scope, feature, expected) From 5ffb73c5f5ecb9cc26d422915b6cd22d5bdab864 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 4 Apr 2022 15:13:43 -0600 Subject: [PATCH 37/43] ida: insn: extract operand number and offset features --- capa/features/extractors/ida/insn.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/capa/features/extractors/ida/insn.py b/capa/features/extractors/ida/insn.py index c8ccb2ee1..306bb9542 100644 --- a/capa/features/extractors/ida/insn.py +++ b/capa/features/extractors/ida/insn.py @@ -12,7 +12,7 @@ import capa.features.extractors.helpers import capa.features.extractors.ida.helpers -from capa.features.insn import API, Number, Offset, Mnemonic +from capa.features.insn import API, Number, Offset, Mnemonic, OperandNumber, OperandOffset from capa.features.common import ( BITNESS_X32, BITNESS_X64, @@ -143,7 +143,11 @@ def extract_insn_number_features(f, bb, insn): # .text:00401145 add esp, 0Ch return - for op in capa.features.extractors.ida.helpers.get_insn_ops(insn, target_ops=(idaapi.o_imm, idaapi.o_mem)): + for i, op in enumerate(insn.ops): + if op.type == idaapi.o_void: + break + if op.type not in (idaapi.o_imm, idaapi.o_mem): + continue # skip things like: # .text:00401100 shr eax, offset loc_C if capa.features.extractors.ida.helpers.is_op_offset(insn, op): @@ -156,6 +160,7 @@ def extract_insn_number_features(f, bb, insn): yield Number(const), insn.ea yield Number(const, bitness=get_bitness(f.ctx)), insn.ea + yield OperandNumber(i, const), insn.ea def extract_insn_bytes_features(f, bb, insn): @@ -208,9 +213,14 @@ def extract_insn_offset_features(f, bb, insn): example: .text:0040112F cmp [esi+4], ebx """ - for op in capa.features.extractors.ida.helpers.get_insn_ops(insn, target_ops=(idaapi.o_phrase, idaapi.o_displ)): + for i, op in enumerate(insn.ops): + if op.type == idaapi.o_void: + break + if op.type not in (idaapi.o_phrase, idaapi.o_displ): + continue if capa.features.extractors.ida.helpers.is_op_stack_var(insn.ea, op.n): continue + p_info = capa.features.extractors.ida.helpers.get_op_phrase_info(op) op_off = p_info.get("offset", 0) if idaapi.is_mapped(op_off): @@ -225,6 +235,7 @@ def extract_insn_offset_features(f, bb, insn): yield Offset(op_off), insn.ea yield Offset(op_off, bitness=get_bitness(f.ctx)), insn.ea + yield OperandOffset(i, op_off), insn.ea def contains_stack_cookie_keywords(s): From df03932f89b0476a79e7b120369cb2a6b5e9e516 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 4 Apr 2022 16:54:51 -0600 Subject: [PATCH 38/43] gitignore --- .DS_Store | Bin 6148 -> 0 bytes .gitignore | 2 ++ 2 files changed, 2 insertions(+) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 96f147eec4586e0682dc5976dc129980517f8e78..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK&2G~`7@W-yaT-980I3%wOI$+;Eg&i`Nty^M@gp?|4uC>ZyTOWUNBJQ|QKX#V z9e4$9TMT;wZ@yH&d)F9K+{0hH#nHmkJD1nlC9$F`)_I;WVZAnNnQSiZOfX zuIJ7ETnzZT%?tq<{Qhn_hHSS24vYrb!i@0=~xlE8>vc23mNgG-o>( zSL89qj9$2yBy%hG;>-djoXwcyA8{;cw)8E|E|Tq%HzT`B-7yBK{L;6$!b&Jc_>|-; zhvNJk`5qljjYN7PuEspca}=E2=9s~|=saeBJ{jqC=6iqSdy{Fc_CsW{x%{PqRj^9d zyUvGj-kJ8Mvq`fzek0FbhN0hIANTsNJa^tLU*8XdY0nGXu@ZQWOUm2VUf_iDW;hF+ ziPCnW3s%u8cFWs~#qPtsihX~lwyfBTy`5UczPI~exhz^YZryow*d6*OLHL>eF$C8| z6<0K_LH;ALxqs}813wJpeYRGtnHPt;C#!UEU6{VSMd*enw38dnjjR|hM# z1t8|wtb)ht)1PR|24Y>~Dnhm}L?k6fQlYmPLhl?zBVLk+u4W=><{80yf14(KEwg3PC diff --git a/.gitignore b/.gitignore index 7793d351e..10269009f 100644 --- a/.gitignore +++ b/.gitignore @@ -120,3 +120,5 @@ scripts/perf/*.svg scripts/perf/*.zip .direnv .envrc +.DS_Store +*/.DS_Store From 78c0afe0066ba7651f18e96c50578ceb438daac3 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 5 Apr 2022 10:18:55 -0600 Subject: [PATCH 39/43] setup: min python version is now 3.7 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3934513bb..f49a07c3b 100644 --- a/setup.py +++ b/setup.py @@ -98,5 +98,5 @@ "Programming Language :: Python :: 3", "Topic :: Security", ], - python_requires=">=3.6", + python_requires=">=3.7", ) From 715ac64ae694fef79b0ec5c071af97711d07de07 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 5 Apr 2022 10:19:04 -0600 Subject: [PATCH 40/43] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9047b3fa6..474b80be9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ ### Breaking Changes - instruction scope and operand feature are new and are not backwards compatible with older versions of capa + - Python 3.7 is now the minimum supported Python version #866 @williballenthin ### New Rules (4) From 0617b87f36938a3e1bf2c082061fd5301fbfe2ec Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 5 Apr 2022 10:19:09 -0600 Subject: [PATCH 41/43] ci: no longer test against py3.6 --- .github/workflows/publish.yml | 2 +- .github/workflows/tests.yml | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index a5119e4c7..42525df9a 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -15,7 +15,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.6' + python-version: '3.7' - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index cf12c6055..1f3cc7f78 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -67,11 +67,9 @@ jobs: matrix: os: [ubuntu-20.04, windows-2019, macos-10.15] # across all operating systems - python-version: ["3.6", "3.10"] + python-version: ["3.7", "3.10"] include: # on Ubuntu run these as well - - os: ubuntu-20.04 - python-version: "3.7" - os: ubuntu-20.04 python-version: "3.8" - os: ubuntu-20.04 From b5be876e617dbec8f34c0d59138efd3ce43d3d89 Mon Sep 17 00:00:00 2001 From: Moritz Date: Wed, 6 Apr 2022 11:24:05 +0200 Subject: [PATCH 42/43] feat: start dotnet detection (#955) * feat: start dotnet detection * Apply suggestions from code review Co-authored-by: Willi Ballenthin * refactor: dn instead of dotnet * refactor: format branches, extractor reorg * refactor: format selection and dotnet detect * feat: get format, arch, os * refactor: log errors and exceptions * ci: also test and build for dotnet-main dev * fix: import path * fix: circular dep * fix: remove buf argument feat: get runtime meta data * fix: log unsupported runtime error * fix: type ignore Co-authored-by: Willi Ballenthin --- .github/workflows/build.yml | 2 +- .github/workflows/tests.yml | 4 +- CHANGELOG.md | 11 +- capa/exceptions.py | 14 ++ capa/features/common.py | 17 +- capa/features/extractors/common.py | 5 +- capa/features/extractors/dnfile_.py | 105 +++++++++++ capa/features/freeze.py | 4 +- capa/helpers.py | 81 ++++++++- capa/main.py | 212 +++++++++++------------ capa/render/json.py | 4 +- capa/render/result_document.py | 1 - scripts/lint.py | 13 +- scripts/show-capabilities-by-function.py | 24 +-- scripts/show-features.py | 24 +-- setup.py | 1 + tests/fixtures.py | 21 ++- tests/test_dotnet_features.py | 25 +++ 18 files changed, 400 insertions(+), 168 deletions(-) create mode 100644 capa/exceptions.py create mode 100644 capa/features/extractors/dnfile_.py create mode 100644 tests/test_dotnet_features.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ddc8e2d00..7be4cdc9e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,7 +2,7 @@ name: build on: push: - branches: [master] + branches: [master, dotnet-main] release: types: [edited, published] diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1f3cc7f78..34eda0e0b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -2,9 +2,9 @@ name: CI on: push: - branches: [ master ] + branches: [ master, dotnet-main ] pull_request: - branches: [ master ] + branches: [ master, dotnet-main ] # save workspaces to speed up testing env: diff --git a/CHANGELOG.md b/CHANGELOG.md index 474b80be9..84b6b3d17 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,14 +4,15 @@ ### New Features - - add new scope "instruction" for matching mnemonics and operands #767 @williballenthin - - add new feature "operand[{0, 1, 2}].number" for matching instruction operand immediate values #767 @williballenthin - - add new feature "operand[{0, 1, 2}].offset" for matching instruction operand offsets #767 @williballenthin +- add new scope "instruction" for matching mnemonics and operands #767 @williballenthin +- add new feature "operand[{0, 1, 2}].number" for matching instruction operand immediate values #767 @williballenthin +- add new feature "operand[{0, 1, 2}].offset" for matching instruction operand offsets #767 @williballenthin +- main: detect dotnet binaries #955 @mr-tz ### Breaking Changes - - instruction scope and operand feature are new and are not backwards compatible with older versions of capa - - Python 3.7 is now the minimum supported Python version #866 @williballenthin +- instruction scope and operand feature are new and are not backwards compatible with older versions of capa +- Python 3.7 is now the minimum supported Python version #866 @williballenthin ### New Rules (4) diff --git a/capa/exceptions.py b/capa/exceptions.py new file mode 100644 index 000000000..8c9399977 --- /dev/null +++ b/capa/exceptions.py @@ -0,0 +1,14 @@ +class UnsupportedRuntimeError(RuntimeError): + pass + + +class UnsupportedFormatError(ValueError): + pass + + +class UnsupportedArchError(ValueError): + pass + + +class UnsupportedOSError(ValueError): + pass diff --git a/capa/features/common.py b/capa/features/common.py index 7f25e8d4b..bff1138ca 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -410,7 +410,9 @@ def freeze_deserialize(cls, args): # other candidates here: https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#machine-types ARCH_I386 = "i386" ARCH_AMD64 = "amd64" -VALID_ARCH = (ARCH_I386, ARCH_AMD64) +# dotnet +ARCH_ANY = "any" +VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ANY) class Arch(Feature): @@ -422,8 +424,10 @@ def __init__(self, value: str, description=None): OS_WINDOWS = "windows" OS_LINUX = "linux" OS_MACOS = "macos" +# dotnet +OS_ANY = "any" VALID_OS = {os.value for os in capa.features.extractors.elf.OS} -VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS}) +VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS, OS_ANY}) class OS(Feature): @@ -434,7 +438,14 @@ def __init__(self, value: str, description=None): FORMAT_PE = "pe" FORMAT_ELF = "elf" -VALID_FORMAT = (FORMAT_PE, FORMAT_ELF) +FORMAT_DOTNET = "dotnet" +VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET) +# internal only, not to be used in rules +FORMAT_AUTO = "auto" +FORMAT_SC32 = "sc32" +FORMAT_SC64 = "sc64" +FORMAT_FREEZE = "freeze" +FORMAT_UNKNOWN = "unknown" class Format(Feature): diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py index 99f0ea08a..786e4faf5 100644 --- a/capa/features/extractors/common.py +++ b/capa/features/extractors/common.py @@ -8,7 +8,8 @@ import capa.features import capa.features.extractors.elf import capa.features.extractors.pefile -from capa.features.common import OS, FORMAT_PE, FORMAT_ELF, OS_WINDOWS, Arch, Format, String +from capa.features.common import OS, FORMAT_PE, FORMAT_ELF, OS_WINDOWS, FORMAT_FREEZE, Arch, Format, String +from capa.features.freeze import is_freeze logger = logging.getLogger(__name__) @@ -29,6 +30,8 @@ def extract_format(buf): yield Format(FORMAT_PE), 0x0 elif buf.startswith(b"\x7fELF"): yield Format(FORMAT_ELF), 0x0 + elif is_freeze(buf): + yield Format(FORMAT_FREEZE), 0x0 else: # we likely end up here: # 1. handling a file format (e.g. macho) diff --git a/capa/features/extractors/dnfile_.py b/capa/features/extractors/dnfile_.py new file mode 100644 index 000000000..c20fd32b5 --- /dev/null +++ b/capa/features/extractors/dnfile_.py @@ -0,0 +1,105 @@ +import logging +from typing import Tuple, Iterator + +import dnfile + +from capa.features.common import OS, OS_ANY, ARCH_ANY, ARCH_I386, ARCH_AMD64, FORMAT_DOTNET, Arch, Format, Feature +from capa.features.extractors.base_extractor import FeatureExtractor + +logger = logging.getLogger(__name__) + + +def extract_file_format(**kwargs): + yield Format(FORMAT_DOTNET), 0x0 + + +def extract_file_os(**kwargs): + yield OS(OS_ANY), 0x0 + + +def extract_file_arch(pe, **kwargs): + # TODO differences for versions < 4.5? + # via https://stackoverflow.com/a/23614024/10548020 + if pe.net.Flags.CLR_32BITREQUIRED and pe.net.Flags.CLR_PREFER_32BIT: + yield Arch(ARCH_I386), 0x0 + elif not pe.net.Flags.CLR_32BITREQUIRED and not pe.net.Flags.CLR_PREFER_32BIT: + yield Arch(ARCH_AMD64), 0x0 + else: + yield Arch(ARCH_ANY), 0x0 + + +def extract_file_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, int]]: + for file_handler in FILE_HANDLERS: + for feature, va in file_handler(pe=pe): # type: ignore + yield feature, va + + +FILE_HANDLERS = ( + # extract_file_export_names, + # extract_file_import_names, + # extract_file_section_names, + # extract_file_strings, + # extract_file_function_names, + extract_file_format, +) + + +def extract_global_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, int]]: + for handler in GLOBAL_HANDLERS: + for feature, va in handler(pe=pe): # type: ignore + yield feature, va + + +GLOBAL_HANDLERS = ( + extract_file_os, + extract_file_arch, +) + + +class DnfileFeatureExtractor(FeatureExtractor): + def __init__(self, path: str): + super(DnfileFeatureExtractor, self).__init__() + self.path: str = path + self.pe: dnfile.dnPE = dnfile.dnPE(path) + + def get_base_address(self) -> int: + return self.pe.net.struct.EntryPointTokenOrRva + + def extract_global_features(self): + yield from extract_global_features(self.pe) + + def extract_file_features(self): + yield from extract_file_features(self.pe) + + def is_dotnet_file(self) -> bool: + return bool(self.pe.net) + + def get_runtime_version(self) -> Tuple[int, int]: + return self.pe.net.struct.MajorRuntimeVersion, self.pe.net.struct.MinorRuntimeVersion + + def get_meta_version_string(self) -> str: + return self.pe.net.metadata.struct.Version.decode("utf-8") + + def get_functions(self): + raise NotImplementedError("DnfileFeatureExtractor can only be used to extract file features") + + def extract_function_features(self, f): + raise NotImplementedError("DnfileFeatureExtractor can only be used to extract file features") + + def get_basic_blocks(self, f): + raise NotImplementedError("DnfileFeatureExtractor can only be used to extract file features") + + def extract_basic_block_features(self, f, bb): + raise NotImplementedError("DnfileFeatureExtractor can only be used to extract file features") + + def get_instructions(self, f, bb): + raise NotImplementedError("DnfileFeatureExtractor can only be used to extract file features") + + def extract_insn_features(self, f, bb, insn): + raise NotImplementedError("DnfileFeatureExtractor can only be used to extract file features") + + def is_library_function(self, va): + raise NotImplementedError("DnfileFeatureExtractor can only be used to extract file features") + + def get_function_name(self, va): + raise NotImplementedError("DnfileFeatureExtractor can only be used to extract file features") diff --git a/capa/features/freeze.py b/capa/features/freeze.py index bfa92460c..c86d9165d 100644 --- a/capa/features/freeze.py +++ b/capa/features/freeze.py @@ -53,13 +53,12 @@ import logging from typing import Dict, Type +import capa.helpers import capa.features.file import capa.features.insn import capa.features.common import capa.features.basicblock import capa.features.extractors.base_extractor -from capa.helpers import hex -from capa.features.common import Feature logger = logging.getLogger(__name__) @@ -87,6 +86,7 @@ def dumps(extractor): returns: str: the serialized features. """ + hex = capa.helpers.hex ret = { "version": 1, "base address": extractor.get_base_address(), diff --git a/capa/helpers.py b/capa/helpers.py index e36ca3ac2..5c0bcfd69 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -5,10 +5,20 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. - import os +import logging from typing import NoReturn +from capa.exceptions import UnsupportedFormatError +from capa.features.common import FORMAT_SC32, FORMAT_SC64, FORMAT_UNKNOWN +from capa.features.extractors.common import extract_format + +EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32") +EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64") + + +logger = logging.getLogger("capa") + _hex = hex @@ -35,3 +45,72 @@ def is_runtime_ida(): def assert_never(value: NoReturn) -> NoReturn: assert False, f"Unhandled value: {value} ({type(value).__name__})" + + +def get_format_from_extension(sample: str) -> str: + if sample.endswith(EXTENSIONS_SHELLCODE_32): + return FORMAT_SC32 + elif sample.endswith(EXTENSIONS_SHELLCODE_64): + return FORMAT_SC64 + return FORMAT_UNKNOWN + + +def get_auto_format(path: str) -> str: + format_ = get_format(path) + if format_ == FORMAT_UNKNOWN: + format_ = get_format_from_extension(path) + if format_ == FORMAT_UNKNOWN: + raise UnsupportedFormatError() + return format_ + + +def get_format(sample: str) -> str: + with open(sample, "rb") as f: + buf = f.read() + + for feature, _ in extract_format(buf): + assert isinstance(feature.value, str) + return feature.value + + return FORMAT_UNKNOWN + + +def log_unsupported_format_error(): + logger.error("-" * 80) + logger.error(" Input file does not appear to be a PE or ELF file.") + logger.error(" ") + logger.error( + " capa currently only supports analyzing PE and ELF files (or shellcode, when using --format sc32|sc64)." + ) + logger.error(" If you don't know the input file type, you can try using the `file` utility to guess it.") + logger.error("-" * 80) + + +def log_unsupported_os_error(): + logger.error("-" * 80) + logger.error(" Input file does not appear to target a supported OS.") + logger.error(" ") + logger.error( + " capa currently only supports analyzing executables for some operating systems (including Windows and Linux)." + ) + logger.error("-" * 80) + + +def log_unsupported_arch_error(): + logger.error("-" * 80) + logger.error(" Input file does not appear to target a supported architecture.") + logger.error(" ") + logger.error(" capa currently only supports analyzing x86 (32- and 64-bit).") + logger.error("-" * 80) + + +def log_unsupported_runtime_error(): + logger.error("-" * 80) + logger.error(" Unsupported runtime or Python interpreter.") + logger.error(" ") + logger.error(" capa supports running under Python 3.7 and higher.") + logger.error(" ") + logger.error( + " If you're seeing this message on the command line, please ensure you're running a supported Python version." + ) + logger.error("-" * 80) diff --git a/capa/main.py b/capa/main.py index 007cb2411..eee2c2946 100644 --- a/capa/main.py +++ b/capa/main.py @@ -41,18 +41,35 @@ import capa.features.extractors import capa.features.extractors.common import capa.features.extractors.pefile +import capa.features.extractors.dnfile_ import capa.features.extractors.elffile from capa.rules import Rule, Scope, RuleSet from capa.engine import FeatureSet, MatchResults -from capa.helpers import get_file_taste -from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor +from capa.helpers import ( + get_format, + get_file_taste, + get_auto_format, + log_unsupported_os_error, + log_unsupported_arch_error, + log_unsupported_format_error, +) +from capa.exceptions import UnsupportedOSError, UnsupportedArchError, UnsupportedFormatError, UnsupportedRuntimeError +from capa.features.common import ( + FORMAT_PE, + FORMAT_ELF, + FORMAT_AUTO, + FORMAT_SC32, + FORMAT_SC64, + FORMAT_DOTNET, + FORMAT_FREEZE, +) +from capa.features.extractors.base_extractor import FunctionHandle, FeatureExtractor RULES_PATH_DEFAULT_STRING = "(embedded rules)" SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)" BACKEND_VIV = "vivisect" BACKEND_SMDA = "smda" -EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32") -EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64") +BACKEND_DOTNET = "dotnet" E_MISSING_RULES = -10 E_MISSING_FILE = -11 @@ -287,6 +304,7 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro return matches, meta +# TODO move all to helpers? def has_rule_with_namespace(rules, capabilities, rule_cat): for rule_name in capabilities.keys(): if rules.rules[rule_name].meta.get("namespace", "").startswith(rule_cat): @@ -334,17 +352,6 @@ def is_supported_format(sample: str) -> bool: return len(list(capa.features.extractors.common.extract_format(taste))) == 1 -def get_format(sample: str) -> str: - with open(sample, "rb") as f: - buf = f.read() - - for feature, _ in capa.features.extractors.common.extract_format(buf): - assert isinstance(feature.value, str) - return feature.value - - return "unknown" - - def is_supported_arch(sample: str) -> bool: with open(sample, "rb") as f: buf = f.read() @@ -433,19 +440,7 @@ def get_default_signatures() -> List[str]: return ret -class UnsupportedFormatError(ValueError): - pass - - -class UnsupportedArchError(ValueError): - pass - - -class UnsupportedOSError(ValueError): - pass - - -def get_workspace(path, format, sigpaths): +def get_workspace(path, format_, sigpaths): """ load the program at the given path into a vivisect workspace using the given format. also apply the given FLIRT signatures. @@ -465,21 +460,22 @@ def get_workspace(path, format, sigpaths): import viv_utils logger.debug("generating vivisect workspace for: %s", path) - if format == "auto": + # TODO should not be auto at this point, anymore + if format_ == FORMAT_AUTO: if not is_supported_format(path): raise UnsupportedFormatError() # don't analyze, so that we can add our Flirt function analyzer first. vw = viv_utils.getWorkspace(path, analyze=False, should_save=False) - elif format in {"pe", "elf"}: + elif format_ in {FORMAT_PE, FORMAT_ELF}: vw = viv_utils.getWorkspace(path, analyze=False, should_save=False) - elif format == "sc32": + elif format_ == FORMAT_SC32: # these are not analyzed nor saved. vw = viv_utils.getShellcodeWorkspaceFromFile(path, arch="i386", analyze=False) - elif format == "sc64": + elif format_ == FORMAT_SC64: vw = viv_utils.getShellcodeWorkspaceFromFile(path, arch="amd64", analyze=False) else: - raise ValueError("unexpected format: " + format) + raise ValueError("unexpected format: " + format_) viv_utils.flirt.register_flirt_signature_analyzers(vw, sigpaths) @@ -489,12 +485,9 @@ def get_workspace(path, format, sigpaths): return vw -class UnsupportedRuntimeError(RuntimeError): - pass - - +# TODO get_extractors -> List[FeatureExtractor]? def get_extractor( - path: str, format: str, backend: str, sigpaths: List[str], should_save_workspace=False, disable_progress=False + path: str, format_: str, backend: str, sigpaths: List[str], should_save_workspace=False, disable_progress=False ) -> FeatureExtractor: """ raises: @@ -502,7 +495,7 @@ def get_extractor( UnsupportedArchError UnsupportedOSError """ - if format not in ("sc32", "sc64"): + if format_ not in (FORMAT_SC32, FORMAT_SC64): if not is_supported_format(path): raise UnsupportedFormatError() @@ -512,6 +505,10 @@ def get_extractor( if not is_supported_os(path): raise UnsupportedOSError() + if format_ == FORMAT_DOTNET: + # TODO return capa.features.extractors.dotnet.extractor.DnFeatureExtractor(...) + raise NotImplementedError("DnFeatureExtractor") + if backend == "smda": from smda.SmdaConfig import SmdaConfig from smda.Disassembler import Disassembler @@ -530,7 +527,7 @@ def get_extractor( import capa.features.extractors.viv.extractor with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress): - vw = get_workspace(path, format, sigpaths) + vw = get_workspace(path, format_, sigpaths) if should_save_workspace: logger.debug("saving workspace") @@ -545,6 +542,22 @@ def get_extractor( return capa.features.extractors.viv.extractor.VivisectFeatureExtractor(vw, path) +def get_file_extractors(sample: str, format_: str) -> List[FeatureExtractor]: + file_extractors: List[FeatureExtractor] = list() + + if format_ == capa.features.extractors.common.FORMAT_PE: + file_extractors.append(capa.features.extractors.pefile.PefileFeatureExtractor(sample)) + + dnfile_extractor = capa.features.extractors.dnfile_.DnfileFeatureExtractor(sample) + if dnfile_extractor.is_dotnet_file(): + file_extractors.append(dnfile_extractor) + + elif format_ == capa.features.extractors.common.FORMAT_ELF: + file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(sample)) + + return file_extractors + + def is_nursery_rule_path(path: str) -> bool: """ The nursery is a spot for rules that have not yet been fully polished. @@ -652,7 +665,7 @@ def collect_metadata(argv, sample_path, rules_path, extractor): if rules_path != RULES_PATH_DEFAULT_STRING: rules_path = os.path.abspath(os.path.normpath(rules_path)) - format = get_format(sample_path) + format_ = get_format(sample_path) arch = get_arch(sample_path) os_ = get_os(sample_path) @@ -667,7 +680,7 @@ def collect_metadata(argv, sample_path, rules_path, extractor): "path": os.path.normpath(sample_path), }, "analysis": { - "format": format, + "format": format_, "arch": arch, "os": os_, "extractor": extractor.__class__.__name__, @@ -782,19 +795,20 @@ def install_common_args(parser, wanted=None): if "format" in wanted: formats = [ - ("auto", "(default) detect file type automatically"), - ("pe", "Windows PE file"), - ("elf", "Executable and Linkable Format"), - ("sc32", "32-bit shellcode"), - ("sc64", "64-bit shellcode"), - ("freeze", "features previously frozen by capa"), + (FORMAT_AUTO, "(default) detect file type automatically"), + (FORMAT_PE, "Windows PE file"), + (FORMAT_DOTNET, ".NET PE file"), + (FORMAT_ELF, "Executable and Linkable Format"), + (FORMAT_SC32, "32-bit shellcode"), + (FORMAT_SC64, "64-bit shellcode"), + (FORMAT_FREEZE, "features previously frozen by capa"), ] format_help = ", ".join(["%s: %s" % (f[0], f[1]) for f in formats]) parser.add_argument( "-f", "--format", choices=[f[0] for f in formats], - default="auto", + default=FORMAT_AUTO, help="select sample format, %s" % format_help, ) @@ -963,13 +977,21 @@ def main(argv=None): return ret try: - taste = get_file_taste(args.sample) + _ = get_file_taste(args.sample) except IOError as e: # per our research there's not a programmatic way to render the IOError with non-ASCII filename unless we # handle the IOError separately and reach into the args logger.error("%s", e.args[0]) return E_MISSING_FILE + format_ = args.format + if format_ == FORMAT_AUTO: + try: + format_ = get_auto_format(args.sample) + except UnsupportedFormatError: + log_unsupported_format_error() + return E_INVALID_FILE_TYPE + try: rules = get_rules(args.rules, disable_progress=args.quiet) rules = capa.rules.RuleSet(rules) @@ -991,26 +1013,23 @@ def main(argv=None): logger.error("%s", str(e)) return E_INVALID_RULE - file_extractor = None - if args.format == "pe" or (args.format == "auto" and taste.startswith(b"MZ")): - # these pefile and elffile file feature extractors are pretty light weight: they don't do any code analysis. - # so we can fairly quickly determine if the given file has "pure" file-scope rules - # that indicate a limitation (like "file is packed based on section names") - # and avoid doing a full code analysis on difficult/impossible binaries. - try: - file_extractor = capa.features.extractors.pefile.PefileFeatureExtractor(args.sample) - except PEFormatError as e: - logger.error("Input file '%s' is not a valid PE file: %s", args.sample, str(e)) - return E_CORRUPT_FILE - - elif args.format == "elf" or (args.format == "auto" and taste.startswith(b"\x7fELF")): - try: - file_extractor = capa.features.extractors.elffile.ElfFeatureExtractor(args.sample) - except (ELFError, OverflowError) as e: - logger.error("Input file '%s' is not a valid ELF file: %s", args.sample, str(e)) - return E_CORRUPT_FILE - - if file_extractor: + # file feature extractors are pretty lightweight: they don't do any code analysis. + # so we can fairly quickly determine if the given file has "pure" file-scope rules + # that indicate a limitation (like "file is packed based on section names") + # and avoid doing a full code analysis on difficult/impossible binaries. + # + # this pass can inspect multiple file extractors, e.g., dotnet and pe to identify + # various limitations + try: + file_extractors = get_file_extractors(args.sample, format_) + except PEFormatError as e: + logger.error("Input file '%s' is not a valid PE file: %s", args.sample, str(e)) + return E_CORRUPT_FILE + except (ELFError, OverflowError) as e: + logger.error("Input file '%s' is not a valid ELF file: %s", args.sample, str(e)) + return E_CORRUPT_FILE + + for file_extractor in file_extractors: try: pure_file_capabilities, _ = find_file_capabilities(rules, file_extractor, {}) except PEFormatError as e: @@ -1029,58 +1048,37 @@ def main(argv=None): logger.debug("file limitation short circuit, won't analyze fully.") return E_FILE_LIMITATION - try: - if args.format == "pe" or (args.format == "auto" and taste.startswith(b"MZ")): - sig_paths = get_signatures(args.signatures) - else: - sig_paths = [] - logger.debug("skipping library code matching: only have PE signatures") - except (IOError) as e: - logger.error("%s", str(e)) - return E_INVALID_SIG + if isinstance(file_extractor, capa.features.extractors.dnfile_.DnfileFeatureExtractor): + format_ = FORMAT_DOTNET - if (args.format == "freeze") or (args.format == "auto" and capa.features.freeze.is_freeze(taste)): - format = "freeze" + if format_ == FORMAT_FREEZE: with open(args.sample, "rb") as f: extractor = capa.features.freeze.load(f.read()) else: - format = args.format - if format == "auto" and args.sample.endswith(EXTENSIONS_SHELLCODE_32): - format = "sc32" - elif format == "auto" and args.sample.endswith(EXTENSIONS_SHELLCODE_64): - format = "sc64" + try: + if format_ == FORMAT_PE: + sig_paths = get_signatures(args.signatures) + else: + sig_paths = [] + logger.debug("skipping library code matching: only have native PE signatures") + except IOError as e: + logger.error("%s", str(e)) + return E_INVALID_SIG should_save_workspace = os.environ.get("CAPA_SAVE_WORKSPACE") not in ("0", "no", "NO", "n", None) try: extractor = get_extractor( - args.sample, format, args.backend, sig_paths, should_save_workspace, disable_progress=args.quiet + args.sample, format_, args.backend, sig_paths, should_save_workspace, disable_progress=args.quiet ) except UnsupportedFormatError: - logger.error("-" * 80) - logger.error(" Input file does not appear to be a PE or ELF file.") - logger.error(" ") - logger.error( - " capa currently only supports analyzing PE and ELF files (or shellcode, when using --format sc32|sc64)." - ) - logger.error(" If you don't know the input file type, you can try using the `file` utility to guess it.") - logger.error("-" * 80) + log_unsupported_format_error() return E_INVALID_FILE_TYPE except UnsupportedArchError: - logger.error("-" * 80) - logger.error(" Input file does not appear to target a supported architecture.") - logger.error(" ") - logger.error(" capa currently only supports analyzing x86 (32- and 64-bit).") - logger.error("-" * 80) + log_unsupported_arch_error() return E_INVALID_FILE_ARCH except UnsupportedOSError: - logger.error("-" * 80) - logger.error(" Input file does not appear to target a supported OS.") - logger.error(" ") - logger.error( - " capa currently only supports analyzing executables for some operating systems (including Windows and Linux)." - ) - logger.error("-" * 80) + log_unsupported_os_error() return E_INVALID_FILE_OS meta = collect_metadata(argv, args.sample, args.rules, extractor) diff --git a/capa/render/json.py b/capa/render/json.py index a70f9122d..9f595d4a0 100644 --- a/capa/render/json.py +++ b/capa/render/json.py @@ -7,9 +7,9 @@ # See the License for the specific language governing permissions and limitations under the License. import json -import capa.render.result_document from capa.rules import RuleSet from capa.engine import MatchResults +from capa.render.result_document import convert_capabilities_to_result_document class CapaJsonObjectEncoder(json.JSONEncoder): @@ -27,7 +27,7 @@ def default(self, obj): def render(meta, rules: RuleSet, capabilities: MatchResults) -> str: return json.dumps( - capa.render.result_document.convert_capabilities_to_result_document(meta, rules, capabilities), + convert_capabilities_to_result_document(meta, rules, capabilities), cls=CapaJsonObjectEncoder, sort_keys=True, ) diff --git a/capa/render/result_document.py b/capa/render/result_document.py index 1a0bde69b..33e083fa4 100644 --- a/capa/render/result_document.py +++ b/capa/render/result_document.py @@ -7,7 +7,6 @@ # See the License for the specific language governing permissions and limitations under the License. import copy -import capa.rules import capa.engine import capa.render.utils import capa.features.common diff --git a/scripts/lint.py b/scripts/lint.py index f3f161640..f4ba39ed9 100644 --- a/scripts/lint.py +++ b/scripts/lint.py @@ -41,6 +41,7 @@ import capa.main import capa.rules import capa.engine +import capa.helpers import capa.features.insn import capa.features.common from capa.rules import Rule, RuleSet @@ -286,16 +287,16 @@ def get_sample_capabilities(ctx: Context, path: Path) -> Set[str]: logger.debug("found cached results: %s: %d capabilities", nice_path, len(ctx.capabilities_by_sample[path])) return ctx.capabilities_by_sample[path] - if nice_path.endswith(capa.main.EXTENSIONS_SHELLCODE_32): - format = "sc32" - elif nice_path.endswith(capa.main.EXTENSIONS_SHELLCODE_64): - format = "sc64" + if nice_path.endswith(capa.helpers.EXTENSIONS_SHELLCODE_32): + format_ = "sc32" + elif nice_path.endswith(capa.helpers.EXTENSIONS_SHELLCODE_64): + format_ = "sc64" else: - format = "auto" + format_ = "auto" logger.debug("analyzing sample: %s", nice_path) extractor = capa.main.get_extractor( - nice_path, format, capa.main.BACKEND_VIV, DEFAULT_SIGNATURES, False, disable_progress=True + nice_path, format_, capa.main.BACKEND_VIV, DEFAULT_SIGNATURES, False, disable_progress=True ) capabilities, _ = capa.main.find_capabilities(ctx.rules, extractor, disable_progress=True) diff --git a/scripts/show-capabilities-by-function.py b/scripts/show-capabilities-by-function.py index 4f5761b6b..b2af9446a 100644 --- a/scripts/show-capabilities-by-function.py +++ b/scripts/show-capabilities-by-function.py @@ -59,7 +59,9 @@ import capa.main import capa.rules import capa.engine +import capa.helpers import capa.features +import capa.exceptions import capa.render.utils as rutils import capa.features.freeze import capa.render.result_document @@ -162,25 +164,11 @@ def main(argv=None): extractor = capa.main.get_extractor( args.sample, args.format, args.backend, sig_paths, should_save_workspace ) - except capa.main.UnsupportedFormatError: - logger.error("-" * 80) - logger.error(" Input file does not appear to be a PE file.") - logger.error(" ") - logger.error( - " capa currently only supports analyzing PE files (or shellcode, when using --format sc32|sc64)." - ) - logger.error(" If you don't know the input file type, you can try using the `file` utility to guess it.") - logger.error("-" * 80) + except capa.exceptions.UnsupportedFormatError: + capa.helpers.log_unsupported_format_error() return -1 - except capa.main.UnsupportedRuntimeError: - logger.error("-" * 80) - logger.error(" Unsupported runtime or Python interpreter.") - logger.error(" ") - logger.error(" capa supports running under Python 2.7 using Vivisect for binary analysis.") - logger.error(" It can also run within IDA Pro, using either Python 2.7 or 3.5+.") - logger.error(" ") - logger.error(" If you're seeing this message on the command line, please ensure you're running Python 2.7.") - logger.error("-" * 80) + except capa.exceptions.UnsupportedRuntimeError: + capa.helpers.log_unsupported_runtime_error() return -1 meta = capa.main.collect_metadata(argv, args.sample, args.rules, extractor) diff --git a/scripts/show-features.py b/scripts/show-features.py index a4f7f3b21..a070f653b 100644 --- a/scripts/show-features.py +++ b/scripts/show-features.py @@ -75,8 +75,10 @@ import capa.engine import capa.helpers import capa.features +import capa.exceptions import capa.features.common import capa.features.freeze +from capa.helpers import log_unsupported_runtime_error logger = logging.getLogger("capa.show-features") @@ -113,25 +115,11 @@ def main(argv=None): extractor = capa.main.get_extractor( args.sample, args.format, args.backend, sig_paths, should_save_workspace ) - except capa.main.UnsupportedFormatError: - logger.error("-" * 80) - logger.error(" Input file does not appear to be a PE file.") - logger.error(" ") - logger.error( - " capa currently only supports analyzing PE files (or shellcode, when using --format sc32|sc64)." - ) - logger.error(" If you don't know the input file type, you can try using the `file` utility to guess it.") - logger.error("-" * 80) + except capa.exceptions.UnsupportedFormatError: + capa.helpers.log_unsupported_format_error() return -1 - except capa.main.UnsupportedRuntimeError: - logger.error("-" * 80) - logger.error(" Unsupported runtime or Python interpreter.") - logger.error(" ") - logger.error(" capa supports running under Python 2.7 using Vivisect for binary analysis.") - logger.error(" It can also run within IDA Pro, using either Python 2.7 or 3.5+.") - logger.error(" ") - logger.error(" If you're seeing this message on the command line, please ensure you're running Python 2.7.") - logger.error("-" * 80) + except capa.exceptions.UnsupportedRuntimeError: + log_unsupported_runtime_error() return -1 if not args.function: diff --git a/setup.py b/setup.py index f49a07c3b..f07ae6c98 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ "smda==1.7.1", "pefile==2021.9.3", "pyelftools==0.28", + "dnfile==0.10.0", ] # this sets __version__ diff --git a/tests/fixtures.py b/tests/fixtures.py index 8630bccf1..b544304b4 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -224,6 +224,8 @@ def get_data_path_by_name(name): return os.path.join(CD, "data", "79abd17391adc6251ecdc58d13d76baf.dll_") elif name.startswith("946a9"): return os.path.join(CD, "data", "946a99f36a46d335dec080d9a4371940.dll_") + elif name.startswith("b9f5b"): + return os.path.join(CD, "data", "b9f5bd514485fb06da39beff051b9fdc.exe_") else: raise ValueError("unexpected sample fixture: %s" % name) @@ -276,7 +278,9 @@ def get_sample_md5_by_name(name): elif name.startswith("79abd"): return "79abd17391adc6251ecdc58d13d76baf" elif name.startswith("946a9"): - return "946a99f36a46d335dec080d9a4371940.dll_" + return "946a99f36a46d335dec080d9a4371940" + elif name.startswith("b9f5b"): + return "b9f5bd514485fb06da39beff051b9fdc" else: raise ValueError("unexpected sample fixture: %s" % name) @@ -583,6 +587,16 @@ def parametrize(params, values, **kwargs): key=lambda t: (t[0], t[1]), ) +FEATURE_PRESENCE_TESTS_DOTNET = sorted( + [ + ("b9f5b", "file", Arch(ARCH_I386), True), + ("b9f5b", "file", Arch(ARCH_AMD64), False), + ], + # order tests by (file, item) + # so that our LRU cache is most effective. + key=lambda t: (t[0], t[1]), +) + FEATURE_PRESENCE_TESTS_IDA = [ # file/imports # IDA can recover more names of APIs imported by ordinal @@ -695,3 +709,8 @@ def al_khaser_x86_extractor(): @pytest.fixture def pingtaest_extractor(): return get_extractor(get_data_path_by_name("pingtaest")) + + +@pytest.fixture +def b9f5b_extractor(): + return get_extractor(get_data_path_by_name("b9f5b")) diff --git a/tests/test_dotnet_features.py b/tests/test_dotnet_features.py new file mode 100644 index 000000000..449d7b555 --- /dev/null +++ b/tests/test_dotnet_features.py @@ -0,0 +1,25 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +# b9f5bd514485fb06da39beff051b9fdc + +import pytest +import fixtures +from fixtures import * +from fixtures import parametrize + +import capa.features.file + + +@parametrize( + "sample,scope,feature,expected", + fixtures.FEATURE_PRESENCE_TESTS_DOTNET, + indirect=["sample", "scope"], +) +def test_dnfile_features(sample, scope, feature, expected): + fixtures.do_test_feature_presence(fixtures.get_pefile_extractor, sample, scope, feature, expected) From 97e76a88e3a4fbab51403dbafa0d441ed2dcff9b Mon Sep 17 00:00:00 2001 From: Moritz Raabe Date: Wed, 6 Apr 2022 17:29:12 +0200 Subject: [PATCH 43/43] fix: imports and add tests --- capa/features/extractors/dnfile_.py | 14 +++++++++----- capa/features/freeze.py | 1 + capa/main.py | 2 +- tests/fixtures.py | 12 +++++++++++- tests/test_dotnet_features.py | 16 +++++++++++++++- 5 files changed, 37 insertions(+), 8 deletions(-) diff --git a/capa/features/extractors/dnfile_.py b/capa/features/extractors/dnfile_.py index c20fd32b5..715e8a5f2 100644 --- a/capa/features/extractors/dnfile_.py +++ b/capa/features/extractors/dnfile_.py @@ -2,6 +2,7 @@ from typing import Tuple, Iterator import dnfile +import pefile from capa.features.common import OS, OS_ANY, ARCH_ANY, ARCH_I386, ARCH_AMD64, FORMAT_DOTNET, Arch, Format, Feature from capa.features.extractors.base_extractor import FeatureExtractor @@ -18,11 +19,11 @@ def extract_file_os(**kwargs): def extract_file_arch(pe, **kwargs): - # TODO differences for versions < 4.5? - # via https://stackoverflow.com/a/23614024/10548020 - if pe.net.Flags.CLR_32BITREQUIRED and pe.net.Flags.CLR_PREFER_32BIT: + # to distinguish in more detail, see https://stackoverflow.com/a/23614024/10548020 + # .NET 4.5 added option: any CPU, 32-bit preferred + if pe.net.Flags.CLR_32BITREQUIRED and pe.PE_TYPE == pefile.OPTIONAL_HEADER_MAGIC_PE: yield Arch(ARCH_I386), 0x0 - elif not pe.net.Flags.CLR_32BITREQUIRED and not pe.net.Flags.CLR_PREFER_32BIT: + elif not pe.net.Flags.CLR_32BITREQUIRED and pe.PE_TYPE == pefile.OPTIONAL_HEADER_MAGIC_PE_PLUS: yield Arch(ARCH_AMD64), 0x0 else: yield Arch(ARCH_ANY), 0x0 @@ -63,6 +64,9 @@ def __init__(self, path: str): self.pe: dnfile.dnPE = dnfile.dnPE(path) def get_base_address(self) -> int: + return 0x0 + + def get_entry_point(self) -> int: return self.pe.net.struct.EntryPointTokenOrRva def extract_global_features(self): @@ -78,7 +82,7 @@ def get_runtime_version(self) -> Tuple[int, int]: return self.pe.net.struct.MajorRuntimeVersion, self.pe.net.struct.MinorRuntimeVersion def get_meta_version_string(self) -> str: - return self.pe.net.metadata.struct.Version.decode("utf-8") + return self.pe.net.metadata.struct.Version.rstrip(b"\x00").decode("utf-8") def get_functions(self): raise NotImplementedError("DnfileFeatureExtractor can only be used to extract file features") diff --git a/capa/features/freeze.py b/capa/features/freeze.py index c86d9165d..ff465f778 100644 --- a/capa/features/freeze.py +++ b/capa/features/freeze.py @@ -59,6 +59,7 @@ import capa.features.common import capa.features.basicblock import capa.features.extractors.base_extractor +from capa.features.common import Feature logger = logging.getLogger(__name__) diff --git a/capa/main.py b/capa/main.py index eee2c2946..7499afc3a 100644 --- a/capa/main.py +++ b/capa/main.py @@ -63,7 +63,7 @@ FORMAT_DOTNET, FORMAT_FREEZE, ) -from capa.features.extractors.base_extractor import FunctionHandle, FeatureExtractor +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor RULES_PATH_DEFAULT_STRING = "(embedded rules)" SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)" diff --git a/tests/fixtures.py b/tests/fixtures.py index b544304b4..c236d43f9 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -24,6 +24,7 @@ import capa.features.basicblock from capa.features.common import ( OS, + OS_ANY, OS_LINUX, ARCH_I386, FORMAT_PE, @@ -32,6 +33,7 @@ OS_WINDOWS, BITNESS_X32, BITNESS_X64, + FORMAT_DOTNET, Arch, Format, ) @@ -134,6 +136,12 @@ def get_pefile_extractor(path): return capa.features.extractors.pefile.PefileFeatureExtractor(path) +def get_dnfile_extractor(path): + import capa.features.extractors.dnfile_ + + return capa.features.extractors.dnfile_.DnfileFeatureExtractor(path) + + def extract_global_features(extractor): features = collections.defaultdict(set) for feature, va in extractor.extract_global_features(): @@ -591,6 +599,8 @@ def parametrize(params, values, **kwargs): [ ("b9f5b", "file", Arch(ARCH_I386), True), ("b9f5b", "file", Arch(ARCH_AMD64), False), + ("b9f5b", "file", OS(OS_ANY), True), + ("b9f5b", "file", Format(FORMAT_DOTNET), True), ], # order tests by (file, item) # so that our LRU cache is most effective. @@ -713,4 +723,4 @@ def pingtaest_extractor(): @pytest.fixture def b9f5b_extractor(): - return get_extractor(get_data_path_by_name("b9f5b")) + return get_dnfile_extractor(get_data_path_by_name("b9f5b")) diff --git a/tests/test_dotnet_features.py b/tests/test_dotnet_features.py index 449d7b555..10b5183f8 100644 --- a/tests/test_dotnet_features.py +++ b/tests/test_dotnet_features.py @@ -22,4 +22,18 @@ indirect=["sample", "scope"], ) def test_dnfile_features(sample, scope, feature, expected): - fixtures.do_test_feature_presence(fixtures.get_pefile_extractor, sample, scope, feature, expected) + fixtures.do_test_feature_presence(fixtures.get_dnfile_extractor, sample, scope, feature, expected) + + +@parametrize( + "function,expected", + [ + ("is_dotnet_file", True), + ("get_entry_point", 0x6000007), + ("get_runtime_version", (2, 5)), + ("get_meta_version_string", "v2.0.50727"), + ], +) +def test_dnfile_extractor(b9f5b_extractor, function, expected): + func = getattr(b9f5b_extractor, function) + assert func() == expected