diff --git a/capa/features/extractors/dotnet/helpers.py b/capa/features/extractors/dotnet/helpers.py index 8bd1ef9c80..a7511a5b46 100644 --- a/capa/features/extractors/dotnet/helpers.py +++ b/capa/features/extractors/dotnet/helpers.py @@ -1,6 +1,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Tuple, Generator, Any +from itertools import chain if TYPE_CHECKING: from dnfile.mdtable import MemberRefRow @@ -10,7 +11,7 @@ import dnfile from dnfile.enums import MetadataTables from dncil.cil.body import CilMethodBody -from dncil.clr.token import Token, InvalidToken +from dncil.clr.token import Token, StringToken, InvalidToken from dncil.cil.body.reader import CilMethodBodyReaderBase # key indexes to dotnet metadata tables @@ -18,54 +19,128 @@ class DnfileMethodBodyReader(CilMethodBodyReaderBase): - def __init__(self, pe: dnfile.dnPE, row: MethodDefRow): + def __init__(self, pe: dnPE, row: MethodDefRow): """ """ - self.pe = pe - self.rva = self.pe.get_offset_from_rva(row.Rva) + self.pe: dnPE = pe + self.offset: int = self.pe.get_offset_from_rva(row.Rva) - def read(self, n): + def read(self, n: int) -> bytes: """ """ - data = self.pe.get_data(self.pe.get_rva_from_offset(self.rva), n) - self.rva += n + data: bytes = self.pe.get_data(self.pe.get_rva_from_offset(self.offset), n) + self.offset += n return data - def tell(self): + def tell(self) -> int: """ """ - return self.rva + return self.offset - def seek(self, rva): + def seek(self, offset: int) -> int: """ """ - self.rva = rva + self.offset = offset + return self.offset - def get_token(self, value, is_str=False): - """ """ - token = Token(value) - if is_str: - return self.pe.net.user_strings.get_us(token.rid).value +def make_token(table: int, rid: int) -> int: + """ """ + return ((table & 0xFF) << Token.TABLE_SHIFT) | (rid & Token.RID_MASK) + - table_name = DOTNET_META_TABLES_BY_INDEX.get(token.table, "") - if not table_name: - # table_index is not valid - return InvalidToken(token.value) +def resolve_token(pe: dnPE, token: Token) -> Any: + """ """ + if isinstance(token, StringToken): + return pe.net.user_strings.get_us(token.rid).value - table = getattr(self.pe.net.mdtables, table_name, None) - if table is None: - # table index is valid but table is not present - return InvalidToken(token.value) + table_name: str = DOTNET_META_TABLES_BY_INDEX.get(token.table, "") + if not table_name: + # table_index is not valid + return InvalidToken(token.value) - try: - return table.rows[token.rid - 1] - except IndexError: - # table index is valid but row index is not valid - return InvalidToken(token.value) + table: Any = getattr(pe.net.mdtables, table_name, None) + if table is None: + # table index is valid but table is not present + return InvalidToken(token.value) + try: + return table.rows[token.rid - 1] + except IndexError: + # table index is valid but row index is not valid + return InvalidToken(token.value) -def read_dotnet_method_body(pe: dnPE, row: MethodDefRow) -> CilMethodBody: + +def get_method_body(pe: dnPE, row: MethodDefRow) -> CilMethodBody: """ """ return CilMethodBody(DnfileMethodBodyReader(pe, row)) -def get_imported_class_name(row: MemberRefRow) -> str: +def get_class_import_name(row: MemberRefRow) -> str: """ """ return f"{row.Class.row.TypeNamespace}.{row.Class.row.TypeName}" + + +def get_class_imports(pe: dnPE) -> Generator[Tuple[int, str], None, None]: + """parse class imports + + see https://www.ntcore.com/files/dotnetformat.htm + + 10 - MemberRef Table + Each row represents an imported method + Class (index into the TypeRef, ModuleRef, MethodDef, TypeSpec or TypeDef tables) + Name (index into String heap) + 01 - TypeRef Table + Each row represents an imported class, its namespace and the assembly which contains it + TypeName (index into String heap) + TypeNamespace (index into String heap) + """ + if not hasattr(pe.net.mdtables, "MemberRef"): + return + + for (rid, row) in enumerate(pe.net.mdtables.MemberRef): + if not isinstance(row.Class.row, (dnfile.mdtable.TypeRefRow,)): + continue + + class_imp = f"{get_class_import_name(row)}::{row.Name}" + token = make_token(MetadataTables.MemberRef.value, rid + 1) + + yield token, class_imp + + +def get_native_imports(pe: dnPE) -> Generator[Tuple[int, str], None, None]: + """parse native imports + + see https://www.ntcore.com/files/dotnetformat.htm + + 28 - ImplMap Table + ImplMap table holds information about unmanaged methods that can be reached from managed code, using PInvoke dispatch + MemberForwarded (index into the Field or MethodDef table; more precisely, a MemberForwarded coded index) + ImportName (index into the String heap) + ImportScope (index into the ModuleRef table) + """ + if not hasattr(pe.net.mdtables, "ImplMap"): + return + + for row in pe.net.mdtables.ImplMap: + dll: str = row.ImportScope.row.Name + symbol: str = row.ImportName + + # like Kernel32.dll + if dll and "." in dll: + dll = dll.split(".")[0].lower() + + # like kernel32.CreateFileA + native_imp: str = f"{dll}.{symbol}" + + # ECMA says "Each row of the ImplMap table associates a row in the MethodDef table (MemberForwarded) with the + # name of a routine (ImportName) in some unmanaged DLL (ImportScope)"; so we calculate and map the MemberForwarded + # MethodDef table token to help us later record native import method calls made from CIL + member_forwarded_token = make_token(row.MemberForwarded.table.number, row.MemberForwarded.row_index) + + yield member_forwarded_token, native_imp + + +def get_imports(pe: dnPE) -> Dict[int, str]: + """ """ + imps: Dict[int, str] = {} + + for (token, imp) in chain(get_class_imports(pe), get_native_imports(pe)): + imps[token] = imp + return imps diff --git a/capa/features/extractors/dotnet/insn.py b/capa/features/extractors/dotnet/insn.py index 3272d2d411..417c7234f9 100644 --- a/capa/features/extractors/dotnet/insn.py +++ b/capa/features/extractors/dotnet/insn.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, List, Tuple, Union, Callable, Generator +from typing import TYPE_CHECKING, Dict, List, Tuple, Union, Callable, Generator, Any if TYPE_CHECKING: from dncil.cil.instruction import Instruction @@ -17,25 +17,28 @@ from capa.features.common import String +def get_imports(ctx): + """ """ + if "imports_cache" not in ctx: + ctx["imports_cache"] = capa.features.extractors.dotnet.helpers.get_imports(ctx["pe"]) + return ctx["imports_cache"] + + def extract_insn_api_features(f: CilMethodBody, insn: Instruction) -> Generator[Tuple[API, int], None, None]: - """parse instruction API features - - see https://www.ntcore.com/files/dotnetformat.htm - - 10 - MemberRef Table - Each row represents an imported method. - Class (index into the TypeRef, ModuleRef, MethodDef, TypeSpec or TypeDef tables) - 01 - TypeRef Table - Each row represents an imported class, its namespace and the assembly which contains it. - TypeName (index into String heap) - TypeNamespace (index into String heap) - """ - if insn.opcode in (OpCodes.Call, OpCodes.Callvirt, OpCodes.Jmp, OpCodes.Calli): - if isinstance(insn.operand, dnfile.mdtable.MemberRefRow): - if isinstance(insn.operand.Class.row, (dnfile.mdtable.TypeRefRow,)): - class_name = capa.features.extractors.dotnet.helpers.get_imported_class_name(insn.operand) - method_name = insn.operand.Name - yield API(f"{class_name}::{method_name}"), insn.offset + """parse instruction API features""" + if insn.opcode not in (OpCodes.Call, OpCodes.Callvirt, OpCodes.Jmp, OpCodes.Calli): + return + + name = get_imports(f.ctx).get(insn.operand.value, "") + if not name: + return + + if "::" in name: + yield API(name), insn.offset + else: + dll, _, symbol = name.rpartition(".") + for name_variant in capa.features.extractors.helpers.generate_symbols(dll, symbol): + yield API(name_variant), insn.offset def extract_insn_number_features(f: CilMethodBody, insn: Instruction) -> Generator[Tuple[Number, int], None, None]: @@ -47,7 +50,8 @@ def extract_insn_number_features(f: CilMethodBody, insn: Instruction) -> Generat def extract_insn_string_features(f: CilMethodBody, insn: Instruction) -> Generator[Tuple[String, int], None, None]: """parse instruction string features""" if insn.is_ldstr(): - yield String(insn.operand), insn.offset + user_string = capa.features.extractors.dotnet.helpers.resolve_token(f.ctx["pe"], insn.operand) + yield String(user_string), insn.offset def extract_features( @@ -68,16 +72,25 @@ def extract_features( def main(args): """ """ - dn = dnfile.dnPE(args.path) + pe: dnPE = dnfile.dnPE(args.path) + + # data structure shared across functions yielded here. + # useful for caching analysis relevant across a single workspace. + ctx = {} + ctx["pe"] = pe + + features: List[Any] = [] + for row in pe.net.mdtables.MethodDef: + if not row.ImplFlags.miIL or any((row.Flags.mdAbstract, row.Flags.mdPinvokeImpl)): + continue + + try: + body: CilMethodBody = get_method_body(pe, row) + except MethodBodyFormatError as e: + print(e) + continue - features = [] - for row in dn.net.mdtables.MethodDef: - if row.ImplFlags.miIL: - try: - body = read_dotnet_method_body(dn, row) - except MethodBodyFormatError as e: - print(e) - continue + setattr(body, "ctx", ctx) for insn in body.instructions: features.extend(list(extract_features(body, insn))) @@ -91,7 +104,7 @@ def main(args): """ """ import argparse - from capa.features.extractors.dotnet.helpers import read_dotnet_method_body + from capa.features.extractors.dotnet.helpers import get_method_body parser = argparse.ArgumentParser(prog="parse instruction features from .NET PE") parser.add_argument("path", type=str, help="full path to .NET PE")