diff --git a/.gitattributes b/.gitattributes index 486a2325..3e80b98a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,3 @@ *.zip filter=lfs diff=lfs merge=lfs -text +*.pdf filter=lfs diff=lfs merge=lfs -text +*.png filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index 8b2342e7..6b697b88 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ samples/** dump.bin *.yara *.traits +dump.json diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..4450283f --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "docs/theme"] + path = docs/theme + url = https://github.com/jothepro/doxygen-awesome-css.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 00bc6159..4d813065 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,38 @@ -cmake_minimum_required(VERSION 3.16.3) +cmake_minimum_required(VERSION 3.5) project(binlex) -add_executable(${PROJECT_NAME} main.cpp) +set(SOURCES_BINLEX + src/binlex.cpp + src/args.cpp + src/raw.cpp + src/common.cpp + src/blelf.cpp + src/pe.cpp + src/decompiler.cpp + src/decompiler_rev.cpp +) -target_link_libraries(${PROJECT_NAME} -lcapstone) +add_executable(binlex ${SOURCES_BINLEX}) + +target_link_libraries(binlex -lcapstone -lcrypto -lm) + +target_include_directories(binlex + PRIVATE + ${PROJECT_SOURCE_DIR}/include +) + +project(blyara) + +set(SOURCES_BLYARA + src/blyara.cpp +) + +add_executable(blyara ${SOURCES_BLYARA}) + +target_link_libraries(blyara -lcapstone -lcrypto -lm) + +target_include_directories(blyara + PRIVATE + ${PROJECT_SOURCE_DIR}/include +) diff --git a/Doxyfile b/Doxyfile new file mode 100644 index 00000000..7a50b674 --- /dev/null +++ b/Doxyfile @@ -0,0 +1,385 @@ +# Doxyfile 1.8.17 + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- +DOXYFILE_ENCODING = UTF-8 +PROJECT_NAME = "binlex" +PROJECT_NUMBER = +PROJECT_BRIEF = +PROJECT_LOGO = +OUTPUT_DIRECTORY = build/docs/ +CREATE_SUBDIRS = NO +ALLOW_UNICODE_NAMES = NO +OUTPUT_LANGUAGE = English +OUTPUT_TEXT_DIRECTION = None +BRIEF_MEMBER_DESC = YES +REPEAT_BRIEF = YES +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the +ALWAYS_DETAILED_SEC = NO +INLINE_INHERITED_MEMB = NO +FULL_PATH_NAMES = YES +STRIP_FROM_PATH = +STRIP_FROM_INC_PATH = +SHORT_NAMES = NO +JAVADOC_AUTOBRIEF = NO +JAVADOC_BANNER = NO +QT_AUTOBRIEF = NO +MULTILINE_CPP_IS_BRIEF = NO +INHERIT_DOCS = YES +SEPARATE_MEMBER_PAGES = NO +TAB_SIZE = 4 +ALIASES = +TCL_SUBST = +OPTIMIZE_OUTPUT_FOR_C = NO +OPTIMIZE_OUTPUT_JAVA = NO +OPTIMIZE_FOR_FORTRAN = NO +OPTIMIZE_OUTPUT_VHDL = NO +OPTIMIZE_OUTPUT_SLICE = NO +EXTENSION_MAPPING = +MARKDOWN_SUPPORT = YES +TOC_INCLUDE_HEADINGS = 5 +AUTOLINK_SUPPORT = YES +BUILTIN_STL_SUPPORT = NO +CPP_CLI_SUPPORT = NO +SIP_SUPPORT = NO +IDL_PROPERTY_SUPPORT = YES +DISTRIBUTE_GROUP_DOC = NO +GROUP_NESTED_COMPOUNDS = NO +SUBGROUPING = YES +INLINE_GROUPED_CLASSES = NO +INLINE_SIMPLE_STRUCTS = NO +TYPEDEF_HIDES_STRUCT = NO +LOOKUP_CACHE_SIZE = 0 +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- +EXTRACT_ALL = NO +EXTRACT_PRIVATE = NO +EXTRACT_PRIV_VIRTUAL = NO +EXTRACT_PACKAGE = NO +EXTRACT_STATIC = NO +EXTRACT_LOCAL_CLASSES = YES +EXTRACT_LOCAL_METHODS = NO +EXTRACT_ANON_NSPACES = NO +HIDE_UNDOC_MEMBERS = NO +HIDE_UNDOC_CLASSES = NO +HIDE_FRIEND_COMPOUNDS = NO +HIDE_IN_BODY_DOCS = NO +INTERNAL_DOCS = NO +CASE_SENSE_NAMES = YES +HIDE_SCOPE_NAMES = NO +HIDE_COMPOUND_REFERENCE= NO +SHOW_INCLUDE_FILES = YES +SHOW_GROUPED_MEMB_INC = NO +FORCE_LOCAL_INCLUDES = NO +INLINE_INFO = YES +SORT_MEMBER_DOCS = YES +SORT_BRIEF_DOCS = NO +SORT_MEMBERS_CTORS_1ST = NO +SORT_GROUP_NAMES = NO +SORT_BY_SCOPE_NAME = NO +STRICT_PROTO_MATCHING = NO +GENERATE_TODOLIST = YES +GENERATE_TESTLIST = YES +GENERATE_BUGLIST = YES +GENERATE_DEPRECATEDLIST= YES +ENABLED_SECTIONS = +MAX_INITIALIZER_LINES = 30 +SHOW_USED_FILES = YES +SHOW_FILES = YES +SHOW_NAMESPACES = YES +FILE_VERSION_FILTER = +LAYOUT_FILE = +CITE_BIB_FILES = +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- +QUIET = NO +WARNINGS = YES +WARN_IF_UNDOCUMENTED = YES +WARN_IF_DOC_ERROR = YES +WARN_NO_PARAMDOC = NO +WARN_AS_ERROR = NO +WARN_FORMAT = "$file:$line: $text" +WARN_LOGFILE = +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- +INPUT = README.md include/ +INPUT_ENCODING = UTF-8 +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.doc \ + *.txt \ + *.py \ + *.pyw \ + *.f90 \ + *.f95 \ + *.f03 \ + *.f08 \ + *.f \ + *.for \ + *.tcl \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf \ + *.ice +RECURSIVE = NO +EXCLUDE = include/json.h +EXCLUDE_SYMLINKS = NO +EXCLUDE_PATTERNS = +EXCLUDE_SYMBOLS = +EXAMPLE_PATH = +EXAMPLE_PATTERNS = * +EXAMPLE_RECURSIVE = NO +IMAGE_PATH = +INPUT_FILTER = +FILTER_PATTERNS = +FILTER_SOURCE_FILES = NO +FILTER_SOURCE_PATTERNS = +USE_MDFILE_AS_MAINPAGE = README.md +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- +SOURCE_BROWSER = NO +INLINE_SOURCES = NO +STRIP_CODE_COMMENTS = YES +REFERENCED_BY_RELATION = NO +REFERENCES_RELATION = NO +REFERENCES_LINK_SOURCE = YES +SOURCE_TOOLTIPS = YES +USE_HTAGS = NO +VERBATIM_HEADERS = YES +CLANG_ASSISTED_PARSING = NO +CLANG_OPTIONS = +CLANG_DATABASE_PATH = +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- +ALPHABETICAL_INDEX = YES +COLS_IN_ALPHA_INDEX = 5 +IGNORE_PREFIX = +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- +GENERATE_HTML = YES +HTML_OUTPUT = html +HTML_FILE_EXTENSION = .html +HTML_HEADER = +HTML_FOOTER = +HTML_STYLESHEET = +HTML_EXTRA_STYLESHEET = docs/theme/doxygen-awesome.css +HTML_EXTRA_FILES = +HTML_COLORSTYLE_HUE = 220 +HTML_COLORSTYLE_SAT = 100 +HTML_COLORSTYLE_GAMMA = 80 +HTML_TIMESTAMP = NO +HTML_DYNAMIC_MENUS = YES +HTML_DYNAMIC_SECTIONS = NO +HTML_INDEX_NUM_ENTRIES = 100 +GENERATE_DOCSET = NO +DOCSET_FEEDNAME = "Doxygen generated docs" +DOCSET_BUNDLE_ID = org.doxygen.Project +DOCSET_PUBLISHER_ID = org.doxygen.Publisher +DOCSET_PUBLISHER_NAME = Publisher +GENERATE_HTMLHELP = NO +CHM_FILE = +HHC_LOCATION = +GENERATE_CHI = NO +CHM_INDEX_ENCODING = +BINARY_TOC = NO +TOC_EXPAND = NO +GENERATE_QHP = NO +QCH_FILE = +QHP_NAMESPACE = org.doxygen.Project +QHP_VIRTUAL_FOLDER = doc +QHP_CUST_FILTER_NAME = +QHP_CUST_FILTER_ATTRS = +QHP_SECT_FILTER_ATTRS = +QHG_LOCATION = +GENERATE_ECLIPSEHELP = NO +ECLIPSE_DOC_ID = org.doxygen.Project +DISABLE_INDEX = NO +GENERATE_TREEVIEW = YES +ENUM_VALUES_PER_LINE = 4 +TREEVIEW_WIDTH = 250 +EXT_LINKS_IN_WINDOW = NO +FORMULA_FONTSIZE = 10 +FORMULA_TRANSPARENT = YES +FORMULA_MACROFILE = +USE_MATHJAX = NO +MATHJAX_FORMAT = HTML-CSS +MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/ +MATHJAX_EXTENSIONS = +MATHJAX_CODEFILE = +SEARCHENGINE = YES +SERVER_BASED_SEARCH = NO +EXTERNAL_SEARCH = NO +SEARCHENGINE_URL = +SEARCHDATA_FILE = searchdata.xml +EXTERNAL_SEARCH_ID = +EXTRA_SEARCH_MAPPINGS = +#--------------------------------------------------------------------------- +# Configuration options related to the LaTeX output +#--------------------------------------------------------------------------- +GENERATE_LATEX = NO +LATEX_OUTPUT = latex +LATEX_CMD_NAME = +MAKEINDEX_CMD_NAME = makeindex +LATEX_MAKEINDEX_CMD = makeindex +COMPACT_LATEX = NO +PAPER_TYPE = a4 +EXTRA_PACKAGES = +LATEX_HEADER = +LATEX_FOOTER = +LATEX_EXTRA_STYLESHEET = +LATEX_EXTRA_FILES = +PDF_HYPERLINKS = YES +USE_PDFLATEX = YES +LATEX_BATCHMODE = NO +LATEX_HIDE_INDICES = NO +LATEX_SOURCE_CODE = NO +LATEX_BIB_STYLE = plain +LATEX_TIMESTAMP = NO +LATEX_EMOJI_DIRECTORY = +#--------------------------------------------------------------------------- +# Configuration options related to the RTF output +#--------------------------------------------------------------------------- +GENERATE_RTF = NO +RTF_OUTPUT = rtf +COMPACT_RTF = NO +RTF_HYPERLINKS = NO +RTF_STYLESHEET_FILE = +RTF_EXTENSIONS_FILE = +RTF_SOURCE_CODE = NO +#--------------------------------------------------------------------------- +# Configuration options related to the man page output +#--------------------------------------------------------------------------- +GENERATE_MAN = NO +MAN_OUTPUT = man +MAN_EXTENSION = .3 +MAN_SUBDIR = +MAN_LINKS = NO +#--------------------------------------------------------------------------- +# Configuration options related to the XML output +#--------------------------------------------------------------------------- +GENERATE_XML = NO +XML_OUTPUT = xml +XML_PROGRAMLISTING = YES +XML_NS_MEMB_FILE_SCOPE = NO +#--------------------------------------------------------------------------- +# Configuration options related to the DOCBOOK output +#--------------------------------------------------------------------------- +GENERATE_DOCBOOK = NO +DOCBOOK_OUTPUT = docbook +DOCBOOK_PROGRAMLISTING = NO +#--------------------------------------------------------------------------- +# Configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- +GENERATE_AUTOGEN_DEF = NO +#--------------------------------------------------------------------------- +# Configuration options related to the Perl module output +#--------------------------------------------------------------------------- +GENERATE_PERLMOD = NO +PERLMOD_LATEX = NO +PERLMOD_PRETTY = YES +PERLMOD_MAKEVAR_PREFIX = +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = NO +EXPAND_ONLY_PREDEF = NO +SEARCH_INCLUDES = YES +INCLUDE_PATH = +INCLUDE_FILE_PATTERNS = +PREDEFINED = +EXPAND_AS_DEFINED = +SKIP_FUNCTION_MACROS = YES +#--------------------------------------------------------------------------- +# Configuration options related to external references +#--------------------------------------------------------------------------- +TAGFILES = +GENERATE_TAGFILE = +ALLEXTERNALS = NO +EXTERNAL_GROUPS = YES +EXTERNAL_PAGES = YES +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- +CLASS_DIAGRAMS = YES +DIA_PATH = +HIDE_UNDOC_RELATIONS = YES +HAVE_DOT = NO +DOT_NUM_THREADS = 0 +DOT_FONTNAME = Helvetica +DOT_FONTSIZE = 10 +DOT_FONTPATH = +CLASS_GRAPH = YES +COLLABORATION_GRAPH = YES +GROUP_GRAPHS = YES +UML_LOOK = NO +UML_LIMIT_NUM_FIELDS = 10 +TEMPLATE_RELATIONS = NO +INCLUDE_GRAPH = YES +INCLUDED_BY_GRAPH = YES +CALL_GRAPH = NO +CALLER_GRAPH = NO +GRAPHICAL_HIERARCHY = YES +DIRECTORY_GRAPH = YES +DOT_IMAGE_FORMAT = png +INTERACTIVE_SVG = NO +DOT_PATH = +DOTFILE_DIRS = +MSCFILE_DIRS = +DIAFILE_DIRS = +PLANTUML_JAR_PATH = +PLANTUML_CFG_FILE = +PLANTUML_INCLUDE_PATH = +DOT_GRAPH_MAX_NODES = 50 +MAX_DOT_GRAPH_DEPTH = 0 +DOT_TRANSPARENT = NO +DOT_MULTI_TARGETS = NO +GENERATE_LEGEND = YES +DOT_CLEANUP = YES diff --git a/Makefile b/Makefile index 6e2e1923..4df7ae11 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,5 @@ +.PHONY: docs + threads=1 all: @@ -6,18 +8,28 @@ all: cmake -S ../ -B . && \ make -j ${threads} +docs: + mkdir -p build/docs/ + (cat Doxyfile; echo "NUM_PROC_THREADS=${threads}") | doxygen - + +docs-update: + rm -rf docs/html/ + cp -r build/docs/html/ docs/ + install: cp build/binlex /usr/bin/ + cp build/blyara /usr/bin/ uninstall: rm -f /usr/bin/binlex + rm -f /usr/bin/blyara traits: check-parameter-source check-parameter-dest check-parameter-type check-parameter-format check-parameter-arch @echo "[-] building traits..." @find ${source} -type f | while read i; do \ mkdir -p ${dest}/${type}/${format}/${arch}/; \ filename=`basename $${i}`; \ - echo "binlex -m ${format}:${arch} --input $${i} --output ${dest}/${type}/${format}/${arch}/$${filename}.traits"; \ + echo "binlex -m ${format}:${arch} -i $${i} | jq '.[] | .trait' > ${dest}/${type}/${format}/${arch}/$${filename}.traits"; \ done | parallel -u --progress -j ${threads} {} @echo "[*] trait build complete" diff --git a/README.md b/README.md index fdc75fa9..3baa5f02 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,34 @@ # binlex -
+ + + +
+ +Get slides [here](docs/oalabs.pdf). # Use Cases - YARA Signature Creation/Automation - Identifying Code-Reuse - Threat Hunting +- Building Goodware Trait Corpus +- Building Malware Trait Corpus +- Genetic Programming - Machine Learning Malware Detection # Installing @@ -17,8 +36,8 @@ Most projects attempting this use Python to generate traits, but it's slow. When **From Source:** ```bash -sudo apt install -y git libcapstone-dev cmake make parallel -git clone https://github.com/c3rb3ru5d3d53c/binlex.git +sudo apt install -y git build-essential libcapstone-dev cmake make parallel doxygen git-lfs +git clone --recursive https://github.com/c3rb3ru5d3d53c/binlex.git cd binlex/ make threads=4 sudo make install @@ -30,15 +49,16 @@ binlex -m elf:x86 -i tests/elf/elf.x86 **NOTE:** - ZIP files in the `tests/` directory can be extracted using the password `infected` -# Usage +# Basic Usage ```text -binlex v1.0.1 - A Binary Genetic Traits Lexer +binlex v1.1.0 - A Binary Genetic Traits Lexer -i --input input file (required) -m --mode set mode (required) -lm --list-modes list modes -h --help display help -o --output output file (optional) + -p --pretty pretty output (optional) -v --version display version Author: @c3rb3ru5d3d53c ``` @@ -51,11 +71,82 @@ Author: @c3rb3ru5d3d53c - `pe:x86_64` - `raw:x86` - `raw:x86_64` -- `raw:cil` +- `raw:cil` (experimental) __NOTE:__ The `raw` formats can be used on shellcode -**Advanced Usage:** +# Advanced Usage + +If you are hunting using `binlex` you can use `jq` to your advantage for advanced searches. + +```bash +binlex -m raw:x86 -i tests/raw/raw.x86 | jq -r '.[] | select(.type == "block" and .size < 32 and .size > 0) | .bytes' +2c 20 c1 cf 0d 01 c7 49 75 ef +52 57 8b 52 10 8b 42 3c 01 d0 8b 40 78 85 c0 74 4c +01 d0 50 8b 58 20 8b 48 18 01 d3 85 c9 74 3c +49 8b 34 8b 01 d6 31 ff 31 c0 c1 cf 0d ac 01 c7 38 e0 75 f4 +03 7d f8 3b 7d 24 75 e0 +58 5f 5a 8b 12 e9 80 ff ff ff +ff 4e 08 75 ec +e8 67 00 00 00 6a 00 6a 04 56 57 68 02 d9 c8 5f ff d5 83 f8 00 7e 36 +e9 9b ff ff ff +01 c3 29 c6 75 c1 +``` + +Other queries you can do: +```bash +# Block traits with a size between 0 and 32 bytes +jq -r '[.[] | select(.type == "block" and .size < 32 and .size > 0)]' +# Function traits with a cyclomatic complexity greater than 32 (maybe obfuscation) +jq -r '[.[] | select(.type == "function" and .cyclomatic_complexity > 32)]' +# Traits where bytes have high entropy +jq -r '[.[] | select(.bytes_entropy > 7)]' +# Output all trait strings only +jq -r '.[] | .trait' +# Output only trait hashes +jq -r '.[] | .trait_sha256' +``` + +If you output just traits you want to `stdout` you can do build a `yara` signature on the fly with the included tool `blyara`: + +```bash +build/binlex -m raw:x86 -i tests/raw/raw.x86 | jq -r '.[] | select(.size > 16 and .size < 32) | .trait' | build/blyara --name example_0 -m author example -m tlp white -c 1 +rule example_0 { + metadata: + author = "example" + tlp = "white" + strings: + trait_0 = {52 57 8b 52 ?? 8b 42 ?? 01 d0 8b 40 ?? 85 c0 74 4c} + trait_1 = {49 8b 34 8b 01 d6 31 ff 31 c0 c1 cf ?? ac 01 c7 38 e0 75 f4} + trait_2 = {e8 67 00 00 00 6a 00 6a ?? 56 57 68 ?? ?? ?? ?? ff d5 83 f8 00 7e 36} + condition: + 1 of them +} +``` + +You can also use the switch `--pretty` to output `json` to identify more properies to query. + +```bash +binlex -m pe:x86 -i tests/pe/pe.trickbot.x86 --pretty +[ + { + "average_instructions_per_block": 29, + "blocks": 1, + "bytes": "ae 32 c3 32 1a 33 25 34 85 39 ae 3b b4 3b c8 3b 35 3c 3a 3c 6b 3c 71 3c 85 3c aa 3d b0 3d 6a 3e a5 3e b8 3e fd 3e 38 3f 4b 3f 87 3f 00 20 00 00 58 00 00 00 4f 30 aa 30 01 31 1d 31 ac 31 d6 31 e5 31 f5 31 1c 32 31 32 75 34", + "bytes_entropy": 5.070523738861084, + "bytes_sha256": "67a966fe573ef678feaea6229271bb374304b418fe63f464b71af1fbe2a87f37", + "cyclomatic_complexity": 3, + "edges": 2, + "instructions": 29, + "offset": 11589, + "size": 74, + "trait": "ae 32 c3 32 1a 33 25 ?? ?? ?? ?? 3b b4 3b ?? ?? ?? ?? 3a 3c 6b 3c 71 3c 85 3c aa 3d b0 3d 6a 3e a5 3e b8 3e fd 3e 38 3f 4b 3f 87 3f 00 20 00 00 58 00 00 00 4f ?? aa 30 01 31 1d ?? ?? ?? ?? 31 e5 31 f5 31 1c 32 31 32 75 34", + "trait_entropy": 4.9164042472839355, + "trait_sha256": "a00fcb2b23a916192990665d8a5f53b2adfa74ec98991277e571542aee94c3a5", + "type": "block" + } +] +``` If you have terabytes of executable files, we can leverage the power of `parallel` to generate traits for us. @@ -91,18 +182,78 @@ Binlex is designed to do one thing and one thing only, extract genetic traits fr Again, **it's up to you to implement your own algorithms for detection based on the genetic traits you extract**. # Trait Format + Traits will contain binary code represented in hexadecimal form and will use `??` as wild cards for memory operands or other operands subject to change. -Trait files will contain a list of traits ordered by size and use the sha256 of the sample as the file name. +They will also contain additional properties about the trait including its `offset`, `edges`, `blocks`, `cyclomatic_complexity`, `average_instruction_per_block`, `bytes`, `trait`, `trait_sha256`, `bytes_sha256`, `trait_entropy`, `bytes_entropy`, `type`, `size`, and `instructions`. ``` -# Example Trait File -12 34 56 ?? ?? 11 12 13 -14 15 16 17 18 ?? ?? 21 22 23 -# ... More traits to follow +[ + { + "average_instructions_per_block": 29, + "blocks": 1, + "bytes": "ae 32 c3 32 1a 33 25 34 85 39 ae 3b b4 3b c8 3b 35 3c 3a 3c 6b 3c 71 3c 85 3c aa 3d b0 3d 6a 3e a5 3e b8 3e fd 3e 38 3f 4b 3f 87 3f 00 20 00 00 58 00 00 00 4f 30 aa 30 01 31 1d 31 ac 31 d6 31 e5 31 f5 31 1c 32 31 32 75 34", + "bytes_entropy": 5.070523738861084, + "bytes_sha256": "67a966fe573ef678feaea6229271bb374304b418fe63f464b71af1fbe2a87f37", + "cyclomatic_complexity": 3, + "edges": 2, + "instructions": 29, + "offset": 11589, + "size": 74, + "trait": "ae 32 c3 32 1a 33 25 ?? ?? ?? ?? 3b b4 3b ?? ?? ?? ?? 3a 3c 6b 3c 71 3c 85 3c aa 3d b0 3d 6a 3e a5 3e b8 3e fd 3e 38 3f 4b 3f 87 3f 00 20 00 00 58 00 00 00 4f ?? aa 30 01 31 1d ?? ?? ?? ?? 31 e5 31 f5 31 1c 32 31 32 75 34", + "trait_entropy": 4.9164042472839355, + "trait_sha256": "a00fcb2b23a916192990665d8a5f53b2adfa74ec98991277e571542aee94c3a5", + "type": "block" + } +] ``` +# Building Docs + +You can access the C++ API Documentation and everything else by building the documents using `doxygen`. + +```bash +make docs threads=4 +``` + +The documents will be available at `build/docs/html/index.html`. + +# C++ API Example Code + +It couldn't be any easier to leverage `binlex` and its C++ API to build your own applications. + +See example code below: + +```cpp +#include
+ binlex
+
+ |
+
▼Nbinlex | Binlex namespace |
CArgs | |
CCommon | |
CDecompiler | |
▼CDecompilerREV | |
CSection | |
CElf | |
CIMAGE_DATA_DIRECTORY | |
CIMAGE_DOS_HEADER | |
CIMAGE_EXPORT_DIRECTORY | |
CIMAGE_FILE_HEADER | |
CIMAGE_OPTIONAL_HEADER | |
CIMAGE_OPTIONAL_HEADER_32 | |
CIMAGE_OPTIONAL_HEADER_64 | |
CIMAGE_ROM_OPTIONAL_HEADER | |
CIMAGE_SECTION_HEADER | |
CIMAGE_TLS_DIRECTORY32 | |
CIMAGE_TLS_DIRECTORY64 | |
CPe | |
CRaw |
+ binlex
+
+ |
+
+ binlex
+
+ |
+
+ binlex
+
+ |
+
+ binlex
+
+ |
+
This is the complete list of members for binlex::Args, including all inherited members.
+Args() (defined in binlex::Args) | binlex::Args | |
check_mode(char *mode) (defined in binlex::Args) | binlex::Args | |
help (defined in binlex::Args) | binlex::Args | |
input (defined in binlex::Args) | binlex::Args | |
io_type (defined in binlex::Args) | binlex::Args | |
is_dir(const char *path) (defined in binlex::Args) | binlex::Args | |
is_file(const char *path) (defined in binlex::Args) | binlex::Args | |
list_modes (defined in binlex::Args) | binlex::Args | |
mode (defined in binlex::Args) | binlex::Args | |
modes (defined in binlex::Args) | binlex::Args | |
options (defined in binlex::Args) | binlex::Args | |
output (defined in binlex::Args) | binlex::Args | |
parse(int argc, char **argv) (defined in binlex::Args) | binlex::Args | |
pretty (defined in binlex::Args) | binlex::Args | |
print_help() (defined in binlex::Args) | binlex::Args | |
set_io_type(char *input) (defined in binlex::Args) | binlex::Args | |
SetDefault() (defined in binlex::Args) | binlex::Args | |
threads (defined in binlex::Args) | binlex::Args | |
version (defined in binlex::Args) | binlex::Args | |
~Args() (defined in binlex::Args) | binlex::Args |
+ binlex
+
+ |
+
+ binlex
+
+ |
+
This is the complete list of members for binlex::Common, including all inherited members.
+GetByteSize(string s) | binlex::Common | |
Hexdump(const char *desc, const void *addr, const int len) | binlex::Common | |
HexdumpBE(const void *data, size_t size) | binlex::Common | |
RemoveSpaces(string s) | binlex::Common | |
RemoveWildcards(string trait) | binlex::Common | |
SHA256(const char *trait) | binlex::Common | |
TraitToChar(string trait) | binlex::Common | |
TrimRight(const std::string &s) | binlex::Common | |
WildcardTrait(string trait, string bytes) | binlex::Common |
+ binlex
+
+ |
+
+Public Member Functions | |
string | SHA256 (const char *trait) |
vector< char > | TraitToChar (string trait) |
string | RemoveWildcards (string trait) |
uint | GetByteSize (string s) |
string | RemoveSpaces (string s) |
string | WildcardTrait (string trait, string bytes) |
string | TrimRight (const std::string &s) |
string | HexdumpBE (const void *data, size_t size) |
void | Hexdump (const char *desc, const void *addr, const int len) |
uint binlex::Common::GetByteSize | +( | +string | +s | ) | ++ |
This method gets the size in bytes of a trait string (includes wildcards).
trait | input trait string. |
void binlex::Common::Hexdump | +( | +const char * | +desc, | +
+ | + | const void * | +addr, | +
+ | + | const int | +len | +
+ | ) | ++ |
This method prints hexdump to stdout.
desc | A description of the data. |
data | A pointer to the data |
size | The size of the data to collect |
string binlex::Common::HexdumpBE | +( | +const void * | +data, | +
+ | + | size_t | +size | +
+ | ) | ++ |
This method creates a byte string based on a pointer and its size.
data | A pointer to the data |
size | The size of the data to collect |
string binlex::Common::RemoveSpaces | +( | +string | +s | ) | ++ |
This method removes spaces from a string.
s | input string |
string binlex::Common::RemoveWildcards | +( | +string | +trait | ) | ++ |
This method removes wildcards from a trait string.
trait | input trait string. |
string binlex::Common::SHA256 | +( | +const char * | +trait | ) | ++ |
This class contains methods common to binlex. This method takes an input string and returns its sha256 hash.
trait | input string. |
vector<char> binlex::Common::TraitToChar | +( | +string | +trait | ) | ++ |
This method takes an input trait string and returns a char vector of bytes (ignores wildcards).
trait | input string. |
string binlex::Common::TrimRight | +( | +const std::string & | +s | ) | ++ |
This method removes whitespace on the right.
s | input string |
string binlex::Common::WildcardTrait | +( | +string | +trait, | +
+ | + | string | +bytes | +
+ | ) | ++ |
This method wildcards byte strings for traits.
trait | input trait string |
bytes | byte string to wildcard |
+ binlex
+
+ |
+
This is the complete list of members for binlex::Decompiler, including all inherited members.
+Decompiler() (defined in binlex::Decompiler) | binlex::Decompiler | |
handle (defined in binlex::Decompiler) | binlex::Decompiler | |
pc (defined in binlex::Decompiler) | binlex::Decompiler | |
PrintTraits(bool pretty) (defined in binlex::Decompiler) | binlex::Decompiler | |
sections (defined in binlex::Decompiler) | binlex::Decompiler | |
Setup(cs_arch arch, cs_mode mode) (defined in binlex::Decompiler) | binlex::Decompiler | |
status (defined in binlex::Decompiler) | binlex::Decompiler | |
WriteTraits(char *file_path, bool pretty) (defined in binlex::Decompiler) | binlex::Decompiler | |
x86_64(void *data, size_t data_size, size_t data_offset, uint index) (defined in binlex::Decompiler) | binlex::Decompiler | |
~Decompiler() (defined in binlex::Decompiler) | binlex::Decompiler |
+ binlex
+
+ |
+
+Public Attributes | |
+csh | handle |
+cs_err | status |
+uint64_t | pc |
+struct Section | sections [DECOMPILER_MAX_SECTIONS] |
+ binlex
+
+ |
+
This is the complete list of members for binlex::DecompilerREV, including all inherited members.
+AddEdges(uint count, uint index) (defined in binlex::DecompilerREV) | binlex::DecompilerREV | |
ClearBlock(uint index) (defined in binlex::DecompilerREV) | binlex::DecompilerREV | |
ClearTrait(uint index) (defined in binlex::DecompilerREV) | binlex::DecompilerREV | |
CollectBlockTrait(uint index) (defined in binlex::DecompilerREV) | binlex::DecompilerREV | |
CollectFunctionTrait(uint index) (defined in binlex::DecompilerREV) | binlex::DecompilerREV | |
common (defined in binlex::DecompilerREV) | binlex::DecompilerREV | |
Decompile(void *data, size_t data_size, size_t data_offset, uint index) (defined in binlex::DecompilerREV) | binlex::DecompilerREV | |
DecompilerREV() (defined in binlex::DecompilerREV) | binlex::DecompilerREV | |
PrintTraits(bool pretty) (defined in binlex::DecompilerREV) | binlex::DecompilerREV | |
sections (defined in binlex::DecompilerREV) | binlex::DecompilerREV | |
Seek(uint offset, uint index) (defined in binlex::DecompilerREV) | binlex::DecompilerREV | |
Setup(cs_arch arch, cs_mode mode, uint index) (defined in binlex::DecompilerREV) | binlex::DecompilerREV | |
WriteTraits(char *file_path, bool pretty) (defined in binlex::DecompilerREV) | binlex::DecompilerREV | |
~DecompilerREV() (defined in binlex::DecompilerREV) | binlex::DecompilerREV |
+ binlex
+
+ |
+
+Classes | |
struct | Section |
+Public Attributes | |
+struct Section | sections [DECOMPILER_REV_MAX_SECTIONS] |
+Common | common |
+ binlex
+
+ |
+
This is the complete list of members for binlex::Elf, including all inherited members.
+Elf() (defined in binlex::Elf) | binlex::Elf | |
fd (defined in binlex::Elf) | binlex::Elf | |
header (defined in binlex::Elf) | binlex::Elf | |
magic (defined in binlex::Elf) | binlex::Elf | |
mode (defined in binlex::Elf) | binlex::Elf | |
ReadFile(char *file_path) (defined in binlex::Elf) | binlex::Elf | |
sections (defined in binlex::Elf) | binlex::Elf | |
Setup(int input_mode) (defined in binlex::Elf) | binlex::Elf | |
sh_str (defined in binlex::Elf) | binlex::Elf | |
sh_table (defined in binlex::Elf) | binlex::Elf | |
~Elf() (defined in binlex::Elf) | binlex::Elf |
+ binlex
+
+ |
+
+Public Member Functions | |
+bool | Setup (int input_mode) |
+bool | ReadFile (char *file_path) |
+ binlex
+
+ |
+
This is the complete list of members for binlex::Pe, including all inherited members.
+coff_header (defined in binlex::Pe) | binlex::Pe | |
dos_header (defined in binlex::Pe) | binlex::Pe | |
fd (defined in binlex::Pe) | binlex::Pe | |
magic_mz (defined in binlex::Pe) | binlex::Pe | |
magic_pe (defined in binlex::Pe) | binlex::Pe | |
mode (defined in binlex::Pe) | binlex::Pe | |
optional_header_32 (defined in binlex::Pe) | binlex::Pe | |
optional_header_64 (defined in binlex::Pe) | binlex::Pe | |
Pe() (defined in binlex::Pe) | binlex::Pe | |
pe_header_ptr (defined in binlex::Pe) | binlex::Pe | |
ReadFile(char *file_path) (defined in binlex::Pe) | binlex::Pe | |
section_header (defined in binlex::Pe) | binlex::Pe | |
sections (defined in binlex::Pe) | binlex::Pe | |
Setup(int input_mode) (defined in binlex::Pe) | binlex::Pe | |
~Pe() (defined in binlex::Pe) | binlex::Pe |
+ binlex
+
+ |
+
+Public Member Functions | |
+bool | Setup (int input_mode) |
+bool | ReadFile (char *file_path) |
+Public Attributes | |
+char | magic_mz [2] = {0x5a, 0x4d} |
+char | magic_pe [4] = {0x00, 0x00, 0x45, 0x50} |
+FILE * | fd = NULL |
+PIMAGE_DOS_HEADER | dos_header = NULL |
+PIMAGE_COFF_HEADER | coff_header = NULL |
+uint32_t | pe_header_ptr = 0 |
+PIMAGE_OPTIONAL_HEADER_32 | optional_header_32 = NULL |
+PIMAGE_OPTIONAL_HEADER_64 | optional_header_64 = NULL |
+PIMAGE_SECTION_HEADER | section_header = NULL |
+int | mode = PE_MODE_UNSET |
+struct Section | sections [PE_MAX_SECTIONS] |
+ binlex
+
+ |
+
This is the complete list of members for binlex::Raw, including all inherited members.
+Raw() (defined in binlex::Raw) | binlex::Raw | |
ReadFile(char *file_path, int section_index) (defined in binlex::Raw) | binlex::Raw | |
sections (defined in binlex::Raw) | binlex::Raw | |
~Raw() (defined in binlex::Raw) | binlex::Raw |
+ binlex
+
+ |
+
+Public Member Functions | |
+bool | ReadFile (char *file_path, int section_index) |
+Public Attributes | |
+struct Section | sections [RAW_MAX_SECTIONS] |
+ binlex
+
+ |
+
|
+DecompilerREV (binlex) | +IMAGE_DOS_HEADER (binlex) | +IMAGE_SECTION_HEADER (binlex) | +
|
+||
|
+IMAGE_EXPORT_DIRECTORY (binlex) | +IMAGE_TLS_DIRECTORY32 (binlex) | +||||
Args (binlex) | +IMAGE_FILE_HEADER (binlex) | +IMAGE_TLS_DIRECTORY64 (binlex) | +DecompilerREV::Section (binlex) | +|||
|
+Elf (binlex) | +IMAGE_OPTIONAL_HEADER (binlex) | +
|
+|||
|
+IMAGE_OPTIONAL_HEADER_32 (binlex) | +|||||
Common (binlex) | +IMAGE_OPTIONAL_HEADER_64 (binlex) | +Pe (binlex) | +||||
|
+IMAGE_DATA_DIRECTORY (binlex) | +IMAGE_ROM_OPTIONAL_HEADER (binlex) | +
|
+|||
Decompiler (binlex) | +Raw (binlex) | +|||||
+ binlex
+
+ |
+
+ binlex
+
+ |
+
+ binlex
+
+ |
+
+ binlex
+
+ |
+
+ binlex
+
+ |
+
▼ include | |
args.h | |
blelf.h | |
cil.h | |
common.h | |
decompiler.h | |
decompiler_rev.h | |
jvm.h | |
pe.h | |
raw.h |
+ binlex
+
+ |
+
+ binlex
+
+ |
+
+ binlex
+
+ |
+
The purpose of binlex
is to extract basic blocks and functions as traits from binaries for malware research, hunting and detection.
Most projects attempting this use Python to generate traits, but it is very slow.
+The design philophy behind binlex
is it to keep it simple and extensable.
The simple command-line interface allows malware researchers and analysts to hunt traits across hundreds or thousands of potentially similar malware saving time and money in production environments.
+While the C++ API allows developers to get creative with their own detection solutions.
+Get slides here.
+From Source:
+Binary Release: See the releases
page.
NOTE:
tests/
directory can be extracted using the password infected
Currently Supported Modes
+elf:x86
elf:x86_64
pe:x86
pe:x86_64
raw:x86
raw:x86_64
raw:cil
(experimental)NOTE: The raw
formats can be used on shellcode
If you are hunting using binlex
you can use jq
to your advantage for advanced searches.
Other queries you can do:
If you output just traits you want to stdout
you can do build a yara
signature on the fly with the included tool blyara
:
You can also use the switch --pretty
to output json
to identify more properies to query.
If you have terabytes of executable files, we can leverage the power of parallel
to generate traits for us.
It also allows you to name your type of dataset, i.e. goodware/malware/riskware/pua etc...
+With binlex
it is up to you to remove goodware traits from your extracted traits.
There have been many questions about removing "library code", there is a make target shown below to help you with this.
+With binlex
the power is in your hands, "With great power comes great responsibility", it is up to you!
Plugins:
+There has been some interest in making IDA, Ghidra and Cutter plugins for binlex
.
This is something that will be started soon.
+This README.md
will be updated when they are ready to use.
General Usage Information:
+Binlex is designed to do one thing and one thing only, extract genetic traits from executable code in files. This means it is up to you "the researcher" / "the data scientist" to determine which traits are good and which traits are bad. To accomplish this, you need to use your own fitness function. I encourage you to read about genetic programming to gain a better understanding of this in practice. Perhaps watching this introductory video will help your understanding.
+Again, it's up to you to implement your own algorithms for detection based on the genetic traits you extract.
+Traits will contain binary code represented in hexadecimal form and will use ??
as wild cards for memory operands or other operands subject to change.
They will also contain additional properties about the trait including its offset
, edges
, blocks
, cyclomatic_complexity
, average_instruction_per_block
, bytes
, trait
, trait_sha256
, bytes_sha256
, trait_entropy
, bytes_entropy
, type
, size
, and instructions
.
You can access the C++ API Documentation and everything else by building the documents using doxygen
.
The documents will be available at build/docs/html/index.html
.
It couldn't be any easier to leverage binlex
and its C++ API to build your own applications.
See example code below:
+We hope this encourages people to build their own detection solutions based on binary genetic traits.
+jq
to improve your searchesTraits will be compared amongst their common malware family, any traits not common to all samples will be discarded.
+Once completed, all remaining traits will be compared to traits from a goodware set, any traits that match the goodware set will be discarded.
+To further differ the traits from other malware families, the remaining population will be compared to other malware families, any that match will be discarded.
+The remaining population of traits will be unique to the malware family tested and not legitimate binaries or other malware families.
+This fitness model allows for accurate classification of the tested malware family.
+raw:jvm
, java:jvm
pe:cil
and raw:cil
macho:x86_64
, macho:x86
If you wish to contribute to Binlex DM me on Twitter https://twitter.com/c3rb3ru5d3d53c.
+Currently looking for help on:
+ binlex
+
+ |
+