From 9fd489276181e1ca699d8735c08adec0b50f2944 Mon Sep 17 00:00:00 2001 From: hbrodin <90325907+hbrodin@users.noreply.github.com> Date: Wed, 17 May 2023 13:26:05 +0200 Subject: [PATCH 1/6] Adding the main components. No real integration yet. --- .../polytracker/passes/tainted_control_flow.h | 68 ++++++ .../include/taintdag/control_flow_log.h | 88 ++++++++ polytracker/include/taintdag/polytracker.h | 1 + polytracker/src/passes/CMakeLists.txt | 2 +- .../src/passes/tainted_control_flow.cpp | 198 ++++++++++++++++++ unittests/src/taintdag/CMakeLists.txt | 3 +- unittests/src/taintdag/control_flow_log.cpp | 57 +++++ 7 files changed, 415 insertions(+), 2 deletions(-) create mode 100644 polytracker/include/polytracker/passes/tainted_control_flow.h create mode 100644 polytracker/include/taintdag/control_flow_log.h create mode 100644 polytracker/src/passes/tainted_control_flow.cpp create mode 100644 unittests/src/taintdag/control_flow_log.cpp diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h new file mode 100644 index 00000000..d0acf509 --- /dev/null +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +namespace polytracker { +namespace detail { +struct FunctionMappingJSONWriter; +} + +class TaintedControlFlowPass + : public llvm::PassInfoMixin, + public llvm::InstVisitor { + // + llvm::IntegerType *label_ty{nullptr}; + // Taint tracking startup + llvm::FunctionCallee taint_start_fn; + // Log taint label affecting control flow + llvm::FunctionCallee cond_br_log_fn; + // Log enter/leave functions + llvm::FunctionCallee fn_enter_log_fn; + llvm::FunctionCallee fn_leave_log_fn; + + // Helpers + void insertCondBrLogCall(llvm::Instruction &inst, llvm::Value *val); + void insertTaintStartupCall(llvm::Module &mod); + void declareLoggingFunctions(llvm::Module &mod); + + + llvm::ConstantInt *get_function_id_const(llvm::Function &f); + llvm::ConstantInt *get_function_id_const(llvm::Instruction &i); + +public: + + using function_id = uint32_t; + + TaintedControlFlowPass(); + TaintedControlFlowPass(TaintedControlFlowPass &&); + ~TaintedControlFlowPass(); + + llvm::PreservedAnalyses run(llvm::Module &mod, + llvm::ModuleAnalysisManager &mam); + void visitGetElementPtrInst(llvm::GetElementPtrInst &gep); + void visitBranchInst(llvm::BranchInst &bi); + void visitSwitchInst(llvm::SwitchInst &si); + void visitSelectInst(llvm::SelectInst &si); + + void instrumentFunctionEnter(llvm::Function &func); + void visitReturnInst(llvm::ReturnInst &ri); + + function_id function_mapping(llvm::Function &func); + + std::unordered_map function_ids_; + function_id function_counter_{0}; + + std::unique_ptr function_mapping_writer_; +}; + +} // namespace polytracker \ No newline at end of file diff --git a/polytracker/include/taintdag/control_flow_log.h b/polytracker/include/taintdag/control_flow_log.h new file mode 100644 index 00000000..3eaf3523 --- /dev/null +++ b/polytracker/include/taintdag/control_flow_log.h @@ -0,0 +1,88 @@ + +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + +#pragma once + +#include "taintdag/outputfile.h" +#include "taintdag/section.h" +#include "taintdag/taint.h" +#include "taintdag/util.h" + +namespace taintdag { + +namespace details { +// A uint32_t varint encoded by setting highest bit for all but the final byte. +// Requires up to 5 bytes of storage as each output byte uses 7 input bits. +// Total maximum need is floor(32/7) = 5. Returns number of bytes required. +size_t varint_encode(uint32_t val, uint8_t *buffer) { + auto orig_buffer = buffer; + while (val >= 0x80) { + *buffer++ = 0x80 | (val & 0x7f); + val >>= 7; + } + *buffer++ = val & 0x7f; + return buffer - orig_buffer; +} +// TODO (hbrodin): Should probably used std::span +} // namespace details + +struct ControlFlowLog : public SectionBase { + enum EventType { + EnterFunction = 0, + LeaveFunction = 1, + TaintedControlFlow = 2, + }; + + static constexpr uint8_t tag{8}; + static constexpr size_t align_of{1}; + static constexpr size_t allocation_size{1024 * 1024 * 1024}; + + template + ControlFlowLog(SectionArg of) : SectionBase(of.range) {} + + void function_event(EventType evt, uint32_t function_id) { + uint8_t buffer[6]; + buffer[0] = static_cast(evt); + auto used = details::varint_encode(function_id, &buffer[1]); + auto total = used + 1; + + if (auto wctx = write(total)) { + std::copy(&buffer[0], &buffer[total], wctx->mem.begin()); + } else { + error_exit("Failed to write ", total, + " bytes of output to the ControlFlowLog Section."); + } + } + void enter_function(uint32_t function_id) { + function_event(EnterFunction, function_id); + } + + void leave_function(uint32_t function_id) { + function_event(LeaveFunction, function_id); + } + + void tainted_control_flow(label_t label, uint32_t function_id) { + // 1 byte event, <= 5 bytes function id, <= 5 bytes label + uint8_t buffer[11]; + buffer[0] = static_cast(TaintedControlFlow); + auto used = details::varint_encode(function_id, &buffer[1]); + auto total = used + 1; + used = details::varint_encode(label, &buffer[total]); + total += used; + + if (auto wctx = write(total)) { + std::copy(&buffer[0], &buffer[total], wctx->mem.begin()); + } else { + error_exit("Failed to write ", total, + " bytes of output to the ControlFlowLog Section."); + } + } +}; + +} // namespace taintdag diff --git a/polytracker/include/taintdag/polytracker.h b/polytracker/include/taintdag/polytracker.h index 08211fef..da159026 100644 --- a/polytracker/include/taintdag/polytracker.h +++ b/polytracker/include/taintdag/polytracker.h @@ -12,6 +12,7 @@ #include #include "taintdag/bitmap_section.h" +#include "taintdag/control_flow_log.h" #include "taintdag/fnmapping.h" #include "taintdag/fntrace.h" #include "taintdag/labels.h" diff --git a/polytracker/src/passes/CMakeLists.txt b/polytracker/src/passes/CMakeLists.txt index 87080e67..af6aaa9d 100644 --- a/polytracker/src/passes/CMakeLists.txt +++ b/polytracker/src/passes/CMakeLists.txt @@ -6,7 +6,7 @@ endif(APPLE) add_library( PolytrackerPass SHARED - taint_tracking.cpp remove_fn_attr.cpp function_tracing.cpp + taint_tracking.cpp remove_fn_attr.cpp function_tracing.cpp tainted_control_flow.cpp DataFlowSanitizer.cpp utils.cpp pass_plugin.cpp) target_link_libraries( diff --git a/polytracker/src/passes/tainted_control_flow.cpp b/polytracker/src/passes/tainted_control_flow.cpp new file mode 100644 index 00000000..d8142794 --- /dev/null +++ b/polytracker/src/passes/tainted_control_flow.cpp @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + +#include "polytracker/passes/tainted_control_flow.h" + +#include +#include +#include +#include + +#include + +#include "polytracker/dfsan_types.h" +#include "polytracker/passes/utils.h" + +#include + +namespace polytracker { + +namespace detail { +// Helper type to produce the json file of function names by functionid +class FunctionMappingJSONWriter { +public: + FunctionMappingJSONWriter(std::string_view filename) + : file(filename.data(), std::ios::binary) { + file << "["; + } + + ~FunctionMappingJSONWriter() { + // Back up and erase the last ",\n" + file.seekp(-2, std::ios::cur); + file << "\n]\n"; + } + + void append(std::string_view name) { + // Will cause an additional ',' but don't care about that right now... + // The destructor will back up two steps and replace the ',' with a newline + // and array termination. + file << "\"" << name << "\",\n"; + } + +private: + std::ofstream file; +}; +} // namespace detail + +namespace { +uint32_t +get_or_add_mapping(uintptr_t key, std::unordered_map &m, + uint32_t &counter, std::string_view name, + polytracker::detail::FunctionMappingJSONWriter &js) { + if (auto it = m.find(key); it != m.end()) { + return it->second; + } else { + js.append(name); + return m[key] = counter++; + } +} + +} // namespace +void TaintedControlFlowPass::insertCondBrLogCall(llvm::Instruction &inst, + llvm::Value *val) { + llvm::IRBuilder<> ir(&inst); + auto dummy_val{val}; + if (inst.getType()->isVectorTy()) { + dummy_val = ir.CreateExtractElement(val, uint64_t(0)); + } + ir.CreateCall(cond_br_log_fn, {ir.CreateSExtOrTrunc(dummy_val, label_ty)}); +} + +llvm::ConstantInt * +TaintedControlFlowPass::get_function_id_const(llvm::Function &func) { + auto func_address = reinterpret_cast(&func); + std::string_view name = func.getName(); + auto fid = get_or_add_mapping(func_address, function_ids_, function_counter_, + name, *function_mapping_writer_); + return llvm::ConstantInt::get(func.getContext(), llvm::APInt(32, fid, false)); +} + +llvm::ConstantInt * +TaintedControlFlowPass::get_function_id_const(llvm::Instruction &i) { + return get_function_id_const(*(i.getParent()->getParent())); +} + +void TaintedControlFlowPass::visitGetElementPtrInst( + llvm::GetElementPtrInst &gep) { + llvm::IRBuilder<> ir(&gep); + for (auto &idx : gep.indices()) { + if (llvm::isa(idx)) { + continue; + } + + auto callret = ir.CreateCall(cond_br_log_fn, + {ir.CreateSExtOrTrunc(idx, ir.getInt64Ty()), + get_function_id_const(gep)}); + + idx = ir.CreateSExtOrTrunc(callret, idx->getType()); + } +} + +void TaintedControlFlowPass::visitBranchInst(llvm::BranchInst &bi) { + if (bi.isUnconditional()) { + return; + } + + llvm::IRBuilder<> ir(&bi); + auto cond = bi.getCondition(); + + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(bi)}); + + bi.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); +} + +void TaintedControlFlowPass::visitSwitchInst(llvm::SwitchInst &si) { + llvm::IRBuilder<> ir(&si); + auto cond = si.getCondition(); + + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); + + si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); +} + +void TaintedControlFlowPass::visitSelectInst(llvm::SelectInst &si) { + // TODO(hbrodin): Can't handle atm. + if (si.getType()->isVectorTy()) { + return; + } + llvm::IRBuilder<> ir(&si); + auto cond = si.getCondition(); + + auto callret = ir.CreateCall( + cond_br_log_fn, + {ir.CreateSExtOrTrunc(cond, ir.getInt64Ty()), get_function_id_const(si)}); + + si.setCondition(ir.CreateSExtOrTrunc(callret, cond->getType())); +} + +void TaintedControlFlowPass::declareLoggingFunctions(llvm::Module &mod) { + llvm::IRBuilder<> ir(mod.getContext()); + cond_br_log_fn = mod.getOrInsertFunction( + "__polytracker_log_tainted_control_flow", + llvm::AttributeList::get( + mod.getContext(), + {{llvm::AttributeList::FunctionIndex, + llvm::Attribute::get(mod.getContext(), + llvm::Attribute::ReadNone)}}), + ir.getInt64Ty(), ir.getInt64Ty(), ir.getInt32Ty()); + + fn_enter_log_fn = mod.getOrInsertFunction("__polytracker_enter_function", + ir.getVoidTy(), ir.getInt32Ty()); + + fn_leave_log_fn = mod.getOrInsertFunction("__polytracker_leave_function", + ir.getVoidTy(), ir.getInt32Ty()); +} + +void TaintedControlFlowPass::instrumentFunctionEnter(llvm::Function &func) { + if (func.isDeclaration()) { + return; + } + llvm::IRBuilder<> ir(&*func.getEntryBlock().begin()); + ir.CreateCall(fn_enter_log_fn, get_function_id_const(func)); +} + +void TaintedControlFlowPass::visitReturnInst(llvm::ReturnInst &ri) { + llvm::IRBuilder<> ir(&ri); + ir.CreateCall(fn_leave_log_fn, get_function_id_const(ri)); +} + +llvm::PreservedAnalyses +TaintedControlFlowPass::run(llvm::Module &mod, + llvm::ModuleAnalysisManager &mam) { + label_ty = llvm::IntegerType::get(mod.getContext(), DFSAN_LABEL_BITS); + declareLoggingFunctions(mod); + for (auto &fn : mod) { + instrumentFunctionEnter(fn); + visit(fn); + } + return llvm::PreservedAnalyses::none(); +} + +TaintedControlFlowPass::TaintedControlFlowPass() + : function_mapping_writer_( + std::make_unique( + "functionid.json")) {} + +TaintedControlFlowPass::~TaintedControlFlowPass() = default; +TaintedControlFlowPass::TaintedControlFlowPass(TaintedControlFlowPass &&) = + default; +} // namespace polytracker \ No newline at end of file diff --git a/unittests/src/taintdag/CMakeLists.txt b/unittests/src/taintdag/CMakeLists.txt index fee4b776..b620b84a 100644 --- a/unittests/src/taintdag/CMakeLists.txt +++ b/unittests/src/taintdag/CMakeLists.txt @@ -11,7 +11,8 @@ add_executable( fntrace.cpp union.cpp labeldeq.cpp - stream_offset.cpp) + stream_offset.cpp + control_flow_log.cpp) target_include_directories(${TAINTDAG_UNITTEST} PRIVATE ${CMAKE_SOURCE_DIR}/polytracker/include) diff --git a/unittests/src/taintdag/control_flow_log.cpp b/unittests/src/taintdag/control_flow_log.cpp new file mode 100644 index 00000000..95ec95b5 --- /dev/null +++ b/unittests/src/taintdag/control_flow_log.cpp @@ -0,0 +1,57 @@ + +/* + * Copyright (c) 2022-present, Trail of Bits, Inc. + * All rights reserved. + * + * This source code is licensed in accordance with the terms specified in + * the LICENSE file found in the root directory of this source tree. + */ + +#include "taintdag/control_flow_log.h" +#include "taintdag/section.h" +#include + +TEST_CASE("Simple varint encoding") { + using namespace taintdag::details; + uint8_t buffer[5]; + + SECTION("Encode 0") { + auto n = varint_encode(0, buffer); + REQUIRE(n == 1); + REQUIRE(buffer[0] == 0); + } + + SECTION("Encode 1") { + auto n = varint_encode(1, buffer); + REQUIRE(n == 1); + REQUIRE(buffer[0] == 1); + } + + SECTION("Encode 0x7f") { + auto n = varint_encode(0x7f, buffer); + REQUIRE(n == 1); + REQUIRE(buffer[0] == 0x7f); + } + + SECTION("Encode 0x80") { + auto n = varint_encode(0x80, buffer); + REQUIRE(n == 2); + REQUIRE(buffer[0] == 0x80); + REQUIRE(buffer[1] == 0x01); + } + SECTION("Encode 0x3ffe") { + auto n = varint_encode(0x3ffe, buffer); + REQUIRE(n == 2); + REQUIRE(buffer[0] == 0xfe); + REQUIRE(buffer[1] == 0x7f); + } + SECTION("Encode 0xffffffff") { + auto n = varint_encode(0xffffffff, buffer); + REQUIRE(n == 5); + REQUIRE(buffer[0] == 0xff); + REQUIRE(buffer[1] == 0xff); + REQUIRE(buffer[2] == 0xff); + REQUIRE(buffer[3] == 0xff); + REQUIRE(buffer[4] == 0x0f); + } +} From bdd3158871f556c30e8ddd2b34cd784c46ccca29 Mon Sep 17 00:00:00 2001 From: hbrodin <90325907+hbrodin@users.noreply.github.com> Date: Mon, 22 May 2023 13:35:39 +0200 Subject: [PATCH 2/6] Integrate control affecting dataflow logging Enables logging of control affecting data flow by specifying the --cflog argument to the `instrument-targets` command. Adds support for a new section in the TDAG, adds the instrumentation step and instrumentation callback handlers. --- polytracker/build.py | 37 +++++- polytracker/custom_abi/dfsan_abilist.txt | 6 + polytracker/include/taintdag/polytracker.h | 18 ++- polytracker/src/passes/pass_plugin.cpp | 5 + polytracker/src/polytracker/polytracker.cpp | 18 +++ polytracker/src/taintdag/polytracker.cpp | 14 ++ polytracker/taint_dag.py | 139 ++++++++++++++++++++ tests/conftest.py | 2 +- tests/test_cf_log.cpp | 71 ++++++++++ tests/test_cf_log.py | 69 ++++++++++ 10 files changed, 375 insertions(+), 4 deletions(-) create mode 100644 tests/test_cf_log.cpp create mode 100644 tests/test_cf_log.py diff --git a/polytracker/build.py b/polytracker/build.py index d350671c..c4789075 100644 --- a/polytracker/build.py +++ b/polytracker/build.py @@ -146,6 +146,24 @@ def _optimize_bitcode(input_bitcode: Path, output_bitcode: Path) -> None: cmd = ["opt", "-O3", str(input_bitcode), "-o", str(output_bitcode)] subprocess.check_call(cmd) +def _preopt_instrument_bitcode(input_bitcode: Path, output_bitcode: Path) -> None: + POLY_PASS_PATH: Path = _ensure_path_exists( + _compiler_dir_path() / "pass" / "libPolytrackerPass.so" + ) + + cmd = [ + "opt", + "-load", + str(POLY_PASS_PATH), + "-load-pass-plugin", + str(POLY_PASS_PATH), + "-passes=pt-tcf", + str(input_bitcode), + "-o", + str(output_bitcode), + ] + # execute `cmd` + subprocess.check_call(cmd) def _instrument_bitcode( input_bitcode: Path, @@ -398,16 +416,31 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): help="specify additional ignore lists to polytracker", ) + parser.add_argument( + "--cflog", + action="store_true", + help="instrument with control affecting dataflow logging", + ) + + def run(self, args: argparse.Namespace): for target in args.targets: blight_cmds = _read_blight_journal(args.journal_path) target_cmd, target_path = _find_target(target, blight_cmds) bc_path = target_path.with_suffix(".bc") + opt_bc = bc_path.with_suffix(".opt.bc") _extract_bitcode(target_path, bc_path) - _optimize_bitcode(bc_path, bc_path) + if args.cflog: + # Control affecting data flow logging happens before optimization + pre_opt = bc_path.with_suffix(".preopt.bc") + _preopt_instrument_bitcode(bc_path, pre_opt) + + _optimize_bitcode(pre_opt, opt_bc) + else: + _optimize_bitcode(bc_path, opt_bc) inst_bc_path = Path(f"{bc_path.stem}.instrumented.bc") _instrument_bitcode( - bc_path, + opt_bc, inst_bc_path, args.ignore_lists, args.taint, diff --git a/polytracker/custom_abi/dfsan_abilist.txt b/polytracker/custom_abi/dfsan_abilist.txt index 6ddf8a97..bfe49d35 100644 --- a/polytracker/custom_abi/dfsan_abilist.txt +++ b/polytracker/custom_abi/dfsan_abilist.txt @@ -30,6 +30,10 @@ fun:open64=custom ########################################## # Polytracker functions ######################################### +fun:__polytracker_leave_function=uninstrumented +fun:__polytracker_leave_function=discard +fun:__polytracker_enter_function=uninstrumented +fun:__polytracker_enter_function=discard fun:__polytracker_log_func_entry=uninstrumented fun:__polytracker_log_func_entry=discard fun:__polytracker_log_func_exit=uninstrumented @@ -43,6 +47,8 @@ fun:__polytracker_log_taint_op=uninstrumented fun:__polytracker_log_taint_op=custom fun:__polytracker_log_conditional_branch=uninstrumented fun:__polytracker_log_conditional_branch=custom +fun:__polytracker_log_tainted_control_flow=uninstrumented +fun:__polytracker_log_tainted_control_flow=custom # -- end fun:__polytracker_dump=uninstrumented fun:__polytracker_dump=discard diff --git a/polytracker/include/taintdag/polytracker.h b/polytracker/include/taintdag/polytracker.h index da159026..2bdb5dfe 100644 --- a/polytracker/include/taintdag/polytracker.h +++ b/polytracker/include/taintdag/polytracker.h @@ -55,6 +55,21 @@ class PolyTracker { // Update the label, it affects control flow void affects_control_flow(label_t taint_label); + // Instrumentation callback for when control flow is influenced by a + // a tainted value + void log_tainted_control_flow(label_t taint_label, uint32_t function_id); + + // Instrumentation callback for when execution enters a function + // NOTE: There is a overlap in functionality between this and `function_entry` + // they will co-exist for now as they operate slightly different. The underlying + // reason is that this was developed separately to support the Tainted Control + // Flow logging mechanism. + void enter_function(uint32_t function_id); + + // Instrumentation callback for when execution leaves a function + // NOTE: Se `enter_function` comment about overlap. + void leave_function(uint32_t function_id); + // Log tainted data flowed into the sink void taint_sink(int fd, util::Offset offset, void const *mem, size_t length); // Same as before, but use same label for all data @@ -80,7 +95,7 @@ class PolyTracker { // sections and in which order they appear. using ConcreteOutputFile = OutputFile; + SourceLabelIndexSection, Functions, Events, ControlFlowLog>; ConcreteOutputFile output_file_; // Tracking source offsets for streams (where offsets can be determined by @@ -88,6 +103,7 @@ class PolyTracker { static constexpr size_t offset_capacity = size_t{max_source_index} + 1; StreamOffset stream_read_offsets_; StreamOffset stream_write_offsets_; + }; } // namespace taintdag diff --git a/polytracker/src/passes/pass_plugin.cpp b/polytracker/src/passes/pass_plugin.cpp index c026d5ba..e8ad4a1e 100644 --- a/polytracker/src/passes/pass_plugin.cpp +++ b/polytracker/src/passes/pass_plugin.cpp @@ -13,6 +13,7 @@ #include "polytracker/passes/function_tracing.h" #include "polytracker/passes/remove_fn_attr.h" #include "polytracker/passes/taint_tracking.h" +#include "polytracker/passes/tainted_control_flow.h" llvm::PassPluginLibraryInfo getPolyTrackerPluginInfo() { return {LLVM_PLUGIN_API_VERSION, "PolyTracker", "", @@ -36,6 +37,10 @@ llvm::PassPluginLibraryInfo getPolyTrackerPluginInfo() { mpm.addPass(polytracker::FunctionTracingPass()); return true; } + if (name == "pt-tcf") { + mpm.addPass(polytracker::TaintedControlFlowPass()); + return true; + } return false; }); }}; diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index 477c7e33..ee030ce7 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -41,4 +41,22 @@ extern "C" void __taint_start() { taint_start(); } extern "C" void __polytracker_taint_argv(int argc, char *argv[]) { polytracker::taint_argv(argc, argv); +} + +extern "C" uint64_t __dfsw___polytracker_log_tainted_control_flow( + uint64_t conditional, uint32_t functionid, dfsan_label conditional_label, + dfsan_label function_label, dfsan_label *ret_label) { + if (conditional_label > 0) { + get_polytracker_tdag().log_tainted_control_flow(conditional_label, functionid); + } + *ret_label = conditional_label; + return conditional; +} + +extern "C" void __polytracker_enter_function(uint32_t function_id) { + get_polytracker_tdag().enter_function(function_id); +} + +extern "C" void __polytracker_leave_function(uint32_t function_id) { + get_polytracker_tdag().leave_function(function_id); } \ No newline at end of file diff --git a/polytracker/src/taintdag/polytracker.cpp b/polytracker/src/taintdag/polytracker.cpp index 89f41bad..7f7b69b0 100644 --- a/polytracker/src/taintdag/polytracker.cpp +++ b/polytracker/src/taintdag/polytracker.cpp @@ -175,6 +175,20 @@ void PolyTracker::affects_control_flow(label_t lbl) { output_file_.section().affects_control_flow(lbl); } +void PolyTracker::log_tainted_control_flow(label_t lbl, uint32_t function_id) { + output_file_.section().tainted_control_flow(lbl, + function_id); +} + +void PolyTracker::enter_function(uint32_t function_id) { + output_file_.section().enter_function(function_id); +} + +void PolyTracker::leave_function(uint32_t function_id) { + output_file_.section().leave_function(function_id); +} + + Functions::index_t PolyTracker::function_entry(std::string_view name) { auto &functions{output_file_.section()}; auto maybe_index{functions.add_mapping(name)}; diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index 2e8a7bd6..cbd1a22f 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -129,6 +129,128 @@ def read_raw(self, label): def count(self): return len(self.section) // sizeof(c_uint64) +class TDEnterFunctionEvent: + """Emitted whenever execution enters a function. + The callstack member is the callstack right before entering the function, + having the function just entered as the last member of the callstack. + """ + + def __init__(self, callstack): + """Callstack after entering function""" + self.callstack = callstack + + def __repr__(self) -> str: + return f"Enter: {self.callstack}" + + def __eq__(self, __o: object) -> bool: + return self.callstack == __o.callstack + + +class TDLeaveFunctionEvent: + """Emitted whenever execution leaves a function. + The callstack member is the callstack right before leaving the function, + having the function about to leave as the last member of the callstack. + """ + + def __init__(self, callstack): + """Callstack before leaving function""" + self.callstack = callstack + + def __repr__(self) -> str: + return f"Leave: {self.callstack}" + + def __eq__(self, __o: object) -> bool: + return self.callstack == __o.callstack + +class TDTaintedControlFlowEvent: + """Emitted whenever a control flow change is influenced by tainted data. + The label that influenced the control flow is available in the `label` member. + Current callstack (including the function the control flow happened in) is available + in the `callstack` member.""" + + def __init__(self, callstack, label): + self.callstack = callstack + self.label = label + + def __repr__(self) -> str: + return f"TaintedControlFlow label {self.label} callstack {self.callstack}" + + def __eq__(self, __o: object) -> bool: + return self.label == __o.label and self.callstack == __o.callstack + +class TDControlFlowLogSection: + """TDAG Control flow log section + + Interprets the control flow log section in a TDAG file. + Enables enumeration/random access of items + """ + + # NOTE: MUST correspond to the members in the `ControlFlowLog::EventType`` in `control_flog_log.h`. + ENTER_FUNCTION = 0 + LEAVE_FUNCTION = 1 + TAINTED_CONTROL_FLOW = 2 + + @staticmethod + def _decode_varint(buffer): + shift = 0 + val = 0 + while buffer: + curr = c_uint8.from_buffer_copy(buffer, 0).value + val |= (curr & 0x7F) << shift + shift += 7 + buffer = buffer[1:] + if curr & 0x80 == 0: + break + + return val, buffer + + @staticmethod + def _align_callstack(target_function_id, callstack): + while callstack and callstack[-1] != target_function_id: + yield TDLeaveFunctionEvent(callstack[:]) + callstack.pop() + + def __init__(self, mem, hdr): + self.section = mem[hdr.offset : hdr.offset + hdr.size] + self.funcmapping = None + + def __iter__(self): + buffer = self.section + callstack = [] + while buffer: + event = c_uint8.from_buffer_copy(buffer,0).value + buffer = buffer[1:] + function_id, buffer = TDControlFlowLogSection._decode_varint(buffer) + if self.funcmapping != None: + function_id = self.funcmapping[function_id] + + if event == TDControlFlowLogSection.ENTER_FUNCTION: + callstack.append(function_id) + yield TDEnterFunctionEvent(callstack[:]) + elif event == TDControlFlowLogSection.LEAVE_FUNCTION: + # Align call stack, if needed + yield from TDControlFlowLogSection._align_callstack( + function_id, callstack + ) + + # TODO(hbrodin): If the callstack doesn't contain function_id at all, this will break. + yield TDLeaveFunctionEvent(callstack[:]) + callstack.pop() + else: + # Align call stack, if needed + yield from TDControlFlowLogSection._align_callstack( + function_id, callstack + ) + + label, buffer = TDControlFlowLogSection._decode_varint(buffer) + yield TDTaintedControlFlowEvent(callstack[:], label) + + # Drain callstack with artifical TDLeaveFunction events (using a dummy function id that doesn't exist) + yield from TDControlFlowLogSection._align_callstack(-1, callstack) + + def function_id_mapping(self, id_to_name_array): + """This method stores an array used to translate from function id to symbolic names""" + self.funcmapping = id_to_name_array class TDSinkSection: """TDAG Sinks section @@ -300,6 +422,7 @@ def __repr__(self) -> str: TDSourceIndexSection, TDFunctionsSection, TDEventsSection, + TDControlFlowLogSection, ] @@ -344,6 +467,9 @@ def __init__(self, file: BinaryIO) -> None: elif hdr.tag == 7: self.sections.append(TDEventsSection(self.buffer, hdr)) self.sections_by_type[TDEventsSection] = self.sections[-1] + elif hdr.tag == 8: + self.sections.append(TDControlFlowLogSection(self.buffer, hdr)) + self.sections_by_type[TDControlFlowLogSection] = self.sections[-1] else: raise NotImplementedError("Unsupported section tag") @@ -716,6 +842,13 @@ def __init_arguments__(self, parser): help="print function trace events", ) + parser.add_argument( + "--print-control-flow-log", + "-c", + action="store_true", + help="print function trace events", + ) + def run(self, args): with open(args.POLYTRACKER_TF, "rb") as f: tdfile = TDFile(f) @@ -742,3 +875,9 @@ def run(self, args): if args.print_function_trace: for e in tdfile.events: print(f"{e}") + + if args.print_control_flow_log: + cflog = tdfile._get_section(TDControlFlowLogSection) + assert isinstance(cflog, TDControlFlowLogSection) + for obj in cflog: + print(f"{obj}") \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 6f5f9df8..8114bafc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -35,7 +35,7 @@ def build(target: Path, binary: Path) -> None: def instrument(target: str) -> None: - cmd = ["instrument-targets", "--taint", "--ftrace", target] + cmd = ["instrument-targets", "--taint", "--ftrace", "--cflog", target] run_polytracker(cmd) diff --git a/tests/test_cf_log.cpp b/tests/test_cf_log.cpp new file mode 100644 index 00000000..5b992a9e --- /dev/null +++ b/tests/test_cf_log.cpp @@ -0,0 +1,71 @@ +#include +#include +#include +#include + +int f2(uint8_t val) { + if (val & 1) { + return val + 4; + } else { + return val + 3; + } +} + +int f3(uint8_t val) { + if (val > 0) { + return val - 1; + } else { + return val; + } +} + +int f1(uint8_t val) { + if (val > 5) { + return f2(val); + } else { + return 2 + f3(val + 2); + } +} + +// Some dummy control flow to test that control flow logging works as expected +int main(int argc, char *argv[]) { + uint8_t buffer[sizeof(uint64_t)]; + if (sizeof(buffer) != read(0, buffer, sizeof(buffer))) { + exit(EXIT_FAILURE); + } + + bool good = true; + + // Control flow 1, affects label 1 + if (buffer[0] == 'a') { + // Control flow 2, affects label 2 + if (buffer[1] != 'c') { + // Control flow labels 3 trough 8 + for (size_t i = 2; i < sizeof(buffer); i++) { + if (buffer[i] == '\0') { + good = false; + } + } + } + } + + if (good) { + // Union/range + uint64_t val = 0; + for (auto v : buffer) { + val = (val << 8) | v; + } + // Control flow label 15. The range node covering the full input buffer + if (val == 1) { + printf("Wow, that was unexpected\n"); + } + + // Control flow label 3 (again) + if (buffer[2] < 16) { + printf("OK, buffer[2] < 16\n"); + } + + auto v = f1(buffer[6]); + } + exit(EXIT_SUCCESS); +} \ No newline at end of file diff --git a/tests/test_cf_log.py b/tests/test_cf_log.py new file mode 100644 index 00000000..5316fac7 --- /dev/null +++ b/tests/test_cf_log.py @@ -0,0 +1,69 @@ +import cxxfilt +import json +import pytest +import subprocess + +import polytracker +from pathlib import Path + +from polytracker.taint_dag import ( + TDEnterFunctionEvent, + TDLeaveFunctionEvent, + TDTaintedControlFlowEvent, +) + + +@pytest.mark.program_trace("test_cf_log.cpp") +def test_cf_log(instrumented_binary: Path, trace_file: Path): + # Data to write to stdin, one byte at a time + stdin_data = "abcdefgh" + + subprocess.run( + [str(instrumented_binary)], + input=stdin_data.encode("utf-8"), + env={ + "POLYDB": str(trace_file), + "POLYTRACKER_STDIN_SOURCE": "1", + "POLYTRACKER_LOG_CONTROL_FLOW": "1", + }, + ) + + program_trace = polytracker.PolyTrackerTrace.load(trace_file) + + cflog = program_trace.tdfile._get_section( + polytracker.taint_dag.TDControlFlowLogSection + ) + + # The functionid mapping is available next to the built binary + with open(instrumented_binary.parent / "functionid.json", "rb") as f: + functionid_mapping = list(map(cxxfilt.demangle, json.load(f))) + + # Apply the id to function mappign + cflog.function_id_mapping(functionid_mapping) + + expected_seq = [ + TDEnterFunctionEvent(["main"]), + TDTaintedControlFlowEvent(["main"], 1), + TDTaintedControlFlowEvent(["main"], 2), + TDTaintedControlFlowEvent(["main"], 3), + TDTaintedControlFlowEvent(["main"], 4), + TDTaintedControlFlowEvent(["main"], 5), + TDTaintedControlFlowEvent(["main"], 6), + TDTaintedControlFlowEvent(["main"], 7), + TDTaintedControlFlowEvent(["main"], 8), + TDTaintedControlFlowEvent(["main"], 15), + TDTaintedControlFlowEvent(["main"], 3), + TDEnterFunctionEvent(["main", "f1(unsigned char)"]), + TDTaintedControlFlowEvent(["main", "f1(unsigned char)"], 7), + TDEnterFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), + TDTaintedControlFlowEvent( + ["main", "f1(unsigned char)", "f2(unsigned char)"], 7 + ), + TDLeaveFunctionEvent(["main", "f1(unsigned char)", "f2(unsigned char)"]), + TDLeaveFunctionEvent(["main", "f1(unsigned char)"]), + TDLeaveFunctionEvent(["main"]), # This is artifical as there is a call to exit + ] + + # NOTE(hbrodin): Could have done assert list(cflog) == expected_seq, but this provides the failed element + for got, expected in zip(cflog, expected_seq): + assert got == expected From f1fe70b867fe24f7169f7c52b90906e95687938a Mon Sep 17 00:00:00 2001 From: hbrodin <90325907+hbrodin@users.noreply.github.com> Date: Mon, 22 May 2023 14:08:45 +0200 Subject: [PATCH 3/6] formatting --- polytracker/build.py | 3 ++- .../include/polytracker/passes/tainted_control_flow.h | 2 -- polytracker/include/taintdag/polytracker.h | 7 +++---- polytracker/src/polytracker/polytracker.cpp | 3 ++- polytracker/src/taintdag/polytracker.cpp | 4 +--- polytracker/taint_dag.py | 8 ++++++-- 6 files changed, 14 insertions(+), 13 deletions(-) diff --git a/polytracker/build.py b/polytracker/build.py index c4789075..4b28f0da 100644 --- a/polytracker/build.py +++ b/polytracker/build.py @@ -146,6 +146,7 @@ def _optimize_bitcode(input_bitcode: Path, output_bitcode: Path) -> None: cmd = ["opt", "-O3", str(input_bitcode), "-o", str(output_bitcode)] subprocess.check_call(cmd) + def _preopt_instrument_bitcode(input_bitcode: Path, output_bitcode: Path) -> None: POLY_PASS_PATH: Path = _ensure_path_exists( _compiler_dir_path() / "pass" / "libPolytrackerPass.so" @@ -165,6 +166,7 @@ def _preopt_instrument_bitcode(input_bitcode: Path, output_bitcode: Path) -> Non # execute `cmd` subprocess.check_call(cmd) + def _instrument_bitcode( input_bitcode: Path, output_bitcode: Path, @@ -422,7 +424,6 @@ def __init_arguments__(self, parser: argparse.ArgumentParser): help="instrument with control affecting dataflow logging", ) - def run(self, args: argparse.Namespace): for target in args.targets: blight_cmds = _read_blight_journal(args.journal_path) diff --git a/polytracker/include/polytracker/passes/tainted_control_flow.h b/polytracker/include/polytracker/passes/tainted_control_flow.h index d0acf509..b9d22f6a 100644 --- a/polytracker/include/polytracker/passes/tainted_control_flow.h +++ b/polytracker/include/polytracker/passes/tainted_control_flow.h @@ -35,12 +35,10 @@ class TaintedControlFlowPass void insertTaintStartupCall(llvm::Module &mod); void declareLoggingFunctions(llvm::Module &mod); - llvm::ConstantInt *get_function_id_const(llvm::Function &f); llvm::ConstantInt *get_function_id_const(llvm::Instruction &i); public: - using function_id = uint32_t; TaintedControlFlowPass(); diff --git a/polytracker/include/taintdag/polytracker.h b/polytracker/include/taintdag/polytracker.h index 2bdb5dfe..751f6d17 100644 --- a/polytracker/include/taintdag/polytracker.h +++ b/polytracker/include/taintdag/polytracker.h @@ -61,9 +61,9 @@ class PolyTracker { // Instrumentation callback for when execution enters a function // NOTE: There is a overlap in functionality between this and `function_entry` - // they will co-exist for now as they operate slightly different. The underlying - // reason is that this was developed separately to support the Tainted Control - // Flow logging mechanism. + // they will co-exist for now as they operate slightly different. The + // underlying reason is that this was developed separately to support the + // Tainted Control Flow logging mechanism. void enter_function(uint32_t function_id); // Instrumentation callback for when execution leaves a function @@ -103,7 +103,6 @@ class PolyTracker { static constexpr size_t offset_capacity = size_t{max_source_index} + 1; StreamOffset stream_read_offsets_; StreamOffset stream_write_offsets_; - }; } // namespace taintdag diff --git a/polytracker/src/polytracker/polytracker.cpp b/polytracker/src/polytracker/polytracker.cpp index ee030ce7..1f30d4f4 100644 --- a/polytracker/src/polytracker/polytracker.cpp +++ b/polytracker/src/polytracker/polytracker.cpp @@ -47,7 +47,8 @@ extern "C" uint64_t __dfsw___polytracker_log_tainted_control_flow( uint64_t conditional, uint32_t functionid, dfsan_label conditional_label, dfsan_label function_label, dfsan_label *ret_label) { if (conditional_label > 0) { - get_polytracker_tdag().log_tainted_control_flow(conditional_label, functionid); + get_polytracker_tdag().log_tainted_control_flow(conditional_label, + functionid); } *ret_label = conditional_label; return conditional; diff --git a/polytracker/src/taintdag/polytracker.cpp b/polytracker/src/taintdag/polytracker.cpp index 7f7b69b0..65b683a8 100644 --- a/polytracker/src/taintdag/polytracker.cpp +++ b/polytracker/src/taintdag/polytracker.cpp @@ -176,8 +176,7 @@ void PolyTracker::affects_control_flow(label_t lbl) { } void PolyTracker::log_tainted_control_flow(label_t lbl, uint32_t function_id) { - output_file_.section().tainted_control_flow(lbl, - function_id); + output_file_.section().tainted_control_flow(lbl, function_id); } void PolyTracker::enter_function(uint32_t function_id) { @@ -188,7 +187,6 @@ void PolyTracker::leave_function(uint32_t function_id) { output_file_.section().leave_function(function_id); } - Functions::index_t PolyTracker::function_entry(std::string_view name) { auto &functions{output_file_.section()}; auto maybe_index{functions.add_mapping(name)}; diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index cbd1a22f..3f75c490 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -129,6 +129,7 @@ def read_raw(self, label): def count(self): return len(self.section) // sizeof(c_uint64) + class TDEnterFunctionEvent: """Emitted whenever execution enters a function. The callstack member is the callstack right before entering the function, @@ -162,6 +163,7 @@ def __repr__(self) -> str: def __eq__(self, __o: object) -> bool: return self.callstack == __o.callstack + class TDTaintedControlFlowEvent: """Emitted whenever a control flow change is influenced by tainted data. The label that influenced the control flow is available in the `label` member. @@ -178,6 +180,7 @@ def __repr__(self) -> str: def __eq__(self, __o: object) -> bool: return self.label == __o.label and self.callstack == __o.callstack + class TDControlFlowLogSection: """TDAG Control flow log section @@ -218,7 +221,7 @@ def __iter__(self): buffer = self.section callstack = [] while buffer: - event = c_uint8.from_buffer_copy(buffer,0).value + event = c_uint8.from_buffer_copy(buffer, 0).value buffer = buffer[1:] function_id, buffer = TDControlFlowLogSection._decode_varint(buffer) if self.funcmapping != None: @@ -252,6 +255,7 @@ def function_id_mapping(self, id_to_name_array): """This method stores an array used to translate from function id to symbolic names""" self.funcmapping = id_to_name_array + class TDSinkSection: """TDAG Sinks section @@ -880,4 +884,4 @@ def run(self, args): cflog = tdfile._get_section(TDControlFlowLogSection) assert isinstance(cflog, TDControlFlowLogSection) for obj in cflog: - print(f"{obj}") \ No newline at end of file + print(f"{obj}") From 1e5759dc1011995dd60adb6f8305209ee37b8205 Mon Sep 17 00:00:00 2001 From: hbrodin <90325907+hbrodin@users.noreply.github.com> Date: Mon, 22 May 2023 14:18:47 +0200 Subject: [PATCH 4/6] Python linting --- polytracker/taint_dag.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/polytracker/taint_dag.py b/polytracker/taint_dag.py index 3f75c490..744e5761 100644 --- a/polytracker/taint_dag.py +++ b/polytracker/taint_dag.py @@ -144,7 +144,9 @@ def __repr__(self) -> str: return f"Enter: {self.callstack}" def __eq__(self, __o: object) -> bool: - return self.callstack == __o.callstack + if isinstance(__o, TDEnterFunctionEvent): + return self.callstack == __o.callstack + return False class TDLeaveFunctionEvent: @@ -161,7 +163,9 @@ def __repr__(self) -> str: return f"Leave: {self.callstack}" def __eq__(self, __o: object) -> bool: - return self.callstack == __o.callstack + if isinstance(__o, TDLeaveFunctionEvent): + return self.callstack == __o.callstack + return False class TDTaintedControlFlowEvent: @@ -178,7 +182,9 @@ def __repr__(self) -> str: return f"TaintedControlFlow label {self.label} callstack {self.callstack}" def __eq__(self, __o: object) -> bool: - return self.label == __o.label and self.callstack == __o.callstack + if isinstance(__o, TDTaintedControlFlowEvent): + return self.label == __o.label and self.callstack == __o.callstack + return False class TDControlFlowLogSection: From 993efd1f41337af4921526a4532b2a375e00603d Mon Sep 17 00:00:00 2001 From: hbrodin <90325907+hbrodin@users.noreply.github.com> Date: Wed, 24 May 2023 13:21:13 +0200 Subject: [PATCH 5/6] Unify naming of 'detail' (not 'details') --- polytracker/include/taintdag/control_flow_log.h | 10 +++++----- unittests/src/taintdag/control_flow_log.cpp | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/polytracker/include/taintdag/control_flow_log.h b/polytracker/include/taintdag/control_flow_log.h index 3eaf3523..1bcff380 100644 --- a/polytracker/include/taintdag/control_flow_log.h +++ b/polytracker/include/taintdag/control_flow_log.h @@ -16,7 +16,7 @@ namespace taintdag { -namespace details { +namespace detail { // A uint32_t varint encoded by setting highest bit for all but the final byte. // Requires up to 5 bytes of storage as each output byte uses 7 input bits. // Total maximum need is floor(32/7) = 5. Returns number of bytes required. @@ -30,7 +30,7 @@ size_t varint_encode(uint32_t val, uint8_t *buffer) { return buffer - orig_buffer; } // TODO (hbrodin): Should probably used std::span -} // namespace details +} // namespace detail struct ControlFlowLog : public SectionBase { enum EventType { @@ -49,7 +49,7 @@ struct ControlFlowLog : public SectionBase { void function_event(EventType evt, uint32_t function_id) { uint8_t buffer[6]; buffer[0] = static_cast(evt); - auto used = details::varint_encode(function_id, &buffer[1]); + auto used = detail::varint_encode(function_id, &buffer[1]); auto total = used + 1; if (auto wctx = write(total)) { @@ -71,9 +71,9 @@ struct ControlFlowLog : public SectionBase { // 1 byte event, <= 5 bytes function id, <= 5 bytes label uint8_t buffer[11]; buffer[0] = static_cast(TaintedControlFlow); - auto used = details::varint_encode(function_id, &buffer[1]); + auto used = detail::varint_encode(function_id, &buffer[1]); auto total = used + 1; - used = details::varint_encode(label, &buffer[total]); + used = detail::varint_encode(label, &buffer[total]); total += used; if (auto wctx = write(total)) { diff --git a/unittests/src/taintdag/control_flow_log.cpp b/unittests/src/taintdag/control_flow_log.cpp index 95ec95b5..fcafe61b 100644 --- a/unittests/src/taintdag/control_flow_log.cpp +++ b/unittests/src/taintdag/control_flow_log.cpp @@ -12,7 +12,7 @@ #include TEST_CASE("Simple varint encoding") { - using namespace taintdag::details; + using namespace taintdag::detail; uint8_t buffer[5]; SECTION("Encode 0") { From 9590a15b05fd4ea4506f481486d137944cbc82f9 Mon Sep 17 00:00:00 2001 From: hbrodin <90325907+hbrodin@users.noreply.github.com> Date: Thu, 25 May 2023 14:03:39 +0200 Subject: [PATCH 6/6] Reviewer feedback: Simplify build step --- polytracker/build.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/polytracker/build.py b/polytracker/build.py index 4b28f0da..eeded59c 100644 --- a/polytracker/build.py +++ b/polytracker/build.py @@ -433,12 +433,9 @@ def run(self, args: argparse.Namespace): _extract_bitcode(target_path, bc_path) if args.cflog: # Control affecting data flow logging happens before optimization - pre_opt = bc_path.with_suffix(".preopt.bc") - _preopt_instrument_bitcode(bc_path, pre_opt) + _preopt_instrument_bitcode(bc_path, bc_path) - _optimize_bitcode(pre_opt, opt_bc) - else: - _optimize_bitcode(bc_path, opt_bc) + _optimize_bitcode(bc_path, opt_bc) inst_bc_path = Path(f"{bc_path.stem}.instrumented.bc") _instrument_bitcode( opt_bc,