From 25a0ef3239af35d197b29567bc6718803f5c2a97 Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Mon, 14 Jan 2019 14:14:43 +0800 Subject: [PATCH 001/135] trace.py: fix syscall parameter rewriting on x64 (#2138) On x64 from kernel v4.17 onwards, a indirect table is used to rewrite syscall parameters in trace.py. However, it only works for arg1. This patch fixes it for arg2-arg6 too. Fixes: 2da34267fcae ("generate indirect parameter assignment if arch uses syscall wrapper (#1816)") Signed-off-by: Xiaozhou Liu --- tools/trace.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/trace.py b/tools/trace.py index e1845da18..ff53319d6 100755 --- a/tools/trace.py +++ b/tools/trace.py @@ -218,15 +218,15 @@ def _parse_action(self, action): aliases_indarg = { "arg1": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM1(_ctx))); _val;})", - "arg2": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM2(ctx);" + "arg2": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM2(_ctx))); _val;})", - "arg3": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM3(ctx);" + "arg3": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM3(_ctx))); _val;})", - "arg4": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM4(ctx);" + "arg4": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM4(_ctx))); _val;})", - "arg5": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM5(ctx);" + "arg5": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM5(_ctx))); _val;})", - "arg6": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM6(ctx);" + "arg6": "({u64 _val; struct pt_regs *_ctx = (struct pt_regs *)PT_REGS_PARM1(ctx);" " bpf_probe_read(&_val, sizeof(_val), &(PT_REGS_PARM6(_ctx))); _val;})", } From 2ddbc07782c3ca4115a0840882e42dfb0c88686d Mon Sep 17 00:00:00 2001 From: vijunag Date: Tue, 15 Jan 2019 10:48:38 +0530 Subject: [PATCH 002/135] Add build_id support for BPF stackmap A separate build_id stackmap can be created with the help of new macro BPF_STACK_TRACE_BUILDID. The kernel BPF reports stacktrace in the structure bpf_stack_build_id. Changes have been made to BPF modules to support resolving symbols mentioned in the above format. An example tool is also available in examples/tracing/stack_buildid_example.py. Both python and C++ test cases are added. --- examples/tracing/stack_buildid_example.py | 105 ++++++++++++++++++ src/cc/api/BPF.cc | 17 +++ src/cc/api/BPF.h | 16 +++ src/cc/api/BPFTable.cc | 63 +++++++++++ src/cc/api/BPFTable.h | 20 ++++ src/cc/bcc_elf.c | 14 +++ src/cc/bcc_elf.h | 1 + src/cc/bcc_syms.cc | 126 ++++++++++++++++++++++ src/cc/bcc_syms.h | 8 ++ src/cc/export/helpers.h | 7 ++ src/cc/syms.h | 39 +++++++ src/python/bcc/__init__.py | 42 +++++++- src/python/bcc/libbcc.py | 25 +++++ src/python/bcc/table.py | 18 +++- tests/cc/test_bpf_table.cc | 56 ++++++++++ tests/python/test_stackid.py | 39 +++++++ 16 files changed, 591 insertions(+), 5 deletions(-) create mode 100644 examples/tracing/stack_buildid_example.py diff --git a/examples/tracing/stack_buildid_example.py b/examples/tracing/stack_buildid_example.py new file mode 100644 index 000000000..505697cbe --- /dev/null +++ b/examples/tracing/stack_buildid_example.py @@ -0,0 +1,105 @@ +#!/usr/bin/python +# +# An example usage of stack_build_id +# Most of the code here is borrowed from tools/profile.py +# +# Steps for using this code +# 1) Start ping program in one terminal eg invocation: ping google.com -i0.001 +# 2) Change the path of libc specified in b.add_module() below +# 3) Invoke the script as 'python stack_buildid_example.py' +# 4) o/p of the tool is as shown below +# python example/tracing/stack_buildid_example.py +# sendto +# - ping (5232) +# 2 +# +# REQUIRES: Linux 4.17+ (BPF_BUILD_ID support) +# Licensed under the Apache License, Version 2.0 (the "License") +# 03-Jan-2019 Vijay Nag + +from __future__ import print_function +from bcc import BPF, PerfType, PerfSWConfig +from sys import stderr +from time import sleep +import argparse +import signal +import os +import subprocess +import errno +import multiprocessing +import ctypes as ct + +def Get_libc_path(): + # A small helper function that returns full path + # of libc in the system + cmd = 'cat /proc/self/maps | grep libc | awk \'{print $6}\' | uniq' + output = subprocess.check_output(cmd, shell=True) + if not isinstance(output, str): + output = output.decode() + return output.split('\n')[0] + +bpf_text = """ +#include +#include +#include + +struct key_t { + u32 pid; + int user_stack_id; + char name[TASK_COMM_LEN]; +}; +BPF_HASH(counts, struct key_t); +BPF_STACK_TRACE_BUILDID(stack_traces, 128); + +int do_perf_event(struct bpf_perf_event_data *ctx) { + u32 pid = bpf_get_current_pid_tgid() >> 32; + + // create map key + struct key_t key = {.pid = pid}; + bpf_get_current_comm(&key.name, sizeof(key.name)); + + key.user_stack_id = stack_traces.get_stackid(&ctx->regs, BPF_F_USER_STACK); + + if (key.user_stack_id >= 0) { + counts.increment(key); + } + return 0; +} +""" + +b = BPF(text=bpf_text) +b.attach_perf_event(ev_type=PerfType.SOFTWARE, + ev_config=PerfSWConfig.CPU_CLOCK, fn_name="do_perf_event", + sample_period=0, sample_freq=49, cpu=0) + +# Add the list of libraries/executables to the build sym cache for sym resolution +# Change the libc path if it is different on a different machine. +# libc.so and ping are added here so that any symbols pertaining to +# libc or ping are resolved. More executables/libraries can be added here. +b.add_module(Get_libc_path()) +b.add_module("/usr/sbin/sshd") +b.add_module("/bin/ping") +counts = b.get_table("counts") +stack_traces = b.get_table("stack_traces") +duration = 2 + +def signal_handler(signal, frame): + print() + +try: + sleep(duration) +except KeyboardInterrupt: + # as cleanup can take some time, trap Ctrl-C: + signal.signal(signal.SIGINT, signal_ignore) + +user_stack=[] +for k,v in sorted(counts.items(), key=lambda counts: counts[1].value): + user_stack = [] if k.user_stack_id < 0 else \ + stack_traces.walk(k.user_stack_id) + + user_stack=list(user_stack) + for addr in user_stack: + print(" %s" % b.sym(addr, k.pid).decode('utf-8', 'replace')) + print(" %-16s %s (%d)" % ("-", k.name.decode('utf-8', 'replace'), k.pid)) + print(" %d\n" % v.value) + diff --git a/src/cc/api/BPF.cc b/src/cc/api/BPF.cc index c6f843f2f..e0502fbf3 100644 --- a/src/cc/api/BPF.cc +++ b/src/cc/api/BPF.cc @@ -86,6 +86,8 @@ BPF::~BPF() { if (res.code() != 0) std::cerr << "Failed to detach all probes on destruction: " << std::endl << res.msg() << std::endl; + bcc_free_buildsymcache(bsymcache_); + bsymcache_ = NULL; } StatusTuple BPF::detach_all() { @@ -650,6 +652,21 @@ BPFStackTable BPF::get_stack_table(const std::string& name, bool use_debug_file, return BPFStackTable({}, use_debug_file, check_debug_file_crc); } +BPFStackBuildIdTable BPF::get_stackbuildid_table(const std::string &name, bool use_debug_file, + bool check_debug_file_crc) { + TableStorage::iterator it; + + if (bpf_module_->table_storage().Find(Path({bpf_module_->id(), name}), it)) + return BPFStackBuildIdTable(it->second, use_debug_file, check_debug_file_crc, get_bsymcache()); + return BPFStackBuildIdTable({}, use_debug_file, check_debug_file_crc, get_bsymcache()); +} + +bool BPF::add_module(std::string module) +{ + return bcc_buildsymcache_add_module(get_bsymcache(), module.c_str()) != 0 ? + false : true; +} + std::string BPF::get_uprobe_event(const std::string& binary_path, uint64_t offset, bpf_probe_attach_type type, pid_t pid) { diff --git a/src/cc/api/BPF.h b/src/cc/api/BPF.h index fcf0db400..ae3ad9199 100644 --- a/src/cc/api/BPF.h +++ b/src/cc/api/BPF.h @@ -49,6 +49,7 @@ class BPF { explicit BPF(unsigned int flag = 0, TableStorage* ts = nullptr, bool rw_engine_enabled = bpf_module_rw_engine_enabled(), const std::string &maps_ns = "") : flag_(flag), + bsymcache_(NULL), bpf_module_(new BPFModule(flag, ts, rw_engine_enabled, maps_ns)) {} StatusTuple init(const std::string& bpf_program, const std::vector& cflags = {}, @@ -137,6 +138,13 @@ class BPF { return BPFPercpuHashTable({}); } + void* get_bsymcache(void) { + if (bsymcache_ == NULL) { + bsymcache_ = bcc_buildsymcache_new(); + } + return bsymcache_; + } + BPFProgTable get_prog_table(const std::string& name); BPFCgroupArray get_cgroup_array(const std::string& name); @@ -147,6 +155,12 @@ class BPF { bool use_debug_file = true, bool check_debug_file_crc = true); + BPFStackBuildIdTable get_stackbuildid_table(const std::string &name, + bool use_debug_file = true, + bool check_debug_file_crc = true); + + bool add_module(std::string module); + StatusTuple open_perf_event(const std::string& name, uint32_t type, uint64_t config); @@ -225,6 +239,8 @@ class BPF { int flag_; + void *bsymcache_; + std::unique_ptr syscall_prefix_; std::unique_ptr bpf_module_; diff --git a/src/cc/api/BPFTable.cc b/src/cc/api/BPFTable.cc index 64fe77ca0..67e8a8f2a 100644 --- a/src/cc/api/BPFTable.cc +++ b/src/cc/api/BPFTable.cc @@ -316,6 +316,69 @@ std::vector BPFStackTable::get_stack_symbol(int stack_id, return res; } +BPFStackBuildIdTable::BPFStackBuildIdTable(const TableDesc& desc, bool use_debug_file, + bool check_debug_file_crc, + void *bsymcache) + : BPFTableBase(desc), + bsymcache_(bsymcache) { + if (desc.type != BPF_MAP_TYPE_STACK_TRACE) + throw std::invalid_argument("Table '" + desc.name + + "' is not a stack table"); + + symbol_option_ = {.use_debug_file = use_debug_file, + .check_debug_file_crc = check_debug_file_crc, + .use_symbol_type = (1 << STT_FUNC) | (1 << STT_GNU_IFUNC)}; +} + +void BPFStackBuildIdTable::clear_table_non_atomic() { + for (int i = 0; size_t(i) < capacity(); i++) { + remove(&i); + } +} + +std::vector BPFStackBuildIdTable::get_stack_addr(int stack_id) { + std::vector res; + struct stacktrace_buildid_t stack; + if (stack_id < 0) + return res; + if (!lookup(&stack_id, &stack)) + return res; + for (int i = 0; (i < BPF_MAX_STACK_DEPTH) && \ + (stack.trace[i].status == BPF_STACK_BUILD_ID_VALID); + i++) { + /* End of stack marker is BCC_STACK_BUILD_ID_EMPTY or + * BCC_STACK_BUILD_IP(fallback) mechanism. + * We do not support fallback mechanism + */ + res.push_back(stack.trace[i]); + } + return res; +} + +std::vector BPFStackBuildIdTable::get_stack_symbol(int stack_id) +{ + auto addresses = get_stack_addr(stack_id); + std::vector res; + if (addresses.empty()) + return res; + res.reserve(addresses.size()); + + bcc_symbol symbol; + struct bpf_stack_build_id trace; + for (auto addr : addresses) { + memcpy(trace.build_id, addr.build_id, sizeof(trace.build_id)); + trace.status = addr.status; + trace.offset = addr.offset; + if (bcc_buildsymcache_resolve(bsymcache_,&trace,&symbol) != 0) { + res.emplace_back("[UNKNOWN]"); + } else { + res.push_back(symbol.name); + bcc_symbol_free_demangle_name(&symbol); + } + } + return res; +} + BPFPerfBuffer::BPFPerfBuffer(const TableDesc& desc) : BPFTableBase(desc), epfd_(-1) { if (desc.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) diff --git a/src/cc/api/BPFTable.h b/src/cc/api/BPFTable.h index 3a183f4f2..7bd243964 100644 --- a/src/cc/api/BPFTable.h +++ b/src/cc/api/BPFTable.h @@ -315,6 +315,26 @@ class BPFStackTable : public BPFTableBase { std::map pid_sym_; }; +// from src/cc/export/helpers.h +struct stacktrace_buildid_t { + struct bpf_stack_build_id trace[BPF_MAX_STACK_DEPTH]; +}; + +class BPFStackBuildIdTable : public BPFTableBase { + public: + BPFStackBuildIdTable(const TableDesc& desc, bool use_debug_file, + bool check_debug_file_crc, void *bsymcache); + ~BPFStackBuildIdTable() = default; + + void clear_table_non_atomic(); + std::vector get_stack_addr(int stack_id); + std::vector get_stack_symbol(int stack_id); + + private: + void *bsymcache_; + bcc_symbol_option symbol_option_; +}; + class BPFPerfBuffer : public BPFTableBase { public: BPFPerfBuffer(const TableDesc& desc); diff --git a/src/cc/bcc_elf.c b/src/cc/bcc_elf.c index c6745a2bb..be1ef82fe 100644 --- a/src/cc/bcc_elf.c +++ b/src/cc/bcc_elf.c @@ -882,6 +882,20 @@ int bcc_free_memory() { return err; } +int bcc_elf_get_buildid(const char *path, char *buildid) +{ + Elf *e; + int fd; + + if (openelf(path, &e, &fd) < 0) + return -1; + + if (!find_buildid(e, buildid)) + return -1; + + return 0; +} + #if 0 #include diff --git a/src/cc/bcc_elf.h b/src/cc/bcc_elf.h index 0d102592c..314a2e3c6 100644 --- a/src/cc/bcc_elf.h +++ b/src/cc/bcc_elf.h @@ -69,6 +69,7 @@ int bcc_elf_is_shared_obj(const char *path); int bcc_elf_is_exe(const char *path); int bcc_elf_is_vdso(const char *name); int bcc_free_memory(); +int bcc_elf_get_buildid(const char *path, char *buildid); #ifdef __cplusplus } diff --git a/src/cc/bcc_syms.cc b/src/cc/bcc_syms.cc index 116cf0d50..b74003d3b 100644 --- a/src/cc/bcc_syms.cc +++ b/src/cc/bcc_syms.cc @@ -378,6 +378,93 @@ bool ProcSyms::Module::find_addr(uint64_t offset, struct bcc_symbol *sym) { return false; } +bool BuildSyms::Module::load_sym_table() +{ + if (loaded_) + return true; + + symbol_option_ = { + .use_debug_file = 1, + .check_debug_file_crc = 1, + .use_symbol_type = (1 << STT_FUNC) | (1 << STT_GNU_IFUNC) + }; + + bcc_elf_foreach_sym(module_name_.c_str(), _add_symbol, &symbol_option_, this); + std::sort(syms_.begin(), syms_.end()); + + for(std::vector::iterator it = syms_.begin(); + it != syms_.end(); ++it++) { + } + loaded_ = true; + return true; +} + +int BuildSyms::Module::_add_symbol(const char *symname, uint64_t start, + uint64_t size, void *p) +{ + BuildSyms::Module *m = static_cast (p); + auto res = m->symnames_.emplace(symname); + m->syms_.emplace_back(&*(res.first), start, size); + return 0; +} + +bool BuildSyms::Module::resolve_addr(uint64_t offset, struct bcc_symbol* sym, + bool demangle) +{ + std::vector::iterator it; + + load_sym_table(); + + if (syms_.empty()) + goto unknown_symbol; + + it = std::upper_bound(syms_.begin(), syms_.end(), Symbol(nullptr, offset, 0)); + if (it != syms_.begin()) { + it--; + sym->name = (*it).name->c_str(); + if (demangle) + sym->demangle_name = sym->name; + sym->offset = (*it).start; + sym->module = module_name_.c_str(); + return true; + } + +unknown_symbol: + memset(sym, 0, sizeof(struct bcc_symbol)); + return false; +} + +bool BuildSyms::add_module(const std::string module_name) +{ + struct stat s; + char buildid[BPF_BUILD_ID_SIZE*2+1]; + + if (stat(module_name.c_str(), &s) < 0) + return false; + + if (bcc_elf_get_buildid(module_name.c_str(), buildid) < 0) + return false; + + std::string elf_buildid(buildid); + std::unique_ptr ptr(new BuildSyms::Module(module_name.c_str())); + buildmap_[elf_buildid] = std::move(ptr); + return true; +} + +bool BuildSyms::resolve_addr(std::string build_id, uint64_t offset, + struct bcc_symbol *sym, bool demangle) +{ + std::unordered_map >::iterator it; + + it = buildmap_.find(build_id); + if (it == buildmap_.end()) + /*build-id not added to the BuildSym*/ + return false; + + BuildSyms::Module *mod = it->second.get(); + return mod->resolve_addr(offset, sym, demangle); +} + extern "C" { void *bcc_symcache_new(int pid, struct bcc_symbol_option *option) { @@ -421,6 +508,45 @@ void bcc_symcache_refresh(void *resolver) { cache->refresh(); } +void *bcc_buildsymcache_new(void) { + return static_cast(new BuildSyms()); +} + +void bcc_free_buildsymcache(void *symcache) { + delete static_cast(symcache); +} + +int bcc_buildsymcache_add_module(void *resolver, const char *module_name) +{ + BuildSyms *bsym = static_cast(resolver); + return bsym->add_module(module_name) ? 0 : -1; +} + +int bcc_buildsymcache_resolve(void *resolver, + struct bpf_stack_build_id *trace, + struct bcc_symbol *sym) +{ + std::string build_id; + unsigned char *c = &trace->build_id[0]; + int idx = 0; + + /*cannot resolve in case of fallback*/ + if (trace->status == BPF_STACK_BUILD_ID_EMPTY || + trace->status == BPF_STACK_BUILD_ID_IP) + return 0; + + while( idx < 20) { + int nib1 = (c[idx]&0xf0)>>4; + int nib2 = (c[idx]&0x0f); + build_id += "0123456789abcdef"[nib1]; + build_id += "0123456789abcdef"[nib2]; + idx++; + } + + BuildSyms *bsym = static_cast(resolver); + return bsym->resolve_addr(build_id, trace->offset, sym) ? 0 : -1; +} + struct mod_st { const char *name; uint64_t start; diff --git a/src/cc/bcc_syms.h b/src/cc/bcc_syms.h index 42a1cf375..2cf432b42 100644 --- a/src/cc/bcc_syms.h +++ b/src/cc/bcc_syms.h @@ -21,6 +21,7 @@ extern "C" { #endif #include +#include struct bcc_symbol { const char *name; @@ -67,6 +68,13 @@ void bcc_symcache_refresh(void *resolver); int bcc_resolve_global_addr(int pid, const char *module, const uint64_t address, uint64_t *global); +/*bcc APIs for build_id stackmap support*/ +void *bcc_buildsymcache_new(void); +void bcc_free_buildsymcache(void *symcache); +int bcc_buildsymcache_add_module(void *resolver, const char *module_name); +int bcc_buildsymcache_resolve(void *resolver, + struct bpf_stack_build_id *trace, + struct bcc_symbol *sym); // Call cb on every function symbol in the specified module. Uses simpler // SYM_CB callback mainly for easier to use in Python API. // Will prefer use debug file and check debug file CRC when reading the module. diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index 00b916034..9b2c04caa 100755 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -216,9 +216,16 @@ struct bpf_stacktrace { u64 ip[BPF_MAX_STACK_DEPTH]; }; +struct bpf_stacktrace_buildid { + struct bpf_stack_build_id trace[BPF_MAX_STACK_DEPTH]; +}; + #define BPF_STACK_TRACE(_name, _max_entries) \ BPF_TABLE("stacktrace", int, struct bpf_stacktrace, _name, roundup_pow_of_two(_max_entries)) +#define BPF_STACK_TRACE_BUILDID(_name, _max_entries) \ + BPF_F_TABLE("stacktrace", int, struct bpf_stacktrace_buildid, _name, roundup_pow_of_two(_max_entries), BPF_F_STACK_BUILD_ID) + #define BPF_PROG_ARRAY(_name, _max_entries) \ BPF_TABLE("prog", u32, u32, _name, _max_entries) diff --git a/src/cc/syms.h b/src/cc/syms.h index d7dabfa37..09caa9ff3 100644 --- a/src/cc/syms.h +++ b/src/cc/syms.h @@ -147,3 +147,42 @@ class ProcSyms : SymbolCache { virtual bool resolve_name(const char *module, const char *name, uint64_t *addr); }; + +class BuildSyms { + struct Symbol { + Symbol(const std::string *name, uint64_t start, uint64_t size) + :name(name), start(start), size(size) {} + const std::string *name; + uint64_t start; + uint64_t size; + + bool operator<(const struct Symbol &rhs) const { + return start < rhs.start; + } + }; + + struct Module { + Module(const char *module_name): + module_name_(module_name), + loaded_(false) {} + const std::string module_name_; + const std::string build_id_; + bool loaded_; + std::unordered_set symnames_; + std::vector syms_; + bcc_symbol_option symbol_option_; + + bool load_sym_table(); + static int _add_symbol(const char *symname, uint64_t start, uint64_t size, + void *p); + bool resolve_addr(uint64_t offset, struct bcc_symbol*, bool demangle=true); + }; + + std::unordered_map > buildmap_; + +public: + BuildSyms() {} + virtual ~BuildSyms() = default; + virtual bool add_module(const std::string module_name); + virtual bool resolve_addr(std::string build_id, uint64_t offset, struct bcc_symbol *sym, bool demangle = true); +}; diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index 1d99afd5e..353fa5e90 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -24,7 +24,7 @@ import sys basestring = (unicode if sys.version_info[0] < 3 else str) -from .libbcc import lib, bcc_symbol, bcc_symbol_option, _SYM_CB_TYPE +from .libbcc import lib, bcc_symbol, bcc_symbol_option, bcc_stacktrace_build_id, _SYM_CB_TYPE from .table import Table, PerfEventArray from .perf import Perf from .syscall import syscall_name @@ -69,6 +69,7 @@ def resolve(self, addr, demangle): module. If we don't even know the module, return the absolute address as the offset. """ + sym = bcc_symbol() if demangle: res = lib.bcc_symcache_resolve(self.cache, addr, ct.byref(sym)) @@ -96,6 +97,7 @@ def resolve_name(self, module, name): return -1 return addr.value + class PerfType: # From perf_type_id in uapi/linux/perf_event.h HARDWARE = 0 @@ -158,6 +160,7 @@ class BPF(object): _probe_repl = re.compile(b"[^a-zA-Z0-9_]") _sym_caches = {} + _bsymcache = lib.bcc_buildsymcache_new() _auto_includes = { "linux/time.h": ["time"], @@ -1189,7 +1192,31 @@ def sym(addr, pid, show_module=False, show_offset=False, demangle=True): Example output when both show_module and show_offset are False: "start_thread" """ - name, offset, module = BPF._sym_cache(pid).resolve(addr, demangle) + + #addr is of type stacktrace_build_id + #so invoke the bsym address resolver + typeofaddr = str(type(addr)) + if typeofaddr.find('bpf_stack_build_id') != -1: + sym = bcc_symbol() + b = bcc_stacktrace_build_id() + b.status = addr.status + b.build_id = addr.build_id + b.u.offset = addr.offset; + res = lib.bcc_buildsymcache_resolve(BPF._bsymcache, + ct.byref(b), + ct.byref(sym)) + if res < 0: + if sym.module and sym.offset: + name,offset,module = (None, sym.offset, + ct.cast(sym.module, ct.c_char_p).value) + else: + name, offset, module = (None, addr, None) + else: + name, offset, module = (sym.name, sym.offset, + ct.cast(sym.module, ct.c_char_p).value) + else: + name, offset, module = BPF._sym_cache(pid).resolve(addr, demangle) + offset = b"+0x%x" % offset if show_offset and name is not None else b"" name = name or b"[unknown]" name = name + offset @@ -1262,6 +1289,17 @@ def kprobe_poll(self, timeout = -1): def free_bcc_memory(self): return lib.bcc_free_memory() + @staticmethod + def add_module(modname): + """add_module(modname) + + Add a library or exe to buildsym cache + """ + try: + lib.bcc_buildsymcache_add_module(BPF._bsymcache, modname.encode()) + except Exception as e: + print("Error adding module to build sym cache"+str(e)) + def donothing(self): """the do nothing exit handler""" diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py index 6b5be77eb..341ed5bec 100644 --- a/src/python/bcc/libbcc.py +++ b/src/python/bcc/libbcc.py @@ -133,6 +133,19 @@ class bcc_symbol(ct.Structure): ('offset', ct.c_ulonglong), ] +class bcc_ip_offset_union(ct.Union): + _fields_ = [ + ('offset', ct.c_uint64), + ('ip', ct.c_uint64) + ] + +class bcc_stacktrace_build_id(ct.Structure): + _fields_ = [ + ('status', ct.c_uint32), + ('build_id',ct.c_ubyte*20), + ('u',bcc_ip_offset_union) + ] + class bcc_symbol_option(ct.Structure): _fields_ = [ ('use_debug_file', ct.c_int), @@ -161,6 +174,18 @@ class bcc_symbol_option(ct.Structure): lib.bcc_free_symcache.restype = ct.c_void_p lib.bcc_free_symcache.argtypes = [ct.c_void_p, ct.c_int] +lib.bcc_buildsymcache_new.restype = ct.c_void_p +lib.bcc_buildsymcache_new.argtypes = None + +lib.bcc_free_buildsymcache.restype = None +lib.bcc_free_buildsymcache.argtypes = [ct.c_void_p] + +lib.bcc_buildsymcache_add_module.restype = ct.c_int +lib.bcc_buildsymcache_add_module.argtypes = [ct.c_void_p, ct.c_char_p] + +lib.bcc_buildsymcache_resolve.restype = ct.c_int +lib.bcc_buildsymcache_resolve.argtypes = [ct.c_void_p, ct.POINTER(bcc_stacktrace_build_id), ct.POINTER(bcc_symbol)] + lib.bcc_symbol_free_demangle_name.restype = ct.c_void_p lib.bcc_symbol_free_demangle_name.argtypes = [ct.POINTER(bcc_symbol)] diff --git a/src/python/bcc/table.py b/src/python/bcc/table.py index 301afaca5..6f5983532 100644 --- a/src/python/bcc/table.py +++ b/src/python/bcc/table.py @@ -736,15 +736,20 @@ def __len__(self): class StackTrace(TableBase): MAX_DEPTH = 127 + BPF_F_STACK_BUILD_ID = (1<<5) + BPF_STACK_BUILD_ID_EMPTY = 0 #can't get stacktrace + BPF_STACK_BUILD_ID_VALID = 1 #valid build-id,ip + BPF_STACK_BUILD_ID_IP = 2 #fallback to ip def __init__(self, *args, **kwargs): super(StackTrace, self).__init__(*args, **kwargs) class StackWalker(object): - def __init__(self, stack, resolve=None): + def __init__(self, stack, flags, resolve=None): self.stack = stack self.n = -1 self.resolve = resolve + self.flags = flags def __iter__(self): return self @@ -757,14 +762,21 @@ def next(self): if self.n == StackTrace.MAX_DEPTH: raise StopIteration() - addr = self.stack.ip[self.n] + if self.flags & StackTrace.BPF_F_STACK_BUILD_ID: + addr = self.stack.trace[self.n] + if addr.status == StackTrace.BPF_STACK_BUILD_ID_IP or \ + addr.status == StackTrace.BPF_STACK_BUILD_ID_EMPTY: + raise StopIteration() + else: + addr = self.stack.ip[self.n] + if addr == 0 : raise StopIteration() return self.resolve(addr) if self.resolve else addr def walk(self, stack_id, resolve=None): - return StackTrace.StackWalker(self[self.Key(stack_id)], resolve) + return StackTrace.StackWalker(self[self.Key(stack_id)], self.flags, resolve) def __len__(self): i = 0 diff --git a/tests/cc/test_bpf_table.cc b/tests/cc/test_bpf_table.cc index 40ee0af8a..a45c7bc6b 100644 --- a/tests/cc/test_bpf_table.cc +++ b/tests/cc/test_bpf_table.cc @@ -222,3 +222,59 @@ TEST_CASE("test bpf stack table", "[bpf_stack_table]") { REQUIRE(addrs.size() == 0); #endif } + +TEST_CASE("test bpf stack_id table", "[bpf_stack_table]") { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 17, 0) + const std::string BPF_PROGRAM = R"( + BPF_HASH(id, int, int, 1); + BPF_STACK_TRACE_BUILDID(stack_traces, 8); + + int on_sys_getuid(void *ctx) { + int stack_id = stack_traces.get_stackid(ctx, BPF_F_USER_STACK); + int zero = 0, *val; + val = id.lookup_or_init(&zero, &stack_id); + (*val) = stack_id; + + return 0; + } + )"; + + ebpf::BPF bpf; + ebpf::StatusTuple res(0); + res = bpf.init(BPF_PROGRAM); + REQUIRE(res.code() == 0); + std::string getuid_fnname = bpf.get_syscall_fnname("getuid"); + res = bpf.attach_kprobe(getuid_fnname, "on_sys_getuid"); + REQUIRE(res.code() == 0); + REQUIRE(getuid() >= 0); + res = bpf.detach_kprobe(getuid_fnname); + REQUIRE(res.code() == 0); + + auto id = bpf.get_hash_table("id"); + auto stack_traces = bpf.get_stackbuildid_table("stack_traces"); + + /* libc locations on different distributions are added below*/ + bpf.add_module("/lib/x86_64-linux-gnu/libc.so.6"); //Location of libc in ubuntu + bpf.add_module("/lib64/libc.so.6"); //Location of libc fedora machine + + int stack_id = id[0]; + REQUIRE(stack_id >= 0); + + auto addrs = stack_traces.get_stack_addr(stack_id); + auto symbols = stack_traces.get_stack_symbol(stack_id); + REQUIRE(addrs.size() > 0); + REQUIRE(addrs.size() == symbols.size()); + bool found = false; + for (const auto &symbol : symbols) { + if (symbol.find("getuid") != std::string::npos) { + found = true; + break; + } + } + REQUIRE(found); + + stack_traces.clear_table_non_atomic(); + addrs = stack_traces.get_stack_addr(stack_id); + REQUIRE(addrs.size()==0); +#endif +} diff --git a/tests/python/test_stackid.py b/tests/python/test_stackid.py index 25872934b..4bf0808de 100755 --- a/tests/python/test_stackid.py +++ b/tests/python/test_stackid.py @@ -6,6 +6,7 @@ import distutils.version import os import unittest +import subprocess def kernel_version_ge(major, minor): # True if running kernel is >= X.Y @@ -49,6 +50,44 @@ def test_simple(self): stack = stack_traces[stackid].ip self.assertEqual(b.ksym(stack[0]), b"htab_map_lookup_elem") +def Get_libc_path(): + cmd = 'cat /proc/self/maps | grep libc | awk \'{print $6}\' | uniq' + output = subprocess.check_output(cmd, shell=True) + if not isinstance(output, str): + output = output.decode() + return output.split('\n')[0] + +@unittest.skipUnless(kernel_version_ge(4,17), "requires kernel >= 4.17") +class TestStackBuildid(unittest.TestCase): + def test_simple(self): + b = bcc.BPF(text=""" +#include +struct bpf_map; +BPF_STACK_TRACE_BUILDID(stack_traces, 10240); +BPF_HASH(stack_entries, int, int); +BPF_HASH(stub); +int kprobe__sys_getuid(struct pt_regs *ctx, struct bpf_map *map, u64 *k) { + int id = stack_traces.get_stackid(ctx, BPF_F_USER_STACK); + if (id < 0) + return 0; + int key = 1; + stack_entries.update(&key, &id); + return 0; +} +""") + os.getuid() + stub = b["stub"] + stack_traces = b["stack_traces"] + stack_entries = b["stack_entries"] + b.add_module(Get_libc_path()) + try: x = stub[stub.Key(1)] + except: pass + k = stack_entries.Key(1) + self.assertIn(k, stack_entries) + stackid = stack_entries[k] + self.assertIsNotNone(stackid) + stack = stack_traces[stackid] + self.assertTrue(b.sym(stack.trace[0], -1).find(b"getuid")!=-1) if __name__ == "__main__": unittest.main() From 98fd5030e71ab4a265ef92f5502d767c8f0809ba Mon Sep 17 00:00:00 2001 From: Joel Date: Tue, 15 Jan 2019 14:13:59 -0500 Subject: [PATCH 003/135] Bcc build fixes for Android (#2142) * Mark unused parameters as unused In Android, we are building libbpf with -Wunused-parameter, mark the parameters in bpf_detach_tracepoint to prevent errors. Change-Id: I2d0011746af80898e55d456b973a95330ce6be71 Signed-off-by: Joel Fernandes * Avoid void pointer arithmetic In Android, we build libbpf with -Wpointer-arith, this causes warnings as below. Avoid void pointer arithmetic to prevent the warning. external/bcc/src/cc/perf_reader.c:189:26: error: arithmetic on a pointer to void is a GNU extension [-Werror,-Wpointer-arith] memcpy(reader->buf + len, base, e->size - len); ~~~~~~~~~~~ ^ Change-Id: If06535459473c78799b38119786a91e74a208895 Signed-off-by: Joel Fernandes * Cast correctly for unsigned long format specifier In Android, -Wformat gets passed to the compiler causing a warning. Fix it by casting. external/bcc/src/cc/libbpf.c:972:58: error: format specifies type 'unsigned long' but the argument has type 'uint64_t' (aka 'unsigned lo ng long') [-Werror,-Wformat] Change-Id: I5e70eeff983f20a0b921e81aee7ddbee6d7de2b3 Signed-off-by: Joel Fernandes --- src/cc/libbpf.c | 6 +++++- src/cc/perf_reader.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 8d7079816..5930b617d 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -84,6 +84,8 @@ #define min(x, y) ((x) < (y) ? (x) : (y)) +#define UNUSED(expr) do { (void)(expr); } while (0) + struct bpf_helper { char *name; char *required_version; @@ -979,7 +981,7 @@ int bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type, goto error; } res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r', - event_type, event_alias, binary_path, offset); + event_type, event_alias, binary_path, (unsigned long)offset); if (res < 0 || res >= sizeof(buf)) { fprintf(stderr, "Event alias (%s) too long for buffer\n", event_alias); goto error; @@ -1108,6 +1110,8 @@ int bpf_attach_tracepoint(int progfd, const char *tp_category, } int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) { + UNUSED(tp_category); + UNUSED(tp_name); // Right now, there is nothing to do, but it's a good idea to encourage // callers to detach anything they attach. return 0; diff --git a/src/cc/perf_reader.c b/src/cc/perf_reader.c index 91817bcad..3cab01532 100644 --- a/src/cc/perf_reader.c +++ b/src/cc/perf_reader.c @@ -186,7 +186,7 @@ void perf_reader_event_read(struct perf_reader *reader) { reader->buf = realloc(reader->buf, e->size); size_t len = sentinel - begin; memcpy(reader->buf, begin, len); - memcpy(reader->buf + len, base, e->size - len); + memcpy((void *)((unsigned long)reader->buf + len), base, e->size - len); ptr = reader->buf; } From cc01a9cd6ab1a2aea2d21a71502882af1add4346 Mon Sep 17 00:00:00 2001 From: Alexey Ivanov Date: Wed, 16 Jan 2019 09:50:46 -0800 Subject: [PATCH 004/135] tools/examples: hardcode #!/usr/bin/python shebang --- examples/hello_world.py | 2 +- examples/networking/distributed_bridge/main.py | 2 +- examples/networking/distributed_bridge/tunnel.py | 2 +- examples/networking/distributed_bridge/tunnel_mesh.py | 2 +- examples/networking/dns_matching/dns_matching.py | 2 +- examples/networking/http_filter/http-parse-complete.py | 2 +- examples/networking/http_filter/http-parse-simple.py | 2 +- examples/networking/neighbor_sharing/tc_neighbor_sharing.py | 2 +- examples/networking/simple_tc.py | 2 +- examples/networking/tc_perf_event.py | 2 +- examples/networking/tunnel_monitor/main.py | 2 +- examples/networking/tunnel_monitor/monitor.py | 2 +- examples/networking/vlan_filter/data-plane-tracing.py | 2 +- examples/networking/vlan_learning/vlan_learning.py | 2 +- examples/networking/xdp/xdp_drop_count.py | 2 +- examples/networking/xdp/xdp_macswap_count.py | 2 +- examples/networking/xdp/xdp_redirect_cpu.py | 2 +- examples/networking/xdp/xdp_redirect_map.py | 2 +- examples/tracing/bitehist.py | 2 +- examples/tracing/disksnoop.py | 2 +- examples/tracing/hello_fields.py | 2 +- examples/tracing/hello_perf_output.py | 2 +- examples/tracing/kvm_hypercall.py | 2 +- examples/tracing/mallocstacks.py | 2 +- examples/tracing/mysqld_query.py | 2 +- examples/tracing/nodejs_http_server.py | 2 +- examples/tracing/stacksnoop.py | 2 +- examples/tracing/strlen_count.py | 2 +- examples/tracing/strlen_hist.py | 2 +- examples/tracing/strlen_snoop.py | 2 +- examples/tracing/sync_timing.py | 2 +- examples/tracing/task_switch.py | 2 +- examples/tracing/tcpv4connect.py | 2 +- examples/tracing/trace_fields.py | 2 +- examples/tracing/trace_perf_output.py | 2 +- examples/tracing/urandomread-explicit.py | 2 +- examples/tracing/urandomread.py | 2 +- examples/tracing/vfsreadlat.py | 2 +- examples/usdt_sample/scripts/lat_avg.py | 2 +- examples/usdt_sample/scripts/lat_dist.py | 2 +- examples/usdt_sample/scripts/latency.py | 2 +- tools/argdist.py | 2 +- tools/bashreadline.py | 2 +- tools/biolatency.py | 2 +- tools/biosnoop.py | 2 +- tools/biotop.py | 2 +- tools/bitesize.py | 2 +- tools/bpflist.py | 2 +- tools/btrfsdist.py | 2 +- tools/btrfsslower.py | 2 +- tools/cachestat.py | 2 +- tools/cachetop.py | 2 +- tools/capable.py | 2 +- tools/cpudist.py | 2 +- tools/cpuunclaimed.py | 2 +- tools/criticalstat.py | 2 +- tools/dbslower.py | 2 +- tools/dbstat.py | 2 +- tools/dcsnoop.py | 2 +- tools/dcstat.py | 2 +- tools/deadlock_detector.py | 2 +- tools/execsnoop.py | 2 +- tools/ext4dist.py | 2 +- tools/ext4slower.py | 2 +- tools/filelife.py | 2 +- tools/fileslower.py | 2 +- tools/filetop.py | 2 +- tools/funccount.py | 2 +- tools/funclatency.py | 2 +- tools/funcslower.py | 2 +- tools/gethostlatency.py | 2 +- tools/hardirqs.py | 2 +- tools/inject.py | 2 +- tools/killsnoop.py | 2 +- tools/lib/ucalls.py | 2 +- tools/lib/uflow.py | 2 +- tools/lib/ugc.py | 2 +- tools/lib/uobjnew.py | 2 +- tools/lib/ustat.py | 2 +- tools/lib/uthreads.py | 2 +- tools/llcstat.py | 2 +- tools/mdflush.py | 2 +- tools/memleak.py | 2 +- tools/mountsnoop.py | 2 +- tools/mysqld_qslower.py | 2 +- tools/nfsdist.py | 2 +- tools/nfsslower.py | 2 +- tools/offcputime.py | 2 +- tools/offwaketime.py | 2 +- tools/old/bashreadline.py | 2 +- tools/old/biosnoop.py | 2 +- tools/old/filelife.py | 2 +- tools/old/gethostlatency.py | 2 +- tools/old/killsnoop.py | 2 +- tools/old/memleak.py | 2 +- tools/old/offcputime.py | 2 +- tools/old/offwaketime.py | 2 +- tools/old/oomkill.py | 2 +- tools/old/opensnoop.py | 2 +- tools/old/profile.py | 2 +- tools/old/softirqs.py | 2 +- tools/old/stackcount.py | 2 +- tools/old/stacksnoop.py | 2 +- tools/old/statsnoop.py | 2 +- tools/old/syncsnoop.py | 2 +- tools/old/tcpaccept.py | 2 +- tools/old/tcpconnect.py | 2 +- tools/old/wakeuptime.py | 2 +- tools/oomkill.py | 2 +- tools/opensnoop.py | 2 +- tools/pidpersec.py | 2 +- tools/profile.py | 2 +- tools/runqlat.py | 2 +- tools/runqlen.py | 2 +- tools/runqslower.py | 2 +- tools/shmsnoop.py | 2 +- tools/slabratetop.py | 2 +- tools/sofdsnoop.py | 2 +- tools/softirqs.py | 2 +- tools/solisten.py | 2 +- tools/sslsniff.py | 2 +- tools/stackcount.py | 2 +- tools/statsnoop.py | 2 +- tools/syncsnoop.py | 2 +- tools/syscount.py | 2 +- tools/tcpaccept.py | 2 +- tools/tcpconnect.py | 2 +- tools/tcpconnlat.py | 2 +- tools/tcpdrop.py | 2 +- tools/tcplife.py | 2 +- tools/tcpretrans.py | 2 +- tools/tcpstates.py | 2 +- tools/tcpsubnet.py | 2 +- tools/tcptop.py | 2 +- tools/tcptracer.py | 2 +- tools/tplist.py | 2 +- tools/trace.py | 2 +- tools/ttysnoop.py | 2 +- tools/vfscount.py | 2 +- tools/vfsstat.py | 2 +- tools/wakeuptime.py | 2 +- tools/xfsdist.py | 2 +- tools/xfsslower.py | 2 +- tools/zfsdist.py | 2 +- tools/zfsslower.py | 2 +- 145 files changed, 145 insertions(+), 145 deletions(-) diff --git a/examples/hello_world.py b/examples/hello_world.py index 49c55353b..bb52f3e76 100755 --- a/examples/hello_world.py +++ b/examples/hello_world.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Copyright (c) PLUMgrid, Inc. # Licensed under the Apache License, Version 2.0 (the "License") diff --git a/examples/networking/distributed_bridge/main.py b/examples/networking/distributed_bridge/main.py index 056443ee5..c72360eaa 100755 --- a/examples/networking/distributed_bridge/main.py +++ b/examples/networking/distributed_bridge/main.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Copyright (c) PLUMgrid, Inc. # Licensed under the Apache License, Version 2.0 (the "License") diff --git a/examples/networking/distributed_bridge/tunnel.py b/examples/networking/distributed_bridge/tunnel.py index ef942924c..11b59f068 100755 --- a/examples/networking/distributed_bridge/tunnel.py +++ b/examples/networking/distributed_bridge/tunnel.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Copyright (c) PLUMgrid, Inc. # Licensed under the Apache License, Version 2.0 (the "License") diff --git a/examples/networking/distributed_bridge/tunnel_mesh.py b/examples/networking/distributed_bridge/tunnel_mesh.py index f111ac9a0..54ecfcd83 100644 --- a/examples/networking/distributed_bridge/tunnel_mesh.py +++ b/examples/networking/distributed_bridge/tunnel_mesh.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Copyright (c) PLUMgrid, Inc. # Licensed under the Apache License, Version 2.0 (the "License") diff --git a/examples/networking/dns_matching/dns_matching.py b/examples/networking/dns_matching/dns_matching.py index c8625cd32..943dca597 100755 --- a/examples/networking/dns_matching/dns_matching.py +++ b/examples/networking/dns_matching/dns_matching.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python from __future__ import print_function from bcc import BPF diff --git a/examples/networking/http_filter/http-parse-complete.py b/examples/networking/http_filter/http-parse-complete.py index 1218cb2d4..f1e5e0a26 100644 --- a/examples/networking/http_filter/http-parse-complete.py +++ b/examples/networking/http_filter/http-parse-complete.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # #Bertrone Matteo - Polytechnic of Turin #November 2015 diff --git a/examples/networking/http_filter/http-parse-simple.py b/examples/networking/http_filter/http-parse-simple.py index 1fad0d84d..b702393d1 100644 --- a/examples/networking/http_filter/http-parse-simple.py +++ b/examples/networking/http_filter/http-parse-simple.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # #Bertrone Matteo - Polytechnic of Turin #November 2015 diff --git a/examples/networking/neighbor_sharing/tc_neighbor_sharing.py b/examples/networking/neighbor_sharing/tc_neighbor_sharing.py index 43799c980..8d13c43fb 100755 --- a/examples/networking/neighbor_sharing/tc_neighbor_sharing.py +++ b/examples/networking/neighbor_sharing/tc_neighbor_sharing.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Copyright (c) PLUMgrid, Inc. # Licensed under the Apache License, Version 2.0 (the "License") diff --git a/examples/networking/simple_tc.py b/examples/networking/simple_tc.py index 4dd8aa5ca..ec0a3e74d 100755 --- a/examples/networking/simple_tc.py +++ b/examples/networking/simple_tc.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Copyright (c) PLUMgrid, Inc. # Licensed under the Apache License, Version 2.0 (the "License") diff --git a/examples/networking/tc_perf_event.py b/examples/networking/tc_perf_event.py index 40e741115..4a1b754e9 100755 --- a/examples/networking/tc_perf_event.py +++ b/examples/networking/tc_perf_event.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # tc_perf_event.py Output skb and meta data through perf event # diff --git a/examples/networking/tunnel_monitor/main.py b/examples/networking/tunnel_monitor/main.py index d3359ef49..7d3acb8e2 100755 --- a/examples/networking/tunnel_monitor/main.py +++ b/examples/networking/tunnel_monitor/main.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Copyright (c) PLUMgrid, Inc. # Licensed under the Apache License, Version 2.0 (the "License") diff --git a/examples/networking/tunnel_monitor/monitor.py b/examples/networking/tunnel_monitor/monitor.py index bac3420ec..fb3613ee1 100644 --- a/examples/networking/tunnel_monitor/monitor.py +++ b/examples/networking/tunnel_monitor/monitor.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Copyright (c) PLUMgrid, Inc. # Licensed under the Apache License, Version 2.0 (the "License") diff --git a/examples/networking/vlan_filter/data-plane-tracing.py b/examples/networking/vlan_filter/data-plane-tracing.py index 975552f8c..efaa7f106 100755 --- a/examples/networking/vlan_filter/data-plane-tracing.py +++ b/examples/networking/vlan_filter/data-plane-tracing.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python from __future__ import print_function from bcc import BPF diff --git a/examples/networking/vlan_learning/vlan_learning.py b/examples/networking/vlan_learning/vlan_learning.py index a9023207f..7229176a1 100755 --- a/examples/networking/vlan_learning/vlan_learning.py +++ b/examples/networking/vlan_learning/vlan_learning.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Copyright (c) PLUMgrid, Inc. # Licensed under the Apache License, Version 2.0 (the "License") diff --git a/examples/networking/xdp/xdp_drop_count.py b/examples/networking/xdp/xdp_drop_count.py index f04cb155d..9b228d43f 100755 --- a/examples/networking/xdp/xdp_drop_count.py +++ b/examples/networking/xdp/xdp_drop_count.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # xdp_drop_count.py Drop incoming packets on XDP layer and count for which # protocol type diff --git a/examples/networking/xdp/xdp_macswap_count.py b/examples/networking/xdp/xdp_macswap_count.py index 145d00497..bb4107e00 100755 --- a/examples/networking/xdp/xdp_macswap_count.py +++ b/examples/networking/xdp/xdp_macswap_count.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # xdp_macswap_count.py Swap Source and Destination MAC addresses on # incoming packets and transmit packets back on diff --git a/examples/networking/xdp/xdp_redirect_cpu.py b/examples/networking/xdp/xdp_redirect_cpu.py index f7aa2bcaa..15b0d09b8 100755 --- a/examples/networking/xdp/xdp_redirect_cpu.py +++ b/examples/networking/xdp/xdp_redirect_cpu.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # xdp_redirect_cpu.py Redirect the incoming packet to the specific CPU # diff --git a/examples/networking/xdp/xdp_redirect_map.py b/examples/networking/xdp/xdp_redirect_map.py index e3b90a388..4a6227236 100755 --- a/examples/networking/xdp/xdp_redirect_map.py +++ b/examples/networking/xdp/xdp_redirect_map.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # xdp_redirect_map.py Redirect the incoming packet to another interface # with the helper: bpf_redirect_map() diff --git a/examples/tracing/bitehist.py b/examples/tracing/bitehist.py index 410424b94..c8c7f7a65 100755 --- a/examples/tracing/bitehist.py +++ b/examples/tracing/bitehist.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # bitehist.py Block I/O size histogram. # For Linux, uses BCC, eBPF. Embedded C. diff --git a/examples/tracing/disksnoop.py b/examples/tracing/disksnoop.py index c30ac0a7c..ed3dd819d 100755 --- a/examples/tracing/disksnoop.py +++ b/examples/tracing/disksnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # disksnoop.py Trace block device I/O: basic version of iosnoop. # For Linux, uses BCC, eBPF. Embedded C. diff --git a/examples/tracing/hello_fields.py b/examples/tracing/hello_fields.py index bad1a2298..be53e6222 100755 --- a/examples/tracing/hello_fields.py +++ b/examples/tracing/hello_fields.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # This is a Hello World example that formats output as fields. diff --git a/examples/tracing/hello_perf_output.py b/examples/tracing/hello_perf_output.py index eb1e9979b..7decd5808 100755 --- a/examples/tracing/hello_perf_output.py +++ b/examples/tracing/hello_perf_output.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # This is a Hello World example that uses BPF_PERF_OUTPUT. diff --git a/examples/tracing/kvm_hypercall.py b/examples/tracing/kvm_hypercall.py index 322bb8e50..5f1d3d7a2 100755 --- a/examples/tracing/kvm_hypercall.py +++ b/examples/tracing/kvm_hypercall.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # kvm_hypercall.py # diff --git a/examples/tracing/mallocstacks.py b/examples/tracing/mallocstacks.py index 4820447c3..2f3eb2594 100644 --- a/examples/tracing/mallocstacks.py +++ b/examples/tracing/mallocstacks.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # mallocstacks Trace malloc() calls in a process and print the full # stack trace for all callsites. diff --git a/examples/tracing/mysqld_query.py b/examples/tracing/mysqld_query.py index cf877d1c1..15ff297af 100755 --- a/examples/tracing/mysqld_query.py +++ b/examples/tracing/mysqld_query.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # mysqld_query Trace MySQL server queries. Example of USDT tracing. # For Linux, uses BCC, BPF. Embedded C. diff --git a/examples/tracing/nodejs_http_server.py b/examples/tracing/nodejs_http_server.py index 367e9d71f..1017de563 100755 --- a/examples/tracing/nodejs_http_server.py +++ b/examples/tracing/nodejs_http_server.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # nodejs_http_server Basic example of node.js USDT tracing. # For Linux, uses BCC, BPF. Embedded C. diff --git a/examples/tracing/stacksnoop.py b/examples/tracing/stacksnoop.py index d16b59d50..bced93f13 100755 --- a/examples/tracing/stacksnoop.py +++ b/examples/tracing/stacksnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # stacksnoop Trace a kernel function and print all kernel stack traces. # For Linux, uses BCC, eBPF, and currently x86_64 only. Inline C. diff --git a/examples/tracing/strlen_count.py b/examples/tracing/strlen_count.py index 103464fe0..49d70809e 100755 --- a/examples/tracing/strlen_count.py +++ b/examples/tracing/strlen_count.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # strlen_count Trace strlen() and print a frequency count of strings. # For Linux, uses BCC, eBPF. Embedded C. diff --git a/examples/tracing/strlen_hist.py b/examples/tracing/strlen_hist.py index 4652c4a45..dda1cb239 100755 --- a/examples/tracing/strlen_hist.py +++ b/examples/tracing/strlen_hist.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # strlen_hist.py Histogram of system-wide strlen return values diff --git a/examples/tracing/strlen_snoop.py b/examples/tracing/strlen_snoop.py index 44be1acad..c3c7199eb 100755 --- a/examples/tracing/strlen_snoop.py +++ b/examples/tracing/strlen_snoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # strlen_snoop Trace strlen() library function for a given PID. # For Linux, uses BCC, eBPF. Embedded C. diff --git a/examples/tracing/sync_timing.py b/examples/tracing/sync_timing.py index a00bf5a5e..675ad14c8 100755 --- a/examples/tracing/sync_timing.py +++ b/examples/tracing/sync_timing.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # sync_timing.py Trace time between syncs. # For Linux, uses BCC, eBPF. Embedded C. diff --git a/examples/tracing/task_switch.py b/examples/tracing/task_switch.py index 46d43babf..161edfbc4 100755 --- a/examples/tracing/task_switch.py +++ b/examples/tracing/task_switch.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Copyright (c) PLUMgrid, Inc. # Licensed under the Apache License, Version 2.0 (the "License") diff --git a/examples/tracing/tcpv4connect.py b/examples/tracing/tcpv4connect.py index 5b03717c6..8a89469de 100755 --- a/examples/tracing/tcpv4connect.py +++ b/examples/tracing/tcpv4connect.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # tcpv4connect Trace TCP IPv4 connect()s. # For Linux, uses BCC, eBPF. Embedded C. diff --git a/examples/tracing/trace_fields.py b/examples/tracing/trace_fields.py index 63a7b5310..8b57f9a21 100755 --- a/examples/tracing/trace_fields.py +++ b/examples/tracing/trace_fields.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Copyright (c) PLUMgrid, Inc. # Licensed under the Apache License, Version 2.0 (the "License") diff --git a/examples/tracing/trace_perf_output.py b/examples/tracing/trace_perf_output.py index 865a45935..26333c896 100755 --- a/examples/tracing/trace_perf_output.py +++ b/examples/tracing/trace_perf_output.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Copyright (c) PLUMgrid, Inc. # Licensed under the Apache License, Version 2.0 (the "License") diff --git a/examples/tracing/urandomread-explicit.py b/examples/tracing/urandomread-explicit.py index 7be545afe..448ffdfc4 100755 --- a/examples/tracing/urandomread-explicit.py +++ b/examples/tracing/urandomread-explicit.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # urandomread-explicit Example of instrumenting a kernel tracepoint. # For Linux, uses BCC, BPF. Embedded C. diff --git a/examples/tracing/urandomread.py b/examples/tracing/urandomread.py index 80ea9debb..319db2ca5 100755 --- a/examples/tracing/urandomread.py +++ b/examples/tracing/urandomread.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # urandomread Example of instrumenting a kernel tracepoint. # For Linux, uses BCC, BPF. Embedded C. diff --git a/examples/tracing/vfsreadlat.py b/examples/tracing/vfsreadlat.py index f4daae57c..b2c4156eb 100755 --- a/examples/tracing/vfsreadlat.py +++ b/examples/tracing/vfsreadlat.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # vfsreadlat.py VFS read latency distribution. # For Linux, uses BCC, eBPF. See .c file. diff --git a/examples/usdt_sample/scripts/lat_avg.py b/examples/usdt_sample/scripts/lat_avg.py index 36c4dbb17..48e7db56a 100755 --- a/examples/usdt_sample/scripts/lat_avg.py +++ b/examples/usdt_sample/scripts/lat_avg.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python import argparse from time import sleep, strftime from sys import argv diff --git a/examples/usdt_sample/scripts/lat_dist.py b/examples/usdt_sample/scripts/lat_dist.py index 647f29566..782c960bf 100755 --- a/examples/usdt_sample/scripts/lat_dist.py +++ b/examples/usdt_sample/scripts/lat_dist.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python import argparse from time import sleep, strftime from sys import argv diff --git a/examples/usdt_sample/scripts/latency.py b/examples/usdt_sample/scripts/latency.py index d46f2efb4..8a7ec08c8 100755 --- a/examples/usdt_sample/scripts/latency.py +++ b/examples/usdt_sample/scripts/latency.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python import argparse from time import sleep from sys import argv diff --git a/tools/argdist.py b/tools/argdist.py index bbf627388..695b5b3c8 100755 --- a/tools/argdist.py +++ b/tools/argdist.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # argdist Trace a function and display a distribution of its # parameter values as a histogram or frequency count. diff --git a/tools/bashreadline.py b/tools/bashreadline.py index 3d74c93cf..da9c1b7c1 100755 --- a/tools/bashreadline.py +++ b/tools/bashreadline.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # bashreadline Print entered bash commands from all running shells. # For Linux, uses BCC, eBPF. Embedded C. diff --git a/tools/biolatency.py b/tools/biolatency.py index dcb6d267c..a265c3182 100755 --- a/tools/biolatency.py +++ b/tools/biolatency.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # biolatency Summarize block device I/O latency as a histogram. diff --git a/tools/biosnoop.py b/tools/biosnoop.py index 51b3a7fe2..259a81b32 100755 --- a/tools/biosnoop.py +++ b/tools/biosnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # biosnoop Trace block device I/O and print details including issuing PID. diff --git a/tools/biotop.py b/tools/biotop.py index 3fe454cbc..62c295d16 100755 --- a/tools/biotop.py +++ b/tools/biotop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # biotop block device (disk) I/O by process. diff --git a/tools/bitesize.py b/tools/bitesize.py index e57185dd9..f70f09148 100755 --- a/tools/bitesize.py +++ b/tools/bitesize.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # bitehist.py Block I/O size histogram. # For Linux, uses BCC, eBPF. See .c file. diff --git a/tools/bpflist.py b/tools/bpflist.py index 85220b625..f73e945ac 100755 --- a/tools/bpflist.py +++ b/tools/bpflist.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # bpflist Display processes currently using BPF programs and maps, # pinned BPF programs and maps, and enabled probes. diff --git a/tools/btrfsdist.py b/tools/btrfsdist.py index a0aeb24f3..4659ab46e 100755 --- a/tools/btrfsdist.py +++ b/tools/btrfsdist.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # btrfsdist Summarize btrfs operation latency. diff --git a/tools/btrfsslower.py b/tools/btrfsslower.py index cff61b8f9..0a59820f9 100755 --- a/tools/btrfsslower.py +++ b/tools/btrfsslower.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # btrfsslower Trace slow btrfs operations. diff --git a/tools/cachestat.py b/tools/cachestat.py index 90a55b051..b00c80434 100755 --- a/tools/cachestat.py +++ b/tools/cachestat.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # cachestat Count cache kernel function calls. # For Linux, uses BCC, eBPF. See .c file. diff --git a/tools/cachetop.py b/tools/cachetop.py index 51ddace54..59de3912d 100755 --- a/tools/cachetop.py +++ b/tools/cachetop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # cachetop Count cache kernel function calls per processes diff --git a/tools/capable.py b/tools/capable.py index 368f4b057..65ffa7729 100755 --- a/tools/capable.py +++ b/tools/capable.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # capable Trace security capabilitiy checks (cap_capable()). diff --git a/tools/cpudist.py b/tools/cpudist.py index 9e6134183..4d7c9eb4e 100755 --- a/tools/cpudist.py +++ b/tools/cpudist.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # cpudist Summarize on- and off-CPU time per task as a histogram. diff --git a/tools/cpuunclaimed.py b/tools/cpuunclaimed.py index c899398a8..b862bad27 100755 --- a/tools/cpuunclaimed.py +++ b/tools/cpuunclaimed.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # cpuunclaimed Sample CPU run queues and calculate unclaimed idle CPU. diff --git a/tools/criticalstat.py b/tools/criticalstat.py index 8126b49af..da2859412 100755 --- a/tools/criticalstat.py +++ b/tools/criticalstat.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # criticalstat Trace long critical sections (IRQs or preemption disabled) diff --git a/tools/dbslower.py b/tools/dbslower.py index a42df87ae..24e63948c 100755 --- a/tools/dbslower.py +++ b/tools/dbslower.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # dbslower Trace MySQL and PostgreSQL queries slower than a threshold. # diff --git a/tools/dbstat.py b/tools/dbstat.py index 1d9843617..a89b09711 100755 --- a/tools/dbstat.py +++ b/tools/dbstat.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # dbstat Display a histogram of MySQL and PostgreSQL query latencies. # diff --git a/tools/dcsnoop.py b/tools/dcsnoop.py index 145219105..4c3757188 100755 --- a/tools/dcsnoop.py +++ b/tools/dcsnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # dcsnoop Trace directory entry cache (dcache) lookups. diff --git a/tools/dcstat.py b/tools/dcstat.py index 2009a19df..5ecddd1a7 100755 --- a/tools/dcstat.py +++ b/tools/dcstat.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # dcstat Directory entry cache (dcache) stats. diff --git a/tools/deadlock_detector.py b/tools/deadlock_detector.py index cbc06912f..573f8307c 100755 --- a/tools/deadlock_detector.py +++ b/tools/deadlock_detector.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # deadlock_detector Detects potential deadlocks (lock order inversions) # on a running process. For Linux, uses BCC, eBPF. diff --git a/tools/execsnoop.py b/tools/execsnoop.py index e27e50ee7..0c2c0655a 100755 --- a/tools/execsnoop.py +++ b/tools/execsnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # execsnoop Trace new processes via exec() syscalls. diff --git a/tools/ext4dist.py b/tools/ext4dist.py index b71cfda6a..bc797fb03 100755 --- a/tools/ext4dist.py +++ b/tools/ext4dist.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # ext4dist Summarize ext4 operation latency. diff --git a/tools/ext4slower.py b/tools/ext4slower.py index 344e68f0c..16b56ec4c 100755 --- a/tools/ext4slower.py +++ b/tools/ext4slower.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # ext4slower Trace slow ext4 operations. diff --git a/tools/filelife.py b/tools/filelife.py index 40952731f..f66f00bb2 100755 --- a/tools/filelife.py +++ b/tools/filelife.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # filelife Trace the lifespan of short-lived files. diff --git a/tools/fileslower.py b/tools/fileslower.py index 219a94a11..e2830e98f 100755 --- a/tools/fileslower.py +++ b/tools/fileslower.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # fileslower Trace slow synchronous file reads and writes. diff --git a/tools/filetop.py b/tools/filetop.py index 91e098e95..03c01f413 100755 --- a/tools/filetop.py +++ b/tools/filetop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # filetop file reads and writes by process. diff --git a/tools/funccount.py b/tools/funccount.py index fcb96b853..69dd01c8c 100755 --- a/tools/funccount.py +++ b/tools/funccount.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # funccount Count functions, tracepoints, and USDT probes. diff --git a/tools/funclatency.py b/tools/funclatency.py index f23d8f06b..3f08a7e0d 100755 --- a/tools/funclatency.py +++ b/tools/funclatency.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # funclatency Time functions and print latency as a histogram. diff --git a/tools/funcslower.py b/tools/funcslower.py index 214358318..283c80182 100755 --- a/tools/funcslower.py +++ b/tools/funcslower.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # funcslower Trace slow kernel or user function calls. diff --git a/tools/gethostlatency.py b/tools/gethostlatency.py index 84c798826..8d07e23ab 100755 --- a/tools/gethostlatency.py +++ b/tools/gethostlatency.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # gethostlatency Show latency for getaddrinfo/gethostbyname[2] calls. # For Linux, uses BCC, eBPF. Embedded C. diff --git a/tools/hardirqs.py b/tools/hardirqs.py index 1f5983ad7..589a890dd 100755 --- a/tools/hardirqs.py +++ b/tools/hardirqs.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # hardirqs Summarize hard IRQ (interrupt) event time. diff --git a/tools/inject.py b/tools/inject.py index 2cf990918..fa2d38875 100755 --- a/tools/inject.py +++ b/tools/inject.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # This script generates a BPF program with structure inspired by trace.py. The # generated program operates on PID-indexed stacks. Generally speaking, diff --git a/tools/killsnoop.py b/tools/killsnoop.py index ce03d3737..16221a2a2 100755 --- a/tools/killsnoop.py +++ b/tools/killsnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # killsnoop Trace signals issued by the kill() syscall. diff --git a/tools/lib/ucalls.py b/tools/lib/ucalls.py index 3b90b91ce..352e4d70b 100755 --- a/tools/lib/ucalls.py +++ b/tools/lib/ucalls.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # ucalls Summarize method calls in high-level languages and/or system calls. diff --git a/tools/lib/uflow.py b/tools/lib/uflow.py index 2bfe36bc8..63fab877d 100755 --- a/tools/lib/uflow.py +++ b/tools/lib/uflow.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # uflow Trace method execution flow in high-level languages. diff --git a/tools/lib/ugc.py b/tools/lib/ugc.py index 77f125ebc..8841d5faa 100755 --- a/tools/lib/ugc.py +++ b/tools/lib/ugc.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # ugc Summarize garbage collection events in high-level languages. diff --git a/tools/lib/uobjnew.py b/tools/lib/uobjnew.py index 8159f9ac7..85f576812 100755 --- a/tools/lib/uobjnew.py +++ b/tools/lib/uobjnew.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # uobjnew Summarize object allocations in high-level languages. diff --git a/tools/lib/ustat.py b/tools/lib/ustat.py index ef29d7676..1edc98565 100755 --- a/tools/lib/ustat.py +++ b/tools/lib/ustat.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # ustat Activity stats from high-level languages, including exceptions, diff --git a/tools/lib/uthreads.py b/tools/lib/uthreads.py index 00dd68b71..90d0a745b 100755 --- a/tools/lib/uthreads.py +++ b/tools/lib/uthreads.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # uthreads Trace thread creation/destruction events in high-level languages. diff --git a/tools/llcstat.py b/tools/llcstat.py index 01a63fd21..7b7bc47ad 100755 --- a/tools/llcstat.py +++ b/tools/llcstat.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # llcstat.py Summarize cache references and cache misses by PID. # Cache reference and cache miss are corresponding events defined in diff --git a/tools/mdflush.py b/tools/mdflush.py index 507614b80..485635d70 100755 --- a/tools/mdflush.py +++ b/tools/mdflush.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # mdflush Trace md flush events. diff --git a/tools/memleak.py b/tools/memleak.py index 4021bf87f..64c5972d2 100755 --- a/tools/memleak.py +++ b/tools/memleak.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # memleak Trace and display outstanding allocations to detect # memory leaks in user-mode processes and the kernel. diff --git a/tools/mountsnoop.py b/tools/mountsnoop.py index b6f96ca2a..eefb4ec72 100755 --- a/tools/mountsnoop.py +++ b/tools/mountsnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # mountsnoop Trace mount() and umount syscalls. # For Linux, uses BCC, eBPF. Embedded C. diff --git a/tools/mysqld_qslower.py b/tools/mysqld_qslower.py index 5737d1865..ab23b5b1a 100755 --- a/tools/mysqld_qslower.py +++ b/tools/mysqld_qslower.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # mysqld_qslower MySQL server queries slower than a threshold. # For Linux, uses BCC, BPF. Embedded C. diff --git a/tools/nfsdist.py b/tools/nfsdist.py index e3317a4dd..ff78506f6 100755 --- a/tools/nfsdist.py +++ b/tools/nfsdist.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # nfsdist Summarize NFS operation latency diff --git a/tools/nfsslower.py b/tools/nfsslower.py index 8113eff32..32e91c7b6 100755 --- a/tools/nfsslower.py +++ b/tools/nfsslower.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # nfsslower Trace slow NFS operations diff --git a/tools/offcputime.py b/tools/offcputime.py index 644026048..d84ae529f 100755 --- a/tools/offcputime.py +++ b/tools/offcputime.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # offcputime Summarize off-CPU time by stack trace # For Linux, uses BCC, eBPF. diff --git a/tools/offwaketime.py b/tools/offwaketime.py index 3c4f0f3fe..38a9ff252 100755 --- a/tools/offwaketime.py +++ b/tools/offwaketime.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # offwaketime Summarize blocked time by kernel off-CPU stack + waker stack # For Linux, uses BCC, eBPF. diff --git a/tools/old/bashreadline.py b/tools/old/bashreadline.py index c4b8ec282..571b6626c 100755 --- a/tools/old/bashreadline.py +++ b/tools/old/bashreadline.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # bashreadline Print entered bash commands from all running shells. # For Linux, uses BCC, eBPF. Embedded C. diff --git a/tools/old/biosnoop.py b/tools/old/biosnoop.py index 96db56b26..37ee3f9cb 100755 --- a/tools/old/biosnoop.py +++ b/tools/old/biosnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # biosnoop Trace block device I/O and print details including issuing PID. diff --git a/tools/old/filelife.py b/tools/old/filelife.py index af64b5341..075be087d 100755 --- a/tools/old/filelife.py +++ b/tools/old/filelife.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # filelife Trace the lifespan of short-lived files. diff --git a/tools/old/gethostlatency.py b/tools/old/gethostlatency.py index 4d87c83b2..7d32cb82d 100755 --- a/tools/old/gethostlatency.py +++ b/tools/old/gethostlatency.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # gethostlatency Show latency for getaddrinfo/gethostbyname[2] calls. # For Linux, uses BCC, eBPF. Embedded C. diff --git a/tools/old/killsnoop.py b/tools/old/killsnoop.py index e2d4cb5f8..ddf9d5af0 100755 --- a/tools/old/killsnoop.py +++ b/tools/old/killsnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # killsnoop Trace signals issued by the kill() syscall. diff --git a/tools/old/memleak.py b/tools/old/memleak.py index b962c999c..eca685f83 100755 --- a/tools/old/memleak.py +++ b/tools/old/memleak.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # memleak Trace and display outstanding allocations to detect # memory leaks in user-mode processes and the kernel. diff --git a/tools/old/offcputime.py b/tools/old/offcputime.py index dc8906305..38d12a251 100755 --- a/tools/old/offcputime.py +++ b/tools/old/offcputime.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # offcputime Summarize off-CPU time by kernel stack trace # For Linux, uses BCC, eBPF. diff --git a/tools/old/offwaketime.py b/tools/old/offwaketime.py index b5fdd0fe3..3b5bb36c8 100755 --- a/tools/old/offwaketime.py +++ b/tools/old/offwaketime.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # offwaketime Summarize blocked time by kernel off-CPU stack + waker stack # For Linux, uses BCC, eBPF. diff --git a/tools/old/oomkill.py b/tools/old/oomkill.py index b99f85274..0973cfbaa 100755 --- a/tools/old/oomkill.py +++ b/tools/old/oomkill.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # oomkill Trace oom_kill_process(). For Linux, uses BCC, eBPF. # diff --git a/tools/old/opensnoop.py b/tools/old/opensnoop.py index 3736ec2fe..5df3b4178 100755 --- a/tools/old/opensnoop.py +++ b/tools/old/opensnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # opensnoop Trace open() syscalls. diff --git a/tools/old/profile.py b/tools/old/profile.py index f0328d209..e308208ee 100755 --- a/tools/old/profile.py +++ b/tools/old/profile.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # profile Profile CPU usage by sampling stack traces at a timed interval. diff --git a/tools/old/softirqs.py b/tools/old/softirqs.py index 30495bc60..3b40b1acf 100755 --- a/tools/old/softirqs.py +++ b/tools/old/softirqs.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # softirqs Summarize soft IRQ (interrupt) event time. diff --git a/tools/old/stackcount.py b/tools/old/stackcount.py index 6eee27ff3..108c80007 100755 --- a/tools/old/stackcount.py +++ b/tools/old/stackcount.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # stackcount Count kernel function calls and their stack traces. # For Linux, uses BCC, eBPF. diff --git a/tools/old/stacksnoop.py b/tools/old/stacksnoop.py index 238ab8249..9fcc12b01 100755 --- a/tools/old/stacksnoop.py +++ b/tools/old/stacksnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # stacksnoop Trace a kernel function and print all kernel stack traces. # For Linux, uses BCC, eBPF, and currently x86_64 only. Inline C. diff --git a/tools/old/statsnoop.py b/tools/old/statsnoop.py index 82128c2af..ad54ac78c 100755 --- a/tools/old/statsnoop.py +++ b/tools/old/statsnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # statsnoop Trace stat() syscalls. diff --git a/tools/old/syncsnoop.py b/tools/old/syncsnoop.py index b14309d19..cae57ea8a 100755 --- a/tools/old/syncsnoop.py +++ b/tools/old/syncsnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # syncsnoop Trace sync() syscall. diff --git a/tools/old/tcpaccept.py b/tools/old/tcpaccept.py index cc0c240f1..8125eaa35 100755 --- a/tools/old/tcpaccept.py +++ b/tools/old/tcpaccept.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # tcpaccept Trace TCP accept()s. diff --git a/tools/old/tcpconnect.py b/tools/old/tcpconnect.py index e0a59e967..579a85f91 100755 --- a/tools/old/tcpconnect.py +++ b/tools/old/tcpconnect.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # tcpconnect Trace TCP connect()s. diff --git a/tools/old/wakeuptime.py b/tools/old/wakeuptime.py index 512e4f41b..783c7ffbb 100644 --- a/tools/old/wakeuptime.py +++ b/tools/old/wakeuptime.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # wakeuptime Summarize sleep to wakeup time by waker kernel stack # For Linux, uses BCC, eBPF. diff --git a/tools/oomkill.py b/tools/oomkill.py index db3a53786..16defe090 100755 --- a/tools/oomkill.py +++ b/tools/oomkill.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # oomkill Trace oom_kill_process(). For Linux, uses BCC, eBPF. # diff --git a/tools/opensnoop.py b/tools/opensnoop.py index 4cb4dbb9d..55db352d8 100755 --- a/tools/opensnoop.py +++ b/tools/opensnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # opensnoop Trace open() syscalls. diff --git a/tools/pidpersec.py b/tools/pidpersec.py index aff12196e..c4490043a 100755 --- a/tools/pidpersec.py +++ b/tools/pidpersec.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # pidpersec Count new processes (via fork). diff --git a/tools/profile.py b/tools/profile.py index 084ac633f..d1d3d26ac 100755 --- a/tools/profile.py +++ b/tools/profile.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # profile Profile CPU usage by sampling stack traces at a timed interval. diff --git a/tools/runqlat.py b/tools/runqlat.py index 9c56d22e5..9fd40642b 100755 --- a/tools/runqlat.py +++ b/tools/runqlat.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # runqlat Run queue (scheduler) latency as a histogram. diff --git a/tools/runqlen.py b/tools/runqlen.py index 4ff515dbb..b56a5916a 100755 --- a/tools/runqlen.py +++ b/tools/runqlen.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # runqlen Summarize scheduler run queue length as a histogram. diff --git a/tools/runqslower.py b/tools/runqslower.py index 0b3e1c1e3..bd1138e03 100755 --- a/tools/runqslower.py +++ b/tools/runqslower.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # runqslower Trace long process scheduling delays. diff --git a/tools/shmsnoop.py b/tools/shmsnoop.py index 20fcbd702..bb5053547 100755 --- a/tools/shmsnoop.py +++ b/tools/shmsnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # shmsnoop Trace shm*() syscalls. diff --git a/tools/slabratetop.py b/tools/slabratetop.py index ab6c08cf8..101c58568 100755 --- a/tools/slabratetop.py +++ b/tools/slabratetop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # slabratetop Summarize kmem_cache_alloc() calls. diff --git a/tools/sofdsnoop.py b/tools/sofdsnoop.py index f63310712..77f8f14cd 100755 --- a/tools/sofdsnoop.py +++ b/tools/sofdsnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # sofdsnoop traces file descriptors passed via socket diff --git a/tools/softirqs.py b/tools/softirqs.py index 10ebc38ed..1e2daf5f9 100755 --- a/tools/softirqs.py +++ b/tools/softirqs.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # softirqs Summarize soft IRQ (interrupt) event time. diff --git a/tools/solisten.py b/tools/solisten.py index 81e82e041..bced0a2ab 100755 --- a/tools/solisten.py +++ b/tools/solisten.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # solisten Trace TCP listen events # For Linux, uses BCC, eBPF. Embedded C. diff --git a/tools/sslsniff.py b/tools/sslsniff.py index f7bc11767..265e87f40 100755 --- a/tools/sslsniff.py +++ b/tools/sslsniff.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # sslsniff Captures data on read/recv or write/send functions of OpenSSL, # GnuTLS and NSS diff --git a/tools/stackcount.py b/tools/stackcount.py index 5554014fc..9dfc06f11 100755 --- a/tools/stackcount.py +++ b/tools/stackcount.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # stackcount Count events and their stack traces. # For Linux, uses BCC, eBPF. diff --git a/tools/statsnoop.py b/tools/statsnoop.py index 9e585beb2..516eda2d8 100755 --- a/tools/statsnoop.py +++ b/tools/statsnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # statsnoop Trace stat() syscalls. diff --git a/tools/syncsnoop.py b/tools/syncsnoop.py index eb892babd..708fbc4a0 100755 --- a/tools/syncsnoop.py +++ b/tools/syncsnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # syncsnoop Trace sync() syscall. diff --git a/tools/syscount.py b/tools/syscount.py index 58039bee8..486953cc2 100755 --- a/tools/syscount.py +++ b/tools/syscount.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # syscount Summarize syscall counts and latencies. # diff --git a/tools/tcpaccept.py b/tools/tcpaccept.py index 5a7bbb8ea..b12808630 100755 --- a/tools/tcpaccept.py +++ b/tools/tcpaccept.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # tcpaccept Trace TCP accept()s. diff --git a/tools/tcpconnect.py b/tools/tcpconnect.py index 5ca6851da..54364c939 100755 --- a/tools/tcpconnect.py +++ b/tools/tcpconnect.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # tcpconnect Trace TCP connect()s. diff --git a/tools/tcpconnlat.py b/tools/tcpconnlat.py index 92dc2c186..9f25f0f41 100755 --- a/tools/tcpconnlat.py +++ b/tools/tcpconnlat.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # tcpconnlat Trace TCP active connection latency (connect). diff --git a/tools/tcpdrop.py b/tools/tcpdrop.py index 82f66a711..ca89be60c 100755 --- a/tools/tcpdrop.py +++ b/tools/tcpdrop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # tcpdrop Trace TCP kernel-dropped packets/segments. diff --git a/tools/tcplife.py b/tools/tcplife.py index e7d29d446..46395822b 100755 --- a/tools/tcplife.py +++ b/tools/tcplife.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # tcplife Trace the lifespan of TCP sessions and summarize. diff --git a/tools/tcpretrans.py b/tools/tcpretrans.py index 442fd3ea9..47ac8c105 100755 --- a/tools/tcpretrans.py +++ b/tools/tcpretrans.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # tcpretrans Trace or count TCP retransmits and TLPs. diff --git a/tools/tcpstates.py b/tools/tcpstates.py index 736de97ae..4a21f0205 100755 --- a/tools/tcpstates.py +++ b/tools/tcpstates.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # -*- coding: utf-8 -*- # @lint-avoid-python-3-compatibility-imports # diff --git a/tools/tcpsubnet.py b/tools/tcpsubnet.py index bf944e163..5f2a8062b 100755 --- a/tools/tcpsubnet.py +++ b/tools/tcpsubnet.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # tcpsubnet Summarize TCP bytes sent to different subnets. diff --git a/tools/tcptop.py b/tools/tcptop.py index a8451d23b..e1eb24111 100755 --- a/tools/tcptop.py +++ b/tools/tcptop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # tcptop Summarize TCP send/recv throughput by host. diff --git a/tools/tcptracer.py b/tools/tcptracer.py index 8f272ebb5..cc92c3fc9 100755 --- a/tools/tcptracer.py +++ b/tools/tcptracer.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # tcpv4tracer Trace TCP connections. # For Linux, uses BCC, eBPF. Embedded C. diff --git a/tools/tplist.py b/tools/tplist.py index db4b68b49..6ec2fbe18 100755 --- a/tools/tplist.py +++ b/tools/tplist.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # tplist Display kernel tracepoints or USDT probes and their formats. # diff --git a/tools/trace.py b/tools/trace.py index e1845da18..e06624075 100755 --- a/tools/trace.py +++ b/tools/trace.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # trace Trace a function and print a trace message based on its # parameters, with an optional filter. diff --git a/tools/ttysnoop.py b/tools/ttysnoop.py index aa18d2483..07f272fa9 100755 --- a/tools/ttysnoop.py +++ b/tools/ttysnoop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # ttysnoop Watch live output from a tty or pts device. diff --git a/tools/vfscount.py b/tools/vfscount.py index 285cd8b89..10c6b1eb1 100755 --- a/tools/vfscount.py +++ b/tools/vfscount.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # vfscount Count VFS calls ("vfs_*"). diff --git a/tools/vfsstat.py b/tools/vfsstat.py index 4a55f8c53..1764c6012 100755 --- a/tools/vfsstat.py +++ b/tools/vfsstat.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # vfsstat Count some VFS calls. diff --git a/tools/wakeuptime.py b/tools/wakeuptime.py index 68e885724..18e70e480 100755 --- a/tools/wakeuptime.py +++ b/tools/wakeuptime.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # wakeuptime Summarize sleep to wakeup time by waker kernel stack # For Linux, uses BCC, eBPF. diff --git a/tools/xfsdist.py b/tools/xfsdist.py index 1a7fdd96a..f409f90db 100755 --- a/tools/xfsdist.py +++ b/tools/xfsdist.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # xfsdist Summarize XFS operation latency. diff --git a/tools/xfsslower.py b/tools/xfsslower.py index 5b4e0a2ba..b79527b11 100755 --- a/tools/xfsslower.py +++ b/tools/xfsslower.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # xfsslower Trace slow XFS operations. diff --git a/tools/zfsdist.py b/tools/zfsdist.py index 9330739a5..6b29b99ba 100755 --- a/tools/zfsdist.py +++ b/tools/zfsdist.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # zfsdist Summarize ZFS operation latency. diff --git a/tools/zfsslower.py b/tools/zfsslower.py index 6f0382af9..7bf160b7f 100755 --- a/tools/zfsslower.py +++ b/tools/zfsslower.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # # zfsslower Trace slow ZFS operations. From 3664191a6aa2a036d50316c389e1af059b5378d9 Mon Sep 17 00:00:00 2001 From: Alexey Ivanov Date: Wed, 16 Jan 2019 09:52:07 -0800 Subject: [PATCH 005/135] scripts: switch shebang in linter --- scripts/py-style-check.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/py-style-check.sh b/scripts/py-style-check.sh index d8c5ece87..78c8964ef 100755 --- a/scripts/py-style-check.sh +++ b/scripts/py-style-check.sh @@ -6,7 +6,7 @@ set -euo pipefail find tools -type f -name "*.py" | xargs pep8 -r --show-source --ignore=E123,E125,E126,E127,E128,E302 || \ echo "pep8 run failed, please fix it" >&2 -NO_PROPER_SHEBANG="$(find tools examples -type f -executable -name '*.py' | xargs grep -L '#!/usr/bin/env python')" +NO_PROPER_SHEBANG="$(find tools examples -type f -executable -name '*.py' | xargs grep -L '#!/usr/bin/python')" if [ -n "$NO_PROPER_SHEBANG" ]; then echo "bad shebangs found:" echo "$NO_PROPER_SHEBANG" From 76b5d5d9b6991e3831d286fe91f0dc7b59e4c71e Mon Sep 17 00:00:00 2001 From: Alexey Ivanov Date: Wed, 16 Jan 2019 09:56:18 -0800 Subject: [PATCH 006/135] spec: added shebang mangling --- SPECS/bcc.spec | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/SPECS/bcc.spec b/SPECS/bcc.spec index 691ab3d48..f74bb6149 100644 --- a/SPECS/bcc.spec +++ b/SPECS/bcc.spec @@ -88,6 +88,10 @@ popd %install pushd build make install/strip DESTDIR=%{buildroot} +# mangle shebangs +find %{buildroot}/usr/share/bcc/{tools,examples} -type f -exec \ + sed -i -e '1 s|^#!/usr/bin/python$|#!'%{__python}'|' \ + -e '1 s|^#!/usr/bin/env python$|#!'%{__python}'|' {} \; %package -n libbcc Summary: Shared Library for BPF Compiler Collection (BCC) From 4c5509fc1664a5e2b0ac9823b3dd09fd9181a7c4 Mon Sep 17 00:00:00 2001 From: Teng Qin Date: Wed, 16 Jan 2019 14:59:23 -0800 Subject: [PATCH 007/135] Add free_bcc_memory to BPFModule (#2147) Some users uses `BPFModule` directly instead of C++ / Python API, and they would like to have similar interface of free BCC `.text` memory --- src/cc/bpf_module.cc | 5 +++++ src/cc/bpf_module.h | 1 + 2 files changed, 6 insertions(+) diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc index 73235bf53..502bd5d74 100644 --- a/src/cc/bpf_module.cc +++ b/src/cc/bpf_module.cc @@ -35,6 +35,7 @@ #include "common.h" #include "bcc_debug.h" +#include "bcc_elf.h" #include "frontends/b/loader.h" #include "frontends/clang/loader.h" #include "frontends/clang/b_frontend_action.h" @@ -141,6 +142,10 @@ BPFModule::~BPFModule() { ts_->DeletePrefix(Path({id_})); } +int BPFModule::free_bcc_memory() { + return bcc_free_memory(); +} + // load an entire c file as a module int BPFModule::load_cfile(const string &file, bool in_memory, const char *cflags[], int ncflags) { ClangLoader clang_loader(&*ctx_, flags_); diff --git a/src/cc/bpf_module.h b/src/cc/bpf_module.h index d4bc54eed..f4f56390d 100644 --- a/src/cc/bpf_module.h +++ b/src/cc/bpf_module.h @@ -83,6 +83,7 @@ class BPFModule { BPFModule(unsigned flags, TableStorage *ts = nullptr, bool rw_engine_enabled = true, const std::string &maps_ns = ""); ~BPFModule(); + int free_bcc_memory(); int load_b(const std::string &filename, const std::string &proto_filename); int load_c(const std::string &filename, const char *cflags[], int ncflags); int load_string(const std::string &text, const char *cflags[], int ncflags); From cda5f98a7e4340c20124be7bae062f87b8934b6f Mon Sep 17 00:00:00 2001 From: Teng Qin Date: Wed, 16 Jan 2019 17:28:34 -0800 Subject: [PATCH 008/135] Fix include style for bcc_syms.h (#2148) Across the repo (see libbpf.h, api/BPF.h) we always use the custom header search include, to make it easier to work with different build environments. --- src/cc/bcc_syms.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cc/bcc_syms.h b/src/cc/bcc_syms.h index 2cf432b42..d6e3b7b74 100644 --- a/src/cc/bcc_syms.h +++ b/src/cc/bcc_syms.h @@ -21,7 +21,7 @@ extern "C" { #endif #include -#include +#include "compat/linux/bpf.h" struct bcc_symbol { const char *name; From dd132421c9edadc6cf8316e70454bedce6830155 Mon Sep 17 00:00:00 2001 From: Sandipan Das <31861871+sandip4n@users.noreply.github.com> Date: Thu, 17 Jan 2019 23:11:01 +0530 Subject: [PATCH 009/135] Fix process map parsing when freeing bcc memory (#2151) This fixes the format string used to parse the major and minor device fields in /proc/self/maps. These fields have hexadecimal values and hence cannot be parsed as unsigned integers. Fixes: 51480d05 ("implement free_bcc_memory() API (#2097)") Reported-by: Nageswara R Sastry rnsastry@linux.vnet.ibm.com Signed-off-by: Sandipan Das --- src/cc/bcc_elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cc/bcc_elf.c b/src/cc/bcc_elf.c index be1ef82fe..991adb95f 100644 --- a/src/cc/bcc_elf.c +++ b/src/cc/bcc_elf.c @@ -863,7 +863,7 @@ int bcc_free_memory() { int path_start = 0, path_end = 0; unsigned int devmajor, devminor; char perms[8]; - if (sscanf(line, "%lx-%lx %7s %lx %u:%u %lu %n%*[^\n]%n", + if (sscanf(line, "%lx-%lx %7s %lx %x:%x %lu %n%*[^\n]%n", &addr_start, &addr_end, perms, &offset, &devmajor, &devminor, &inode, &path_start, &path_end) < 7) From bf5b9a5bf4094fc129c76b95029e6fa034509c80 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 16 Jan 2019 17:32:09 -0800 Subject: [PATCH 010/135] add kernel libbpf as a submodule The kernel libbpf library https://github.com/torvalds/linux/tree/master/tools/lib/bpf provides common functionality for handling bpf programs, maps, ELF parsing, probing, BTF manipulation, etc. The library has been recommended to the community as a common library for bpf/btf handling. A separate repository https://github.com/libbpf/libbpf has been created for this purpose. This patch brings in libbpf to bcc. The following command is executed to crate a libbpf submodule at src/cc/libbpf: git submodule add https://github.com/libbpf/libbpf.git src/cc/libbpf Signed-off-by: Yonghong Song --- .gitmodules | 3 +++ src/cc/libbpf | 1 + 2 files changed, 4 insertions(+) create mode 100644 .gitmodules create mode 160000 src/cc/libbpf diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..aeb53483a --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "src/cc/libbpf"] + path = src/cc/libbpf + url = https://github.com/libbpf/libbpf.git diff --git a/src/cc/libbpf b/src/cc/libbpf new file mode 160000 index 000000000..d5b146fec --- /dev/null +++ b/src/cc/libbpf @@ -0,0 +1 @@ +Subproject commit d5b146fec50d7aa126fe98323aeaee688d4af289 From fbe94ddbde49a0b5d78cea0183dcaf10fe9e059e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 16 Jan 2019 15:47:24 -0800 Subject: [PATCH 011/135] get submodules in cmake and package build If the target directory src/cc/libbpf/ does not exist, the top level CMakeLists.txt is changed to fetch submodules with the following command git submodule update --init --recursive The dev/rpm build scripts are also changed to do git submodule update. Signed-off-by: Yonghong Song --- CMakeLists.txt | 6 ++++++ scripts/build-deb.sh | 15 ++++++++++++++- scripts/build-release-rpm.sh | 17 ++++++++++++++++- scripts/build-rpm.sh | 16 +++++++++++++++- 4 files changed, 51 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 610e153ae..94aac856e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,12 @@ endif() enable_testing() +# populate submodules (libbpf) +if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/src/cc/libbpf/src) + execute_process(COMMAND git submodule update --init --recursive + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) +endif() + include(cmake/GetGitRevisionDescription.cmake) include(cmake/version.cmake) include(CMakeDependentOption) diff --git a/scripts/build-deb.sh b/scripts/build-deb.sh index 97bed85eb..1e450b664 100755 --- a/scripts/build-deb.sh +++ b/scripts/build-deb.sh @@ -16,11 +16,24 @@ function cleanup() { } trap cleanup EXIT +# populate submodules +git submodule update --init --recursive + . scripts/git-tag.sh -git archive HEAD --prefix=bcc/ --format=tar.gz -o $TMP/bcc_$revision.orig.tar.gz +git archive HEAD --prefix=bcc/ --format=tar -o $TMP/bcc_$revision.orig.tar + +# archive submodules +pushd src/cc/libbpf +git archive HEAD --prefix=bcc/src/cc/libbpf/ --format=tar -o $TMP/bcc_libbpf_$revision.orig.tar +popd pushd $TMP + +# merge all archives into bcc_$revision.orig.tar.gz +tar -A -f bcc_$revision.orig.tar bcc_libbpf_$revision.orig.tar +gzip bcc_$revision.orig.tar + tar xf bcc_$revision.orig.tar.gz cd bcc diff --git a/scripts/build-release-rpm.sh b/scripts/build-release-rpm.sh index 0fd6b70d5..e1147bf04 100755 --- a/scripts/build-release-rpm.sh +++ b/scripts/build-release-rpm.sh @@ -14,9 +14,24 @@ mkdir $TMP/{BUILD,RPMS,SOURCES,SPECS,SRPMS} llvmver=3.7.1 +# populate submodules +git submodule update --init --recursive + . scripts/git-tag.sh -git archive HEAD --prefix=bcc/ --format=tar.gz -o $TMP/SOURCES/$git_tag_latest.tar.gz +git archive HEAD --prefix=bcc/ --format=tar -o $TMP/SOURCES/bcc.tar + +# archive submodules +pushd src/cc/libbpf +git archive HEAD --prefix=bcc/src/cc/libbpf/ --format=tar -o $TMP/SOURCES/bcc_libbpf.tar +popd + +# merge all archives into $git_tag_latest.tar.gz +pushd $TMP/SOURCES +tar -A -f bcc.tar bcc_libbpf.tar +gzip -c bcc.tar > $git_tag_latest.tar.gz +popd + wget -P $TMP/SOURCES http://llvm.org/releases/$llvmver/{cfe,llvm}-$llvmver.src.tar.xz sed \ diff --git a/scripts/build-rpm.sh b/scripts/build-rpm.sh index 061650131..81cc2d124 100755 --- a/scripts/build-rpm.sh +++ b/scripts/build-rpm.sh @@ -14,9 +14,23 @@ mkdir $TMP/{BUILD,RPMS,SOURCES,SPECS,SRPMS} llvmver=3.7.1 +# populate submodules +git submodule update --init --recursive + . scripts/git-tag.sh -git archive HEAD --prefix=bcc/ --format=tar.gz -o $TMP/SOURCES/bcc.tar.gz +git archive HEAD --prefix=bcc/ --format=tar -o $TMP/SOURCES/bcc.tar + +# archive submodules +pushd src/cc/libbpf +git archive HEAD --prefix=bcc/src/cc/libbpf/ --format=tar -o $TMP/SOURCES/bcc_libbpf.tar +popd + +# merge all archives into bcc.tar.gz +pushd $TMP/SOURCES +tar -A -f bcc.tar bcc_libbpf.tar +gzip bcc.tar +popd sed \ -e "s/^\(Version:\s*\)@REVISION@/\1$revision/" \ From 751559e9f6bd7644f4fb68040c60bee71462cf06 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 16 Jan 2019 16:16:23 -0800 Subject: [PATCH 012/135] use linux bpf uapi header from libbpf submodule The uapi header src/cc/libbpf/include/uapi/linux/bpf.h is used instead of the current way src/cc/compat/linux/bpf.h Signed-off-by: Yonghong Song --- examples/cpp/CMakeLists.txt | 1 + introspection/CMakeLists.txt | 1 + src/cc/CMakeLists.txt | 4 +- src/cc/README | 16 + src/cc/api/BPF.h | 2 +- src/cc/bcc_syms.h | 2 +- src/cc/compat/linux/bpf.h | 3057 ------------------------------ src/cc/compat/linux/bpf_common.h | 57 - src/cc/libbpf.h | 2 +- src/cc/shared_table.cc | 2 +- tests/cc/CMakeLists.txt | 1 + 11 files changed, 25 insertions(+), 3120 deletions(-) create mode 100644 src/cc/README delete mode 100644 src/cc/compat/linux/bpf.h delete mode 100644 src/cc/compat/linux/bpf_common.h diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index 9d5e5d43e..0b49a6a56 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -3,6 +3,7 @@ include_directories(${CMAKE_SOURCE_DIR}/src/cc) include_directories(${CMAKE_SOURCE_DIR}/src/cc/api) +include_directories(${CMAKE_SOURCE_DIR}/src/cc/libbpf/include/uapi) option(INSTALL_CPP_EXAMPLES "Install C++ examples. Those binaries are statically linked and can take plenty of disk space" OFF) diff --git a/introspection/CMakeLists.txt b/introspection/CMakeLists.txt index 836bc0aae..88df6e84e 100644 --- a/introspection/CMakeLists.txt +++ b/introspection/CMakeLists.txt @@ -3,6 +3,7 @@ include_directories(${CMAKE_SOURCE_DIR}/src/cc) include_directories(${CMAKE_SOURCE_DIR}/src/cc/api) +include_directories(${CMAKE_SOURCE_DIR}/src/cc/libbpf/include/uapi) option(INSTALL_INTROSPECTION "Install BPF introspection tools" ON) diff --git a/src/cc/CMakeLists.txt b/src/cc/CMakeLists.txt index 242c420dc..8ad765b3d 100644 --- a/src/cc/CMakeLists.txt +++ b/src/cc/CMakeLists.txt @@ -10,7 +10,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/frontends/clang) include_directories(${LLVM_INCLUDE_DIRS}) include_directories(${LIBELF_INCLUDE_DIRS}) # todo: if check for kernel version -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/compat) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/libbpf/include/uapi) add_definitions(${LLVM_DEFINITIONS}) configure_file(libbcc.pc.in ${CMAKE_CURRENT_BINARY_DIR}/libbcc.pc @ONLY) @@ -105,7 +105,7 @@ set(bcc-lua-static ${bcc-lua-static} ${bcc_common_libs_for_lua}) install(TARGETS bcc-shared LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}) install(FILES ${bcc_table_headers} DESTINATION include/bcc) install(FILES ${bcc_api_headers} DESTINATION include/bcc) -install(DIRECTORY compat/linux/ DESTINATION include/bcc/compat/linux FILES_MATCHING PATTERN "*.h") +install(DIRECTORY libbpf/include/uapi/linux/ DESTINATION include/bcc/compat/linux FILES_MATCHING PATTERN "*.h") install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libbcc.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) endif(ENABLE_CLANG_JIT) install(FILES ${bcc_common_headers} DESTINATION include/bcc) diff --git a/src/cc/README b/src/cc/README new file mode 100644 index 000000000..7f8972353 --- /dev/null +++ b/src/cc/README @@ -0,0 +1,16 @@ +The libbpf directory is a git submodule for repository + https://github.com/libbpf/libbpf + +If you have any change in libbpf directory, please upstream to linux +first as libbpf repo is a mirror of linux/tools/lib/bpf directory. + +If any top-commit update of libbpf submodule contains a uapi header +change, the following are necessary steps to sync properly with +rest of bcc: + 1. sync compat/linux/virtual_bpf.h with libbpf/include/uapi/linux/bpf.h + as virtual_bpf.h has an extra string wrapper for bpf.h. + 2. if new bpf.h has new helpers, add corresponding helper func define + in bcc:src/cc/export/helpers.h and helper entry for error reporting + in bcc:src/cc/libbpf.c. + 3. if new bpf.h has new map types, program types, update + bcc:introspection/bps.c for these new map/program types. diff --git a/src/cc/api/BPF.h b/src/cc/api/BPF.h index ae3ad9199..700135f45 100644 --- a/src/cc/api/BPF.h +++ b/src/cc/api/BPF.h @@ -26,7 +26,7 @@ #include "bcc_exception.h" #include "bcc_syms.h" #include "bpf_module.h" -#include "compat/linux/bpf.h" +#include "linux/bpf.h" #include "libbpf.h" #include "table_storage.h" diff --git a/src/cc/bcc_syms.h b/src/cc/bcc_syms.h index d6e3b7b74..c213f10cd 100644 --- a/src/cc/bcc_syms.h +++ b/src/cc/bcc_syms.h @@ -21,7 +21,7 @@ extern "C" { #endif #include -#include "compat/linux/bpf.h" +#include "linux/bpf.h" struct bcc_symbol { const char *name; diff --git a/src/cc/compat/linux/bpf.h b/src/cc/compat/linux/bpf.h deleted file mode 100644 index bc3bcdd7d..000000000 --- a/src/cc/compat/linux/bpf.h +++ /dev/null @@ -1,3057 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#ifndef _UAPI__LINUX_BPF_H__ -#define _UAPI__LINUX_BPF_H__ - -#include -#include "bpf_common.h" - -/* Extended instruction set based on top of classic BPF */ - -/* instruction classes */ -#define BPF_ALU64 0x07 /* alu mode in double word width */ - -/* ld/ldx fields */ -#define BPF_DW 0x18 /* double word (64-bit) */ -#define BPF_XADD 0xc0 /* exclusive add */ - -/* alu/jmp fields */ -#define BPF_MOV 0xb0 /* mov reg to reg */ -#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ - -/* change endianness of a register */ -#define BPF_END 0xd0 /* flags for endianness conversion: */ -#define BPF_TO_LE 0x00 /* convert to little-endian */ -#define BPF_TO_BE 0x08 /* convert to big-endian */ -#define BPF_FROM_LE BPF_TO_LE -#define BPF_FROM_BE BPF_TO_BE - -/* jmp encodings */ -#define BPF_JNE 0x50 /* jump != */ -#define BPF_JLT 0xa0 /* LT is unsigned, '<' */ -#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ -#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ -#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ -#define BPF_JSLT 0xc0 /* SLT is signed, '<' */ -#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ -#define BPF_CALL 0x80 /* function call */ -#define BPF_EXIT 0x90 /* function return */ - -/* Register numbers */ -enum { - BPF_REG_0 = 0, - BPF_REG_1, - BPF_REG_2, - BPF_REG_3, - BPF_REG_4, - BPF_REG_5, - BPF_REG_6, - BPF_REG_7, - BPF_REG_8, - BPF_REG_9, - BPF_REG_10, - __MAX_BPF_REG, -}; - -/* BPF has 10 general purpose 64-bit registers and stack frame. */ -#define MAX_BPF_REG __MAX_BPF_REG - -struct bpf_insn { - __u8 code; /* opcode */ - __u8 dst_reg:4; /* dest register */ - __u8 src_reg:4; /* source register */ - __s16 off; /* signed offset */ - __s32 imm; /* signed immediate constant */ -}; - -/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ -struct bpf_lpm_trie_key { - __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ - __u8 data[0]; /* Arbitrary size */ -}; - -struct bpf_cgroup_storage_key { - __u64 cgroup_inode_id; /* cgroup inode id */ - __u32 attach_type; /* program attach type */ -}; - -/* BPF syscall commands, see bpf(2) man-page for details. */ -enum bpf_cmd { - BPF_MAP_CREATE, - BPF_MAP_LOOKUP_ELEM, - BPF_MAP_UPDATE_ELEM, - BPF_MAP_DELETE_ELEM, - BPF_MAP_GET_NEXT_KEY, - BPF_PROG_LOAD, - BPF_OBJ_PIN, - BPF_OBJ_GET, - BPF_PROG_ATTACH, - BPF_PROG_DETACH, - BPF_PROG_TEST_RUN, - BPF_PROG_GET_NEXT_ID, - BPF_MAP_GET_NEXT_ID, - BPF_PROG_GET_FD_BY_ID, - BPF_MAP_GET_FD_BY_ID, - BPF_OBJ_GET_INFO_BY_FD, - BPF_PROG_QUERY, - BPF_RAW_TRACEPOINT_OPEN, - BPF_BTF_LOAD, - BPF_BTF_GET_FD_BY_ID, - BPF_TASK_FD_QUERY, - BPF_MAP_LOOKUP_AND_DELETE_ELEM, -}; - -enum bpf_map_type { - BPF_MAP_TYPE_UNSPEC, - BPF_MAP_TYPE_HASH, - BPF_MAP_TYPE_ARRAY, - BPF_MAP_TYPE_PROG_ARRAY, - BPF_MAP_TYPE_PERF_EVENT_ARRAY, - BPF_MAP_TYPE_PERCPU_HASH, - BPF_MAP_TYPE_PERCPU_ARRAY, - BPF_MAP_TYPE_STACK_TRACE, - BPF_MAP_TYPE_CGROUP_ARRAY, - BPF_MAP_TYPE_LRU_HASH, - BPF_MAP_TYPE_LRU_PERCPU_HASH, - BPF_MAP_TYPE_LPM_TRIE, - BPF_MAP_TYPE_ARRAY_OF_MAPS, - BPF_MAP_TYPE_HASH_OF_MAPS, - BPF_MAP_TYPE_DEVMAP, - BPF_MAP_TYPE_SOCKMAP, - BPF_MAP_TYPE_CPUMAP, - BPF_MAP_TYPE_XSKMAP, - BPF_MAP_TYPE_SOCKHASH, - BPF_MAP_TYPE_CGROUP_STORAGE, - BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, - BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, - BPF_MAP_TYPE_QUEUE, - BPF_MAP_TYPE_STACK, -}; - -/* Note that tracing related programs such as - * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} - * are not subject to a stable API since kernel internal data - * structures can change from release to release and may - * therefore break existing tracing BPF programs. Tracing BPF - * programs correspond to /a/ specific kernel which is to be - * analyzed, and not /a/ specific kernel /and/ all future ones. - */ -enum bpf_prog_type { - BPF_PROG_TYPE_UNSPEC, - BPF_PROG_TYPE_SOCKET_FILTER, - BPF_PROG_TYPE_KPROBE, - BPF_PROG_TYPE_SCHED_CLS, - BPF_PROG_TYPE_SCHED_ACT, - BPF_PROG_TYPE_TRACEPOINT, - BPF_PROG_TYPE_XDP, - BPF_PROG_TYPE_PERF_EVENT, - BPF_PROG_TYPE_CGROUP_SKB, - BPF_PROG_TYPE_CGROUP_SOCK, - BPF_PROG_TYPE_LWT_IN, - BPF_PROG_TYPE_LWT_OUT, - BPF_PROG_TYPE_LWT_XMIT, - BPF_PROG_TYPE_SOCK_OPS, - BPF_PROG_TYPE_SK_SKB, - BPF_PROG_TYPE_CGROUP_DEVICE, - BPF_PROG_TYPE_SK_MSG, - BPF_PROG_TYPE_RAW_TRACEPOINT, - BPF_PROG_TYPE_CGROUP_SOCK_ADDR, - BPF_PROG_TYPE_LWT_SEG6LOCAL, - BPF_PROG_TYPE_LIRC_MODE2, - BPF_PROG_TYPE_SK_REUSEPORT, - BPF_PROG_TYPE_FLOW_DISSECTOR, -}; - -enum bpf_attach_type { - BPF_CGROUP_INET_INGRESS, - BPF_CGROUP_INET_EGRESS, - BPF_CGROUP_INET_SOCK_CREATE, - BPF_CGROUP_SOCK_OPS, - BPF_SK_SKB_STREAM_PARSER, - BPF_SK_SKB_STREAM_VERDICT, - BPF_CGROUP_DEVICE, - BPF_SK_MSG_VERDICT, - BPF_CGROUP_INET4_BIND, - BPF_CGROUP_INET6_BIND, - BPF_CGROUP_INET4_CONNECT, - BPF_CGROUP_INET6_CONNECT, - BPF_CGROUP_INET4_POST_BIND, - BPF_CGROUP_INET6_POST_BIND, - BPF_CGROUP_UDP4_SENDMSG, - BPF_CGROUP_UDP6_SENDMSG, - BPF_LIRC_MODE2, - BPF_FLOW_DISSECTOR, - __MAX_BPF_ATTACH_TYPE -}; - -#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE - -/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command - * - * NONE(default): No further bpf programs allowed in the subtree. - * - * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, - * the program in this cgroup yields to sub-cgroup program. - * - * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, - * that cgroup program gets run in addition to the program in this cgroup. - * - * Only one program is allowed to be attached to a cgroup with - * NONE or BPF_F_ALLOW_OVERRIDE flag. - * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will - * release old program and attach the new one. Attach flags has to match. - * - * Multiple programs are allowed to be attached to a cgroup with - * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order - * (those that were attached first, run first) - * The programs of sub-cgroup are executed first, then programs of - * this cgroup and then programs of parent cgroup. - * When children program makes decision (like picking TCP CA or sock bind) - * parent program has a chance to override it. - * - * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. - * A cgroup with NONE doesn't allow any programs in sub-cgroups. - * Ex1: - * cgrp1 (MULTI progs A, B) -> - * cgrp2 (OVERRIDE prog C) -> - * cgrp3 (MULTI prog D) -> - * cgrp4 (OVERRIDE prog E) -> - * cgrp5 (NONE prog F) - * the event in cgrp5 triggers execution of F,D,A,B in that order. - * if prog F is detached, the execution is E,D,A,B - * if prog F and D are detached, the execution is E,A,B - * if prog F, E and D are detached, the execution is C,A,B - * - * All eligible programs are executed regardless of return code from - * earlier programs. - */ -#define BPF_F_ALLOW_OVERRIDE (1U << 0) -#define BPF_F_ALLOW_MULTI (1U << 1) - -/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the - * verifier will perform strict alignment checking as if the kernel - * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, - * and NET_IP_ALIGN defined to 2. - */ -#define BPF_F_STRICT_ALIGNMENT (1U << 0) - -/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the - * verifier will allow any alignment whatsoever. On platforms - * with strict alignment requirements for loads ands stores (such - * as sparc and mips) the verifier validates that all loads and - * stores provably follow this requirement. This flag turns that - * checking and enforcement off. - * - * It is mostly used for testing when we want to validate the - * context and memory access aspects of the verifier, but because - * of an unaligned access the alignment check would trigger before - * the one we are interested in. - */ -#define BPF_F_ANY_ALIGNMENT (1U << 1) - -/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ -#define BPF_PSEUDO_MAP_FD 1 - -/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative - * offset to another bpf function - */ -#define BPF_PSEUDO_CALL 1 - -/* flags for BPF_MAP_UPDATE_ELEM command */ -#define BPF_ANY 0 /* create new element or update existing */ -#define BPF_NOEXIST 1 /* create new element if it didn't exist */ -#define BPF_EXIST 2 /* update existing element */ - -/* flags for BPF_MAP_CREATE command */ -#define BPF_F_NO_PREALLOC (1U << 0) -/* Instead of having one common LRU list in the - * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list - * which can scale and perform better. - * Note, the LRU nodes (including free nodes) cannot be moved - * across different LRU lists. - */ -#define BPF_F_NO_COMMON_LRU (1U << 1) -/* Specify numa node during map creation */ -#define BPF_F_NUMA_NODE (1U << 2) - -#define BPF_OBJ_NAME_LEN 16U - -/* Flags for accessing BPF object */ -#define BPF_F_RDONLY (1U << 3) -#define BPF_F_WRONLY (1U << 4) - -/* Flag for stack_map, store build_id+offset instead of pointer */ -#define BPF_F_STACK_BUILD_ID (1U << 5) - -/* Zero-initialize hash function seed. This should only be used for testing. */ -#define BPF_F_ZERO_SEED (1U << 6) - -/* flags for BPF_PROG_QUERY */ -#define BPF_F_QUERY_EFFECTIVE (1U << 0) - -enum bpf_stack_build_id_status { - /* user space need an empty entry to identify end of a trace */ - BPF_STACK_BUILD_ID_EMPTY = 0, - /* with valid build_id and offset */ - BPF_STACK_BUILD_ID_VALID = 1, - /* couldn't get build_id, fallback to ip */ - BPF_STACK_BUILD_ID_IP = 2, -}; - -#define BPF_BUILD_ID_SIZE 20 -struct bpf_stack_build_id { - __s32 status; - unsigned char build_id[BPF_BUILD_ID_SIZE]; - union { - __u64 offset; - __u64 ip; - }; -}; - -union bpf_attr { - struct { /* anonymous struct used by BPF_MAP_CREATE command */ - __u32 map_type; /* one of enum bpf_map_type */ - __u32 key_size; /* size of key in bytes */ - __u32 value_size; /* size of value in bytes */ - __u32 max_entries; /* max number of entries in a map */ - __u32 map_flags; /* BPF_MAP_CREATE related - * flags defined above. - */ - __u32 inner_map_fd; /* fd pointing to the inner map */ - __u32 numa_node; /* numa node (effective only if - * BPF_F_NUMA_NODE is set). - */ - char map_name[BPF_OBJ_NAME_LEN]; - __u32 map_ifindex; /* ifindex of netdev to create on */ - __u32 btf_fd; /* fd pointing to a BTF type data */ - __u32 btf_key_type_id; /* BTF type_id of the key */ - __u32 btf_value_type_id; /* BTF type_id of the value */ - }; - - struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ - __u32 map_fd; - __aligned_u64 key; - union { - __aligned_u64 value; - __aligned_u64 next_key; - }; - __u64 flags; - }; - - struct { /* anonymous struct used by BPF_PROG_LOAD command */ - __u32 prog_type; /* one of enum bpf_prog_type */ - __u32 insn_cnt; - __aligned_u64 insns; - __aligned_u64 license; - __u32 log_level; /* verbosity level of verifier */ - __u32 log_size; /* size of user buffer */ - __aligned_u64 log_buf; /* user supplied buffer */ - __u32 kern_version; /* not used */ - __u32 prog_flags; - char prog_name[BPF_OBJ_NAME_LEN]; - __u32 prog_ifindex; /* ifindex of netdev to prep for */ - /* For some prog types expected attach type must be known at - * load time to verify attach type specific parts of prog - * (context accesses, allowed helpers, etc). - */ - __u32 expected_attach_type; - __u32 prog_btf_fd; /* fd pointing to BTF type data */ - __u32 func_info_rec_size; /* userspace bpf_func_info size */ - __aligned_u64 func_info; /* func info */ - __u32 func_info_cnt; /* number of bpf_func_info records */ - __u32 line_info_rec_size; /* userspace bpf_line_info size */ - __aligned_u64 line_info; /* line info */ - __u32 line_info_cnt; /* number of bpf_line_info records */ - }; - - struct { /* anonymous struct used by BPF_OBJ_* commands */ - __aligned_u64 pathname; - __u32 bpf_fd; - __u32 file_flags; - }; - - struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ - __u32 target_fd; /* container object to attach to */ - __u32 attach_bpf_fd; /* eBPF program to attach */ - __u32 attach_type; - __u32 attach_flags; - }; - - struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ - __u32 prog_fd; - __u32 retval; - __u32 data_size_in; /* input: len of data_in */ - __u32 data_size_out; /* input/output: len of data_out - * returns ENOSPC if data_out - * is too small. - */ - __aligned_u64 data_in; - __aligned_u64 data_out; - __u32 repeat; - __u32 duration; - } test; - - struct { /* anonymous struct used by BPF_*_GET_*_ID */ - union { - __u32 start_id; - __u32 prog_id; - __u32 map_id; - __u32 btf_id; - }; - __u32 next_id; - __u32 open_flags; - }; - - struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ - __u32 bpf_fd; - __u32 info_len; - __aligned_u64 info; - } info; - - struct { /* anonymous struct used by BPF_PROG_QUERY command */ - __u32 target_fd; /* container object to query */ - __u32 attach_type; - __u32 query_flags; - __u32 attach_flags; - __aligned_u64 prog_ids; - __u32 prog_cnt; - } query; - - struct { - __u64 name; - __u32 prog_fd; - } raw_tracepoint; - - struct { /* anonymous struct for BPF_BTF_LOAD */ - __aligned_u64 btf; - __aligned_u64 btf_log_buf; - __u32 btf_size; - __u32 btf_log_size; - __u32 btf_log_level; - }; - - struct { - __u32 pid; /* input: pid */ - __u32 fd; /* input: fd */ - __u32 flags; /* input: flags */ - __u32 buf_len; /* input/output: buf len */ - __aligned_u64 buf; /* input/output: - * tp_name for tracepoint - * symbol for kprobe - * filename for uprobe - */ - __u32 prog_id; /* output: prod_id */ - __u32 fd_type; /* output: BPF_FD_TYPE_* */ - __u64 probe_offset; /* output: probe_offset */ - __u64 probe_addr; /* output: probe_addr */ - } task_fd_query; -} __attribute__((aligned(8))); - -/* The description below is an attempt at providing documentation to eBPF - * developers about the multiple available eBPF helper functions. It can be - * parsed and used to produce a manual page. The workflow is the following, - * and requires the rst2man utility: - * - * $ ./scripts/bpf_helpers_doc.py \ - * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst - * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 - * $ man /tmp/bpf-helpers.7 - * - * Note that in order to produce this external documentation, some RST - * formatting is used in the descriptions to get "bold" and "italics" in - * manual pages. Also note that the few trailing white spaces are - * intentional, removing them would break paragraphs for rst2man. - * - * Start of BPF helper function descriptions: - * - * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) - * Description - * Perform a lookup in *map* for an entry associated to *key*. - * Return - * Map value associated to *key*, or **NULL** if no entry was - * found. - * - * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) - * Description - * Add or update the value of the entry associated to *key* in - * *map* with *value*. *flags* is one of: - * - * **BPF_NOEXIST** - * The entry for *key* must not exist in the map. - * **BPF_EXIST** - * The entry for *key* must already exist in the map. - * **BPF_ANY** - * No condition on the existence of the entry for *key*. - * - * Flag value **BPF_NOEXIST** cannot be used for maps of types - * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all - * elements always exist), the helper would return an error. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_map_delete_elem(struct bpf_map *map, const void *key) - * Description - * Delete entry with *key* from *map*. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) - * Description - * Push an element *value* in *map*. *flags* is one of: - * - * **BPF_EXIST** - * If the queue/stack is full, the oldest element is removed to - * make room for this. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_probe_read(void *dst, u32 size, const void *src) - * Description - * For tracing programs, safely attempt to read *size* bytes from - * address *src* and store the data in *dst*. - * Return - * 0 on success, or a negative error in case of failure. - * - * u64 bpf_ktime_get_ns(void) - * Description - * Return the time elapsed since system boot, in nanoseconds. - * Return - * Current *ktime*. - * - * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...) - * Description - * This helper is a "printk()-like" facility for debugging. It - * prints a message defined by format *fmt* (of size *fmt_size*) - * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if - * available. It can take up to three additional **u64** - * arguments (as an eBPF helpers, the total number of arguments is - * limited to five). - * - * Each time the helper is called, it appends a line to the trace. - * The format of the trace is customizable, and the exact output - * one will get depends on the options set in - * *\/sys/kernel/debug/tracing/trace_options* (see also the - * *README* file under the same directory). However, it usually - * defaults to something like: - * - * :: - * - * telnet-470 [001] .N.. 419421.045894: 0x00000001: - * - * In the above: - * - * * ``telnet`` is the name of the current task. - * * ``470`` is the PID of the current task. - * * ``001`` is the CPU number on which the task is - * running. - * * In ``.N..``, each character refers to a set of - * options (whether irqs are enabled, scheduling - * options, whether hard/softirqs are running, level of - * preempt_disabled respectively). **N** means that - * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** - * are set. - * * ``419421.045894`` is a timestamp. - * * ``0x00000001`` is a fake value used by BPF for the - * instruction pointer register. - * * ```` is the message formatted with - * *fmt*. - * - * The conversion specifiers supported by *fmt* are similar, but - * more limited than for printk(). They are **%d**, **%i**, - * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, - * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size - * of field, padding with zeroes, etc.) is available, and the - * helper will return **-EINVAL** (but print nothing) if it - * encounters an unknown specifier. - * - * Also, note that **bpf_trace_printk**\ () is slow, and should - * only be used for debugging purposes. For this reason, a notice - * bloc (spanning several lines) is printed to kernel logs and - * states that the helper should not be used "for production use" - * the first time this helper is used (or more precisely, when - * **trace_printk**\ () buffers are allocated). For passing values - * to user space, perf events should be preferred. - * Return - * The number of bytes written to the buffer, or a negative error - * in case of failure. - * - * u32 bpf_get_prandom_u32(void) - * Description - * Get a pseudo-random number. - * - * From a security point of view, this helper uses its own - * pseudo-random internal state, and cannot be used to infer the - * seed of other random functions in the kernel. However, it is - * essential to note that the generator used by the helper is not - * cryptographically secure. - * Return - * A random 32-bit unsigned value. - * - * u32 bpf_get_smp_processor_id(void) - * Description - * Get the SMP (symmetric multiprocessing) processor id. Note that - * all programs run with preemption disabled, which means that the - * SMP processor id is stable during all the execution of the - * program. - * Return - * The SMP id of the processor running the program. - * - * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) - * Description - * Store *len* bytes from address *from* into the packet - * associated to *skb*, at *offset*. *flags* are a combination of - * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the - * checksum for the packet after storing the bytes) and - * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ - * **->swhash** and *skb*\ **->l4hash** to 0). - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) - * Description - * Recompute the layer 3 (e.g. IP) checksum for the packet - * associated to *skb*. Computation is incremental, so the helper - * must know the former value of the header field that was - * modified (*from*), the new value of this field (*to*), and the - * number of bytes (2 or 4) for this field, stored in *size*. - * Alternatively, it is possible to store the difference between - * the previous and the new values of the header field in *to*, by - * setting *from* and *size* to 0. For both methods, *offset* - * indicates the location of the IP checksum within the packet. - * - * This helper works in combination with **bpf_csum_diff**\ (), - * which does not update the checksum in-place, but offers more - * flexibility and can handle sizes larger than 2 or 4 for the - * checksum to update. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) - * Description - * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the - * packet associated to *skb*. Computation is incremental, so the - * helper must know the former value of the header field that was - * modified (*from*), the new value of this field (*to*), and the - * number of bytes (2 or 4) for this field, stored on the lowest - * four bits of *flags*. Alternatively, it is possible to store - * the difference between the previous and the new values of the - * header field in *to*, by setting *from* and the four lowest - * bits of *flags* to 0. For both methods, *offset* indicates the - * location of the IP checksum within the packet. In addition to - * the size of the field, *flags* can be added (bitwise OR) actual - * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left - * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and - * for updates resulting in a null checksum the value is set to - * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates - * the checksum is to be computed against a pseudo-header. - * - * This helper works in combination with **bpf_csum_diff**\ (), - * which does not update the checksum in-place, but offers more - * flexibility and can handle sizes larger than 2 or 4 for the - * checksum to update. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) - * Description - * This special helper is used to trigger a "tail call", or in - * other words, to jump into another eBPF program. The same stack - * frame is used (but values on stack and in registers for the - * caller are not accessible to the callee). This mechanism allows - * for program chaining, either for raising the maximum number of - * available eBPF instructions, or to execute given programs in - * conditional blocks. For security reasons, there is an upper - * limit to the number of successive tail calls that can be - * performed. - * - * Upon call of this helper, the program attempts to jump into a - * program referenced at index *index* in *prog_array_map*, a - * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes - * *ctx*, a pointer to the context. - * - * If the call succeeds, the kernel immediately runs the first - * instruction of the new program. This is not a function call, - * and it never returns to the previous program. If the call - * fails, then the helper has no effect, and the caller continues - * to run its subsequent instructions. A call can fail if the - * destination program for the jump does not exist (i.e. *index* - * is superior to the number of entries in *prog_array_map*), or - * if the maximum number of tail calls has been reached for this - * chain of programs. This limit is defined in the kernel by the - * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), - * which is currently set to 32. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) - * Description - * Clone and redirect the packet associated to *skb* to another - * net device of index *ifindex*. Both ingress and egress - * interfaces can be used for redirection. The **BPF_F_INGRESS** - * value in *flags* is used to make the distinction (ingress path - * is selected if the flag is present, egress path otherwise). - * This is the only flag supported for now. - * - * In comparison with **bpf_redirect**\ () helper, - * **bpf_clone_redirect**\ () has the associated cost of - * duplicating the packet buffer, but this can be executed out of - * the eBPF program. Conversely, **bpf_redirect**\ () is more - * efficient, but it is handled through an action code where the - * redirection happens only after the eBPF program has returned. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * u64 bpf_get_current_pid_tgid(void) - * Return - * A 64-bit integer containing the current tgid and pid, and - * created as such: - * *current_task*\ **->tgid << 32 \|** - * *current_task*\ **->pid**. - * - * u64 bpf_get_current_uid_gid(void) - * Return - * A 64-bit integer containing the current GID and UID, and - * created as such: *current_gid* **<< 32 \|** *current_uid*. - * - * int bpf_get_current_comm(char *buf, u32 size_of_buf) - * Description - * Copy the **comm** attribute of the current task into *buf* of - * *size_of_buf*. The **comm** attribute contains the name of - * the executable (excluding the path) for the current task. The - * *size_of_buf* must be strictly positive. On success, the - * helper makes sure that the *buf* is NUL-terminated. On failure, - * it is filled with zeroes. - * Return - * 0 on success, or a negative error in case of failure. - * - * u32 bpf_get_cgroup_classid(struct sk_buff *skb) - * Description - * Retrieve the classid for the current task, i.e. for the net_cls - * cgroup to which *skb* belongs. - * - * This helper can be used on TC egress path, but not on ingress. - * - * The net_cls cgroup provides an interface to tag network packets - * based on a user-provided identifier for all traffic coming from - * the tasks belonging to the related cgroup. See also the related - * kernel documentation, available from the Linux sources in file - * *Documentation/cgroup-v1/net_cls.txt*. - * - * The Linux kernel has two versions for cgroups: there are - * cgroups v1 and cgroups v2. Both are available to users, who can - * use a mixture of them, but note that the net_cls cgroup is for - * cgroup v1 only. This makes it incompatible with BPF programs - * run on cgroups, which is a cgroup-v2-only feature (a socket can - * only hold data for one version of cgroups at a time). - * - * This helper is only available is the kernel was compiled with - * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to - * "**y**" or to "**m**". - * Return - * The classid, or 0 for the default unconfigured classid. - * - * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) - * Description - * Push a *vlan_tci* (VLAN tag control information) of protocol - * *vlan_proto* to the packet associated to *skb*, then update - * the checksum. Note that if *vlan_proto* is different from - * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to - * be **ETH_P_8021Q**. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_skb_vlan_pop(struct sk_buff *skb) - * Description - * Pop a VLAN header from the packet associated to *skb*. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) - * Description - * Get tunnel metadata. This helper takes a pointer *key* to an - * empty **struct bpf_tunnel_key** of **size**, that will be - * filled with tunnel metadata for the packet associated to *skb*. - * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which - * indicates that the tunnel is based on IPv6 protocol instead of - * IPv4. - * - * The **struct bpf_tunnel_key** is an object that generalizes the - * principal parameters used by various tunneling protocols into a - * single struct. This way, it can be used to easily make a - * decision based on the contents of the encapsulation header, - * "summarized" in this struct. In particular, it holds the IP - * address of the remote end (IPv4 or IPv6, depending on the case) - * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, - * this struct exposes the *key*\ **->tunnel_id**, which is - * generally mapped to a VNI (Virtual Network Identifier), making - * it programmable together with the **bpf_skb_set_tunnel_key**\ - * () helper. - * - * Let's imagine that the following code is part of a program - * attached to the TC ingress interface, on one end of a GRE - * tunnel, and is supposed to filter out all messages coming from - * remote ends with IPv4 address other than 10.0.0.1: - * - * :: - * - * int ret; - * struct bpf_tunnel_key key = {}; - * - * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); - * if (ret < 0) - * return TC_ACT_SHOT; // drop packet - * - * if (key.remote_ipv4 != 0x0a000001) - * return TC_ACT_SHOT; // drop packet - * - * return TC_ACT_OK; // accept packet - * - * This interface can also be used with all encapsulation devices - * that can operate in "collect metadata" mode: instead of having - * one network device per specific configuration, the "collect - * metadata" mode only requires a single device where the - * configuration can be extracted from this helper. - * - * This can be used together with various tunnels such as VXLan, - * Geneve, GRE or IP in IP (IPIP). - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) - * Description - * Populate tunnel metadata for packet associated to *skb.* The - * tunnel metadata is set to the contents of *key*, of *size*. The - * *flags* can be set to a combination of the following values: - * - * **BPF_F_TUNINFO_IPV6** - * Indicate that the tunnel is based on IPv6 protocol - * instead of IPv4. - * **BPF_F_ZERO_CSUM_TX** - * For IPv4 packets, add a flag to tunnel metadata - * indicating that checksum computation should be skipped - * and checksum set to zeroes. - * **BPF_F_DONT_FRAGMENT** - * Add a flag to tunnel metadata indicating that the - * packet should not be fragmented. - * **BPF_F_SEQ_NUMBER** - * Add a flag to tunnel metadata indicating that a - * sequence number should be added to tunnel header before - * sending the packet. This flag was added for GRE - * encapsulation, but might be used with other protocols - * as well in the future. - * - * Here is a typical usage on the transmit path: - * - * :: - * - * struct bpf_tunnel_key key; - * populate key ... - * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); - * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); - * - * See also the description of the **bpf_skb_get_tunnel_key**\ () - * helper for additional information. - * Return - * 0 on success, or a negative error in case of failure. - * - * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags) - * Description - * Read the value of a perf event counter. This helper relies on a - * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of - * the perf event counter is selected when *map* is updated with - * perf event file descriptors. The *map* is an array whose size - * is the number of available CPUs, and each cell contains a value - * relative to one CPU. The value to retrieve is indicated by - * *flags*, that contains the index of the CPU to look up, masked - * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to - * **BPF_F_CURRENT_CPU** to indicate that the value for the - * current CPU should be retrieved. - * - * Note that before Linux 4.13, only hardware perf event can be - * retrieved. - * - * Also, be aware that the newer helper - * **bpf_perf_event_read_value**\ () is recommended over - * **bpf_perf_event_read**\ () in general. The latter has some ABI - * quirks where error and counter value are used as a return code - * (which is wrong to do since ranges may overlap). This issue is - * fixed with **bpf_perf_event_read_value**\ (), which at the same - * time provides more features over the **bpf_perf_event_read**\ - * () interface. Please refer to the description of - * **bpf_perf_event_read_value**\ () for details. - * Return - * The value of the perf event counter read from the map, or a - * negative error code in case of failure. - * - * int bpf_redirect(u32 ifindex, u64 flags) - * Description - * Redirect the packet to another net device of index *ifindex*. - * This helper is somewhat similar to **bpf_clone_redirect**\ - * (), except that the packet is not cloned, which provides - * increased performance. - * - * Except for XDP, both ingress and egress interfaces can be used - * for redirection. The **BPF_F_INGRESS** value in *flags* is used - * to make the distinction (ingress path is selected if the flag - * is present, egress path otherwise). Currently, XDP only - * supports redirection to the egress interface, and accepts no - * flag at all. - * - * The same effect can be attained with the more generic - * **bpf_redirect_map**\ (), which requires specific maps to be - * used but offers better performance. - * Return - * For XDP, the helper returns **XDP_REDIRECT** on success or - * **XDP_ABORTED** on error. For other program types, the values - * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on - * error. - * - * u32 bpf_get_route_realm(struct sk_buff *skb) - * Description - * Retrieve the realm or the route, that is to say the - * **tclassid** field of the destination for the *skb*. The - * indentifier retrieved is a user-provided tag, similar to the - * one used with the net_cls cgroup (see description for - * **bpf_get_cgroup_classid**\ () helper), but here this tag is - * held by a route (a destination entry), not by a task. - * - * Retrieving this identifier works with the clsact TC egress hook - * (see also **tc-bpf(8)**), or alternatively on conventional - * classful egress qdiscs, but not on TC ingress path. In case of - * clsact TC egress hook, this has the advantage that, internally, - * the destination entry has not been dropped yet in the transmit - * path. Therefore, the destination entry does not need to be - * artificially held via **netif_keep_dst**\ () for a classful - * qdisc until the *skb* is freed. - * - * This helper is available only if the kernel was compiled with - * **CONFIG_IP_ROUTE_CLASSID** configuration option. - * Return - * The realm of the route for the packet associated to *skb*, or 0 - * if none was found. - * - * int bpf_perf_event_output(struct pt_reg *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) - * Description - * Write raw *data* blob into a special BPF perf event held by - * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf - * event must have the following attributes: **PERF_SAMPLE_RAW** - * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and - * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. - * - * The *flags* are used to indicate the index in *map* for which - * the value must be put, masked with **BPF_F_INDEX_MASK**. - * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** - * to indicate that the index of the current CPU core should be - * used. - * - * The value to write, of *size*, is passed through eBPF stack and - * pointed by *data*. - * - * The context of the program *ctx* needs also be passed to the - * helper. - * - * On user space, a program willing to read the values needs to - * call **perf_event_open**\ () on the perf event (either for - * one or for all CPUs) and to store the file descriptor into the - * *map*. This must be done before the eBPF program can send data - * into it. An example is available in file - * *samples/bpf/trace_output_user.c* in the Linux kernel source - * tree (the eBPF program counterpart is in - * *samples/bpf/trace_output_kern.c*). - * - * **bpf_perf_event_output**\ () achieves better performance - * than **bpf_trace_printk**\ () for sharing data with user - * space, and is much better suitable for streaming data from eBPF - * programs. - * - * Note that this helper is not restricted to tracing use cases - * and can be used with programs attached to TC or XDP as well, - * where it allows for passing data to user space listeners. Data - * can be: - * - * * Only custom structs, - * * Only the packet payload, or - * * A combination of both. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) - * Description - * This helper was provided as an easy way to load data from a - * packet. It can be used to load *len* bytes from *offset* from - * the packet associated to *skb*, into the buffer pointed by - * *to*. - * - * Since Linux 4.7, usage of this helper has mostly been replaced - * by "direct packet access", enabling packet data to be - * manipulated with *skb*\ **->data** and *skb*\ **->data_end** - * pointing respectively to the first byte of packet data and to - * the byte after the last byte of packet data. However, it - * remains useful if one wishes to read large quantities of data - * at once from a packet into the eBPF stack. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_get_stackid(struct pt_reg *ctx, struct bpf_map *map, u64 flags) - * Description - * Walk a user or a kernel stack and return its id. To achieve - * this, the helper needs *ctx*, which is a pointer to the context - * on which the tracing program is executed, and a pointer to a - * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. - * - * The last argument, *flags*, holds the number of stack frames to - * skip (from 0 to 255), masked with - * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set - * a combination of the following flags: - * - * **BPF_F_USER_STACK** - * Collect a user space stack instead of a kernel stack. - * **BPF_F_FAST_STACK_CMP** - * Compare stacks by hash only. - * **BPF_F_REUSE_STACKID** - * If two different stacks hash into the same *stackid*, - * discard the old one. - * - * The stack id retrieved is a 32 bit long integer handle which - * can be further combined with other data (including other stack - * ids) and used as a key into maps. This can be useful for - * generating a variety of graphs (such as flame graphs or off-cpu - * graphs). - * - * For walking a stack, this helper is an improvement over - * **bpf_probe_read**\ (), which can be used with unrolled loops - * but is not efficient and consumes a lot of eBPF instructions. - * Instead, **bpf_get_stackid**\ () can collect up to - * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that - * this limit can be controlled with the **sysctl** program, and - * that it should be manually increased in order to profile long - * user stacks (such as stacks for Java programs). To do so, use: - * - * :: - * - * # sysctl kernel.perf_event_max_stack= - * Return - * The positive or null stack id on success, or a negative error - * in case of failure. - * - * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed) - * Description - * Compute a checksum difference, from the raw buffer pointed by - * *from*, of length *from_size* (that must be a multiple of 4), - * towards the raw buffer pointed by *to*, of size *to_size* - * (same remark). An optional *seed* can be added to the value - * (this can be cascaded, the seed may come from a previous call - * to the helper). - * - * This is flexible enough to be used in several ways: - * - * * With *from_size* == 0, *to_size* > 0 and *seed* set to - * checksum, it can be used when pushing new data. - * * With *from_size* > 0, *to_size* == 0 and *seed* set to - * checksum, it can be used when removing data from a packet. - * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it - * can be used to compute a diff. Note that *from_size* and - * *to_size* do not need to be equal. - * - * This helper can be used in combination with - * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to - * which one can feed in the difference computed with - * **bpf_csum_diff**\ (). - * Return - * The checksum result, or a negative error code in case of - * failure. - * - * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) - * Description - * Retrieve tunnel options metadata for the packet associated to - * *skb*, and store the raw tunnel option data to the buffer *opt* - * of *size*. - * - * This helper can be used with encapsulation devices that can - * operate in "collect metadata" mode (please refer to the related - * note in the description of **bpf_skb_get_tunnel_key**\ () for - * more details). A particular example where this can be used is - * in combination with the Geneve encapsulation protocol, where it - * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) - * and retrieving arbitrary TLVs (Type-Length-Value headers) from - * the eBPF program. This allows for full customization of these - * headers. - * Return - * The size of the option data retrieved. - * - * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) - * Description - * Set tunnel options metadata for the packet associated to *skb* - * to the option data contained in the raw buffer *opt* of *size*. - * - * See also the description of the **bpf_skb_get_tunnel_opt**\ () - * helper for additional information. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) - * Description - * Change the protocol of the *skb* to *proto*. Currently - * supported are transition from IPv4 to IPv6, and from IPv6 to - * IPv4. The helper takes care of the groundwork for the - * transition, including resizing the socket buffer. The eBPF - * program is expected to fill the new headers, if any, via - * **skb_store_bytes**\ () and to recompute the checksums with - * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ - * (). The main case for this helper is to perform NAT64 - * operations out of an eBPF program. - * - * Internally, the GSO type is marked as dodgy so that headers are - * checked and segments are recalculated by the GSO/GRO engine. - * The size for GSO target is adapted as well. - * - * All values for *flags* are reserved for future usage, and must - * be left at zero. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_skb_change_type(struct sk_buff *skb, u32 type) - * Description - * Change the packet type for the packet associated to *skb*. This - * comes down to setting *skb*\ **->pkt_type** to *type*, except - * the eBPF program does not have a write access to *skb*\ - * **->pkt_type** beside this helper. Using a helper here allows - * for graceful handling of errors. - * - * The major use case is to change incoming *skb*s to - * **PACKET_HOST** in a programmatic way instead of having to - * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for - * example. - * - * Note that *type* only allows certain values. At this time, they - * are: - * - * **PACKET_HOST** - * Packet is for us. - * **PACKET_BROADCAST** - * Send packet to all. - * **PACKET_MULTICAST** - * Send packet to group. - * **PACKET_OTHERHOST** - * Send packet to someone else. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) - * Description - * Check whether *skb* is a descendant of the cgroup2 held by - * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. - * Return - * The return value depends on the result of the test, and can be: - * - * * 0, if the *skb* failed the cgroup2 descendant test. - * * 1, if the *skb* succeeded the cgroup2 descendant test. - * * A negative error code, if an error occurred. - * - * u32 bpf_get_hash_recalc(struct sk_buff *skb) - * Description - * Retrieve the hash of the packet, *skb*\ **->hash**. If it is - * not set, in particular if the hash was cleared due to mangling, - * recompute this hash. Later accesses to the hash can be done - * directly with *skb*\ **->hash**. - * - * Calling **bpf_set_hash_invalid**\ (), changing a packet - * prototype with **bpf_skb_change_proto**\ (), or calling - * **bpf_skb_store_bytes**\ () with the - * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear - * the hash and to trigger a new computation for the next call to - * **bpf_get_hash_recalc**\ (). - * Return - * The 32-bit hash. - * - * u64 bpf_get_current_task(void) - * Return - * A pointer to the current task struct. - * - * int bpf_probe_write_user(void *dst, const void *src, u32 len) - * Description - * Attempt in a safe way to write *len* bytes from the buffer - * *src* to *dst* in memory. It only works for threads that are in - * user context, and *dst* must be a valid user space address. - * - * This helper should not be used to implement any kind of - * security mechanism because of TOC-TOU attacks, but rather to - * debug, divert, and manipulate execution of semi-cooperative - * processes. - * - * Keep in mind that this feature is meant for experiments, and it - * has a risk of crashing the system and running programs. - * Therefore, when an eBPF program using this helper is attached, - * a warning including PID and process name is printed to kernel - * logs. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) - * Description - * Check whether the probe is being run is the context of a given - * subset of the cgroup2 hierarchy. The cgroup2 to test is held by - * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. - * Return - * The return value depends on the result of the test, and can be: - * - * * 0, if the *skb* task belongs to the cgroup2. - * * 1, if the *skb* task does not belong to the cgroup2. - * * A negative error code, if an error occurred. - * - * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) - * Description - * Resize (trim or grow) the packet associated to *skb* to the - * new *len*. The *flags* are reserved for future usage, and must - * be left at zero. - * - * The basic idea is that the helper performs the needed work to - * change the size of the packet, then the eBPF program rewrites - * the rest via helpers like **bpf_skb_store_bytes**\ (), - * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () - * and others. This helper is a slow path utility intended for - * replies with control messages. And because it is targeted for - * slow path, the helper itself can afford to be slow: it - * implicitly linearizes, unclones and drops offloads from the - * *skb*. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_skb_pull_data(struct sk_buff *skb, u32 len) - * Description - * Pull in non-linear data in case the *skb* is non-linear and not - * all of *len* are part of the linear section. Make *len* bytes - * from *skb* readable and writable. If a zero value is passed for - * *len*, then the whole length of the *skb* is pulled. - * - * This helper is only needed for reading and writing with direct - * packet access. - * - * For direct packet access, testing that offsets to access - * are within packet boundaries (test on *skb*\ **->data_end**) is - * susceptible to fail if offsets are invalid, or if the requested - * data is in non-linear parts of the *skb*. On failure the - * program can just bail out, or in the case of a non-linear - * buffer, use a helper to make the data available. The - * **bpf_skb_load_bytes**\ () helper is a first solution to access - * the data. Another one consists in using **bpf_skb_pull_data** - * to pull in once the non-linear parts, then retesting and - * eventually access the data. - * - * At the same time, this also makes sure the *skb* is uncloned, - * which is a necessary condition for direct write. As this needs - * to be an invariant for the write part only, the verifier - * detects writes and adds a prologue that is calling - * **bpf_skb_pull_data()** to effectively unclone the *skb* from - * the very beginning in case it is indeed cloned. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum) - * Description - * Add the checksum *csum* into *skb*\ **->csum** in case the - * driver has supplied a checksum for the entire packet into that - * field. Return an error otherwise. This helper is intended to be - * used in combination with **bpf_csum_diff**\ (), in particular - * when the checksum needs to be updated after data has been - * written into the packet through direct packet access. - * Return - * The checksum on success, or a negative error code in case of - * failure. - * - * void bpf_set_hash_invalid(struct sk_buff *skb) - * Description - * Invalidate the current *skb*\ **->hash**. It can be used after - * mangling on headers through direct packet access, in order to - * indicate that the hash is outdated and to trigger a - * recalculation the next time the kernel tries to access this - * hash or when the **bpf_get_hash_recalc**\ () helper is called. - * - * int bpf_get_numa_node_id(void) - * Description - * Return the id of the current NUMA node. The primary use case - * for this helper is the selection of sockets for the local NUMA - * node, when the program is attached to sockets using the - * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), - * but the helper is also available to other eBPF program types, - * similarly to **bpf_get_smp_processor_id**\ (). - * Return - * The id of current NUMA node. - * - * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) - * Description - * Grows headroom of packet associated to *skb* and adjusts the - * offset of the MAC header accordingly, adding *len* bytes of - * space. It automatically extends and reallocates memory as - * required. - * - * This helper can be used on a layer 3 *skb* to push a MAC header - * for redirection into a layer 2 device. - * - * All values for *flags* are reserved for future usage, and must - * be left at zero. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) - * Description - * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that - * it is possible to use a negative value for *delta*. This helper - * can be used to prepare the packet for pushing or popping - * headers. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) - * Description - * Copy a NUL terminated string from an unsafe address - * *unsafe_ptr* to *dst*. The *size* should include the - * terminating NUL byte. In case the string length is smaller than - * *size*, the target is not padded with further NUL bytes. If the - * string length is larger than *size*, just *size*-1 bytes are - * copied and the last byte is set to NUL. - * - * On success, the length of the copied string is returned. This - * makes this helper useful in tracing programs for reading - * strings, and more importantly to get its length at runtime. See - * the following snippet: - * - * :: - * - * SEC("kprobe/sys_open") - * void bpf_sys_open(struct pt_regs *ctx) - * { - * char buf[PATHLEN]; // PATHLEN is defined to 256 - * int res = bpf_probe_read_str(buf, sizeof(buf), - * ctx->di); - * - * // Consume buf, for example push it to - * // userspace via bpf_perf_event_output(); we - * // can use res (the string length) as event - * // size, after checking its boundaries. - * } - * - * In comparison, using **bpf_probe_read()** helper here instead - * to read the string would require to estimate the length at - * compile time, and would often result in copying more memory - * than necessary. - * - * Another useful use case is when parsing individual process - * arguments or individual environment variables navigating - * *current*\ **->mm->arg_start** and *current*\ - * **->mm->env_start**: using this helper and the return value, - * one can quickly iterate at the right offset of the memory area. - * Return - * On success, the strictly positive length of the string, - * including the trailing NUL character. On error, a negative - * value. - * - * u64 bpf_get_socket_cookie(struct sk_buff *skb) - * Description - * If the **struct sk_buff** pointed by *skb* has a known socket, - * retrieve the cookie (generated by the kernel) of this socket. - * If no cookie has been set yet, generate a new cookie. Once - * generated, the socket cookie remains stable for the life of the - * socket. This helper can be useful for monitoring per socket - * networking traffic statistics as it provides a unique socket - * identifier per namespace. - * Return - * A 8-byte long non-decreasing number on success, or 0 if the - * socket field is missing inside *skb*. - * - * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) - * Description - * Equivalent to bpf_get_socket_cookie() helper that accepts - * *skb*, but gets socket from **struct bpf_sock_addr** contex. - * Return - * A 8-byte long non-decreasing number. - * - * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) - * Description - * Equivalent to bpf_get_socket_cookie() helper that accepts - * *skb*, but gets socket from **struct bpf_sock_ops** contex. - * Return - * A 8-byte long non-decreasing number. - * - * u32 bpf_get_socket_uid(struct sk_buff *skb) - * Return - * The owner UID of the socket associated to *skb*. If the socket - * is **NULL**, or if it is not a full socket (i.e. if it is a - * time-wait or a request socket instead), **overflowuid** value - * is returned (note that **overflowuid** might also be the actual - * UID value for the socket). - * - * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) - * Description - * Set the full hash for *skb* (set the field *skb*\ **->hash**) - * to value *hash*. - * Return - * 0 - * - * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) - * Description - * Emulate a call to **setsockopt()** on the socket associated to - * *bpf_socket*, which must be a full socket. The *level* at - * which the option resides and the name *optname* of the option - * must be specified, see **setsockopt(2)** for more information. - * The option value of length *optlen* is pointed by *optval*. - * - * This helper actually implements a subset of **setsockopt()**. - * It supports the following *level*\ s: - * - * * **SOL_SOCKET**, which supports the following *optname*\ s: - * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, - * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**. - * * **IPPROTO_TCP**, which supports the following *optname*\ s: - * **TCP_CONGESTION**, **TCP_BPF_IW**, - * **TCP_BPF_SNDCWND_CLAMP**. - * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. - * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) - * Description - * Grow or shrink the room for data in the packet associated to - * *skb* by *len_diff*, and according to the selected *mode*. - * - * There is a single supported mode at this time: - * - * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer - * (room space is added or removed below the layer 3 header). - * - * All values for *flags* are reserved for future usage, and must - * be left at zero. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) - * Description - * Redirect the packet to the endpoint referenced by *map* at - * index *key*. Depending on its type, this *map* can contain - * references to net devices (for forwarding packets through other - * ports), or to CPUs (for redirecting XDP frames to another CPU; - * but this is only implemented for native XDP (with driver - * support) as of this writing). - * - * All values for *flags* are reserved for future usage, and must - * be left at zero. - * - * When used to redirect packets to net devices, this helper - * provides a high performance increase over **bpf_redirect**\ (). - * This is due to various implementation details of the underlying - * mechanisms, one of which is the fact that **bpf_redirect_map**\ - * () tries to send packet as a "bulk" to the device. - * Return - * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. - * - * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags) - * Description - * Redirect the packet to the socket referenced by *map* (of type - * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and - * egress interfaces can be used for redirection. The - * **BPF_F_INGRESS** value in *flags* is used to make the - * distinction (ingress path is selected if the flag is present, - * egress path otherwise). This is the only flag supported for now. - * Return - * **SK_PASS** on success, or **SK_DROP** on error. - * - * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) - * Description - * Add an entry to, or update a *map* referencing sockets. The - * *skops* is used as a new value for the entry associated to - * *key*. *flags* is one of: - * - * **BPF_NOEXIST** - * The entry for *key* must not exist in the map. - * **BPF_EXIST** - * The entry for *key* must already exist in the map. - * **BPF_ANY** - * No condition on the existence of the entry for *key*. - * - * If the *map* has eBPF programs (parser and verdict), those will - * be inherited by the socket being added. If the socket is - * already attached to eBPF programs, this results in an error. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) - * Description - * Adjust the address pointed by *xdp_md*\ **->data_meta** by - * *delta* (which can be positive or negative). Note that this - * operation modifies the address stored in *xdp_md*\ **->data**, - * so the latter must be loaded only after the helper has been - * called. - * - * The use of *xdp_md*\ **->data_meta** is optional and programs - * are not required to use it. The rationale is that when the - * packet is processed with XDP (e.g. as DoS filter), it is - * possible to push further meta data along with it before passing - * to the stack, and to give the guarantee that an ingress eBPF - * program attached as a TC classifier on the same device can pick - * this up for further post-processing. Since TC works with socket - * buffers, it remains possible to set from XDP the **mark** or - * **priority** pointers, or other pointers for the socket buffer. - * Having this scratch space generic and programmable allows for - * more flexibility as the user is free to store whatever meta - * data they need. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) - * Description - * Read the value of a perf event counter, and store it into *buf* - * of size *buf_size*. This helper relies on a *map* of type - * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event - * counter is selected when *map* is updated with perf event file - * descriptors. The *map* is an array whose size is the number of - * available CPUs, and each cell contains a value relative to one - * CPU. The value to retrieve is indicated by *flags*, that - * contains the index of the CPU to look up, masked with - * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to - * **BPF_F_CURRENT_CPU** to indicate that the value for the - * current CPU should be retrieved. - * - * This helper behaves in a way close to - * **bpf_perf_event_read**\ () helper, save that instead of - * just returning the value observed, it fills the *buf* - * structure. This allows for additional data to be retrieved: in - * particular, the enabled and running times (in *buf*\ - * **->enabled** and *buf*\ **->running**, respectively) are - * copied. In general, **bpf_perf_event_read_value**\ () is - * recommended over **bpf_perf_event_read**\ (), which has some - * ABI issues and provides fewer functionalities. - * - * These values are interesting, because hardware PMU (Performance - * Monitoring Unit) counters are limited resources. When there are - * more PMU based perf events opened than available counters, - * kernel will multiplex these events so each event gets certain - * percentage (but not all) of the PMU time. In case that - * multiplexing happens, the number of samples or counter value - * will not reflect the case compared to when no multiplexing - * occurs. This makes comparison between different runs difficult. - * Typically, the counter value should be normalized before - * comparing to other experiments. The usual normalization is done - * as follows. - * - * :: - * - * normalized_counter = counter * t_enabled / t_running - * - * Where t_enabled is the time enabled for event and t_running is - * the time running for event since last normalization. The - * enabled and running times are accumulated since the perf event - * open. To achieve scaling factor between two invocations of an - * eBPF program, users can can use CPU id as the key (which is - * typical for perf array usage model) to remember the previous - * value and do the calculation inside the eBPF program. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) - * Description - * For en eBPF program attached to a perf event, retrieve the - * value of the event counter associated to *ctx* and store it in - * the structure pointed by *buf* and of size *buf_size*. Enabled - * and running times are also stored in the structure (see - * description of helper **bpf_perf_event_read_value**\ () for - * more details). - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) - * Description - * Emulate a call to **getsockopt()** on the socket associated to - * *bpf_socket*, which must be a full socket. The *level* at - * which the option resides and the name *optname* of the option - * must be specified, see **getsockopt(2)** for more information. - * The retrieved value is stored in the structure pointed by - * *opval* and of length *optlen*. - * - * This helper actually implements a subset of **getsockopt()**. - * It supports the following *level*\ s: - * - * * **IPPROTO_TCP**, which supports *optname* - * **TCP_CONGESTION**. - * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. - * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_override_return(struct pt_reg *regs, u64 rc) - * Description - * Used for error injection, this helper uses kprobes to override - * the return value of the probed function, and to set it to *rc*. - * The first argument is the context *regs* on which the kprobe - * works. - * - * This helper works by setting setting the PC (program counter) - * to an override function which is run in place of the original - * probed function. This means the probed function is not run at - * all. The replacement function just returns with the required - * value. - * - * This helper has security implications, and thus is subject to - * restrictions. It is only available if the kernel was compiled - * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration - * option, and in this case it only works on functions tagged with - * **ALLOW_ERROR_INJECTION** in the kernel code. - * - * Also, the helper is only available for the architectures having - * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, - * x86 architecture is the only one to support this feature. - * Return - * 0 - * - * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) - * Description - * Attempt to set the value of the **bpf_sock_ops_cb_flags** field - * for the full TCP socket associated to *bpf_sock_ops* to - * *argval*. - * - * The primary use of this field is to determine if there should - * be calls to eBPF programs of type - * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP - * code. A program of the same type can change its value, per - * connection and as necessary, when the connection is - * established. This field is directly accessible for reading, but - * this helper must be used for updates in order to return an - * error if an eBPF program tries to set a callback that is not - * supported in the current kernel. - * - * The supported callback values that *argval* can combine are: - * - * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) - * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) - * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) - * - * Here are some examples of where one could call such eBPF - * program: - * - * * When RTO fires. - * * When a packet is retransmitted. - * * When the connection terminates. - * * When a packet is sent. - * * When a packet is received. - * Return - * Code **-EINVAL** if the socket is not a full TCP socket; - * otherwise, a positive number containing the bits that could not - * be set is returned (which comes down to 0 if all bits were set - * as required). - * - * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) - * Description - * This helper is used in programs implementing policies at the - * socket level. If the message *msg* is allowed to pass (i.e. if - * the verdict eBPF program returns **SK_PASS**), redirect it to - * the socket referenced by *map* (of type - * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and - * egress interfaces can be used for redirection. The - * **BPF_F_INGRESS** value in *flags* is used to make the - * distinction (ingress path is selected if the flag is present, - * egress path otherwise). This is the only flag supported for now. - * Return - * **SK_PASS** on success, or **SK_DROP** on error. - * - * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) - * Description - * For socket policies, apply the verdict of the eBPF program to - * the next *bytes* (number of bytes) of message *msg*. - * - * For example, this helper can be used in the following cases: - * - * * A single **sendmsg**\ () or **sendfile**\ () system call - * contains multiple logical messages that the eBPF program is - * supposed to read and for which it should apply a verdict. - * * An eBPF program only cares to read the first *bytes* of a - * *msg*. If the message has a large payload, then setting up - * and calling the eBPF program repeatedly for all bytes, even - * though the verdict is already known, would create unnecessary - * overhead. - * - * When called from within an eBPF program, the helper sets a - * counter internal to the BPF infrastructure, that is used to - * apply the last verdict to the next *bytes*. If *bytes* is - * smaller than the current data being processed from a - * **sendmsg**\ () or **sendfile**\ () system call, the first - * *bytes* will be sent and the eBPF program will be re-run with - * the pointer for start of data pointing to byte number *bytes* - * **+ 1**. If *bytes* is larger than the current data being - * processed, then the eBPF verdict will be applied to multiple - * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are - * consumed. - * - * Note that if a socket closes with the internal counter holding - * a non-zero value, this is not a problem because data is not - * being buffered for *bytes* and is sent as it is received. - * Return - * 0 - * - * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) - * Description - * For socket policies, prevent the execution of the verdict eBPF - * program for message *msg* until *bytes* (byte number) have been - * accumulated. - * - * This can be used when one needs a specific number of bytes - * before a verdict can be assigned, even if the data spans - * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme - * case would be a user calling **sendmsg**\ () repeatedly with - * 1-byte long message segments. Obviously, this is bad for - * performance, but it is still valid. If the eBPF program needs - * *bytes* bytes to validate a header, this helper can be used to - * prevent the eBPF program to be called again until *bytes* have - * been accumulated. - * Return - * 0 - * - * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) - * Description - * For socket policies, pull in non-linear data from user space - * for *msg* and set pointers *msg*\ **->data** and *msg*\ - * **->data_end** to *start* and *end* bytes offsets into *msg*, - * respectively. - * - * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a - * *msg* it can only parse data that the (**data**, **data_end**) - * pointers have already consumed. For **sendmsg**\ () hooks this - * is likely the first scatterlist element. But for calls relying - * on the **sendpage** handler (e.g. **sendfile**\ ()) this will - * be the range (**0**, **0**) because the data is shared with - * user space and by default the objective is to avoid allowing - * user space to modify data while (or after) eBPF verdict is - * being decided. This helper can be used to pull in data and to - * set the start and end pointer to given values. Data will be - * copied if necessary (i.e. if data was not linear and if start - * and end pointers do not point to the same chunk). - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * - * All values for *flags* are reserved for future usage, and must - * be left at zero. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) - * Description - * Bind the socket associated to *ctx* to the address pointed by - * *addr*, of length *addr_len*. This allows for making outgoing - * connection from the desired IP address, which can be useful for - * example when all processes inside a cgroup should use one - * single IP address on a host that has multiple IP configured. - * - * This helper works for IPv4 and IPv6, TCP and UDP sockets. The - * domain (*addr*\ **->sa_family**) must be **AF_INET** (or - * **AF_INET6**). Looking for a free port to bind to can be - * expensive, therefore binding to port is not permitted by the - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) - * must be set to zero. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) - * Description - * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is - * only possible to shrink the packet as of this writing, - * therefore *delta* must be a negative integer. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) - * Description - * Retrieve the XFRM state (IP transform framework, see also - * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. - * - * The retrieved value is stored in the **struct bpf_xfrm_state** - * pointed by *xfrm_state* and of length *size*. - * - * All values for *flags* are reserved for future usage, and must - * be left at zero. - * - * This helper is available only if the kernel was compiled with - * **CONFIG_XFRM** configuration option. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) - * Description - * Return a user or a kernel stack in bpf program provided buffer. - * To achieve this, the helper needs *ctx*, which is a pointer - * to the context on which the tracing program is executed. - * To store the stacktrace, the bpf program provides *buf* with - * a nonnegative *size*. - * - * The last argument, *flags*, holds the number of stack frames to - * skip (from 0 to 255), masked with - * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set - * the following flags: - * - * **BPF_F_USER_STACK** - * Collect a user space stack instead of a kernel stack. - * **BPF_F_USER_BUILD_ID** - * Collect buildid+offset instead of ips for user stack, - * only valid if **BPF_F_USER_STACK** is also specified. - * - * **bpf_get_stack**\ () can collect up to - * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject - * to sufficient large buffer size. Note that - * this limit can be controlled with the **sysctl** program, and - * that it should be manually increased in order to profile long - * user stacks (such as stacks for Java programs). To do so, use: - * - * :: - * - * # sysctl kernel.perf_event_max_stack= - * Return - * A non-negative value equal to or less than *size* on success, - * or a negative error in case of failure. - * - * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) - * Description - * This helper is similar to **bpf_skb_load_bytes**\ () in that - * it provides an easy way to load *len* bytes from *offset* - * from the packet associated to *skb*, into the buffer pointed - * by *to*. The difference to **bpf_skb_load_bytes**\ () is that - * a fifth argument *start_header* exists in order to select a - * base offset to start from. *start_header* can be one of: - * - * **BPF_HDR_START_MAC** - * Base offset to load data from is *skb*'s mac header. - * **BPF_HDR_START_NET** - * Base offset to load data from is *skb*'s network header. - * - * In general, "direct packet access" is the preferred method to - * access packet data, however, this helper is in particular useful - * in socket filters where *skb*\ **->data** does not always point - * to the start of the mac header and where "direct packet access" - * is not available. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) - * Description - * Do FIB lookup in kernel tables using parameters in *params*. - * If lookup is successful and result shows packet is to be - * forwarded, the neighbor tables are searched for the nexthop. - * If successful (ie., FIB lookup shows forwarding and nexthop - * is resolved), the nexthop address is returned in ipv4_dst - * or ipv6_dst based on family, smac is set to mac address of - * egress device, dmac is set to nexthop mac address, rt_metric - * is set to metric from route (IPv4/IPv6 only), and ifindex - * is set to the device index of the nexthop from the FIB lookup. - * - * *plen* argument is the size of the passed in struct. - * *flags* argument can be a combination of one or more of the - * following values: - * - * **BPF_FIB_LOOKUP_DIRECT** - * Do a direct table lookup vs full lookup using FIB - * rules. - * **BPF_FIB_LOOKUP_OUTPUT** - * Perform lookup from an egress perspective (default is - * ingress). - * - * *ctx* is either **struct xdp_md** for XDP programs or - * **struct sk_buff** tc cls_act programs. - * Return - * * < 0 if any input argument is invalid - * * 0 on success (packet is forwarded, nexthop neighbor exists) - * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the - * packet is not forwarded or needs assist from full stack - * - * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) - * Description - * Add an entry to, or update a sockhash *map* referencing sockets. - * The *skops* is used as a new value for the entry associated to - * *key*. *flags* is one of: - * - * **BPF_NOEXIST** - * The entry for *key* must not exist in the map. - * **BPF_EXIST** - * The entry for *key* must already exist in the map. - * **BPF_ANY** - * No condition on the existence of the entry for *key*. - * - * If the *map* has eBPF programs (parser and verdict), those will - * be inherited by the socket being added. If the socket is - * already attached to eBPF programs, this results in an error. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) - * Description - * This helper is used in programs implementing policies at the - * socket level. If the message *msg* is allowed to pass (i.e. if - * the verdict eBPF program returns **SK_PASS**), redirect it to - * the socket referenced by *map* (of type - * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and - * egress interfaces can be used for redirection. The - * **BPF_F_INGRESS** value in *flags* is used to make the - * distinction (ingress path is selected if the flag is present, - * egress path otherwise). This is the only flag supported for now. - * Return - * **SK_PASS** on success, or **SK_DROP** on error. - * - * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) - * Description - * This helper is used in programs implementing policies at the - * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. - * if the verdeict eBPF program returns **SK_PASS**), redirect it - * to the socket referenced by *map* (of type - * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and - * egress interfaces can be used for redirection. The - * **BPF_F_INGRESS** value in *flags* is used to make the - * distinction (ingress path is selected if the flag is present, - * egress otherwise). This is the only flag supported for now. - * Return - * **SK_PASS** on success, or **SK_DROP** on error. - * - * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) - * Description - * Encapsulate the packet associated to *skb* within a Layer 3 - * protocol header. This header is provided in the buffer at - * address *hdr*, with *len* its size in bytes. *type* indicates - * the protocol of the header and can be one of: - * - * **BPF_LWT_ENCAP_SEG6** - * IPv6 encapsulation with Segment Routing Header - * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, - * the IPv6 header is computed by the kernel. - * **BPF_LWT_ENCAP_SEG6_INLINE** - * Only works if *skb* contains an IPv6 packet. Insert a - * Segment Routing Header (**struct ipv6_sr_hdr**) inside - * the IPv6 header. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) - * Description - * Store *len* bytes from address *from* into the packet - * associated to *skb*, at *offset*. Only the flags, tag and TLVs - * inside the outermost IPv6 Segment Routing Header can be - * modified through this helper. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) - * Description - * Adjust the size allocated to TLVs in the outermost IPv6 - * Segment Routing Header contained in the packet associated to - * *skb*, at position *offset* by *delta* bytes. Only offsets - * after the segments are accepted. *delta* can be as well - * positive (growing) as negative (shrinking). - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) - * Description - * Apply an IPv6 Segment Routing action of type *action* to the - * packet associated to *skb*. Each action takes a parameter - * contained at address *param*, and of length *param_len* bytes. - * *action* can be one of: - * - * **SEG6_LOCAL_ACTION_END_X** - * End.X action: Endpoint with Layer-3 cross-connect. - * Type of *param*: **struct in6_addr**. - * **SEG6_LOCAL_ACTION_END_T** - * End.T action: Endpoint with specific IPv6 table lookup. - * Type of *param*: **int**. - * **SEG6_LOCAL_ACTION_END_B6** - * End.B6 action: Endpoint bound to an SRv6 policy. - * Type of param: **struct ipv6_sr_hdr**. - * **SEG6_LOCAL_ACTION_END_B6_ENCAP** - * End.B6.Encap action: Endpoint bound to an SRv6 - * encapsulation policy. - * Type of param: **struct ipv6_sr_hdr**. - * - * A call to this helper is susceptible to change the underlaying - * packet buffer. Therefore, at load time, all checks on pointers - * previously done by the verifier are invalidated and must be - * performed again, if the helper is used in combination with - * direct packet access. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) - * Description - * This helper is used in programs implementing IR decoding, to - * report a successfully decoded key press with *scancode*, - * *toggle* value in the given *protocol*. The scancode will be - * translated to a keycode using the rc keymap, and reported as - * an input key down event. After a period a key up event is - * generated. This period can be extended by calling either - * **bpf_rc_keydown**\ () again with the same values, or calling - * **bpf_rc_repeat**\ (). - * - * Some protocols include a toggle bit, in case the button was - * released and pressed again between consecutive scancodes. - * - * The *ctx* should point to the lirc sample as passed into - * the program. - * - * The *protocol* is the decoded protocol number (see - * **enum rc_proto** for some predefined values). - * - * This helper is only available is the kernel was compiled with - * the **CONFIG_BPF_LIRC_MODE2** configuration option set to - * "**y**". - * Return - * 0 - * - * int bpf_rc_repeat(void *ctx) - * Description - * This helper is used in programs implementing IR decoding, to - * report a successfully decoded repeat key message. This delays - * the generation of a key up event for previously generated - * key down event. - * - * Some IR protocols like NEC have a special IR message for - * repeating last button, for when a button is held down. - * - * The *ctx* should point to the lirc sample as passed into - * the program. - * - * This helper is only available is the kernel was compiled with - * the **CONFIG_BPF_LIRC_MODE2** configuration option set to - * "**y**". - * Return - * 0 - * - * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb) - * Description - * Return the cgroup v2 id of the socket associated with the *skb*. - * This is roughly similar to the **bpf_get_cgroup_classid**\ () - * helper for cgroup v1 by providing a tag resp. identifier that - * can be matched on or used for map lookups e.g. to implement - * policy. The cgroup v2 id of a given path in the hierarchy is - * exposed in user space through the f_handle API in order to get - * to the same 64-bit id. - * - * This helper can be used on TC egress path, but not on ingress, - * and is available only if the kernel was compiled with the - * **CONFIG_SOCK_CGROUP_DATA** configuration option. - * Return - * The id is returned or 0 in case the id could not be retrieved. - * - * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) - * Description - * Return id of cgroup v2 that is ancestor of cgroup associated - * with the *skb* at the *ancestor_level*. The root cgroup is at - * *ancestor_level* zero and each step down the hierarchy - * increments the level. If *ancestor_level* == level of cgroup - * associated with *skb*, then return value will be same as that - * of **bpf_skb_cgroup_id**\ (). - * - * The helper is useful to implement policies based on cgroups - * that are upper in hierarchy than immediate cgroup associated - * with *skb*. - * - * The format of returned id and helper limitations are same as in - * **bpf_skb_cgroup_id**\ (). - * Return - * The id is returned or 0 in case the id could not be retrieved. - * - * u64 bpf_get_current_cgroup_id(void) - * Return - * A 64-bit integer containing the current cgroup id based - * on the cgroup within which the current task is running. - * - * void* get_local_storage(void *map, u64 flags) - * Description - * Get the pointer to the local storage area. - * The type and the size of the local storage is defined - * by the *map* argument. - * The *flags* meaning is specific for each map type, - * and has to be 0 for cgroup local storage. - * - * Depending on the BPF program type, a local storage area - * can be shared between multiple instances of the BPF program, - * running simultaneously. - * - * A user should care about the synchronization by himself. - * For example, by using the **BPF_STX_XADD** instruction to alter - * the shared data. - * Return - * A pointer to the local storage area. - * - * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) - * Description - * Select a **SO_REUSEPORT** socket from a - * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. - * It checks the selected socket is matching the incoming - * request in the socket buffer. - * Return - * 0 on success, or a negative error in case of failure. - * - * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) - * Description - * Look for TCP socket matching *tuple*, optionally in a child - * network namespace *netns*. The return value must be checked, - * and if non-**NULL**, released via **bpf_sk_release**\ (). - * - * The *ctx* should point to the context of the program, such as - * the skb or socket (depending on the hook in use). This is used - * to determine the base network namespace for the lookup. - * - * *tuple_size* must be one of: - * - * **sizeof**\ (*tuple*\ **->ipv4**) - * Look for an IPv4 socket. - * **sizeof**\ (*tuple*\ **->ipv6**) - * Look for an IPv6 socket. - * - * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will - * will be used. For the TC hooks, this is the netns of the device - * in the skb. For socket hooks, this is the netns of the socket. - * If *netns* is any other signed 32-bit value greater than or - * equal to zero then it specifies the ID of the netns relative to - * the netns associated with the *ctx*. *netns* values beyond the - * range of 32-bit integers are reserved for future use. - * - * All values for *flags* are reserved for future usage, and must - * be left at zero. - * - * This helper is available only if the kernel was compiled with - * **CONFIG_NET** configuration option. - * Return - * Pointer to **struct bpf_sock**, or **NULL** in case of failure. - * For sockets with reuseport option, the **struct bpf_sock** - * result is from **reuse->socks**\ [] using the hash of the tuple. - * - * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) - * Description - * Look for UDP socket matching *tuple*, optionally in a child - * network namespace *netns*. The return value must be checked, - * and if non-**NULL**, released via **bpf_sk_release**\ (). - * - * The *ctx* should point to the context of the program, such as - * the skb or socket (depending on the hook in use). This is used - * to determine the base network namespace for the lookup. - * - * *tuple_size* must be one of: - * - * **sizeof**\ (*tuple*\ **->ipv4**) - * Look for an IPv4 socket. - * **sizeof**\ (*tuple*\ **->ipv6**) - * Look for an IPv6 socket. - * - * If the *netns* is a negative signed 32-bit integer, then the - * socket lookup table in the netns associated with the *ctx* will - * will be used. For the TC hooks, this is the netns of the device - * in the skb. For socket hooks, this is the netns of the socket. - * If *netns* is any other signed 32-bit value greater than or - * equal to zero then it specifies the ID of the netns relative to - * the netns associated with the *ctx*. *netns* values beyond the - * range of 32-bit integers are reserved for future use. - * - * All values for *flags* are reserved for future usage, and must - * be left at zero. - * - * This helper is available only if the kernel was compiled with - * **CONFIG_NET** configuration option. - * Return - * Pointer to **struct bpf_sock**, or **NULL** in case of failure. - * For sockets with reuseport option, the **struct bpf_sock** - * result is from **reuse->socks**\ [] using the hash of the tuple. - * - * int bpf_sk_release(struct bpf_sock *sock) - * Description - * Release the reference held by *sock*. *sock* must be a - * non-**NULL** pointer that was returned from - * **bpf_sk_lookup_xxx**\ (). - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_map_pop_elem(struct bpf_map *map, void *value) - * Description - * Pop an element from *map*. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_map_peek_elem(struct bpf_map *map, void *value) - * Description - * Get an element from *map* without removing it. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) - * Description - * For socket policies, insert *len* bytes into *msg* at offset - * *start*. - * - * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a - * *msg* it may want to insert metadata or options into the *msg*. - * This can later be read and used by any of the lower layer BPF - * hooks. - * - * This helper may fail if under memory pressure (a malloc - * fails) in these cases BPF programs will get an appropriate - * error and BPF programs will need to handle them. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags) - * Description - * Will remove *pop* bytes from a *msg* starting at byte *start*. - * This may result in **ENOMEM** errors under certain situations if - * an allocation and copy are required due to a full ring buffer. - * However, the helper will try to avoid doing the allocation - * if possible. Other errors can occur if input parameters are - * invalid either due to *start* byte not being valid part of *msg* - * payload and/or *pop* value being to large. - * Return - * 0 on success, or a negative error in case of failure. - * - * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) - * Description - * This helper is used in programs implementing IR decoding, to - * report a successfully decoded pointer movement. - * - * The *ctx* should point to the lirc sample as passed into - * the program. - * - * This helper is only available is the kernel was compiled with - * the **CONFIG_BPF_LIRC_MODE2** configuration option set to - * "**y**". - * Return - * 0 - */ -#define __BPF_FUNC_MAPPER(FN) \ - FN(unspec), \ - FN(map_lookup_elem), \ - FN(map_update_elem), \ - FN(map_delete_elem), \ - FN(probe_read), \ - FN(ktime_get_ns), \ - FN(trace_printk), \ - FN(get_prandom_u32), \ - FN(get_smp_processor_id), \ - FN(skb_store_bytes), \ - FN(l3_csum_replace), \ - FN(l4_csum_replace), \ - FN(tail_call), \ - FN(clone_redirect), \ - FN(get_current_pid_tgid), \ - FN(get_current_uid_gid), \ - FN(get_current_comm), \ - FN(get_cgroup_classid), \ - FN(skb_vlan_push), \ - FN(skb_vlan_pop), \ - FN(skb_get_tunnel_key), \ - FN(skb_set_tunnel_key), \ - FN(perf_event_read), \ - FN(redirect), \ - FN(get_route_realm), \ - FN(perf_event_output), \ - FN(skb_load_bytes), \ - FN(get_stackid), \ - FN(csum_diff), \ - FN(skb_get_tunnel_opt), \ - FN(skb_set_tunnel_opt), \ - FN(skb_change_proto), \ - FN(skb_change_type), \ - FN(skb_under_cgroup), \ - FN(get_hash_recalc), \ - FN(get_current_task), \ - FN(probe_write_user), \ - FN(current_task_under_cgroup), \ - FN(skb_change_tail), \ - FN(skb_pull_data), \ - FN(csum_update), \ - FN(set_hash_invalid), \ - FN(get_numa_node_id), \ - FN(skb_change_head), \ - FN(xdp_adjust_head), \ - FN(probe_read_str), \ - FN(get_socket_cookie), \ - FN(get_socket_uid), \ - FN(set_hash), \ - FN(setsockopt), \ - FN(skb_adjust_room), \ - FN(redirect_map), \ - FN(sk_redirect_map), \ - FN(sock_map_update), \ - FN(xdp_adjust_meta), \ - FN(perf_event_read_value), \ - FN(perf_prog_read_value), \ - FN(getsockopt), \ - FN(override_return), \ - FN(sock_ops_cb_flags_set), \ - FN(msg_redirect_map), \ - FN(msg_apply_bytes), \ - FN(msg_cork_bytes), \ - FN(msg_pull_data), \ - FN(bind), \ - FN(xdp_adjust_tail), \ - FN(skb_get_xfrm_state), \ - FN(get_stack), \ - FN(skb_load_bytes_relative), \ - FN(fib_lookup), \ - FN(sock_hash_update), \ - FN(msg_redirect_hash), \ - FN(sk_redirect_hash), \ - FN(lwt_push_encap), \ - FN(lwt_seg6_store_bytes), \ - FN(lwt_seg6_adjust_srh), \ - FN(lwt_seg6_action), \ - FN(rc_repeat), \ - FN(rc_keydown), \ - FN(skb_cgroup_id), \ - FN(get_current_cgroup_id), \ - FN(get_local_storage), \ - FN(sk_select_reuseport), \ - FN(skb_ancestor_cgroup_id), \ - FN(sk_lookup_tcp), \ - FN(sk_lookup_udp), \ - FN(sk_release), \ - FN(map_push_elem), \ - FN(map_pop_elem), \ - FN(map_peek_elem), \ - FN(msg_push_data), \ - FN(msg_pop_data), \ - FN(rc_pointer_rel), - -/* integer value in 'imm' field of BPF_CALL instruction selects which helper - * function eBPF program intends to call - */ -#define __BPF_ENUM_FN(x) BPF_FUNC_ ## x -enum bpf_func_id { - __BPF_FUNC_MAPPER(__BPF_ENUM_FN) - __BPF_FUNC_MAX_ID, -}; -#undef __BPF_ENUM_FN - -/* All flags used by eBPF helper functions, placed here. */ - -/* BPF_FUNC_skb_store_bytes flags. */ -#define BPF_F_RECOMPUTE_CSUM (1ULL << 0) -#define BPF_F_INVALIDATE_HASH (1ULL << 1) - -/* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. - * First 4 bits are for passing the header field size. - */ -#define BPF_F_HDR_FIELD_MASK 0xfULL - -/* BPF_FUNC_l4_csum_replace flags. */ -#define BPF_F_PSEUDO_HDR (1ULL << 4) -#define BPF_F_MARK_MANGLED_0 (1ULL << 5) -#define BPF_F_MARK_ENFORCE (1ULL << 6) - -/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ -#define BPF_F_INGRESS (1ULL << 0) - -/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ -#define BPF_F_TUNINFO_IPV6 (1ULL << 0) - -/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ -#define BPF_F_SKIP_FIELD_MASK 0xffULL -#define BPF_F_USER_STACK (1ULL << 8) -/* flags used by BPF_FUNC_get_stackid only. */ -#define BPF_F_FAST_STACK_CMP (1ULL << 9) -#define BPF_F_REUSE_STACKID (1ULL << 10) -/* flags used by BPF_FUNC_get_stack only. */ -#define BPF_F_USER_BUILD_ID (1ULL << 11) - -/* BPF_FUNC_skb_set_tunnel_key flags. */ -#define BPF_F_ZERO_CSUM_TX (1ULL << 1) -#define BPF_F_DONT_FRAGMENT (1ULL << 2) -#define BPF_F_SEQ_NUMBER (1ULL << 3) - -/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and - * BPF_FUNC_perf_event_read_value flags. - */ -#define BPF_F_INDEX_MASK 0xffffffffULL -#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK -/* BPF_FUNC_perf_event_output for sk_buff input context. */ -#define BPF_F_CTXLEN_MASK (0xfffffULL << 32) - -/* Current network namespace */ -#define BPF_F_CURRENT_NETNS (-1L) - -/* Mode for BPF_FUNC_skb_adjust_room helper. */ -enum bpf_adj_room_mode { - BPF_ADJ_ROOM_NET, -}; - -/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ -enum bpf_hdr_start_off { - BPF_HDR_START_MAC, - BPF_HDR_START_NET, -}; - -/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ -enum bpf_lwt_encap_mode { - BPF_LWT_ENCAP_SEG6, - BPF_LWT_ENCAP_SEG6_INLINE -}; - -#define __bpf_md_ptr(type, name) \ -union { \ - type name; \ - __u64 :64; \ -} __attribute__((aligned(8))) - -/* user accessible mirror of in-kernel sk_buff. - * new fields can only be added to the end of this structure - */ -struct __sk_buff { - __u32 len; - __u32 pkt_type; - __u32 mark; - __u32 queue_mapping; - __u32 protocol; - __u32 vlan_present; - __u32 vlan_tci; - __u32 vlan_proto; - __u32 priority; - __u32 ingress_ifindex; - __u32 ifindex; - __u32 tc_index; - __u32 cb[5]; - __u32 hash; - __u32 tc_classid; - __u32 data; - __u32 data_end; - __u32 napi_id; - - /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ - __u32 family; - __u32 remote_ip4; /* Stored in network byte order */ - __u32 local_ip4; /* Stored in network byte order */ - __u32 remote_ip6[4]; /* Stored in network byte order */ - __u32 local_ip6[4]; /* Stored in network byte order */ - __u32 remote_port; /* Stored in network byte order */ - __u32 local_port; /* stored in host byte order */ - /* ... here. */ - - __u32 data_meta; - __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); - __u64 tstamp; - __u32 wire_len; -}; - -struct bpf_tunnel_key { - __u32 tunnel_id; - union { - __u32 remote_ipv4; - __u32 remote_ipv6[4]; - }; - __u8 tunnel_tos; - __u8 tunnel_ttl; - __u16 tunnel_ext; /* Padding, future use. */ - __u32 tunnel_label; -}; - -/* user accessible mirror of in-kernel xfrm_state. - * new fields can only be added to the end of this structure - */ -struct bpf_xfrm_state { - __u32 reqid; - __u32 spi; /* Stored in network byte order */ - __u16 family; - __u16 ext; /* Padding, future use. */ - union { - __u32 remote_ipv4; /* Stored in network byte order */ - __u32 remote_ipv6[4]; /* Stored in network byte order */ - }; -}; - -/* Generic BPF return codes which all BPF program types may support. - * The values are binary compatible with their TC_ACT_* counter-part to - * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT - * programs. - * - * XDP is handled seprately, see XDP_*. - */ -enum bpf_ret_code { - BPF_OK = 0, - /* 1 reserved */ - BPF_DROP = 2, - /* 3-6 reserved */ - BPF_REDIRECT = 7, - /* >127 are reserved for prog type specific return codes */ -}; - -struct bpf_sock { - __u32 bound_dev_if; - __u32 family; - __u32 type; - __u32 protocol; - __u32 mark; - __u32 priority; - __u32 src_ip4; /* Allows 1,2,4-byte read. - * Stored in network byte order. - */ - __u32 src_ip6[4]; /* Allows 1,2,4-byte read. - * Stored in network byte order. - */ - __u32 src_port; /* Allows 4-byte read. - * Stored in host byte order - */ -}; - -struct bpf_sock_tuple { - union { - struct { - __be32 saddr; - __be32 daddr; - __be16 sport; - __be16 dport; - } ipv4; - struct { - __be32 saddr[4]; - __be32 daddr[4]; - __be16 sport; - __be16 dport; - } ipv6; - }; -}; - -#define XDP_PACKET_HEADROOM 256 - -/* User return codes for XDP prog type. - * A valid XDP program must return one of these defined values. All other - * return codes are reserved for future use. Unknown return codes will - * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). - */ -enum xdp_action { - XDP_ABORTED = 0, - XDP_DROP, - XDP_PASS, - XDP_TX, - XDP_REDIRECT, -}; - -/* user accessible metadata for XDP packet hook - * new fields must be added to the end of this structure - */ -struct xdp_md { - __u32 data; - __u32 data_end; - __u32 data_meta; - /* Below access go through struct xdp_rxq_info */ - __u32 ingress_ifindex; /* rxq->dev->ifindex */ - __u32 rx_queue_index; /* rxq->queue_index */ -}; - -enum sk_action { - SK_DROP = 0, - SK_PASS, -}; - -/* user accessible metadata for SK_MSG packet hook, new fields must - * be added to the end of this structure - */ -struct sk_msg_md { - __bpf_md_ptr(void *, data); - __bpf_md_ptr(void *, data_end); - - __u32 family; - __u32 remote_ip4; /* Stored in network byte order */ - __u32 local_ip4; /* Stored in network byte order */ - __u32 remote_ip6[4]; /* Stored in network byte order */ - __u32 local_ip6[4]; /* Stored in network byte order */ - __u32 remote_port; /* Stored in network byte order */ - __u32 local_port; /* stored in host byte order */ - __u32 size; /* Total size of sk_msg */ -}; - -struct sk_reuseport_md { - /* - * Start of directly accessible data. It begins from - * the tcp/udp header. - */ - __bpf_md_ptr(void *, data); - /* End of directly accessible data */ - __bpf_md_ptr(void *, data_end); - /* - * Total length of packet (starting from the tcp/udp header). - * Note that the directly accessible bytes (data_end - data) - * could be less than this "len". Those bytes could be - * indirectly read by a helper "bpf_skb_load_bytes()". - */ - __u32 len; - /* - * Eth protocol in the mac header (network byte order). e.g. - * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) - */ - __u32 eth_protocol; - __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ - __u32 bind_inany; /* Is sock bound to an INANY address? */ - __u32 hash; /* A hash of the packet 4 tuples */ -}; - -#define BPF_TAG_SIZE 8 - -struct bpf_prog_info { - __u32 type; - __u32 id; - __u8 tag[BPF_TAG_SIZE]; - __u32 jited_prog_len; - __u32 xlated_prog_len; - __aligned_u64 jited_prog_insns; - __aligned_u64 xlated_prog_insns; - __u64 load_time; /* ns since boottime */ - __u32 created_by_uid; - __u32 nr_map_ids; - __aligned_u64 map_ids; - char name[BPF_OBJ_NAME_LEN]; - __u32 ifindex; - __u32 gpl_compatible:1; - __u64 netns_dev; - __u64 netns_ino; - __u32 nr_jited_ksyms; - __u32 nr_jited_func_lens; - __aligned_u64 jited_ksyms; - __aligned_u64 jited_func_lens; - __u32 btf_id; - __u32 func_info_rec_size; - __aligned_u64 func_info; - __u32 nr_func_info; - __u32 nr_line_info; - __aligned_u64 line_info; - __aligned_u64 jited_line_info; - __u32 nr_jited_line_info; - __u32 line_info_rec_size; - __u32 jited_line_info_rec_size; - __u32 nr_prog_tags; - __aligned_u64 prog_tags; -} __attribute__((aligned(8))); - -struct bpf_map_info { - __u32 type; - __u32 id; - __u32 key_size; - __u32 value_size; - __u32 max_entries; - __u32 map_flags; - char name[BPF_OBJ_NAME_LEN]; - __u32 ifindex; - __u32 :32; - __u64 netns_dev; - __u64 netns_ino; - __u32 btf_id; - __u32 btf_key_type_id; - __u32 btf_value_type_id; -} __attribute__((aligned(8))); - -struct bpf_btf_info { - __aligned_u64 btf; - __u32 btf_size; - __u32 id; -} __attribute__((aligned(8))); - -/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed - * by user and intended to be used by socket (e.g. to bind to, depends on - * attach attach type). - */ -struct bpf_sock_addr { - __u32 user_family; /* Allows 4-byte read, but no write. */ - __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. - * Stored in network byte order. - */ - __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. - * Stored in network byte order. - */ - __u32 user_port; /* Allows 4-byte read and write. - * Stored in network byte order - */ - __u32 family; /* Allows 4-byte read, but no write */ - __u32 type; /* Allows 4-byte read, but no write */ - __u32 protocol; /* Allows 4-byte read, but no write */ - __u32 msg_src_ip4; /* Allows 1,2,4-byte read an 4-byte write. - * Stored in network byte order. - */ - __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. - * Stored in network byte order. - */ -}; - -/* User bpf_sock_ops struct to access socket values and specify request ops - * and their replies. - * Some of this fields are in network (bigendian) byte order and may need - * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h). - * New fields can only be added at the end of this structure - */ -struct bpf_sock_ops { - __u32 op; - union { - __u32 args[4]; /* Optionally passed to bpf program */ - __u32 reply; /* Returned by bpf program */ - __u32 replylong[4]; /* Optionally returned by bpf prog */ - }; - __u32 family; - __u32 remote_ip4; /* Stored in network byte order */ - __u32 local_ip4; /* Stored in network byte order */ - __u32 remote_ip6[4]; /* Stored in network byte order */ - __u32 local_ip6[4]; /* Stored in network byte order */ - __u32 remote_port; /* Stored in network byte order */ - __u32 local_port; /* stored in host byte order */ - __u32 is_fullsock; /* Some TCP fields are only valid if - * there is a full socket. If not, the - * fields read as zero. - */ - __u32 snd_cwnd; - __u32 srtt_us; /* Averaged RTT << 3 in usecs */ - __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ - __u32 state; - __u32 rtt_min; - __u32 snd_ssthresh; - __u32 rcv_nxt; - __u32 snd_nxt; - __u32 snd_una; - __u32 mss_cache; - __u32 ecn_flags; - __u32 rate_delivered; - __u32 rate_interval_us; - __u32 packets_out; - __u32 retrans_out; - __u32 total_retrans; - __u32 segs_in; - __u32 data_segs_in; - __u32 segs_out; - __u32 data_segs_out; - __u32 lost_out; - __u32 sacked_out; - __u32 sk_txhash; - __u64 bytes_received; - __u64 bytes_acked; -}; - -/* Definitions for bpf_sock_ops_cb_flags */ -#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) -#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) -#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently - * supported cb flags - */ - -/* List of known BPF sock_ops operators. - * New entries can only be added at the end - */ -enum { - BPF_SOCK_OPS_VOID, - BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or - * -1 if default value should be used - */ - BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized - * window (in packets) or -1 if default - * value should be used - */ - BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an - * active connection is initialized - */ - BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an - * active connection is - * established - */ - BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a - * passive connection is - * established - */ - BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control - * needs ECN - */ - BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is - * based on the path and may be - * dependent on the congestion control - * algorithm. In general it indicates - * a congestion threshold. RTTs above - * this indicate congestion - */ - BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. - * Arg1: value of icsk_retransmits - * Arg2: value of icsk_rto - * Arg3: whether RTO has expired - */ - BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. - * Arg1: sequence number of 1st byte - * Arg2: # segments - * Arg3: return value of - * tcp_transmit_skb (0 => success) - */ - BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. - * Arg1: old_state - * Arg2: new_state - */ - BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after - * socket transition to LISTEN state. - */ -}; - -/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect - * changes between the TCP and BPF versions. Ideally this should never happen. - * If it does, we need to add code to convert them before calling - * the BPF sock_ops function. - */ -enum { - BPF_TCP_ESTABLISHED = 1, - BPF_TCP_SYN_SENT, - BPF_TCP_SYN_RECV, - BPF_TCP_FIN_WAIT1, - BPF_TCP_FIN_WAIT2, - BPF_TCP_TIME_WAIT, - BPF_TCP_CLOSE, - BPF_TCP_CLOSE_WAIT, - BPF_TCP_LAST_ACK, - BPF_TCP_LISTEN, - BPF_TCP_CLOSING, /* Now a valid state */ - BPF_TCP_NEW_SYN_RECV, - - BPF_TCP_MAX_STATES /* Leave at the end! */ -}; - -#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ -#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ - -struct bpf_perf_event_value { - __u64 counter; - __u64 enabled; - __u64 running; -}; - -#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) -#define BPF_DEVCG_ACC_READ (1ULL << 1) -#define BPF_DEVCG_ACC_WRITE (1ULL << 2) - -#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) -#define BPF_DEVCG_DEV_CHAR (1ULL << 1) - -struct bpf_cgroup_dev_ctx { - /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ - __u32 access_type; - __u32 major; - __u32 minor; -}; - -struct bpf_raw_tracepoint_args { - __u64 args[0]; -}; - -/* DIRECT: Skip the FIB rules and go to FIB table associated with device - * OUTPUT: Do lookup from egress perspective; default is ingress - */ -#define BPF_FIB_LOOKUP_DIRECT BIT(0) -#define BPF_FIB_LOOKUP_OUTPUT BIT(1) - -enum { - BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ - BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ - BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ - BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ - BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ - BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ - BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ - BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ - BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ -}; - -struct bpf_fib_lookup { - /* input: network family for lookup (AF_INET, AF_INET6) - * output: network family of egress nexthop - */ - __u8 family; - - /* set if lookup is to consider L4 data - e.g., FIB rules */ - __u8 l4_protocol; - __be16 sport; - __be16 dport; - - /* total length of packet from network header - used for MTU check */ - __u16 tot_len; - - /* input: L3 device index for lookup - * output: device index from FIB lookup - */ - __u32 ifindex; - - union { - /* inputs to lookup */ - __u8 tos; /* AF_INET */ - __be32 flowinfo; /* AF_INET6, flow_label + priority */ - - /* output: metric of fib result (IPv4/IPv6 only) */ - __u32 rt_metric; - }; - - union { - __be32 ipv4_src; - __u32 ipv6_src[4]; /* in6_addr; network order */ - }; - - /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in - * network header. output: bpf_fib_lookup sets to gateway address - * if FIB lookup returns gateway route - */ - union { - __be32 ipv4_dst; - __u32 ipv6_dst[4]; /* in6_addr; network order */ - }; - - /* output */ - __be16 h_vlan_proto; - __be16 h_vlan_TCI; - __u8 smac[6]; /* ETH_ALEN */ - __u8 dmac[6]; /* ETH_ALEN */ -}; - -enum bpf_task_fd_type { - BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ - BPF_FD_TYPE_TRACEPOINT, /* tp name */ - BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ - BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ - BPF_FD_TYPE_UPROBE, /* filename + offset */ - BPF_FD_TYPE_URETPROBE, /* filename + offset */ -}; - -struct bpf_flow_keys { - __u16 nhoff; - __u16 thoff; - __u16 addr_proto; /* ETH_P_* of valid addrs */ - __u8 is_frag; - __u8 is_first_frag; - __u8 is_encap; - __u8 ip_proto; - __be16 n_proto; - __be16 sport; - __be16 dport; - union { - struct { - __be32 ipv4_src; - __be32 ipv4_dst; - }; - struct { - __u32 ipv6_src[4]; /* in6_addr; network order */ - __u32 ipv6_dst[4]; /* in6_addr; network order */ - }; - }; -}; - -struct bpf_func_info { - __u32 insn_off; - __u32 type_id; -}; - -#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) -#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) - -struct bpf_line_info { - __u32 insn_off; - __u32 file_name_off; - __u32 line_off; - __u32 line_col; -}; - -#endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/src/cc/compat/linux/bpf_common.h b/src/cc/compat/linux/bpf_common.h deleted file mode 100644 index ee97668bd..000000000 --- a/src/cc/compat/linux/bpf_common.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI__LINUX_BPF_COMMON_H__ -#define _UAPI__LINUX_BPF_COMMON_H__ - -/* Instruction classes */ -#define BPF_CLASS(code) ((code) & 0x07) -#define BPF_LD 0x00 -#define BPF_LDX 0x01 -#define BPF_ST 0x02 -#define BPF_STX 0x03 -#define BPF_ALU 0x04 -#define BPF_JMP 0x05 -#define BPF_RET 0x06 -#define BPF_MISC 0x07 - -/* ld/ldx fields */ -#define BPF_SIZE(code) ((code) & 0x18) -#define BPF_W 0x00 /* 32-bit */ -#define BPF_H 0x08 /* 16-bit */ -#define BPF_B 0x10 /* 8-bit */ -/* eBPF BPF_DW 0x18 64-bit */ -#define BPF_MODE(code) ((code) & 0xe0) -#define BPF_IMM 0x00 -#define BPF_ABS 0x20 -#define BPF_IND 0x40 -#define BPF_MEM 0x60 -#define BPF_LEN 0x80 -#define BPF_MSH 0xa0 - -/* alu/jmp fields */ -#define BPF_OP(code) ((code) & 0xf0) -#define BPF_ADD 0x00 -#define BPF_SUB 0x10 -#define BPF_MUL 0x20 -#define BPF_DIV 0x30 -#define BPF_OR 0x40 -#define BPF_AND 0x50 -#define BPF_LSH 0x60 -#define BPF_RSH 0x70 -#define BPF_NEG 0x80 -#define BPF_MOD 0x90 -#define BPF_XOR 0xa0 - -#define BPF_JA 0x00 -#define BPF_JEQ 0x10 -#define BPF_JGT 0x20 -#define BPF_JGE 0x30 -#define BPF_JSET 0x40 -#define BPF_SRC(code) ((code) & 0x08) -#define BPF_K 0x00 -#define BPF_X 0x08 - -#ifndef BPF_MAXINSNS -#define BPF_MAXINSNS 4096 -#endif - -#endif /* _UAPI__LINUX_BPF_COMMON_H__ */ diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h index 2728b29d8..e59970f50 100644 --- a/src/cc/libbpf.h +++ b/src/cc/libbpf.h @@ -18,7 +18,7 @@ #ifndef LIBBPF_H #define LIBBPF_H -#include "compat/linux/bpf.h" +#include "linux/bpf.h" #include #include diff --git a/src/cc/shared_table.cc b/src/cc/shared_table.cc index 29744a5d8..c76a97f5e 100644 --- a/src/cc/shared_table.cc +++ b/src/cc/shared_table.cc @@ -18,7 +18,7 @@ #include #include "common.h" -#include "compat/linux/bpf.h" +#include "linux/bpf.h" #include "table_storage.h" #include "table_storage_impl.h" diff --git a/tests/cc/CMakeLists.txt b/tests/cc/CMakeLists.txt index d28060b59..a47abe145 100644 --- a/tests/cc/CMakeLists.txt +++ b/tests/cc/CMakeLists.txt @@ -3,6 +3,7 @@ include_directories(${CMAKE_SOURCE_DIR}/src/cc) include_directories(${CMAKE_SOURCE_DIR}/src/cc/api) +include_directories(${CMAKE_SOURCE_DIR}/src/cc/libbpf/include/uapi) add_executable(test_static test_static.c) target_link_libraries(test_static bcc-static) From e60438db7d81dfa9443a70a252ee3a0442e6c1a5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 16 Jan 2019 17:21:21 -0800 Subject: [PATCH 013/135] include libbpf/src/*.c files in the build system The libbpf/src/*.c files are included in the build system, so those functions will be available for bcc internals to use them. There are two name conflicts, bpf_create_map and bpf_prog_load, between src/cc/libbpf.c and src/cc/libbpf/src/{bpf.c,libbpf.c}. To keep src/cc/libbpf intact, the following renaming happened in bcc repo: bpf_create_map => bcc_create_map bpf_prog_load => bcc_prog_load Signed-off-by: Yonghong Song --- cmake/FindCompilerFlag.cmake | 13 +++++ examples/cpp/UseExternalMap.cc | 8 +-- src/cc/CMakeLists.txt | 10 +++- src/cc/api/BPF.cc | 2 +- src/cc/frontends/b/codegen_llvm.cc | 2 +- src/cc/frontends/clang/b_frontend_action.cc | 2 +- src/cc/libbpf.c | 62 ++------------------- src/cc/libbpf.h | 4 +- src/lua/bcc/bpf.lua | 2 +- src/lua/bcc/libbcc.lua | 4 +- src/lua/bpf/bpf.lua | 6 +- src/python/bcc/__init__.py | 2 +- src/python/bcc/libbcc.py | 4 +- 13 files changed, 43 insertions(+), 78 deletions(-) diff --git a/cmake/FindCompilerFlag.cmake b/cmake/FindCompilerFlag.cmake index 31ac82d1d..04256a197 100644 --- a/cmake/FindCompilerFlag.cmake +++ b/cmake/FindCompilerFlag.cmake @@ -15,3 +15,16 @@ else() endif() set(CMAKE_REQUIRED_FLAGS "${_backup_c_flags}") endif() + +# check whether reallocarray availability +# this is used to satisfy reallocarray usage under src/cc/libbpf/ +CHECK_CXX_SOURCE_COMPILES( +" +#define _GNU_SOURCE +#include + +int main(void) +{ + return !!reallocarray(NULL, 1, 1); +} +" HAVE_REALLOCARRAY_SUPPORT) diff --git a/examples/cpp/UseExternalMap.cc b/examples/cpp/UseExternalMap.cc index 3d4d759dc..d0cf445c6 100644 --- a/examples/cpp/UseExternalMap.cc +++ b/examples/cpp/UseExternalMap.cc @@ -2,7 +2,7 @@ * UseExternalMap shows how to access an external map through * C++ interface. The external map could be a pinned map. * This example simulates the pinned map through a locally - * created map by calling libbpf bpf_create_map. + * created map by calling libbpf bcc_create_map. * * Copyright (c) Facebook, Inc. * Licensed under the Apache License, Version 2.0 (the "License") @@ -79,10 +79,10 @@ int main() { int ctrl_map_fd; uint32_t val; - // create a map through bpf_create_map, bcc knows nothing about this map. - ctrl_map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, "control", sizeof(uint32_t), + // create a map through bcc_create_map, bcc knows nothing about this map. + ctrl_map_fd = bcc_create_map(BPF_MAP_TYPE_ARRAY, "control", sizeof(uint32_t), sizeof(uint32_t), 1, 0); - CHECK(ctrl_map_fd < 0, "bpf_create_map failure"); + CHECK(ctrl_map_fd < 0, "bcc_create_map failure"); // populate control map into TableStorage std::unique_ptr local_ts = diff --git a/src/cc/CMakeLists.txt b/src/cc/CMakeLists.txt index 8ad765b3d..3593e7e17 100644 --- a/src/cc/CMakeLists.txt +++ b/src/cc/CMakeLists.txt @@ -10,6 +10,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/frontends/clang) include_directories(${LLVM_INCLUDE_DIRS}) include_directories(${LIBELF_INCLUDE_DIRS}) # todo: if check for kernel version +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/libbpf/include) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/libbpf/include/uapi) add_definitions(${LLVM_DEFINITIONS}) configure_file(libbcc.pc.in ${CMAKE_CURRENT_BINARY_DIR}/libbcc.pc @ONLY) @@ -18,14 +19,19 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -DBCC_PROG_TAG_DIR='\"${BCC_PROG_T set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -Wno-unused-result") +if (NOT HAVE_REALLOCARRAY_SUPPORT) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DCOMPAT_NEED_REALLOCARRAY") +endif() + string(REGEX MATCH "^([0-9]+).*" _ ${LLVM_PACKAGE_VERSION}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DLLVM_MAJOR_VERSION=${CMAKE_MATCH_1}") include(static_libstdc++) -add_library(bpf-static STATIC libbpf.c perf_reader.c) +file(GLOB libbpf_sources "libbpf/src/*.c") +add_library(bpf-static STATIC libbpf.c perf_reader.c ${libbpf_sources}) set_target_properties(bpf-static PROPERTIES OUTPUT_NAME bpf) -add_library(bpf-shared SHARED libbpf.c perf_reader.c) +add_library(bpf-shared SHARED libbpf.c perf_reader.c ${libbpf_sources}) set_target_properties(bpf-shared PROPERTIES VERSION ${REVISION_LAST} SOVERSION 0) set_target_properties(bpf-shared PROPERTIES OUTPUT_NAME bpf) diff --git a/src/cc/api/BPF.cc b/src/cc/api/BPF.cc index e0502fbf3..606a2bd75 100644 --- a/src/cc/api/BPF.cc +++ b/src/cc/api/BPF.cc @@ -550,7 +550,7 @@ StatusTuple BPF::load_func(const std::string& func_name, bpf_prog_type type, else if (flag_ & DEBUG_BPF) log_level = 1; - fd = bpf_prog_load(type, func_name.c_str(), + fd = bcc_prog_load(type, func_name.c_str(), reinterpret_cast(func_start), func_size, bpf_module_->license(), bpf_module_->kern_version(), log_level, nullptr, 0); diff --git a/src/cc/frontends/b/codegen_llvm.cc b/src/cc/frontends/b/codegen_llvm.cc index dc9651bf0..7753be91e 100644 --- a/src/cc/frontends/b/codegen_llvm.cc +++ b/src/cc/frontends/b/codegen_llvm.cc @@ -1108,7 +1108,7 @@ StatusTuple CodegenLLVM::visit_table_decl_stmt_node(TableDeclStmtNode *n) { decl_gvar->setSection("maps"); tables_[n] = decl_gvar; - int map_fd = bpf_create_map(map_type, n->id_->name_.c_str(), + int map_fd = bcc_create_map(map_type, n->id_->name_.c_str(), key->bit_width_ / 8, leaf->bit_width_ / 8, n->size_, 0); if (map_fd >= 0) diff --git a/src/cc/frontends/clang/b_frontend_action.cc b/src/cc/frontends/clang/b_frontend_action.cc index 20af23b47..6ba484459 100644 --- a/src/cc/frontends/clang/b_frontend_action.cc +++ b/src/cc/frontends/clang/b_frontend_action.cc @@ -1230,7 +1230,7 @@ bool BTypeVisitor::VisitVarDecl(VarDecl *Decl) { } table.type = map_type; - table.fd = bpf_create_map(map_type, table.name.c_str(), + table.fd = bcc_create_map(map_type, table.name.c_str(), table.key_size, table.leaf_size, table.max_entries, table.flags); } diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 5930b617d..420484e75 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -82,7 +82,9 @@ #define AF_ALG 38 #endif +#ifndef min #define min(x, y) ((x) < (y) ? (x) : (y)) +#endif #define UNUSED(expr) do { (void)(expr); } while (0) @@ -191,7 +193,7 @@ static uint64_t ptr_to_u64(void *ptr) return (uint64_t) (unsigned long) ptr; } -int bpf_create_map(enum bpf_map_type map_type, const char *name, +int bcc_create_map(enum bpf_map_type map_type, const char *name, int key_size, int value_size, int max_entries, int map_flags) { @@ -483,7 +485,7 @@ int bpf_prog_get_tag(int fd, unsigned long long *ptag) return 0; } -int bpf_prog_load(enum bpf_prog_type prog_type, const char *name, +int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, const struct bpf_insn *insns, int prog_len, const char *license, unsigned kern_version, int log_level, char *log_buf, unsigned log_buf_size) @@ -1426,59 +1428,3 @@ int bpf_close_perf_event_fd(int fd) { } return error; } - -int bpf_obj_pin(int fd, const char *pathname) -{ - union bpf_attr attr; - - memset(&attr, 0, sizeof(attr)); - attr.pathname = ptr_to_u64((void *)pathname); - attr.bpf_fd = fd; - - return syscall(__NR_bpf, BPF_OBJ_PIN, &attr, sizeof(attr)); -} - -int bpf_obj_get(const char *pathname) -{ - union bpf_attr attr; - - memset(&attr, 0, sizeof(attr)); - attr.pathname = ptr_to_u64((void *)pathname); - - return syscall(__NR_bpf, BPF_OBJ_GET, &attr, sizeof(attr)); -} - -int bpf_prog_get_next_id(uint32_t start_id, uint32_t *next_id) -{ - union bpf_attr attr; - int err; - - memset(&attr, 0, sizeof(attr)); - attr.start_id = start_id; - - err = syscall(__NR_bpf, BPF_PROG_GET_NEXT_ID, &attr, sizeof(attr)); - if (!err) - *next_id = attr.next_id; - - return err; -} - -int bpf_prog_get_fd_by_id(uint32_t id) -{ - union bpf_attr attr; - - memset(&attr, 0, sizeof(attr)); - attr.prog_id = id; - - return syscall(__NR_bpf, BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr)); -} - -int bpf_map_get_fd_by_id(uint32_t id) -{ - union bpf_attr attr; - - memset(&attr, 0, sizeof(attr)); - attr.map_id = id; - - return syscall(__NR_bpf, BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr)); -} diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h index e59970f50..fdcd65caa 100644 --- a/src/cc/libbpf.h +++ b/src/cc/libbpf.h @@ -31,7 +31,7 @@ enum bpf_probe_attach_type { BPF_PROBE_RETURN }; -int bpf_create_map(enum bpf_map_type map_type, const char *name, +int bcc_create_map(enum bpf_map_type map_type, const char *name, int key_size, int value_size, int max_entries, int map_flags); int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags); @@ -56,7 +56,7 @@ int bpf_get_next_key(int fd, void *key, void *next_key); * printing, and continue to attempt increase that allocated buffer size if * initial attemp was insufficient in size. */ -int bpf_prog_load(enum bpf_prog_type prog_type, const char *name, +int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, const struct bpf_insn *insns, int insn_len, const char *license, unsigned kern_version, int log_level, char *log_buf, unsigned log_buf_size); diff --git a/src/lua/bcc/bpf.lua b/src/lua/bcc/bpf.lua index 44b801cfa..da462ac3e 100644 --- a/src/lua/bcc/bpf.lua +++ b/src/lua/bcc/bpf.lua @@ -162,7 +162,7 @@ function Bpf:load_func(fn_name, prog_type) assert(libbcc.bpf_function_start(self.module, fn_name) ~= nil, "unknown program: "..fn_name) - local fd = libbcc.bpf_prog_load(prog_type, + local fd = libbcc.bcc_prog_load(prog_type, fn_name, libbcc.bpf_function_start(self.module, fn_name), libbcc.bpf_function_size(self.module, fn_name), diff --git a/src/lua/bcc/libbcc.lua b/src/lua/bcc/libbcc.lua index 47a2d3bed..4d3e5e0fa 100644 --- a/src/lua/bcc/libbcc.lua +++ b/src/lua/bcc/libbcc.lua @@ -24,13 +24,13 @@ enum bpf_prog_type { BPF_PROG_TYPE_SCHED_ACT, }; -int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries, int map_flags); +int bcc_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries, int map_flags); int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags); int bpf_lookup_elem(int fd, void *key, void *value); int bpf_delete_elem(int fd, void *key); int bpf_get_next_key(int fd, void *key, void *next_key); -int bpf_prog_load(enum bpf_prog_type prog_type, const char *name, +int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, const struct bpf_insn *insns, int insn_len, const char *license, unsigned kern_version, int log_level, char *log_buf, unsigned log_buf_size); diff --git a/src/lua/bpf/bpf.lua b/src/lua/bpf/bpf.lua index 215fb730e..220e68cbb 100644 --- a/src/lua/bpf/bpf.lua +++ b/src/lua/bpf/bpf.lua @@ -1475,7 +1475,7 @@ local tracepoint_mt = { prog = compile(prog, {proto.type(t.type, {source='ptr_to_probe'})}) end -- Load the BPF program - local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.TRACEPOINT, prog.insn, prog.pc) + local prog_fd, err, log = S.bcc_prog_load(S.c.BPF_PROG.TRACEPOINT, prog.insn, prog.pc) assert(prog_fd, tostring(err)..': '..tostring(log)) -- Open tracepoint and attach t.reader:setbpf(prog_fd:getfd()) @@ -1499,7 +1499,7 @@ local function trace_bpf(ptype, pname, pdef, retprobe, prog, pid, cpu, group_fd) if type(prog) ~= 'table' then prog = compile(prog, {proto.pt_regs}) end - local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.KPROBE, prog.insn, prog.pc) + local prog_fd, err, log = S.bcc_prog_load(S.c.BPF_PROG.KPROBE, prog.insn, prog.pc) assert(prog_fd, tostring(err)..': '..tostring(log)) -- Open tracepoint and attach local tp, err = S.perf_probe(ptype, pname, pdef, retprobe) @@ -1580,7 +1580,7 @@ return setmetatable({ if type(prog) ~= 'table' then prog = compile(prog, {proto.skb}) end - local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.SOCKET_FILTER, prog.insn, prog.pc) + local prog_fd, err, log = S.bcc_prog_load(S.c.BPF_PROG.SOCKET_FILTER, prog.insn, prog.pc) assert(prog_fd, tostring(err)..': '..tostring(log)) assert(sock:setsockopt('socket', 'attach_bpf', prog_fd:getfd())) return prog_fd, err diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index 353fa5e90..c661183ee 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -365,7 +365,7 @@ def load_func(self, func_name, prog_type): log_level = 2 elif (self.debug & DEBUG_BPF): log_level = 1 - fd = lib.bpf_prog_load(prog_type, func_name, + fd = lib.bcc_prog_load(prog_type, func_name, lib.bpf_function_start(self.module, func_name), lib.bpf_function_size(self.module, func_name), lib.bpf_module_license(self.module), diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py index 341ed5bec..0624382a8 100644 --- a/src/python/bcc/libbcc.py +++ b/src/python/bcc/libbcc.py @@ -82,8 +82,8 @@ lib.bpf_open_raw_sock.argtypes = [ct.c_char_p] lib.bpf_attach_socket.restype = ct.c_int lib.bpf_attach_socket.argtypes = [ct.c_int, ct.c_int] -lib.bpf_prog_load.restype = ct.c_int -lib.bpf_prog_load.argtypes = [ct.c_int, ct.c_char_p, ct.c_void_p, +lib.bcc_prog_load.restype = ct.c_int +lib.bcc_prog_load.argtypes = [ct.c_int, ct.c_char_p, ct.c_void_p, ct.c_size_t, ct.c_char_p, ct.c_uint, ct.c_int, ct.c_char_p, ct.c_uint] _RAW_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_void_p, ct.c_int) _LOST_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_ulonglong) From 16de581c7d30ac8bce1eb28d1acf03d67eafa321 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 16 Jan 2019 21:30:58 -0800 Subject: [PATCH 014/135] rewrite bcc_create_map with libbpf primitives This patch demonstrated how to use libbpf function calls. Specially, the bcc_map_create is changed to use libbpf functions bpf_create_map_name and bpf_create_map. Signed-off-by: Yonghong Song --- src/cc/libbpf.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 420484e75..af5488aff 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -51,6 +51,8 @@ // TODO: Remove this when CentOS 6 support is not needed anymore #include "setns.h" +#include "libbpf/src/bpf.h" + // TODO: remove these defines when linux-libc-dev exports them properly #ifndef __NR_bpf @@ -198,20 +200,14 @@ int bcc_create_map(enum bpf_map_type map_type, const char *name, int max_entries, int map_flags) { size_t name_len = name ? strlen(name) : 0; - union bpf_attr attr; - memset(&attr, 0, sizeof(attr)); - attr.map_type = map_type; - attr.key_size = key_size; - attr.value_size = value_size; - attr.max_entries = max_entries; - attr.map_flags = map_flags; - memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1)); - - int ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); + char map_name[BPF_OBJ_NAME_LEN]; + memcpy(map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1)); + int ret = bpf_create_map_name(map_type, map_name, key_size, value_size, + max_entries, map_flags); if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) { - memset(attr.map_name, 0, BPF_OBJ_NAME_LEN); - ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); + ret = bpf_create_map(map_type, key_size, value_size, + max_entries, map_flags); } if (ret < 0 && errno == EPERM) { @@ -222,7 +218,8 @@ int bcc_create_map(enum bpf_map_type map_type, const char *name, rl.rlim_max = RLIM_INFINITY; rl.rlim_cur = rl.rlim_max; if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0) - ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); + ret = bpf_create_map(map_type, key_size, value_size, + max_entries, map_flags); } } return ret; From d01f4593c2fa9547c9912ac443ee67b9604b3e88 Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Fri, 18 Jan 2019 09:11:03 -0800 Subject: [PATCH 015/135] notes about tools (#2153) highlighting the <80 chars, and also adding a note about tool names. --- CONTRIBUTING-SCRIPTS.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING-SCRIPTS.md b/CONTRIBUTING-SCRIPTS.md index fdc876879..50f535b7a 100644 --- a/CONTRIBUTING-SCRIPTS.md +++ b/CONTRIBUTING-SCRIPTS.md @@ -9,7 +9,7 @@ _(Written by Brendan Gregg.)_ bcc has 2 types of scripts, in different directories: - **/examples**: intended as short examples of bcc & eBPF code. You should focus on keeping it short, neat, and documented (code comments). A submission can just be the example code. -- **/tools**: intended as production safe performance and troubleshooting tools. You should focus on it being useful, tested, low overhead, documented (incl. all caveats), and easy to use. A submission should involve 4 changes: the tool, a man page, an example file, and an addition to README.md. Follow [my lead](https://github.com/brendangregg/bcc/commit/9fa156273b395cfc5505f0fff5d6b7b1396f7daa), and see the checklist below. These will be run in mission critical environments as root, so if spending hours testing isn't for you, please submit your idea as an issue instead, or chat with us on irc. +- **/tools**: intended as production safe performance and troubleshooting tools. You should focus on it being useful, tested, low overhead, documented (incl. all caveats), and easy to use. A submission should involve 4 changes: the tool, a man page, an example file, and an addition to README.md. Follow [my lead](https://github.com/brendangregg/bcc/commit/9fa156273b395cfc5505f0fff5d6b7b1396f7daa), and see the checklist below. These are run in mission critical environments as root (tech companies, financial institutions, government agencies), so if spending hours testing isn't for you, please submit your idea as an issue instead, or chat with us on irc. More detail for each below. @@ -31,7 +31,9 @@ A checklist for bcc tool development: 1. **Measure the overhead of the tool**. If you are running a micro-benchmark, how much slower is it with the tool running. Is more CPU consumed? Try to determine the worst case: run the micro-benchmark so that CPU headroom is exhausted, and then run the bcc tool. Can overhead be lowered? 1. **Test again, and stress test**. You want to discover and fix all the bad things before others hit them. 1. **Consider command line options**. Should it have -p for filtering on a PID? -T for timestamps? -i for interval? See other tools for examples, and copy the style: the usage message should list example usage at the end. Remember to keep the tool doing one thing and doing it well. Also, if there's one option that seems to be the common case, perhaps it should just be the first argument and not need a switch (no -X). A special case of this is *stat tools, like iostat/vmstat/etc, where the convention is [interval [count]]. -1. **Concise, intuitive, self-explanatory output**. The default output should meet the common need concisely. Leave much less useful fields and data to be shown with options: -v for verbose, etc. Consider including a startup message that's self-explanatory, eg "Tracing block I/O. Output every 1 seconds. Ctrl-C to end.". Also, try hard to keep the output less than 80 characters wide, especially the default output of the tool. That way, the output not only fits on the smallest reasonable terminal, it also fits well in slide decks, blog posts, articles, and printed material, all of which help education and adoption. Publishers of technical books often have templates they require books to conform to: it may not be an option to shrink or narrow the font to fit your output. +1. **Concise, intuitive, self-explanatory output**. The default output should meet the common need concisely. Leave much less useful fields and data to be shown with options: -v for verbose, etc. Consider including a startup message that's self-explanatory, eg "Tracing block I/O. Output every 1 seconds. Ctrl-C to end.". +1. **Default output <80 chars wide**. Try hard to keep the output less than 80 characters wide, especially the default output of the tool. That way, the output not only fits on the smallest reasonable terminal, it also fits well in slide decks, blog posts, articles, and printed material, all of which help education and adoption. Publishers of technical books often have templates they require books to conform to: it may not be an option to shrink or narrow the font to fit your output. +1. **Short tool name**. Follow the style of the other tools, which follow the style of other /usr/bin utilities. They are short and easy to type. No underscores. 1. **Use pep8 to check Python style**: pep8 --show-source --ignore=E123,E125,E126,E127,E128,E302 filename . Note that it misses some things, like consistent usage, so you'll still need to double check your script. 1. **Make sure your script is Python3-ready**: Adding `from __future__ import absolute_import, division, print_function, unicode_literals` helps make your script Python3-ready. 1. **Write an _example.txt file**. Copy the style in tools/biolatency_example.txt: start with an intro sentence, then have examples, and finish with the USAGE message. Explain everything: the first example should explain what we are seeing, even if this seems obvious. For some people it won't be obvious. Also explain why we are running the tool: what problems it's solving. It can take a long time (hours) to come up with good examples, but it's worth it. These will get copied around (eg, presentations, articles). From 3f7b59660037c0d5dea785d115df25d9b95f07dc Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Mon, 21 Jan 2019 11:23:42 +0800 Subject: [PATCH 016/135] print_log2_hist(): check and skip possible paddings (#2155) Address issue 2154. When a struct S is used as key to a BPF_HISTOGRAM, it is assumed that the second member of S holds the slot. But when S is converted to python from bpf C, a padding may be inserted as a second member. This breaks print_log2_hist(). root@debian:~/bcc/tools# ./softirqs.py -d Tracing soft irq event time... Hit Ctrl-C to end. ^C Traceback (most recent call last): File "./softirqs.py", line 144, in dist.print_log2_hist(label, "softirq", section_print_fn=vec_to_name) File "/usr/local/lib/python2.7/dist-packages/bcc/table.py", line 326, in print_log2_hist vals[slot] = v.value TypeError: list indices must be integers, not str Fix it by skipping the possible padding. Future work would be fixing/working around in the library where the padding is introduced. --- src/python/bcc/table.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/python/bcc/table.py b/src/python/bcc/table.py index 6f5983532..f6449de77 100644 --- a/src/python/bcc/table.py +++ b/src/python/bcc/table.py @@ -317,6 +317,15 @@ def print_log2_hist(self, val_type="value", section_header="Bucket ptr", tmp = {} f1 = self.Key._fields_[0][0] f2 = self.Key._fields_[1][0] + + # The above code assumes that self.Key._fields_[1][0] holds the + # slot. But a padding member may have been inserted here, which + # breaks the assumption and leads to chaos. + # TODO: this is a quick fix. Fixing/working around in the BCC + # internal library is the right thing to do. + if f2 == '__pad_1' and len(self.Key._fields_) == 3: + f2 = self.Key._fields_[2][0] + for k, v in self.items(): bucket = getattr(k, f1) if bucket_fn: From c77b158fd008b4c2816fffdbafea3370e135eb42 Mon Sep 17 00:00:00 2001 From: Jugurtha BELKALEM <39241012+jugurthab@users.noreply.github.com> Date: Tue, 22 Jan 2019 19:20:55 +0100 Subject: [PATCH 017/135] ddos_detector.py to monitor DDOS attacks (#2140) ddos_detector.py to monitor DDOS attacks --- examples/tracing/dddos.py | 102 +++++++++++++++++++++++++++++ examples/tracing/dddos_example.txt | 39 +++++++++++ 2 files changed, 141 insertions(+) create mode 100644 examples/tracing/dddos.py create mode 100644 examples/tracing/dddos_example.txt diff --git a/examples/tracing/dddos.py b/examples/tracing/dddos.py new file mode 100644 index 000000000..e72ba7073 --- /dev/null +++ b/examples/tracing/dddos.py @@ -0,0 +1,102 @@ +#!/usr/bin/python +# +# dddos.py DDOS dectection system. +# +# Written as a basic tracing example of using ePBF +# to detect a potential DDOS attack against a system. +# +# Copyright (c) 2019 Jugurtha BELKALEM. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 14-Jan-2019 Jugurtha BELKALEM Created this. + +from bcc import BPF +import ctypes as ct +import datetime +prog = """ +#include +#include + +#define MAX_NB_PACKETS 1000 +#define LEGAL_DIFF_TIMESTAMP_PACKETS 1000000 + +BPF_HASH(rcv_packets); + +struct detectionPackets { + u64 nb_ddos_packets; +}; + +BPF_PERF_OUTPUT(events); + +int detect_ddos(struct pt_regs *ctx, void *skb){ + struct detectionPackets detectionPacket = {}; + + // Used to count number of received packets + u64 rcv_packets_nb_index = 0, rcv_packets_nb_inter=1, *rcv_packets_nb_ptr; + + // Used to measure elapsed time between 2 successive received packets + u64 rcv_packets_ts_index = 1, rcv_packets_ts_inter=0, *rcv_packets_ts_ptr; + + /* The algorithm analyses packets received by ip_rcv function + * and measures the difference in reception time between each packet. + * DDOS flooders send millions of packets such that difference of + * timestamp between 2 successive packets is so small + * (which is not like regular applications behaviour). + * This script looks for this difference in time and if it sees + * more than MAX_NB_PACKETS succesive packets with a difference + * of timestamp between each one of them less than + * LEGAL_DIFF_TIMESTAMP_PACKETS ns, + * ------------------ It Triggers an ALERT ----------------- + * Those settings must be adapted depending on regular network traffic + * ------------------------------------------------------------------- + * Important: this is a rudimentary intrusion detection system, one can + * test a real case attack using hping3. However; if regular network + * traffic increases above predefined detection settings, a false + * positive alert will be triggered (an example would be the + case of large file downloads). + */ + rcv_packets_nb_ptr = rcv_packets.lookup(&rcv_packets_nb_index); + rcv_packets_ts_ptr = rcv_packets.lookup(&rcv_packets_ts_index); + if(rcv_packets_nb_ptr != 0 && rcv_packets_ts_ptr != 0){ + rcv_packets_nb_inter = *rcv_packets_nb_ptr; + rcv_packets_ts_inter = bpf_ktime_get_ns() - *rcv_packets_ts_ptr; + if(rcv_packets_ts_inter < LEGAL_DIFF_TIMESTAMP_PACKETS){ + rcv_packets_nb_inter++; + } else { + rcv_packets_nb_inter = 0; + } + if(rcv_packets_nb_inter > MAX_NB_PACKETS){ + detectionPacket.nb_ddos_packets = rcv_packets_nb_inter; + events.perf_submit(ctx, &detectionPacket, sizeof(detectionPacket)); + } + } + rcv_packets_ts_inter = bpf_ktime_get_ns(); + rcv_packets.update(&rcv_packets_nb_index, &rcv_packets_nb_inter); + rcv_packets.update(&rcv_packets_ts_index, &rcv_packets_ts_inter); + return 0; +} +""" + +# Loads eBPF program +b = BPF(text=prog) + +# Attach kprobe to kernel function and sets detect_ddos as kprobe handler +b.attach_kprobe(event="ip_rcv", fn_name="detect_ddos") + +class DetectionTimestamp(ct.Structure): + _fields_ = [("nb_ddos_packets", ct.c_ulonglong)] + +# Show message when ePBF stats +print("DDOS detector started ... Hit Ctrl-C to end!") + +print("%-26s %-10s" % ("TIME(s)", "MESSAGE")) + +def trigger_alert_event(cpu, data, size): + event = ct.cast(data, ct.POINTER(DetectionTimestamp)).contents + print("%-26s %s %ld" % (datetime.datetime.now(), + "DDOS Attack => nb of packets up to now : ", event.nb_ddos_packets)) + +# loop with callback to trigger_alert_event +b["events"].open_perf_buffer(trigger_alert_event) +while 1: + b.perf_buffer_poll() diff --git a/examples/tracing/dddos_example.txt b/examples/tracing/dddos_example.txt new file mode 100644 index 000000000..064c8d78e --- /dev/null +++ b/examples/tracing/dddos_example.txt @@ -0,0 +1,39 @@ +Demonstrations of dddos.py, the Linux eBPF/bcc version. + +This tracks ip_rcv function (using kprobe) and elapsed time +between received packets to detect potential DDOS attacks. + +The following steps illustrates the usage of dddos : +1 - Start dddos.py : +# ./dddos.py +DDOS detector started ... Hit Ctrl-C to end! +TIME(s) MESSAGE + +2 - Launch hping3 (or any other flooder) in another terminal as shown below: +# hping3 localhost -S -A -V -p 443 -i u100 + +3 - dddos.py triggers alerts and reports a DDOS attack: +DDOS detector started ... Hit Ctrl-C to end! +TIME(s) MESSAGE +2019-01-16 11:55:12.600734 DDOS Attack => nb of packets up to now : 1001 +2019-01-16 11:55:12.600845 DDOS Attack => nb of packets up to now : 1002 +2019-01-16 11:55:12.600887 DDOS Attack => nb of packets up to now : 1003 +2019-01-16 11:55:12.600971 DDOS Attack => nb of packets up to now : 1004 +2019-01-16 11:55:12.601009 DDOS Attack => nb of packets up to now : 1005 +2019-01-16 11:55:12.601062 DDOS Attack => nb of packets up to now : 1006 +2019-01-16 11:55:12.601096 DDOS Attack => nb of packets up to now : 1007 +2019-01-16 11:55:12.601195 DDOS Attack => nb of packets up to now : 1008 +2019-01-16 11:55:12.601228 DDOS Attack => nb of packets up to now : 1009 +2019-01-16 11:55:12.601331 DDOS Attack => nb of packets up to now : 1010 +2019-01-16 11:55:12.601364 DDOS Attack => nb of packets up to now : 1011 +2019-01-16 11:55:12.601470 DDOS Attack => nb of packets up to now : 1012 +2019-01-16 11:55:12.601505 DDOS Attack => nb of packets up to now : 1013 +2019-01-16 11:55:12.601621 DDOS Attack => nb of packets up to now : 1014 +2019-01-16 11:55:12.601656 DDOS Attack => nb of packets up to now : 1015 +2019-01-16 11:55:12.601757 DDOS Attack => nb of packets up to now : 1016 +2019-01-16 11:55:12.601790 DDOS Attack => nb of packets up to now : 1017 +2019-01-16 11:55:12.601892 DDOS Attack => nb of packets up to now : 1018 +2019-01-16 11:55:12.601925 DDOS Attack => nb of packets up to now : 1019 +2019-01-16 11:55:12.602028 DDOS Attack => nb of packets up to now : 1020 + +Remark : Use Ctrl-C to stop dddos.py From 012551bf2b2796c86996893d4517e4a808b01406 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Wed, 23 Jan 2019 03:45:23 +0900 Subject: [PATCH 018/135] use libbpf APIs from libbpf.c (#2156) Recently kernel libbpf was imported in bcc. It makes some of the libbcc code redundant. This patch modifies libbcc functions to use libbpf APIs. --- src/cc/libbpf.c | 66 +++++++------------------------------------------ 1 file changed, 9 insertions(+), 57 deletions(-) diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index af5488aff..79c73c164 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -227,54 +227,29 @@ int bcc_create_map(enum bpf_map_type map_type, const char *name, int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags) { - union bpf_attr attr; - memset(&attr, 0, sizeof(attr)); - attr.map_fd = fd; - attr.key = ptr_to_u64(key); - attr.value = ptr_to_u64(value); - attr.flags = flags; - - return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); + return bpf_map_update_elem(fd, key, value, flags); } int bpf_lookup_elem(int fd, void *key, void *value) { - union bpf_attr attr; - memset(&attr, 0, sizeof(attr)); - attr.map_fd = fd; - attr.key = ptr_to_u64(key); - attr.value = ptr_to_u64(value); - - return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); + return bpf_map_lookup_elem(fd, key, value); } int bpf_delete_elem(int fd, void *key) { - union bpf_attr attr; - memset(&attr, 0, sizeof(attr)); - attr.map_fd = fd; - attr.key = ptr_to_u64(key); - - return syscall(__NR_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); + return bpf_map_delete_elem(fd, key); } int bpf_get_first_key(int fd, void *key, size_t key_size) { - union bpf_attr attr; int i, res; - memset(&attr, 0, sizeof(attr)); - attr.map_fd = fd; - attr.key = 0; - attr.next_key = ptr_to_u64(key); - // 4.12 and above kernel supports passing NULL to BPF_MAP_GET_NEXT_KEY // to get first key of the map. For older kernels, the call will fail. - res = syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); + res = bpf_map_get_next_key(fd, 0, key); if (res < 0 && errno == EFAULT) { // Fall back to try to find a non-existing key. static unsigned char try_values[3] = {0, 0xff, 0x55}; - attr.key = ptr_to_u64(key); for (i = 0; i < 3; i++) { memset(key, try_values[i], key_size); // We want to check the existence of the key but we don't know the size @@ -284,11 +259,11 @@ int bpf_get_first_key(int fd, void *key, size_t key_size) // trigger a page fault in kernel and affect performance. Hence we use // ~0 which will fail and return fast. // This should fail since we pass an invalid pointer for value. - if (bpf_lookup_elem(fd, key, (void *)~0) >= 0) + if (bpf_map_lookup_elem(fd, key, (void *)~0) >= 0) return -1; // This means the key doesn't exist. if (errno == ENOENT) - return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); + return bpf_map_get_next_key(fd, (void*)&try_values[i], key); } return -1; } else { @@ -298,13 +273,7 @@ int bpf_get_first_key(int fd, void *key, size_t key_size) int bpf_get_next_key(int fd, void *key, void *next_key) { - union bpf_attr attr; - memset(&attr, 0, sizeof(attr)); - attr.map_fd = fd; - attr.key = ptr_to_u64(key); - attr.next_key = ptr_to_u64(next_key); - - return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); + return bpf_map_get_next_key(fd, key, next_key); } static void bpf_print_hints(int ret, char *log) @@ -373,19 +342,7 @@ static void bpf_print_hints(int ret, char *log) int bpf_obj_get_info(int prog_map_fd, void *info, uint32_t *info_len) { - union bpf_attr attr; - int err; - - memset(&attr, 0, sizeof(attr)); - attr.info.bpf_fd = prog_map_fd; - attr.info.info_len = *info_len; - attr.info.info = ptr_to_u64(info); - - err = syscall(__NR_bpf, BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); - if (!err) - *info_len = attr.info.info_len; - - return err; + return bpf_obj_get_info_by_fd(prog_map_fd, info, info_len); } int bpf_prog_compute_tag(const struct bpf_insn *insns, int prog_len, @@ -1118,14 +1075,9 @@ int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) { int bpf_attach_raw_tracepoint(int progfd, char *tp_name) { - union bpf_attr attr; int ret; - bzero(&attr, sizeof(attr)); - attr.raw_tracepoint.name = ptr_to_u64(tp_name); - attr.raw_tracepoint.prog_fd = progfd; - - ret = syscall(__NR_bpf, BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr)); + ret = bpf_raw_tracepoint_open(tp_name, progfd); if (ret < 0) fprintf(stderr, "bpf_attach_raw_tracepoint (%s): %s\n", tp_name, strerror(errno)); return ret; From 7324ba5b4d2a492f7c8ab89f87e8b9a3e186fb1d Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Tue, 22 Jan 2019 15:47:08 -0800 Subject: [PATCH 019/135] profile.py: return kernel annotations for folded stacks --- tools/profile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/profile.py b/tools/profile.py index d1d3d26ac..a803b9e0c 100755 --- a/tools/profile.py +++ b/tools/profile.py @@ -319,7 +319,7 @@ def aksym(addr): if stack_id_err(k.kernel_stack_id): line.append("[Missed Kernel Stack]") else: - line.extend([b.ksym(addr) for addr in reversed(kernel_stack)]) + line.extend([aksym(addr) for addr in reversed(kernel_stack)]) print("%s %d" % (b";".join(line).decode('utf-8', 'replace'), v.value)) else: # print default multi-line stack output From 9924e64e03198dc8d579ddc143c6c3fa3fcdeea7 Mon Sep 17 00:00:00 2001 From: vijunag Date: Wed, 23 Jan 2019 12:35:33 +0530 Subject: [PATCH 020/135] support symbol resolution of short-lived process. (#2144) New command line options have been added to tools/trace.py to support the new BUILD_ID stackmap. List of symbol files can be added to the script to resolve symbols from build id as reported by the kernel in the stack trace Updated man page and added an example usage --- man/man8/trace.8 | 12 ++++++++++-- tools/trace.py | 15 +++++++++++++-- tools/trace_example.txt | 18 ++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/man/man8/trace.8 b/man/man8/trace.8 index c12dd7944..329df8daf 100644 --- a/man/man8/trace.8 +++ b/man/man8/trace.8 @@ -2,7 +2,7 @@ .SH NAME trace \- Trace a function and print its arguments or return value, optionally evaluating a filter. Uses Linux eBPF/bcc. .SH SYNOPSIS -.B trace [-h] [-b BUFFER_PAGES] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S] +.B trace [-h] [-b BUFFER_PAGES] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S] [-s SYM_FILE_LIST] [-M MAX_EVENTS] [-t] [-T] [-C] [-K] [-U] [-a] [-I header] probe [probe ...] .SH DESCRIPTION @@ -28,9 +28,13 @@ Trace only functions in the thread TID. Display the generated BPF program, for debugging purposes. .TP \-z STRING_SIZE -When collecting string arguments (of type char*), collect up to STRING_SIZE +When collecting string arguments (of type char*), collect up to STRING_SIZE characters. Longer strings will be truncated. .TP +\-s SYM_FILE_LIST +When collecting stack trace in build id format, use the coma separated list for +symbol resolution. +.TP \-S If set, trace messages from trace's own process. By default, this is off to avoid tracing storms -- for example, if you trace the write system call, and @@ -177,6 +181,10 @@ Trace the pthread_create USDT probe from the pthread library and print the addre Trace the nanosleep system call and print the sleep duration in nanoseconds: # .B trace 'p::SyS_nanosleep(struct timespec *ts) "sleep for %lld ns", ts->tv_nsec' +.TP +Trace the inet_pton system call using build id mechanism and print the stack +# +.B trace -s /lib/x86_64-linux-gnu/libc.so.6,/bin/ping 'p:c:inet_pton' -U .SH SOURCE This is from bcc. .IP diff --git a/tools/trace.py b/tools/trace.py index 8d5493e09..0b8797c9e 100755 --- a/tools/trace.py +++ b/tools/trace.py @@ -4,7 +4,7 @@ # parameters, with an optional filter. # # usage: trace [-h] [-p PID] [-L TID] [-v] [-Z STRING_SIZE] [-S] -# [-M MAX_EVENTS] [-T] [-t] [-K] [-U] [-a] [-I header] +# [-M MAX_EVENTS] [-s SYMBOLFILES] [-T] [-t] [-K] [-U] [-a] [-I header] # probe [probe ...] # # Licensed under the Apache License, Version 2.0 (the "License") @@ -35,6 +35,7 @@ class Probe(object): tgid = -1 pid = -1 page_cnt = None + build_id_enabled = False @classmethod def configure(cls, args): @@ -49,6 +50,7 @@ def configure(cls, args): cls.pid = args.pid or -1 cls.page_cnt = args.buffer_pages cls.bin_cmp = args.bin_cmp + cls.build_id_enabled = args.sym_file_list is not None def __init__(self, probe, string_size, kernel_stack, user_stack): self.usdt = None @@ -346,7 +348,9 @@ def _generate_data_decl(self): self.events_name = "%s_events" % self.probe_name self.struct_name = "%s_data_t" % self.probe_name self.stacks_name = "%s_stacks" % self.probe_name - stack_table = "BPF_STACK_TRACE(%s, 1024);" % self.stacks_name \ + stack_type = "BPF_STACK_TRACE" if self.build_id_enabled is False \ + else "BPF_STACK_TRACE_BUILDID" + stack_table = "%s(%s, 1024);" % (stack_type,self.stacks_name) \ if (self.kernel_stack or self.user_stack) else "" data_fields = "" for i, field_type in enumerate(self.types): @@ -693,6 +697,10 @@ def __init__(self): help="print CPU id") parser.add_argument("-B", "--bin_cmp", action="store_true", help="allow to use STRCMP with binary values") + parser.add_argument('-s', "--sym_file_list", type=str, \ + metavar="SYM_FILE_LIST", dest="sym_file_list", \ + help="coma separated list of symbol files to use \ + for symbol resolution") parser.add_argument("-K", "--kernel-stack", action="store_true", help="output kernel stack trace") parser.add_argument("-U", "--user-stack", @@ -757,6 +765,9 @@ def _attach_probes(self): print(probe.usdt.get_text()) usdt_contexts.append(probe.usdt) self.bpf = BPF(text=self.program, usdt_contexts=usdt_contexts) + if self.args.sym_file_list is not None: + print("Note: Kernel bpf will report stack map with ip/build_id") + map(lambda x: self.bpf.add_module(x), self.args.sym_file_list.split(',')) for probe in self.probes: if self.args.verbose: print(probe) diff --git a/tools/trace_example.txt b/tools/trace_example.txt index 0b41d7a59..303be0eed 100644 --- a/tools/trace_example.txt +++ b/tools/trace_example.txt @@ -104,6 +104,19 @@ TIME PID COMM FUNC - 01:23:55 0 swapper/0 block_rq_complete sectors=8 ^C +Suppose that you want to trace a system-call in a short-lived process, you can use +the -s option to trace. The option is followed by list of libraries/executables to +use for symbol resolution. +# trace -s /lib/x86_64-linux-gnu/libc.so.6,/bin/ping 'p:c:inet_pton' -U +Note: Kernel bpf will report stack map with ip/build_id +PID TID COMM FUNC +4175 4175 ping inet_pton + inet_pton+0x136340 [libc.so.6] + getaddrinfo+0xfb510 [libc.so.6] + _init+0x2a08 [ping] + +During the trace, 'ping -c1 google.com' was executed to obtain the above results + To discover the tracepoint structure format (which you can refer to as the "args" pointer variable), use the tplist tool. For example: @@ -268,6 +281,8 @@ optional arguments: -v, --verbose print resulting BPF program code before executing -Z STRING_SIZE, --string-size STRING_SIZE maximum size to read from strings + -s SYM_FILE_LIST when collecting stack trace in build id format, + use the coma separated list for symbol resolution -S, --include-self do not filter trace's own pid from the trace -M MAX_EVENTS, --max-events MAX_EVENTS number of events to print before quitting @@ -325,4 +340,7 @@ trace -I 'net/sock.h' \\ to 53 (DNS; 13568 in big endian order) trace -I 'linux/fs_struct.h' 'mntns_install "users = %d", $task->fs->users' Trace the number of users accessing the file system of the current task +trace -s /lib/x86_64-linux-gnu/libc.so.6,/bin/ping 'p:c:inet_pton' -U + Trace inet_pton system call and use the specified libraries/executables for + symbol resolution. " From 0fd3efb0c85d1c4ba5fff7d26b8a3979a55eed28 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Thu, 24 Jan 2019 01:35:07 +0900 Subject: [PATCH 021/135] use libbpf api in bpf_attach_xdp (#2158) Use libbpf api bpf_set_link_xdp_fd() to attach xdp program to an interface. --- src/cc/libbpf.c | 129 ++++++------------------------------------------ 1 file changed, 14 insertions(+), 115 deletions(-) diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 79c73c164..306ac8178 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -1191,124 +1191,23 @@ int bpf_open_perf_event(uint32_t type, uint64_t config, int pid, int cpu) { } int bpf_attach_xdp(const char *dev_name, int progfd, uint32_t flags) { - struct sockaddr_nl sa; - int sock, seq = 0, len, ret = -1; - char buf[4096]; - struct nlattr *nla, *nla_xdp; - struct { - struct nlmsghdr nh; - struct ifinfomsg ifinfo; - char attrbuf[64]; - } req; - struct nlmsghdr *nh; - struct nlmsgerr *err; - socklen_t addrlen; - - memset(&sa, 0, sizeof(sa)); - sa.nl_family = AF_NETLINK; - - sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); - if (sock < 0) { - fprintf(stderr, "bpf: opening a netlink socket: %s\n", strerror(errno)); - return -1; - } - - if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { - fprintf(stderr, "bpf: bind to netlink: %s\n", strerror(errno)); - goto cleanup; - } - - addrlen = sizeof(sa); - if (getsockname(sock, (struct sockaddr *)&sa, &addrlen) < 0) { - fprintf(stderr, "bpf: get sock name of netlink: %s\n", strerror(errno)); - goto cleanup; - } - - if (addrlen != sizeof(sa)) { - fprintf(stderr, "bpf: wrong netlink address length: %d\n", addrlen); - goto cleanup; - } - - memset(&req, 0, sizeof(req)); - req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); - req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; - req.nh.nlmsg_type = RTM_SETLINK; - req.nh.nlmsg_pid = 0; - req.nh.nlmsg_seq = ++seq; - req.ifinfo.ifi_family = AF_UNSPEC; - req.ifinfo.ifi_index = if_nametoindex(dev_name); - if (req.ifinfo.ifi_index == 0) { - fprintf(stderr, "bpf: Resolving device name to index: %s\n", strerror(errno)); - goto cleanup; - } - - nla = (struct nlattr *)(((char *)&req) - + NLMSG_ALIGN(req.nh.nlmsg_len)); - nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/; - - nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN); - nla->nla_len = NLA_HDRLEN; - - // we specify the FD passed over by the user - nla_xdp->nla_type = 1/*IFLA_XDP_FD*/; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(progfd); - memcpy((char *)nla_xdp + NLA_HDRLEN, &progfd, sizeof(progfd)); - nla->nla_len += nla_xdp->nla_len; - - // parse flags as passed by the user - if (flags) { - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); - memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); - nla->nla_len += nla_xdp->nla_len; - } - - req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); + int ifindex = if_nametoindex(dev_name); + char err_buf[256]; + int ret = -1; - if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { - fprintf(stderr, "bpf: send to netlink: %s\n", strerror(errno)); - goto cleanup; - } - - len = recv(sock, buf, sizeof(buf), 0); - if (len < 0) { - fprintf(stderr, "bpf: recv from netlink: %s\n", strerror(errno)); - goto cleanup; - } - - for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); - nh = NLMSG_NEXT(nh, len)) { - if (nh->nlmsg_pid != sa.nl_pid) { - fprintf(stderr, "bpf: Wrong pid %u, expected %u\n", - nh->nlmsg_pid, sa.nl_pid); - errno = EBADMSG; - goto cleanup; - } - if (nh->nlmsg_seq != seq) { - fprintf(stderr, "bpf: Wrong seq %d, expected %d\n", - nh->nlmsg_seq, seq); - errno = EBADMSG; - goto cleanup; - } - switch (nh->nlmsg_type) { - case NLMSG_ERROR: - err = (struct nlmsgerr *)NLMSG_DATA(nh); - if (!err->error) - continue; - fprintf(stderr, "bpf: nlmsg error %s\n", strerror(-err->error)); - errno = -err->error; - goto cleanup; - case NLMSG_DONE: - break; - } - } + if (ifindex == 0) { + fprintf(stderr, "bpf: Resolving device name to index: %s\n", strerror(errno)); + return -1; + } - ret = 0; + ret = bpf_set_link_xdp_fd(ifindex, progfd, flags); + if (ret) { + libbpf_strerror(ret, err_buf, sizeof(err_buf)); + fprintf(stderr, "bpf: Attaching prog to %s: %s", dev_name, err_buf); + return -1; + } -cleanup: - close(sock); - return ret; + return 0; } int bpf_attach_perf_event_raw(int progfd, void *perf_event_attr, pid_t pid, From 9e0817a9da21c56888a0e2b9fa2cfce2130e1a05 Mon Sep 17 00:00:00 2001 From: Kenny Yu Date: Wed, 23 Jan 2019 20:17:11 -0800 Subject: [PATCH 022/135] tools: rename "deadlock_detector" to "deadlock" (#2152) (#2160) This renames the `deadlock_detector.py` tool to `deadlock.py` to make the name more diagram-friendly and to be consistent with the naming of the other tools. --- README.md | 2 +- man/man8/deadlock_detector.8 | 26 +++++++-------- snapcraft/snapcraft.yaml | 4 +-- tests/python/test_tools_smoke.py | 4 +-- tools/{deadlock_detector.c => deadlock.c} | 4 +-- tools/{deadlock_detector.py => deadlock.py} | 24 +++++++------- ...ector_example.txt => deadlock_example.txt} | 32 +++++++++---------- 7 files changed, 48 insertions(+), 48 deletions(-) rename tools/{deadlock_detector.c => deadlock.c} (98%) rename tools/{deadlock_detector.py => deadlock.py} (96%) rename tools/{deadlock_detector_example.txt => deadlock_example.txt} (94%) diff --git a/README.md b/README.md index 54733ebb5..77c520bdb 100644 --- a/README.md +++ b/README.md @@ -102,7 +102,7 @@ pair of .c and .py files, and some are directories of files. - tools/[dbstat](tools/dbstat.py): Summarize MySQL/PostgreSQL query latency as a histogram. [Examples](tools/dbstat_example.txt). - tools/[dcsnoop](tools/dcsnoop.py): Trace directory entry cache (dcache) lookups. [Examples](tools/dcsnoop_example.txt). - tools/[dcstat](tools/dcstat.py): Directory entry cache (dcache) stats. [Examples](tools/dcstat_example.txt). -- tools/[deadlock_detector](tools/deadlock_detector.py): Detect potential deadlocks on a running process. [Examples](tools/deadlock_detector_example.txt). +- tools/[deadlock](tools/deadlock.py): Detect potential deadlocks on a running process. [Examples](tools/deadlock_example.txt). - tools/[execsnoop](tools/execsnoop.py): Trace new processes via exec() syscalls. [Examples](tools/execsnoop_example.txt). - tools/[ext4dist](tools/ext4dist.py): Summarize ext4 operation latency distribution as a histogram. [Examples](tools/ext4dist_example.txt). - tools/[ext4slower](tools/ext4slower.py): Trace slow ext4 operations. [Examples](tools/ext4slower_example.txt). diff --git a/man/man8/deadlock_detector.8 b/man/man8/deadlock_detector.8 index 0b23e3e66..0be3f4ab3 100644 --- a/man/man8/deadlock_detector.8 +++ b/man/man8/deadlock_detector.8 @@ -1,14 +1,14 @@ -.TH deadlock_detector 8 "2017-02-01" "USER COMMANDS" +.TH deadlock 8 "2017-02-01" "USER COMMANDS" .SH NAME -deadlock_detector \- Find potential deadlocks (lock order inversions) +deadlock \- Find potential deadlocks (lock order inversions) in a running program. .SH SYNOPSIS -.B deadlock_detector [\-h] [\--binary BINARY] [\--dump-graph DUMP_GRAPH] -.B [\--verbose] [\--lock-symbols LOCK_SYMBOLS] -.B [\--unlock-symbols UNLOCK_SYMBOLS] -.B pid +.B deadlock [\-h] [\--binary BINARY] [\--dump-graph DUMP_GRAPH] +.B [\--verbose] [\--lock-symbols LOCK_SYMBOLS] +.B [\--unlock-symbols UNLOCK_SYMBOLS] +.B pid .SH DESCRIPTION -deadlock_detector finds potential deadlocks in a running process. The program +deadlock finds potential deadlocks in a running process. The program attaches uprobes on `pthread_mutex_lock` and `pthread_mutex_unlock` by default to build a mutex wait directed graph, and then looks for a cycle in this graph. This graph has the following properties: @@ -65,13 +65,13 @@ Pid to trace Find potential deadlocks in PID 181. The --binary argument is not needed for \ statically-linked binaries. # -.B deadlock_detector 181 +.B deadlock 181 .TP Find potential deadlocks in PID 181. If the process was created from a \ dynamically-linked executable, the --binary argument is required and must be \ the path of the pthread library: # -.B deadlock_detector 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0 +.B deadlock 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0 .TP Find potential deadlocks in PID 181. If the process was created from a \ statically-linked executable, optionally pass the location of the binary. \ @@ -80,19 +80,19 @@ contain `:` in the path cannot be attached with uprobes. As a workaround, we \ can create a symlink to the binary, and provide the symlink name instead with \ the `--binary` option: # -.B deadlock_detector 181 --binary /usr/local/bin/lockinversion +.B deadlock 181 --binary /usr/local/bin/lockinversion .TP Find potential deadlocks in PID 181 and dump the mutex wait graph to a file: # -.B deadlock_detector 181 --dump-graph graph.json +.B deadlock 181 --dump-graph graph.json .TP Find potential deadlocks in PID 181 and print mutex wait graph statistics: # -.B deadlock_detector 181 --verbose +.B deadlock 181 --verbose .TP Find potential deadlocks in PID 181 with custom mutexes: # -.B deadlock_detector 181 +.B deadlock 181 .B --lock-symbols custom_mutex1_lock,custom_mutex2_lock .B --unlock_symbols custom_mutex1_unlock,custom_mutex2_unlock .SH OUTPUT diff --git a/snapcraft/snapcraft.yaml b/snapcraft/snapcraft.yaml index 93a2adcee..4be910732 100644 --- a/snapcraft/snapcraft.yaml +++ b/snapcraft/snapcraft.yaml @@ -101,8 +101,8 @@ apps: command: usr/share/bcc/tools/dcsnoop dcstat: command: usr/share/bcc/tools/dcstat - deadlock-detector: - command: usr/share/bcc/tools/deadlock_detector + deadlock: + command: usr/share/bcc/tools/deadlock execsnoop: command: usr/share/bcc/tools/execsnoop ext4dist: diff --git a/tests/python/test_tools_smoke.py b/tests/python/test_tools_smoke.py index 211dbdbcc..13667d909 100755 --- a/tests/python/test_tools_smoke.py +++ b/tests/python/test_tools_smoke.py @@ -137,11 +137,11 @@ def test_dcstat(self): self.run_with_duration("dcstat.py 1 1") @skipUnless(kernel_version_ge(4,6), "requires kernel >= 4.6") - def test_deadlock_detector(self): + def test_deadlock(self): # TODO This tool requires a massive BPF stack traces table allocation, # which might fail the run or even trigger the oomkiller to kill some # other processes. Disabling for now. - # self.run_with_int("deadlock_detector.py $(pgrep -n bash)", timeout=10) + # self.run_with_int("deadlock.py $(pgrep -n bash)", timeout=10) pass @skipUnless(kernel_version_ge(4,8), "requires kernel >= 4.8") diff --git a/tools/deadlock_detector.c b/tools/deadlock.c similarity index 98% rename from tools/deadlock_detector.c rename to tools/deadlock.c index 09899b026..e1f9b823e 100644 --- a/tools/deadlock_detector.c +++ b/tools/deadlock.c @@ -1,6 +1,6 @@ /* - * deadlock_detector.c Detects potential deadlocks in a running process. - * For Linux, uses BCC, eBPF. See .py file. + * deadlock.c Detects potential deadlocks in a running process. + * For Linux, uses BCC, eBPF. See .py file. * * Copyright 2017 Facebook, Inc. * Licensed under the Apache License, Version 2.0 (the "License") diff --git a/tools/deadlock_detector.py b/tools/deadlock.py similarity index 96% rename from tools/deadlock_detector.py rename to tools/deadlock.py index 573f8307c..178487200 100755 --- a/tools/deadlock_detector.py +++ b/tools/deadlock.py @@ -1,12 +1,12 @@ #!/usr/bin/python # -# deadlock_detector Detects potential deadlocks (lock order inversions) -# on a running process. For Linux, uses BCC, eBPF. +# deadlock Detects potential deadlocks (lock order inversions) +# on a running process. For Linux, uses BCC, eBPF. # -# USAGE: deadlock_detector.py [-h] [--binary BINARY] [--dump-graph DUMP_GRAPH] -# [--verbose] [--lock-symbols LOCK_SYMBOLS] -# [--unlock-symbols UNLOCK_SYMBOLS] -# pid +# USAGE: deadlock.py [-h] [--binary BINARY] [--dump-graph DUMP_GRAPH] +# [--verbose] [--lock-symbols LOCK_SYMBOLS] +# [--unlock-symbols UNLOCK_SYMBOLS] +# pid # # This traces pthread mutex lock and unlock calls to build a directed graph # representing the mutex wait graph: @@ -388,25 +388,25 @@ def strlist(s): def main(): examples = '''Examples: - deadlock_detector 181 # Analyze PID 181 + deadlock 181 # Analyze PID 181 - deadlock_detector 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0 + deadlock 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0 # Analyze PID 181 and locks from this binary. # If tracing a process that is running from # a dynamically-linked binary, this argument # is required and should be the path to the # pthread library. - deadlock_detector 181 --verbose + deadlock 181 --verbose # Analyze PID 181 and print statistics about # the mutex wait graph. - deadlock_detector 181 --lock-symbols my_mutex_lock1,my_mutex_lock2 \\ + deadlock 181 --lock-symbols my_mutex_lock1,my_mutex_lock2 \\ --unlock-symbols my_mutex_unlock1,my_mutex_unlock2 # Analyze PID 181 and trace custom mutex # symbols instead of pthread mutexes. - deadlock_detector 181 --dump-graph graph.json + deadlock 181 --dump-graph graph.json # Analyze PID 181 and dump the mutex wait # graph to graph.json. ''' @@ -465,7 +465,7 @@ def main(): print('%s. Is the process (pid=%d) running?' % (str(e), args.pid)) sys.exit(1) - bpf = BPF(src_file=b'deadlock_detector.c') + bpf = BPF(src_file=b'deadlock.c') # Trace where threads are created bpf.attach_kretprobe(event=bpf.get_syscall_fnname('clone'), fn_name='trace_clone') diff --git a/tools/deadlock_detector_example.txt b/tools/deadlock_example.txt similarity index 94% rename from tools/deadlock_detector_example.txt rename to tools/deadlock_example.txt index 6cd239533..45d812605 100644 --- a/tools/deadlock_detector_example.txt +++ b/tools/deadlock_example.txt @@ -1,4 +1,4 @@ -Demonstrations of deadlock_detector. +Demonstrations of deadlock. This program detects potential deadlocks on a running process. The program attaches uprobes on `pthread_mutex_lock` and `pthread_mutex_unlock` to build @@ -35,7 +35,7 @@ after the mutex has been created. As a result, this tool will not find potential deadlocks that involve only one mutex. -# ./deadlock_detector.py 181 +# ./deadlock.py 181 Tracing... Hit Ctrl-C to end. ---------------- Potential Deadlock Detected! @@ -239,7 +239,7 @@ uses a similar format as ThreadSanitizer (https://github.com/google/sanitizers/wiki/ThreadSanitizerDeadlockDetector). -# ./deadlock_detector.py 181 --binary /usr/local/bin/lockinversion +# ./deadlock.py 181 --binary /usr/local/bin/lockinversion Tracing... Hit Ctrl-C to end. ^C @@ -253,7 +253,7 @@ cannot be attached with uprobes. As a workaround, we can create a symlink to the binary, and provide the symlink name instead to the `--binary` option. -# ./deadlock_detector.py 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0 +# ./deadlock.py 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0 Tracing... Hit Ctrl-C to end. ^C @@ -263,7 +263,7 @@ this argument is required and needs to be the path to the pthread shared library used by the executable. -# ./deadlock_detector.py 181 --dump-graph graph.json --verbose +# ./deadlock.py 181 --dump-graph graph.json --verbose Tracing... Hit Ctrl-C to end. Mutexes: 0, Edges: 0 @@ -284,7 +284,7 @@ serialize the graph to analyze it later, you can pass the `--dump-graph FILE` flag, and the program will serialize the graph in json. -# ./deadlock_detector.py 181 --lock-symbols custom_mutex1_lock,custom_mutex2_lock --unlock_symbols custom_mutex1_unlock,custom_mutex2_unlock --verbose +# ./deadlock.py 181 --lock-symbols custom_mutex1_lock,custom_mutex2_lock --unlock_symbols custom_mutex1_unlock,custom_mutex2_unlock --verbose Tracing... Hit Ctrl-C to end. Mutexes: 0, Edges: 0 @@ -307,12 +307,12 @@ in false positives. USAGE message: -# ./deadlock_detector.py -h +# ./deadlock.py -h -usage: deadlock_detector.py [-h] [--binary BINARY] [--dump-graph DUMP_GRAPH] - [--verbose] [--lock-symbols LOCK_SYMBOLS] - [--unlock-symbols UNLOCK_SYMBOLS] - pid +usage: deadlock.py [-h] [--binary BINARY] [--dump-graph DUMP_GRAPH] + [--verbose] [--lock-symbols LOCK_SYMBOLS] + [--unlock-symbols UNLOCK_SYMBOLS] + pid Detect potential deadlocks (lock inversions) in a running binary. Must be run as root. @@ -342,24 +342,24 @@ optional arguments: be inlined in the binary. Examples: - deadlock_detector 181 # Analyze PID 181 + deadlock 181 # Analyze PID 181 - deadlock_detector 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0 + deadlock 181 --binary /lib/x86_64-linux-gnu/libpthread.so.0 # Analyze PID 181 and locks from this binary. # If tracing a process that is running from # a dynamically-linked binary, this argument # is required and should be the path to the # pthread library. - deadlock_detector 181 --verbose + deadlock 181 --verbose # Analyze PID 181 and print statistics about # the mutex wait graph. - deadlock_detector 181 --lock-symbols my_mutex_lock1,my_mutex_lock2 \ + deadlock 181 --lock-symbols my_mutex_lock1,my_mutex_lock2 \ --unlock-symbols my_mutex_unlock1,my_mutex_unlock2 # Analyze PID 181 and trace custom mutex # symbols instead of pthread mutexes. - deadlock_detector 181 --dump-graph graph.json + deadlock 181 --dump-graph graph.json # Analyze PID 181 and dump the mutex wait # graph to graph.json. From 77f4f663ad567e1ecf4528d25f00af548ac746b9 Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Thu, 24 Jan 2019 12:48:25 -0800 Subject: [PATCH 023/135] fix cpuunclaimed.py with cfs_rq structure change (#2164) Similar to runqlen.py, make proper adjustment for cfs_rq_partial structure so it can align with what the kernel expects. Signed-off-by: Yonghong Song --- tools/cpuunclaimed.py | 69 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/tools/cpuunclaimed.py b/tools/cpuunclaimed.py index b862bad27..75ee9324e 100755 --- a/tools/cpuunclaimed.py +++ b/tools/cpuunclaimed.py @@ -62,8 +62,9 @@ from ctypes import c_int import argparse import multiprocessing -from os import getpid, system +from os import getpid, system, open, close, dup, unlink, O_WRONLY import ctypes as ct +from tempfile import NamedTemporaryFile # arguments examples = """examples: @@ -98,6 +99,66 @@ ncpu = multiprocessing.cpu_count() # assume all are online debug = 0 +# Linux 4.15 introduced a new field runnable_weight +# in linux_src:kernel/sched/sched.h as +# struct cfs_rq { +# struct load_weight load; +# unsigned long runnable_weight; +# unsigned int nr_running, h_nr_running; +# ...... +# } +# and this tool requires to access nr_running to get +# runqueue len information. +# +# The commit which introduces cfs_rq->runnable_weight +# field also introduces the field sched_entity->runnable_weight +# where sched_entity is defined in linux_src:include/linux/sched.h. +# +# To cope with pre-4.15 and 4.15/post-4.15 releases, +# we run a simple BPF program to detect whether +# field sched_entity->runnable_weight exists. The existence of +# this field should infer the existence of cfs_rq->runnable_weight. +# +# This will need maintenance as the relationship between these +# two fields may change in the future. +# +def check_runnable_weight_field(): + # Define the bpf program for checking purpose + bpf_check_text = """ +#include +unsigned long dummy(struct sched_entity *entity) +{ + return entity->runnable_weight; +} +""" + + # Get a temporary file name + tmp_file = NamedTemporaryFile(delete=False) + tmp_file.close(); + + # Duplicate and close stderr (fd = 2) + old_stderr = dup(2) + close(2) + + # Open a new file, should get fd number 2 + # This will avoid printing llvm errors on the screen + fd = open(tmp_file.name, O_WRONLY) + try: + t = BPF(text=bpf_check_text) + success_compile = True + except: + success_compile = False + + # Release the fd 2, and next dup should restore old stderr + close(fd) + dup(old_stderr) + close(old_stderr) + + # remove the temporary file and return + unlink(tmp_file.name) + return success_compile + + # process arguments if args.fullcsv: args.csv = True @@ -128,6 +189,7 @@ // header. This will need maintenance. It is from kernel/sched/sched.h: struct cfs_rq_partial { struct load_weight load; + RUNNABLE_WEIGHT_FIELD unsigned int nr_running, h_nr_running; }; @@ -156,6 +218,11 @@ } """ +if check_runnable_weight_field(): + bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', 'unsigned long runnable_weight;') +else: + bpf_text = bpf_text.replace('RUNNABLE_WEIGHT_FIELD', '') + # code substitutions if debug or args.ebpf: print(bpf_text) From 81baae4f1fb2bc245e1ddcf8b75c9b35efd4abb9 Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Sat, 26 Jan 2019 21:53:11 -0800 Subject: [PATCH 024/135] profile: exclude CPU idle stacks by default (#2166) profile: exclude CPU idle stacks by default --- man/man8/profile.8 | 3 ++ tools/profile.py | 14 ++++++ tools/profile_example.txt | 91 +++++++++++++-------------------------- 3 files changed, 47 insertions(+), 61 deletions(-) diff --git a/man/man8/profile.8 b/man/man8/profile.8 index abdd6e3bd..e2b6a8438 100644 --- a/man/man8/profile.8 +++ b/man/man8/profile.8 @@ -50,6 +50,9 @@ Show stacks from user space only (no kernel space stacks). \-K Show stacks from kernel space only (no user space stacks). .TP +\-I +Include CPU idle stacks (by default these are excluded). +.TP \-\-stack-storage-size COUNT The maximum number of unique stack traces that the kernel will count (default 16384). If the sampled count exceeds this, a warning will be printed. diff --git a/tools/profile.py b/tools/profile.py index a803b9e0c..89cd5230d 100755 --- a/tools/profile.py +++ b/tools/profile.py @@ -9,6 +9,8 @@ # counting there. Only the unique stacks and counts are passed to user space # at the end of the profile, greatly reducing the kernel<->user transfer. # +# By default CPU idle stacks are excluded by simply excluding PID 0. +# # REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). Under tools/old is # a version of this tool that may work on Linux 4.6 - 4.8. # @@ -22,6 +24,7 @@ # # 15-Jul-2016 Brendan Gregg Created this. # 20-Oct-2016 " " Switched to use the new 4.9 support. +# 26-Jan-2019 " " Changed to exclude CPU idle by default. from __future__ import print_function from bcc import BPF, PerfType, PerfSWConfig @@ -93,6 +96,8 @@ def stack_id_err(stack_id): help="insert delimiter between kernel/user stacks") parser.add_argument("-a", "--annotations", action="store_true", help="add _[k] annotations to kernel frames") +parser.add_argument("-I", "--include-idle", action="store_true", + help="include CPU idle stacks") parser.add_argument("-f", "--folded", action="store_true", help="output folded format, one line per stack (for flame graphs)") parser.add_argument("--stack-storage-size", default=16384, @@ -141,6 +146,9 @@ def stack_id_err(stack_id): int do_perf_event(struct bpf_perf_event_data *ctx) { u32 pid = bpf_get_current_pid_tgid() >> 32; + if (IDLE_FILTER) + return 0; + if (!(THREAD_FILTER)) return 0; @@ -184,6 +192,12 @@ def stack_id_err(stack_id): } """ +# set idle filter +idle_filter = "pid == 0" +if args.include_idle: + idle_filter = "0" +bpf_text = bpf_text.replace('IDLE_FILTER', idle_filter) + # set thread filter thread_context = "" perf_filter = "-a" diff --git a/tools/profile_example.txt b/tools/profile_example.txt index 6fe6f7407..7b1cc2683 100644 --- a/tools/profile_example.txt +++ b/tools/profile_example.txt @@ -41,6 +41,27 @@ Sampling at 49 Hertz of all threads by user + kernel stack... Hit Ctrl-C to end. - func_ab (13549) 5 +The output was long; I truncated some lines ("[...]"). + +This default output prints stack traces, followed by a line to describe the +process (a dash, the process name, and a PID in parenthesis), and then an +integer count of how many times this stack trace was sampled. + +The func_ab process is running the func_a() function, called by main(), +called by __libc_start_main(), and called by "[unknown]" with what looks +like a bogus address (1st column). That's evidence of a broken stack trace. +It's common for user-level software that hasn't been compiled with frame +pointers (in this case, libc). + +The dd process has called read(), and then enters the kernel via +entry_SYSCALL_64_fastpath(), calling sys_read(), and so on. Yes, I'm now +reading it bottom up. That way follows the code flow. + + +By default, CPU idle stacks are excluded. They can be included with -I: + +# ./profile -I + [...] native_safe_halt @@ -64,32 +85,16 @@ Sampling at 49 Hertz of all threads by user + kernel stack... Hit Ctrl-C to end. - swapper/1 (0) 75 -The output was long; I truncated some lines ("[...]"). - -This default output prints stack traces, followed by a line to describe the -process (a dash, the process name, and a PID in parenthesis), and then an -integer count of how many times this stack trace was sampled. - The output above shows the most frequent stack was from the "swapper/1" process (PID 0), running the native_safe_halt() function, which was called by default_idle(), which was called by arch_cpu_idle(), and so on. This is the idle thread. Stacks can be read top-down, to follow ancestry: child, parent, grandparent, etc. -The func_ab process is running the func_a() function, called by main(), -called by __libc_start_main(), and called by "[unknown]" with what looks -like a bogus address (1st column). That's evidence of a broken stack trace. -It's common for user-level software that hasn't been compiled with frame -pointers (in this case, libc). - -The dd process has called read(), and then enters the kernel via -entry_SYSCALL_64_fastpath(), calling sys_read(), and so on. Yes, I'm now -reading it bottom up. That way follows the code flow. - -The dd process is actually "dd if=/dev/zero of=/dev/null": it's a simple -workload to analyze that just moves bytes from /dev/zero to /dev/null. -Profiling just that process: +The dd process profiled ealrier is actually "dd if=/dev/zero of=/dev/null": +it's a simple workload to analyze that just moves bytes from /dev/zero to +/dev/null. Profiling just that process: # ./profile -p 25036 Sampling at 49 Hertz of PID 25036 by user + kernel stack... Hit Ctrl-C to end. @@ -539,6 +544,8 @@ You can increase or decrease the sample frequency. Eg, sampling at 9 Hertz: # ./profile -F 9 Sampling at 9 Hertz of all threads by user + kernel stack... Hit Ctrl-C to end. ^C +[...] + func_b main __libc_start_main @@ -548,27 +555,6 @@ Sampling at 9 Hertz of all threads by user + kernel stack... Hit Ctrl-C to end. [...] - native_safe_halt - default_idle - arch_cpu_idle - default_idle_call - cpu_startup_entry - start_secondary - - swapper/3 (0) - 8 - - native_safe_halt - default_idle - arch_cpu_idle - default_idle_call - cpu_startup_entry - rest_init - start_kernel - x86_64_start_reservations - x86_64_start_kernel - - swapper/0 (0) - 8 - You can also restrict profiling to just kernel stacks (-K) or user stacks (-U). For example, just user stacks: @@ -707,24 +693,6 @@ Sampling at 49 Hertz of all threads by user stack... Hit Ctrl-C to end. - dd (2931) 14 - - swapper/7 (0) - 46 - - - swapper/0 (0) - 46 - - - swapper/2 (0) - 46 - - - swapper/1 (0) - 46 - - - swapper/3 (0) - 46 - - - swapper/4 (0) - 46 - If there are too many unique stack traces for the kernel to save, a warning will be printed. Eg: @@ -739,8 +707,8 @@ Run ./profile -h to see the default. USAGE message: # ./profile -h -usage: profile [-h] [-p PID] [-U | -K] [-F FREQUENCY | -c COUNT] [-d] [-a] - [-f] [--stack-storage-size STACK_STORAGE_SIZE] +usage: profile.py [-h] [-p PID] [-U | -K] [-F FREQUENCY | -c COUNT] [-d] [-a] + [-I] [-f] [--stack-storage-size STACK_STORAGE_SIZE] [-C CPU] [duration] Profile CPU stack traces at a timed interval @@ -763,11 +731,12 @@ optional arguments: sample period, number of events -d, --delimited insert delimiter between kernel/user stacks -a, --annotations add _[k] annotations to kernel frames + -I, --include-idle include CPU idle stacks -f, --folded output folded format, one line per stack (for flame graphs) --stack-storage-size STACK_STORAGE_SIZE the number of unique stack traces that can be stored - and displayed (default 2048) + and displayed (default 16384) -C CPU, --cpu CPU cpu number to run profile on examples: From aa1b904e2aecdfceb838c416dc095f9a7e9cedf7 Mon Sep 17 00:00:00 2001 From: vijunag Date: Tue, 29 Jan 2019 22:06:13 +0530 Subject: [PATCH 025/135] [iovisor/bcc] trace: Incorrect symbol offsets when using build_id (#2161) (#2162) Fix #2161 - bcc_bsymcache API returned absolute address instead of offset from start of the symbol --- src/cc/bcc_syms.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cc/bcc_syms.cc b/src/cc/bcc_syms.cc index b74003d3b..96d431ed4 100644 --- a/src/cc/bcc_syms.cc +++ b/src/cc/bcc_syms.cc @@ -424,7 +424,7 @@ bool BuildSyms::Module::resolve_addr(uint64_t offset, struct bcc_symbol* sym, sym->name = (*it).name->c_str(); if (demangle) sym->demangle_name = sym->name; - sym->offset = (*it).start; + sym->offset = offset - (*it).start; sym->module = module_name_.c_str(); return true; } From e94833fdf8b437ac1f3056e8214f5a767e2272f8 Mon Sep 17 00:00:00 2001 From: Travis Davies Date: Mon, 28 Jan 2019 00:47:56 +0000 Subject: [PATCH 026/135] Add installation instructions for Amazon Linux 1 AMI Sign-Off-By Travis Davies --- INSTALL.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/INSTALL.md b/INSTALL.md index 7c1934ba4..ecf02d2d9 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -8,6 +8,7 @@ - [Gentoo](#gentoo---portage) - [openSUSE](#opensuse---binary) - [RHEL](#rhel---binary) + - [Amazon Linux 1](#Amazon-Linux-1---Binary) * [Source](#source) - [Debian](#debian---source) - [Ubuntu](#ubuntu---source) @@ -164,6 +165,23 @@ For RHEL 7.6, bcc is already included in the official yum repository as bcc-tool yum install bcc-tools ``` +## Amazon Linux 1 - Binary +Use case 1. Install BCC for latest kernel available in repo: + Tested on Amazon Linux AMI release 2018.03 (kernel 4.14.88-72.73.amzn1.x86_64) +``` +sudo yum update kernel +sudo yum install bcc +sudo reboot +``` + +Use case 2. Install BCC for your AMI's default kernel (no reboot required): + Tested on Amazon Linux AMI release 2018.03 (kernel 4.14.77-70.59.amzn1.x86_64) +``` +sudo yum install kernel-headers-$(uname -r | cut -d'.' -f1-5) +sudo yum install kernel-devel-$(uname -r | cut -d'.' -f1-5) +sudo yum install bcc +``` + # Source ## Debian - Source From 9d5f97216b587a9befb929930b86e908151f8517 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 1 Feb 2019 16:02:15 +1100 Subject: [PATCH 027/135] Fix debuginfo search on Ubuntu bcc on Ubuntu picks up the glibc binary instead of the debuginfo file because find_debug_via_debuglink() doesn't handle symbolic or hard links when checking if two paths point to the same file. Add a helper, same_file() which uses the device and inode to check if the two paths do point to the same file. Signed-off-by: Anton Blanchard --- src/cc/bcc_elf.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/cc/bcc_elf.c b/src/cc/bcc_elf.c index 991adb95f..ba8fa1d23 100644 --- a/src/cc/bcc_elf.c +++ b/src/cc/bcc_elf.c @@ -453,6 +453,21 @@ static int verify_checksum(const char *file, unsigned int crc) { return actual == crc; } +// Check if two filenames point to the same file, including hard or soft links. +static bool same_file(char *a, const char *b) +{ + struct stat stat_a, stat_b; + + if (stat(a, &stat_a) || stat(b, &stat_b)) + return false; + + if ((stat_a.st_dev == stat_b.st_dev) && + (stat_a.st_ino == stat_b.st_ino)) + return true; + else + return false; +} + static char *find_debug_via_debuglink(Elf *e, const char *binpath, int check_crc) { char fullpath[PATH_MAX]; @@ -473,7 +488,7 @@ static char *find_debug_via_debuglink(Elf *e, const char *binpath, // and it might contain poorer symbols (e.g. stripped or partial symbols) // than the external debuginfo that might be available elsewhere. snprintf(fullpath, sizeof(fullpath),"%s/%s", bindir, name); - if (strcmp(fullpath, binpath) != 0 && access(fullpath, F_OK) != -1) { + if (same_file(fullpath, binpath) != true && access(fullpath, F_OK) != -1) { res = strdup(fullpath); goto DONE; } From 6af7b8441176e151b2655d80335fda99b2568a61 Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Sat, 2 Feb 2019 12:45:23 -0800 Subject: [PATCH 028/135] cachestat: bring back HITRATIO column --- man/man8/cachestat.8 | 11 ++- tools/cachestat.py | 73 ++++++++----------- tools/cachestat_example.txt | 141 ++++++++++++++++++++++-------------- 3 files changed, 125 insertions(+), 100 deletions(-) diff --git a/man/man8/cachestat.8 b/man/man8/cachestat.8 index 897d5af07..172194d49 100644 --- a/man/man8/cachestat.8 +++ b/man/man8/cachestat.8 @@ -18,18 +18,14 @@ Since this uses BPF, only the root user can use this tool. CONFIG_BPF and bcc. .SH EXAMPLES .TP -Print summaries every five second: +Print summaries every second: # .B cachestat .TP -Print summaries every five seconds with timestamp: +Print summaries every second with timestamp: # .B cachestat -T .TP -Print summaries each second: -# -.B cachestat 1 -.TP Print output every five seconds, three times: # .B cachestat 5 3 @@ -51,6 +47,9 @@ Number of page cache misses. DIRTIES Number of dirty pages added to the page cache. .TP +HITRATIO +The hit ratio as a percentage. +.TP READ_HIT% Read hit percent of page cache usage. .TP diff --git a/tools/cachestat.py b/tools/cachestat.py index b00c80434..119fd9cf6 100755 --- a/tools/cachestat.py +++ b/tools/cachestat.py @@ -15,6 +15,7 @@ # 09-Sep-2015 Brendan Gregg Created this. # 06-Nov-2015 Allan McAleavy # 13-Jan-2016 Allan McAleavy run pep8 against program +# 02-Feb-2019 Brendan Gregg Column shuffle, bring back %ratio from __future__ import print_function from bcc import BPF @@ -55,7 +56,7 @@ def get_meminfo(): formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("-T", "--timestamp", action="store_true", help="include timestamp on output") -parser.add_argument("interval", nargs="?", default=5, +parser.add_argument("interval", nargs="?", default=1, help="output interval, in seconds") parser.add_argument("count", nargs="?", default=-1, help="number of outputs") @@ -102,7 +103,7 @@ def get_meminfo(): if tstamp: print("%-8s " % "TIME", end="") print("%8s %8s %8s %8s %12s %10s" % - ("TOTAL", "MISSES", "HITS", "DIRTIES", "BUFFERS_MB", "CACHED_MB")) + ("HITS", "MISSES", "DIRTIES", "HITRATIO", "BUFFERS_MB", "CACHED_MB")) loop = 0 exiting = 0 @@ -121,38 +122,36 @@ def get_meminfo(): counts = b["counts"] for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): - - if re.match(b'mark_page_accessed', b.ksym(k.ip)) is not None: + func = b.ksym(k.ip) + # partial string matches in case of .isra (necessary?) + if func.find("mark_page_accessed") == 0: mpa = max(0, v.value) - - if re.match(b'mark_buffer_dirty', b.ksym(k.ip)) is not None: + if func.find("mark_buffer_dirty") == 0: mbd = max(0, v.value) - - if re.match(b'add_to_page_cache_lru', b.ksym(k.ip)) is not None: + if func.find("add_to_page_cache_lru") == 0: apcl = max(0, v.value) - - if re.match(b'account_page_dirtied', b.ksym(k.ip)) is not None: + if func.find("account_page_dirtied") == 0: apd = max(0, v.value) - # total = total cache accesses without counting dirties - # misses = total of add to lru because of read misses - total = (mpa - mbd) - misses = (apcl - apd) - - if total < 0: - total = 0 - - if misses < 0: - misses = 0 - - hits = total - misses - - # If hits are < 0, then its possible misses are overestimated - # due to possibly page cache read ahead adding more pages than - # needed. In this case just assume misses as total and reset hits. - if hits < 0: - misses = total - hits = 0 + # total = total cache accesses without counting dirties + # misses = total of add to lru because of read misses + total = mpa - mbd + misses = apcl - apd + if misses < 0: + misses = 0 + if total < 0: + total = 0 + hits = total - misses + + # If hits are < 0, then its possible misses are overestimated + # due to possibly page cache read ahead adding more pages than + # needed. In this case just assume misses as total and reset hits. + if hits < 0: + misses = total + hits = 0 + ratio = 0 + if total > 0: + ratio = float(hits) / total if debug: print("%d %d %d %d %d %d %d\n" % @@ -167,18 +166,10 @@ def get_meminfo(): if tstamp: print("%-8s " % strftime("%H:%M:%S"), end="") - print("%8d %8d %8d %8d %12.0f %10.0f" % - (total, misses, hits, mbd, buff, cached)) - - mpa = 0 - mbd = 0 - apcl = 0 - apd = 0 - total = 0 - misses = 0 - hits = 0 - cached = 0 - buff = 0 + print("%8d %8d %8d %7.2f%% %12.0f %10.0f" % + (hits, misses, mbd, 100 * ratio, buff, cached)) + + mpa = mbd = apcl = apd = total = misses = hits = cached = buff = 0 if exiting: print("Detaching...") diff --git a/tools/cachestat_example.txt b/tools/cachestat_example.txt index 7ecfec6da..dad523534 100644 --- a/tools/cachestat_example.txt +++ b/tools/cachestat_example.txt @@ -1,56 +1,91 @@ -# ./cachestat -h -USAGE: ./cachestat [-T] [ interval [count] ] +Demonstrations of cachestat, the Linux eBPF/bcc version. -show Linux page cache hit/miss statistics including read and write hit % + +cachestat shows hits and misses to the file system page cache. For example: + +# cachestat + HITS MISSES DIRTIES HITRATIO BUFFERS_MB CACHED_MB + 1132 0 4 100.00% 277 4367 + 161 0 36 100.00% 277 4372 + 16 0 28 100.00% 277 4372 + 17154 13750 15 55.51% 277 4422 + 19 0 1 100.00% 277 4422 + 83 0 83 100.00% 277 4421 + 16 0 1 100.00% 277 4423 +^C 0 -19 360 0.00% 277 4423 +Detaching... + +While tracing, there was a burst of misses in the fourth second, bringing +the hit ration down to 55%. + + +This shows a 1 Gbyte uncached file that is read twice: + +(root) ~ # ./cachestat.py + HITS MISSES DIRTIES HITRATIO BUFFERS_MB CACHED_MB + 1 0 0 100.00% 5 191 + 198 12136 0 1.61% 5 238 + 1 11007 3 0.01% 5 281 + 0 6384 0 0.00% 5 306 + 1 14464 0 0.01% 5 363 + 0 11776 0 0.00% 5 409 + 1 11712 0 0.01% 5 454 + 32 13184 0 0.24% 5 506 + 0 11232 0 0.00% 5 550 + 1 13056 0 0.01% 5 601 + 16 14720 0 0.11% 5 658 + 33 9920 0 0.33% 5 697 + 0 13248 0 0.00% 5 749 + 4 14144 0 0.03% 5 804 + 0 9728 0 0.00% 5 842 + 1 10816 0 0.01% 5 885 + 808 13504 1 5.65% 5 938 + 0 11409 0 0.00% 5 982 + 0 11520 0 0.00% 5 1027 + 0 15616 0 0.00% 5 1088 + 1 9792 0 0.01% 5 1126 + 0 8256 0 0.00% 5 1158 + 1 9600 0 0.01% 5 1196 + 599 4804 0 11.09% 5 1215 + 1 0 0 100.00% 5 1215 + 0 0 0 0.00% 5 1215 + 3 1 0 75.00% 5 1215 + 79536 34 0 99.96% 5 1215 + 87693 274 4 99.69% 6 1214 + 89018 3546 0 96.17% 7 1227 + 33531 201 4 99.40% 7 1228 + 22 44 0 33.33% 8 1228 + 0 0 0 0.00% 8 1228 + 73 21 2 77.66% 8 1228 + +It took 24 seconds to read the 1 Gbyte file the first time, shown in the output +by the high MISSES rate and low HITRATIO. The second time it took 4 seconds, +and the HITRATIO was around 99%. + + +This output shows a 1 Gbyte file being created and added to the page cache: + +(root) ~ # ./cachestat.py + HITS MISSES DIRTIES HITRATIO BUFFERS_MB CACHED_MB + 1 0 0 100.00% 8 209 + 0 0 165584 0.00% 8 856 + 0 0 96505 0.00% 8 1233 + 0 0 0 0.00% 8 1233 + +Note the high rate of DIRTIES, and the CACHED_MD size increases by 1024 Mbytes. + + +USAGE message: + +# cachestat -h +usage: cachestat.py [-h] [-T] [interval] [count] + +Count cache kernel function calls + +positional arguments: + interval output interval, in seconds + count number of outputs optional arguments: - -T include timestamp on output - -examples: - ./cachestat # run with default option of 5 seconds delay - ./cachestat -T # run with default option of 5 seconds delay with timestamps - ./cachestat 1 # print every second hit/miss stats - ./cachestat -T 1 # include timestamps with one second samples - ./cachestat 1 5 # run with interval of one second for five iterations - ./cachestat -T 1 5 # include timestamps with interval of one second for five iterations - - -Following commands show a 2GB file being read into the page cache. - -Command used to generate activity: -# dd if=/root/tmpfile of=/dev/null bs=8192 - -Output from cachestat running simultatenously: -# ./tools/cachestat.py 1 - TOTAL MISSES HITS DIRTIES BUFFERS_MB CACHED_MB - 1 0 1 0 8 283 - 0 0 0 0 8 283 - 0 0 0 2 8 283 - 0 0 0 0 8 283 - 10009 9173 836 2 9 369 - 152032 152032 0 0 9 1028 - 157408 157405 3 0 9 1707 - 150432 150432 0 0 9 2331 - 0 0 0 0 9 2331 - 1 1 0 1 9 2331 - 0 0 0 0 9 2331 - 0 0 0 0 9 2331 - 0 0 0 0 9 2331 - -The misses counter reflects a 2GB file being read and almost everything being -a page cache miss. - -Below shows an example of a new 100MB file added to page cache, by using -the command: dd if=/dev/zero of=/root/tmpfile2 bs=4k count=$((256*100)) - -# ./tools/cachestat.py 1 - TOTAL MISSES HITS DIRTIES BUFFERS_MB CACHED_MB - 0 0 0 0 15 2440 - 0 0 0 0 15 2440 - 0 0 0 0 15 2440 - 1758 0 1758 25603 15 2540 - 0 0 0 0 15 2540 - 0 0 0 0 15 2541 - -~25600 pages are being dirtied (writes) which corresponds to the 100MB file -added to the page cache. + -h, --help show this help message and exit + -T, --timestamp include timestamp on output From 10c8bd33214ef82f6c9a2a9920b92af6be67dab1 Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Sun, 3 Feb 2019 15:50:21 +0800 Subject: [PATCH 029/135] examples/tracing/bitehist.py: add example of linear histogram (#2177) Add an example of linear histogram, which might be useful to newcomers. And also update the comments. --- examples/tracing/bitehist.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/tracing/bitehist.py b/examples/tracing/bitehist.py index c8c7f7a65..4d7c7958b 100755 --- a/examples/tracing/bitehist.py +++ b/examples/tracing/bitehist.py @@ -3,15 +3,15 @@ # bitehist.py Block I/O size histogram. # For Linux, uses BCC, eBPF. Embedded C. # -# Written as a basic example of using a histogram to show a distribution. +# Written as a basic example of using histograms to show a distribution. # -# The default interval is 5 seconds. A Ctrl-C will print the partially -# gathered histogram then exit. +# A Ctrl-C will print the gathered histogram then exit. # # Copyright (c) 2015 Brendan Gregg. # Licensed under the Apache License, Version 2.0 (the "License") # # 15-Aug-2015 Brendan Gregg Created this. +# 03-Feb-2019 Xiaozhou Liu added linear histogram. from __future__ import print_function from bcc import BPF @@ -23,10 +23,12 @@ #include BPF_HISTOGRAM(dist); +BPF_HISTOGRAM(dist_linear); int kprobe__blk_account_io_completion(struct pt_regs *ctx, struct request *req) { dist.increment(bpf_log2l(req->__data_len / 1024)); + dist_linear.increment(req->__data_len / 1024); return 0; } """) @@ -41,4 +43,10 @@ print() # output +print("log2 histogram") +print("~~~~~~~~~~~~~~") b["dist"].print_log2_hist("kbytes") + +print("\nlinear histogram") +print("~~~~~~~~~~~~~~~~") +b["dist_linear"].print_linear_hist("kbytes") From 5f7c82947f4b19abe999b5931404db79c6a079ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20St=C3=BChrk?= Date: Sun, 3 Feb 2019 08:50:49 +0100 Subject: [PATCH 030/135] Fix tools/syscount -l (#2180) In #2063, syscount's syscall mapping was moved into its own module. Unfortunately, that broke the "print list of recognized syscalls and exit" usage of syscount. Fix it by importing the syscall mapping from the new module. --- tools/syscount.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/syscount.py b/tools/syscount.py index 486953cc2..6cbea1162 100755 --- a/tools/syscount.py +++ b/tools/syscount.py @@ -17,7 +17,7 @@ import signal from bcc import BPF from bcc.utils import printb -from bcc.syscall import syscall_name +from bcc.syscall import syscall_name, syscalls if sys.version_info.major < 3: izip_longest = itertools.izip_longest From 922f1ab41626e67308c0186f6a3c26fdf16fa185 Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Sun, 3 Feb 2019 21:27:59 +0800 Subject: [PATCH 031/135] examples/tracing: some minor fixes - chmod +x dddos.py mallocstacks.py stack_buildid_example.py - Handle Ctrl-C keyboard interrupt for dddos.py, disksnoop.py, hello_perf_output.py, stacksnoop.py and trace_perf_output.py --- examples/tracing/dddos.py | 5 ++++- examples/tracing/disksnoop.py | 23 +++++++++++++---------- examples/tracing/hello_perf_output.py | 5 ++++- examples/tracing/mallocstacks.py | 0 examples/tracing/stack_buildid_example.py | 0 examples/tracing/stacksnoop.py | 5 ++++- examples/tracing/trace_perf_output.py | 5 ++++- 7 files changed, 29 insertions(+), 14 deletions(-) mode change 100644 => 100755 examples/tracing/dddos.py mode change 100644 => 100755 examples/tracing/mallocstacks.py mode change 100644 => 100755 examples/tracing/stack_buildid_example.py diff --git a/examples/tracing/dddos.py b/examples/tracing/dddos.py old mode 100644 new mode 100755 index e72ba7073..5b544241c --- a/examples/tracing/dddos.py +++ b/examples/tracing/dddos.py @@ -99,4 +99,7 @@ def trigger_alert_event(cpu, data, size): # loop with callback to trigger_alert_event b["events"].open_perf_buffer(trigger_alert_event) while 1: - b.perf_buffer_poll() + try: + b.perf_buffer_poll() + except KeyboardInterrupt: + exit() diff --git a/examples/tracing/disksnoop.py b/examples/tracing/disksnoop.py index ed3dd819d..17d911a12 100755 --- a/examples/tracing/disksnoop.py +++ b/examples/tracing/disksnoop.py @@ -51,15 +51,18 @@ # format output while 1: - (task, pid, cpu, flags, ts, msg) = b.trace_fields() - (bytes_s, bflags_s, us_s) = msg.split() + try: + (task, pid, cpu, flags, ts, msg) = b.trace_fields() + (bytes_s, bflags_s, us_s) = msg.split() - if int(bflags_s, 16) & REQ_WRITE: - type_s = "W" - elif bytes_s == "0": # see blk_fill_rwbs() for logic - type_s = "M" - else: - type_s = "R" - ms = float(int(us_s, 10)) / 1000 + if int(bflags_s, 16) & REQ_WRITE: + type_s = "W" + elif bytes_s == "0": # see blk_fill_rwbs() for logic + type_s = "M" + else: + type_s = "R" + ms = float(int(us_s, 10)) / 1000 - print("%-18.9f %-2s %-7s %8.2f" % (ts, type_s, bytes_s, ms)) + print("%-18.9f %-2s %-7s %8.2f" % (ts, type_s, bytes_s, ms)) + except KeyboardInterrupt: + exit() diff --git a/examples/tracing/hello_perf_output.py b/examples/tracing/hello_perf_output.py index 7decd5808..64cfb63fc 100755 --- a/examples/tracing/hello_perf_output.py +++ b/examples/tracing/hello_perf_output.py @@ -58,4 +58,7 @@ def print_event(cpu, data, size): # loop with callback to print_event b["events"].open_perf_buffer(print_event) while 1: - b.perf_buffer_poll() + try: + b.perf_buffer_poll() + except KeyboardInterrupt: + exit() diff --git a/examples/tracing/mallocstacks.py b/examples/tracing/mallocstacks.py old mode 100644 new mode 100755 diff --git a/examples/tracing/stack_buildid_example.py b/examples/tracing/stack_buildid_example.py old mode 100644 new mode 100755 diff --git a/examples/tracing/stacksnoop.py b/examples/tracing/stacksnoop.py index bced93f13..0ade2dbba 100755 --- a/examples/tracing/stacksnoop.py +++ b/examples/tracing/stacksnoop.py @@ -120,4 +120,7 @@ def print_event(cpu, data, size): b["events"].open_perf_buffer(print_event) while 1: - b.perf_buffer_poll() + try: + b.perf_buffer_poll() + except KeyboardInterrupt: + exit() diff --git a/examples/tracing/trace_perf_output.py b/examples/tracing/trace_perf_output.py index 26333c896..35a579573 100755 --- a/examples/tracing/trace_perf_output.py +++ b/examples/tracing/trace_perf_output.py @@ -53,4 +53,7 @@ def print_counter(): print("Tracing " + event_name + ", try `dd if=/dev/zero of=/dev/null`") print("Tracing... Hit Ctrl-C to end.") while 1: - b.perf_buffer_poll() + try: + b.perf_buffer_poll() + except KeyboardInterrupt: + exit() From 41ffdd17c4e5a3a6097cbcedbb43548d91fa0a3d Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Mon, 4 Feb 2019 00:40:25 +0800 Subject: [PATCH 032/135] docs: fix broken link of bpf_log2l(#2176) docs: fix broken link of bpf_log2l() --- docs/reference_guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference_guide.md b/docs/reference_guide.md index 79d510318..e7f82ad89 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -23,7 +23,7 @@ This guide is incomplete. If something feels missing, check the bcc and kernel s - [5. bpf_get_current_uid_gid()](#5-bpf_get_current_uid_gid) - [6. bpf_get_current_comm()](#6-bpf_get_current_comm) - [7. bpf_get_current_task()](#7-bpf_get_current_task) - - [8. bpf_log2l()](#8-bpflog2l) + - [8. bpf_log2l()](#8-bpf_log2l) - [9. bpf_get_prandom_u32()](#9-bpf_get_prandom_u32) - [Debugging](#debugging) - [1. bpf_override_return()](#1-bpf_override_return) From a11fa0c2fda8d69fdb5fb835c1148845bb654ad5 Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Mon, 4 Feb 2019 15:11:19 -0800 Subject: [PATCH 033/135] sync with latest libbpf repo (#2183) the libbpf is just sync'ed with latest bpf-next. Signed-off-by: Yonghong Song --- src/cc/libbpf | 2 +- src/cc/libbpf.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cc/libbpf b/src/cc/libbpf index d5b146fec..30388a7af 160000 --- a/src/cc/libbpf +++ b/src/cc/libbpf @@ -1 +1 @@ -Subproject commit d5b146fec50d7aa126fe98323aeaee688d4af289 +Subproject commit 30388a7afd8ad8b237ff44f58cad6db3ec03edc6 diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 306ac8178..c7f5fa9fd 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -52,6 +52,7 @@ #include "setns.h" #include "libbpf/src/bpf.h" +#include "libbpf/src/libbpf.h" // TODO: remove these defines when linux-libc-dev exports them properly From 16fd2f6059cfb750d2e8683eebac5bfcaa7cfd4d Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Mon, 4 Feb 2019 20:14:48 -0800 Subject: [PATCH 034/135] sync with latest bpf (#2184) The previous sync had a bug in libbpf libbpf_print() function. This sync brought in the bug fix Signed-off-by: Yonghong Song --- src/cc/libbpf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cc/libbpf b/src/cc/libbpf index 30388a7af..b19c6dcf6 160000 --- a/src/cc/libbpf +++ b/src/cc/libbpf @@ -1 +1 @@ -Subproject commit 30388a7afd8ad8b237ff44f58cad6db3ec03edc6 +Subproject commit b19c6dcf623a7adc9e538ddbe2964c2f58dd2417 From 929348e99f2ee5dc8e8dcefdd691b3258cf1c08c Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Tue, 5 Feb 2019 23:32:45 +0100 Subject: [PATCH 035/135] Fix misc file permissions (#2185) A few files that are obviously not executable have the exec bit set. Most those files were created with faulty mode to begin with, but src/cc/export/helpers.h and src/cc/frontends/clang/loader.cc have had their exec bit (unintendedly I assume) set by commit bfecc243fc8e ("clang: loader: Allow user to override kernel version (#1895)"). --- examples/tracing/urandomread_example.txt | 0 examples/usdt_sample/CMakeLists.txt | 0 examples/usdt_sample/usdt_sample_app1/CMakeLists.txt | 0 examples/usdt_sample/usdt_sample_lib1/CMakeLists.txt | 0 src/cc/export/helpers.h | 0 src/cc/frontends/clang/loader.cc | 0 6 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 examples/tracing/urandomread_example.txt mode change 100755 => 100644 examples/usdt_sample/CMakeLists.txt mode change 100755 => 100644 examples/usdt_sample/usdt_sample_app1/CMakeLists.txt mode change 100755 => 100644 examples/usdt_sample/usdt_sample_lib1/CMakeLists.txt mode change 100755 => 100644 src/cc/export/helpers.h mode change 100755 => 100644 src/cc/frontends/clang/loader.cc diff --git a/examples/tracing/urandomread_example.txt b/examples/tracing/urandomread_example.txt old mode 100755 new mode 100644 diff --git a/examples/usdt_sample/CMakeLists.txt b/examples/usdt_sample/CMakeLists.txt old mode 100755 new mode 100644 diff --git a/examples/usdt_sample/usdt_sample_app1/CMakeLists.txt b/examples/usdt_sample/usdt_sample_app1/CMakeLists.txt old mode 100755 new mode 100644 diff --git a/examples/usdt_sample/usdt_sample_lib1/CMakeLists.txt b/examples/usdt_sample/usdt_sample_lib1/CMakeLists.txt old mode 100755 new mode 100644 diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h old mode 100755 new mode 100644 diff --git a/src/cc/frontends/clang/loader.cc b/src/cc/frontends/clang/loader.cc old mode 100755 new mode 100644 From 518bd445b3002f6548bd54dd848767b9a4e9c88f Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Wed, 6 Feb 2019 07:24:22 +0800 Subject: [PATCH 036/135] docs: references_guide.md: add/fix search examples/tools links (#2186) add/fix search examples/tools links in references_guide.md --- docs/reference_guide.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/reference_guide.md b/docs/reference_guide.md index e7f82ad89..b0fa8d759 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -752,7 +752,8 @@ Syntax: ```map.insert(&key, &val)``` Associate the value in the second argument to the key, only if there was no previous value. Examples in situ: -[search /examples](https://github.com/iovisor/bcc/search?q=insert+path%3Aexamples&type=Code) +[search /examples](https://github.com/iovisor/bcc/search?q=insert+path%3Aexamples&type=Code), +[search /tools](https://github.com/iovisor/bcc/search?q=insert+path%3Atools&type=Code) ### 17. map.increment() @@ -1043,7 +1044,9 @@ b.attach_tracepoint("random:urandom_read", "printarg") Notice how the first argument to ```printarg()``` is now our defined struct. Examples in situ: -[code](https://github.com/iovisor/bcc/blob/a4159da8c4ea8a05a3c6e402451f530d6e5a8b41/examples/tracing/urandomread-explicit.py#L41) +[code](https://github.com/iovisor/bcc/blob/a4159da8c4ea8a05a3c6e402451f530d6e5a8b41/examples/tracing/urandomread-explicit.py#L41), +[search /examples](https://github.com/iovisor/bcc/search?q=attach_tracepoint+path%3Aexamples+language%3Apython&type=Code), +[search /tools](https://github.com/iovisor/bcc/search?q=attach_tracepoint+path%3Atools+language%3Apython&type=Code) ### 4. attach_uprobe() @@ -1306,8 +1309,8 @@ for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): This example also uses the ```sorted()``` method to sort by value. Examples in situ: -[search /examples](https://github.com/iovisor/bcc/search?q=clear+items%3Aexamples+language%3Apython&type=Code), -[search /tools](https://github.com/iovisor/bcc/search?q=clear+items%3Atools+language%3Apython&type=Code) +[search /examples](https://github.com/iovisor/bcc/search?q=items+path%3Aexamples+language%3Apython&type=Code), +[search /tools](https://github.com/iovisor/bcc/search?q=items+path%3Atools+language%3Apython&type=Code) ### 4. values() From bf900f77add6b50a329e3495e207b4d61da14953 Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Thu, 7 Feb 2019 23:03:40 -0800 Subject: [PATCH 037/135] sync with latest libbpf repo (#2189) updated libbpf repo commit id. Changed helpers bpf_msg_pop_data() and bpf_rc_pointer_rel() kernel version from 4.21 to 5.0. Add two new helpers bpf_spin_lock() and bpf_spin_unlock() to src/cc/libbpf.c, sec/cc/export/helper.h and docs/kernel-versions.md. Signed-off-by: Yonghong Song --- docs/kernel-versions.md | 8 +++++--- src/cc/compat/linux/virtual_bpf.h | 10 +++++++++- src/cc/export/helpers.h | 4 ++++ src/cc/libbpf | 2 +- src/cc/libbpf.c | 6 ++++-- 5 files changed, 23 insertions(+), 7 deletions(-) diff --git a/docs/kernel-versions.md b/docs/kernel-versions.md index 48e6d2e08..7842bf4e2 100644 --- a/docs/kernel-versions.md +++ b/docs/kernel-versions.md @@ -187,7 +187,7 @@ Helper | Kernel version | License | Commit | `BPF_FUNC_map_update_elem()` | 3.19 | | [`d0003ec01c66`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d0003ec01c667b731c139e23de3306a8b328ccf5) `BPF_FUNC_msg_apply_bytes()` | 4.17 | | [`2a100317c9eb`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2a100317c9ebc204a166f16294884fbf9da074ce) `BPF_FUNC_msg_cork_bytes()` | 4.17 | | [`91843d540a13`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=91843d540a139eb8070bcff8aa10089164436deb) -`BPF_FUNC_msg_pop_data()` | 4.21 | | [`7246d8ed4dcc`](https://github.com/torvalds/linux/commit/7246d8ed4dcce23f7509949a77be15fa9f0e3d28) +`BPF_FUNC_msg_pop_data()` | 5.0 | | [`7246d8ed4dcc`](https://github.com/torvalds/linux/commit/7246d8ed4dcce23f7509949a77be15fa9f0e3d28) `BPF_FUNC_msg_pull_data()` | 4.17 | | [`015632bb30da`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=015632bb30daaaee64e1bcac07570860e0bf3092) `BPF_FUNC_msg_push_data()` | 4.20 | | [`6fff607e2f14`](https://github.com/torvalds/linux/commit/6fff607e2f14bd7c63c06c464a6f93b8efbabe28) `BPF_FUNC_msg_redirect_hash()` | 4.18 | | [`81110384441a`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=81110384441a59cff47430f20f049e69b98c17f4) @@ -200,7 +200,7 @@ Helper | Kernel version | License | Commit | `BPF_FUNC_probe_read_str()` | 4.11 | GPL | [`a5e8c07059d0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=a5e8c07059d0f0b31737408711d44794928ac218) `BPF_FUNC_probe_write_user()` | 4.8 | GPL | [`96ae52279594`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=96ae52279594470622ff0585621a13e96b700600) `BPF_FUNC_rc_keydown()` | 4.18 | GPL | [`f4364dcfc86d`](https://git.kernel.org/cgit/linux/kernel/git/bpf/bpf-next.git/commit/?id=f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936) -`BPF_FUNC_rc_pointer_rel()` | 4.21 | GPL | [`01d3240a04f4`](https://github.com/torvalds/linux/commit/01d3240a04f4c09392e13c77b54d4423ebce2d72) +`BPF_FUNC_rc_pointer_rel()` | 5.0 | GPL | [`01d3240a04f4`](https://github.com/torvalds/linux/commit/01d3240a04f4c09392e13c77b54d4423ebce2d72) `BPF_FUNC_rc_repeat()` | 4.18 | GPL | [`f4364dcfc86d`](https://git.kernel.org/cgit/linux/kernel/git/bpf/bpf-next.git/commit/?id=f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936) `BPF_FUNC_redirect()` | 4.4 | | [`27b29f63058d`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=27b29f63058d26c6c1742f1993338280d5a41dc6) `BPF_FUNC_redirect_map()` | 4.14 | | [`97f91a7cf04f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=97f91a7cf04ff605845c20948b8a80e54cbd3376) @@ -234,6 +234,8 @@ Helper | Kernel version | License | Commit | `BPF_FUNC_skb_vlan_push()` | 4.3 | | [`4e10df9a60d9`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4e10df9a60d96ced321dd2af71da558c6b750078) `BPF_FUNC_sock_hash_update()` | 4.18 | | [`81110384441a`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=81110384441a59cff47430f20f049e69b98c17f4) `BPF_FUNC_sock_map_update()` | 4.14 | | [`174a79ff9515`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=174a79ff9515f400b9a6115643dafd62a635b7e6) +`BPF_FUNC_spin_lock()` | 5.1 | | [`d83525ca62cf`](https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/?id=d83525ca62cf8ebe3271d14c36fb900c294274a2) +`BPF_FUNC_spin_unlock()` | 5.1 | | [`d83525ca62cf`](https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/?id=d83525ca62cf8ebe3271d14c36fb900c294274a2) `BPF_FUNC_tail_call()` | 4.2 | | [`04fd61ab36ec`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=04fd61ab36ec065e194ab5e74ae34a5240d992bb) `BPF_FUNC_trace_printk()` | 4.1 | GPL | [`9c959c863f82`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=9c959c863f8217a2ff3d7c296e8223654d240569) `BPF_FUNC_xdp_adjust_head()` | 4.10 | | [`17bedab27231`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=17bedab2723145d17b14084430743549e6943d03) @@ -284,6 +286,6 @@ The list of program types and supported helper functions can be retrieved with: |Function Group| Functions| |------------------|-------| -|`Base functions`| `BPF_FUNC_map_lookup_elem()`
`BPF_FUNC_map_update_elem()`
`BPF_FUNC_map_delete_elem()`
`BPF_FUNC_map_peek_elem()`
`BPF_FUNC_map_pop_elem()`
`BPF_FUNC_map_push_elem()`
`BPF_FUNC_get_prandom_u32()`
`BPF_FUNC_get_smp_processor_id()`
`BPF_FUNC_get_numa_node_id()`
`BPF_FUNC_tail_call()`
`BPF_FUNC_ktime_get_ns()`
`BPF_FUNC_trace_printk()`| +|`Base functions`| `BPF_FUNC_map_lookup_elem()`
`BPF_FUNC_map_update_elem()`
`BPF_FUNC_map_delete_elem()`
`BPF_FUNC_map_peek_elem()`
`BPF_FUNC_map_pop_elem()`
`BPF_FUNC_map_push_elem()`
`BPF_FUNC_get_prandom_u32()`
`BPF_FUNC_get_smp_processor_id()`
`BPF_FUNC_get_numa_node_id()`
`BPF_FUNC_tail_call()`
`BPF_FUNC_ktime_get_ns()`
`BPF_FUNC_trace_printk()`
`BPF_FUNC_spin_lock()`
`BPF_FUNC_spin_unlock()` | |`Tracing functions`|`BPF_FUNC_map_lookup_elem()`
`BPF_FUNC_map_update_elem()`
`BPF_FUNC_map_delete_elem()`
`BPF_FUNC_probe_read()`
`BPF_FUNC_ktime_get_ns()`
`BPF_FUNC_tail_call()`
`BPF_FUNC_get_current_pid_tgid()`
`BPF_FUNC_get_current_task()`
`BPF_FUNC_get_current_uid_gid()`
`BPF_FUNC_get_current_comm()`
`BPF_FUNC_trace_printk()`
`BPF_FUNC_get_smp_processor_id()`
`BPF_FUNC_get_numa_node_id()`
`BPF_FUNC_perf_event_read()`
`BPF_FUNC_probe_write_user()`
`BPF_FUNC_current_task_under_cgroup()`
`BPF_FUNC_get_prandom_u32()`
`BPF_FUNC_probe_read_str()`
`BPF_FUNC_get_current_cgroup_id()` | |`LWT functions`| `BPF_FUNC_skb_load_bytes()`
`BPF_FUNC_skb_pull_data()`
`BPF_FUNC_csum_diff()`
`BPF_FUNC_get_cgroup_classid()`
`BPF_FUNC_get_route_realm()`
`BPF_FUNC_get_hash_recalc()`
`BPF_FUNC_perf_event_output()`
`BPF_FUNC_get_smp_processor_id()`
`BPF_FUNC_skb_under_cgroup()`| diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h index e4be3c460..9061382d2 100644 --- a/src/cc/compat/linux/virtual_bpf.h +++ b/src/cc/compat/linux/virtual_bpf.h @@ -15,6 +15,7 @@ R"********( /* Extended instruction set based on top of classic BPF */ /* instruction classes */ +#define BPF_JMP32 0x06 /* jmp mode in word width */ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ @@ -267,6 +268,7 @@ enum bpf_attach_type { #define BPF_ANY 0 /* create new element or update existing */ #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ +#define BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */ /* flags for BPF_MAP_CREATE command */ #define BPF_F_NO_PREALLOC (1U << 0) @@ -2422,7 +2424,9 @@ union bpf_attr { FN(map_peek_elem), \ FN(msg_push_data), \ FN(msg_pop_data), \ - FN(rc_pointer_rel), + FN(rc_pointer_rel), \ + FN(spin_lock), \ + FN(spin_unlock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2541,6 +2545,7 @@ struct __sk_buff { __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); __u64 tstamp; __u32 wire_len; + __u32 gso_segs; }; struct bpf_tunnel_key { @@ -3055,5 +3060,8 @@ struct bpf_line_info { __u32 line_col; }; +struct bpf_spin_lock { + __u32 val; +}; #endif /* _UAPI__LINUX_BPF_H__ */ )********" diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index 9b2c04caa..c025da5a2 100644 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -461,6 +461,10 @@ static int (*bpf_msg_pop_data)(void *msg, u32 start, u32 pop, u64 flags) = (void *) BPF_FUNC_msg_pop_data; static int (*bpf_rc_pointer_rel)(void *ctx, s32 rel_x, s32 rel_y) = (void *) BPF_FUNC_rc_pointer_rel; +static void (*bpf_spin_lock)(struct bpf_spin_lock *lock) = + (void *) BPF_FUNC_spin_lock; +static void (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = + (void *) BPF_FUNC_spin_unlock; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/src/cc/libbpf b/src/cc/libbpf index b19c6dcf6..f0bcba631 160000 --- a/src/cc/libbpf +++ b/src/cc/libbpf @@ -1 +1 @@ -Subproject commit b19c6dcf623a7adc9e538ddbe2964c2f58dd2417 +Subproject commit f0bcba631dec4540fc6ab2cd0a0923a111cf4cf2 diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index c7f5fa9fd..0cf788dd7 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -187,8 +187,10 @@ static struct bpf_helper helpers[] = { {"map_pop_elem", "4.20"}, {"map_peak_elem", "4.20"}, {"msg_push_data", "4.20"}, - {"msg_pop_data", "4.21"}, - {"rc_pointer_rel", "4.21"}, + {"msg_pop_data", "5.0"}, + {"rc_pointer_rel", "5.0"}, + {"spin_lock", "5.1"}, + {"spin_unlock", "5.1"}, }; static uint64_t ptr_to_u64(void *ptr) From c217b25390299cabbe72fb718c09ab56906515a0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 7 Feb 2019 23:37:04 -0800 Subject: [PATCH 038/135] add bcc_create_map_xattr() and refactor bcc_create_map() Added bcc_create_map_xattr() function which takes a libbpf bpf_create_map_attr pointer as the argument. This api will be later used to create maps with btf fd and key/value btf type ids. bcc_create_map() is refactored to use bcc_create_map_xattr(). Signed-off-by: Yonghong Song --- src/cc/libbpf.c | 36 ++++++++++++++++++++++++------------ src/cc/libbpf.h | 3 +++ 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 0cf788dd7..76afc02c6 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -198,19 +198,17 @@ static uint64_t ptr_to_u64(void *ptr) return (uint64_t) (unsigned long) ptr; } -int bcc_create_map(enum bpf_map_type map_type, const char *name, - int key_size, int value_size, - int max_entries, int map_flags) +int bcc_create_map_xattr(struct bpf_create_map_attr *attr) { - size_t name_len = name ? strlen(name) : 0; - char map_name[BPF_OBJ_NAME_LEN]; + size_t name_len = attr->name ? strlen(attr->name) : 0; + char map_name[BPF_OBJ_NAME_LEN] = {}; - memcpy(map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1)); - int ret = bpf_create_map_name(map_type, map_name, key_size, value_size, - max_entries, map_flags); + memcpy(map_name, attr->name, min(name_len, BPF_OBJ_NAME_LEN - 1)); + attr->name = map_name; + int ret = bpf_create_map_xattr(attr); if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) { - ret = bpf_create_map(map_type, key_size, value_size, - max_entries, map_flags); + map_name[0] = '\0'; + ret = bpf_create_map_xattr(attr); } if (ret < 0 && errno == EPERM) { @@ -221,13 +219,27 @@ int bcc_create_map(enum bpf_map_type map_type, const char *name, rl.rlim_max = RLIM_INFINITY; rl.rlim_cur = rl.rlim_max; if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0) - ret = bpf_create_map(map_type, key_size, value_size, - max_entries, map_flags); + ret = bpf_create_map_xattr(attr); } } return ret; } +int bcc_create_map(enum bpf_map_type map_type, const char *name, + int key_size, int value_size, + int max_entries, int map_flags) +{ + struct bpf_create_map_attr attr = {}; + + attr.map_type = map_type; + attr.name = name; + attr.key_size = key_size; + attr.value_size = value_size; + attr.max_entries = max_entries; + attr.map_flags = map_flags; + return bcc_create_map_xattr(&attr); +} + int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags) { return bpf_map_update_elem(fd, key, value, flags); diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h index fdcd65caa..d2c6fa6d4 100644 --- a/src/cc/libbpf.h +++ b/src/cc/libbpf.h @@ -26,6 +26,8 @@ extern "C" { #endif +struct bpf_create_map_attr; + enum bpf_probe_attach_type { BPF_PROBE_ENTRY, BPF_PROBE_RETURN @@ -34,6 +36,7 @@ enum bpf_probe_attach_type { int bcc_create_map(enum bpf_map_type map_type, const char *name, int key_size, int value_size, int max_entries, int map_flags); +int bcc_create_map_xattr(struct bpf_create_map_attr *attr); int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags); int bpf_lookup_elem(int fd, void *key, void *value); int bpf_delete_elem(int fd, void *key); From 8c06807ea5746f8de0a8a1ce3b165b209115c50c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 8 Feb 2019 00:12:17 -0800 Subject: [PATCH 039/135] add bcc_prog_load_xattr() and refactor bcc_prog_load() Added bcc_prog_load_xattr() function which takes a libbpf bpf_load_program_attr pointer (among others) as an argument. This api will be later used to load bpf programs with btf fd, func_info and line_info. bcc_prog_load() is refactored to use bcc_prog_load_xattr(). Signed-off-by: Yonghong Song --- src/cc/libbpf.c | 93 +++++++++++++++++++++++++------------------------ src/cc/libbpf.h | 6 +++- 2 files changed, 53 insertions(+), 46 deletions(-) diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 76afc02c6..0c9af6960 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -454,40 +454,31 @@ int bpf_prog_get_tag(int fd, unsigned long long *ptag) return 0; } -int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, - const struct bpf_insn *insns, int prog_len, - const char *license, unsigned kern_version, - int log_level, char *log_buf, unsigned log_buf_size) +int bcc_prog_load_xattr(struct bpf_load_program_attr *attr, int prog_len, + char *log_buf, unsigned log_buf_size) { - size_t name_len = name ? strlen(name) : 0; - union bpf_attr attr; - char *tmp_log_buf = NULL; - unsigned tmp_log_buf_size = 0; + size_t name_len = attr->name ? strlen(attr->name) : 0; + char *tmp_log_buf = NULL, *attr_log_buf = NULL; + unsigned tmp_log_buf_size = 0, attr_log_buf_size = 0; int ret = 0, name_offset = 0; + char prog_name[BPF_OBJ_NAME_LEN] = {}; - memset(&attr, 0, sizeof(attr)); - - attr.prog_type = prog_type; - attr.kern_version = kern_version; - attr.license = ptr_to_u64((void *)license); - - attr.insns = ptr_to_u64((void *)insns); - attr.insn_cnt = prog_len / sizeof(struct bpf_insn); - if (attr.insn_cnt > BPF_MAXINSNS) { + unsigned insns_cnt = prog_len / sizeof(struct bpf_insn); + if (insns_cnt > BPF_MAXINSNS) { errno = EINVAL; fprintf(stderr, "bpf: %s. Program %s too large (%u insns), at most %d insns\n\n", - strerror(errno), name, attr.insn_cnt, BPF_MAXINSNS); + strerror(errno), attr->name, insns_cnt, BPF_MAXINSNS); return -1; } + attr->insns_cnt = insns_cnt; - attr.log_level = log_level; - if (attr.log_level > 0) { + if (attr->log_level > 0) { if (log_buf_size > 0) { // Use user-provided log buffer if availiable. log_buf[0] = 0; - attr.log_buf = ptr_to_u64(log_buf); - attr.log_size = log_buf_size; + attr_log_buf = log_buf; + attr_log_buf_size = log_buf_size; } else { // Create and use temporary log buffer if user didn't provide one. tmp_log_buf_size = LOG_BUF_SIZE; @@ -495,32 +486,33 @@ int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, if (!tmp_log_buf) { fprintf(stderr, "bpf: Failed to allocate temporary log buffer: %s\n\n", strerror(errno)); - attr.log_level = 0; + attr->log_level = 0; } else { tmp_log_buf[0] = 0; - attr.log_buf = ptr_to_u64(tmp_log_buf); - attr.log_size = tmp_log_buf_size; + attr_log_buf = tmp_log_buf; + attr_log_buf_size = tmp_log_buf_size; } } } if (name_len) { - if (strncmp(name, "kprobe__", 8) == 0) + if (strncmp(attr->name, "kprobe__", 8) == 0) name_offset = 8; - else if (strncmp(name, "tracepoint__", 12) == 0) + else if (strncmp(attr->name, "tracepoint__", 12) == 0) name_offset = 12; - else if (strncmp(name, "raw_tracepoint__", 16) == 0) + else if (strncmp(attr->name, "raw_tracepoint__", 16) == 0) name_offset = 16; - memcpy(attr.prog_name, name + name_offset, + memcpy(prog_name, attr->name + name_offset, min(name_len - name_offset, BPF_OBJ_NAME_LEN - 1)); + attr->name = prog_name; } - ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + ret = bpf_load_program_xattr(attr, attr_log_buf, attr_log_buf_size); // BPF object name is not supported on older Kernels. // If we failed due to this, clear the name and try again. if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) { - memset(attr.prog_name, 0, BPF_OBJ_NAME_LEN); - ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + prog_name[0] = '\0'; + ret = bpf_load_program_xattr(attr, attr_log_buf, attr_log_buf_size); } if (ret < 0 && errno == EPERM) { @@ -536,7 +528,7 @@ int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, rl.rlim_max = RLIM_INFINITY; rl.rlim_cur = rl.rlim_max; if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0) - ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + ret = bpf_load_program_xattr(attr, attr_log_buf, attr_log_buf_size); } } @@ -545,11 +537,9 @@ int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, // User has provided a log buffer. if (log_buf_size) { // If logging is not already enabled, enable it and do the syscall again. - if (attr.log_level == 0) { - attr.log_level = 1; - attr.log_buf = ptr_to_u64(log_buf); - attr.log_size = log_buf_size; - ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + if (attr->log_level == 0) { + attr->log_level = 1; + ret = bpf_load_program_xattr(attr, log_buf, log_buf_size); } // Print the log message and return. bpf_print_hints(ret, log_buf); @@ -563,8 +553,8 @@ int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, if (tmp_log_buf) free(tmp_log_buf); tmp_log_buf_size = LOG_BUF_SIZE; - if (attr.log_level == 0) - attr.log_level = 1; + if (attr->log_level == 0) + attr->log_level = 1; for (;;) { tmp_log_buf = malloc(tmp_log_buf_size); if (!tmp_log_buf) { @@ -573,10 +563,7 @@ int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, goto return_result; } tmp_log_buf[0] = 0; - attr.log_buf = ptr_to_u64(tmp_log_buf); - attr.log_size = tmp_log_buf_size; - - ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); + ret = bpf_load_program_xattr(attr, tmp_log_buf, tmp_log_buf_size); if (ret < 0 && errno == ENOSPC) { // Temporary buffer size is not enough. Double it and try again. free(tmp_log_buf); @@ -590,7 +577,7 @@ int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, // Check if we should print the log message if log_level is not 0, // either specified by user or set due to error. - if (attr.log_level > 0) { + if (attr->log_level > 0) { // Don't print if user enabled logging and provided log buffer, // but there is no error. if (log_buf && ret < 0) @@ -605,6 +592,22 @@ int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, return ret; } +int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, + const struct bpf_insn *insns, int prog_len, + const char *license, unsigned kern_version, + int log_level, char *log_buf, unsigned log_buf_size) +{ + struct bpf_load_program_attr attr = {}; + + attr.prog_type = prog_type; + attr.name = name; + attr.insns = insns; + attr.license = license; + attr.kern_version = kern_version; + attr.log_level = log_level; + return bcc_prog_load_xattr(&attr, prog_len, log_buf, log_buf_size); +} + int bpf_open_raw_sock(const char *name) { struct sockaddr_ll sll; diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h index d2c6fa6d4..18dcc5482 100644 --- a/src/cc/libbpf.h +++ b/src/cc/libbpf.h @@ -27,6 +27,7 @@ extern "C" { #endif struct bpf_create_map_attr; +struct bpf_load_program_attr; enum bpf_probe_attach_type { BPF_PROBE_ENTRY, @@ -60,9 +61,12 @@ int bpf_get_next_key(int fd, void *key, void *next_key); * initial attemp was insufficient in size. */ int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, - const struct bpf_insn *insns, int insn_len, + const struct bpf_insn *insns, int prog_len, const char *license, unsigned kern_version, int log_level, char *log_buf, unsigned log_buf_size); +int bcc_prog_load_xattr(struct bpf_load_program_attr *attr, + int prog_len, char *log_buf, + unsigned log_buf_size); int bpf_attach_socket(int sockfd, int progfd); From 89bf6f7b1eab3f820876f5dcf1d6a6feb211f982 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 17 Jan 2019 09:12:13 -0800 Subject: [PATCH 040/135] enable -g by default for llvm 9 This patch introduced two changes so that BTF is available to bcc starting from LLVM 9: . -g to enable BTF generation in LLVM . ProcessAllSections for RuntimeDyld so BTF is available to bcc The bcc will then be able to post process and load BTF into the kernel. Signed-off-by: Yonghong Song --- src/cc/bpf_module.cc | 4 ++++ src/cc/frontends/clang/loader.cc | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc index 502bd5d74..0291157b5 100644 --- a/src/cc/bpf_module.cc +++ b/src/cc/bpf_module.cc @@ -239,8 +239,12 @@ int BPFModule::finalize() { return -1; } +#if LLVM_MAJOR_VERSION >= 9 + engine_->setProcessAllSections(true); +#else if (flags_ & DEBUG_SOURCE) engine_->setProcessAllSections(true); +#endif if (int rc = run_pass_manager(*mod)) return rc; diff --git a/src/cc/frontends/clang/loader.cc b/src/cc/frontends/clang/loader.cc index f461ded0b..8f091708f 100644 --- a/src/cc/frontends/clang/loader.cc +++ b/src/cc/frontends/clang/loader.cc @@ -172,8 +172,12 @@ int ClangLoader::parse(unique_ptr *mod, TableStorage &ts, vector kflags; if (kbuild_helper.get_flags(un.machine, &kflags)) return -1; +#if LLVM_MAJOR_VERSION >= 9 + flags_cstr.push_back("-g"); +#else if (flags_ & DEBUG_SOURCE) flags_cstr.push_back("-g"); +#endif for (auto it = kflags.begin(); it != kflags.end(); ++it) flags_cstr.push_back(it->c_str()); From 48ca7819c4be5f236b7936d5af9a3cab287cb41b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 8 Feb 2019 13:58:28 -0800 Subject: [PATCH 041/135] add btf func_info and line_info support The libbpf API functions are used to parse/load .BTF and .BTF.ext ELF sections, and retrieve func_info and line_info for prog loading. The LLVM cannot get remapped file source for line_info. So a postprocessing after compilation fixed the line_off (offset to the string table for the line_info) in .BTF.ext and adjusted string section in .BTF to accommodate new source lines. For users using python and C++ API, no changes to their codes. If .BTF and .BTF.ext are supported in both compiler and kernel, they will automatically get benefit, e.g., verifier log annotated with source code, bpftool showing source annotated jit codes, etc. For example, with latest bpf-next and llvm trunk, running `tcpconnect.py`, bpftool (from kernel) is able to show the following: -bash-4.4$ sudo bpftool p d jited id 165 int trace_connect_v6_return(struct pt_regs * ctx): bpf_prog_8937c3924bd93e28_trace_connect_v6_return: ; int trace_connect_v6_return(struct pt_regs *ctx) 0: push %rbp 1: mov %rsp,%rbp 4: sub $0x88,%rsp b: sub $0x28,%rbp f: mov %rbx,0x0(%rbp) 13: mov %r13,0x8(%rbp) 17: mov %r14,0x10(%rbp) 1b: mov %r15,0x18(%rbp) 1f: xor %eax,%eax 21: mov %rax,0x20(%rbp) 25: mov %rdi,%rbx ; int ret = PT_REGS_RC(ctx); 28: mov 0x50(%rbx),%r13 ; u32 pid = bpf_get_current_pid_tgid(); 2c: callq 0xffffffffe0e4ebfe ; u32 pid = bpf_get_current_pid_tgid(); 31: mov %eax,-0x4(%rbp) ... -bash-4.4$ sudo bpftool p d xlated id 165 int trace_connect_v6_return(struct pt_regs * ctx): ; int trace_connect_v6_return(struct pt_regs *ctx) 0: (bf) r6 = r1 ; int ret = PT_REGS_RC(ctx); 1: (79) r7 = *(u64 *)(r6 +80) ; u32 pid = bpf_get_current_pid_tgid(); 2: (85) call bpf_get_current_pid_tgid#76208 ; u32 pid = bpf_get_current_pid_tgid(); 3: (63) *(u32 *)(r10 -4) = r0 ; skpp = bpf_map_lookup_elem((void *)bpf_pseudo_fd(1, 3), &pid); 4: (18) r1 = map[id:298] 6: (bf) r2 = r10 Signed-off-by: Yonghong Song --- src/cc/CMakeLists.txt | 2 +- src/cc/api/BPF.cc | 2 +- src/cc/bcc_btf.cc | 202 ++++++++++++++++++++ src/cc/bcc_btf.h | 70 +++++++ src/cc/bpf_common.cc | 12 ++ src/cc/bpf_common.h | 6 + src/cc/bpf_module.cc | 106 +++++++++- src/cc/bpf_module.h | 7 + src/cc/frontends/clang/b_frontend_action.cc | 6 + src/cc/libbpf.c | 13 ++ src/python/bcc/__init__.py | 2 +- src/python/bcc/libbcc.py | 4 +- 12 files changed, 423 insertions(+), 9 deletions(-) create mode 100644 src/cc/bcc_btf.cc create mode 100644 src/cc/bcc_btf.h diff --git a/src/cc/CMakeLists.txt b/src/cc/CMakeLists.txt index 3593e7e17..32240b4f0 100644 --- a/src/cc/CMakeLists.txt +++ b/src/cc/CMakeLists.txt @@ -35,7 +35,7 @@ add_library(bpf-shared SHARED libbpf.c perf_reader.c ${libbpf_sources}) set_target_properties(bpf-shared PROPERTIES VERSION ${REVISION_LAST} SOVERSION 0) set_target_properties(bpf-shared PROPERTIES OUTPUT_NAME bpf) -set(bcc_common_sources bpf_common.cc bpf_module.cc exported_files.cc) +set(bcc_common_sources bpf_common.cc bpf_module.cc bcc_btf.cc exported_files.cc) if (${LLVM_PACKAGE_VERSION} VERSION_EQUAL 6 OR ${LLVM_PACKAGE_VERSION} VERSION_GREATER 6) set(bcc_common_sources ${bcc_common_sources} bcc_debug.cc) endif() diff --git a/src/cc/api/BPF.cc b/src/cc/api/BPF.cc index 606a2bd75..17ac9b384 100644 --- a/src/cc/api/BPF.cc +++ b/src/cc/api/BPF.cc @@ -550,7 +550,7 @@ StatusTuple BPF::load_func(const std::string& func_name, bpf_prog_type type, else if (flag_ & DEBUG_BPF) log_level = 1; - fd = bcc_prog_load(type, func_name.c_str(), + fd = bpf_module_->bcc_func_load(type, func_name.c_str(), reinterpret_cast(func_start), func_size, bpf_module_->license(), bpf_module_->kern_version(), log_level, nullptr, 0); diff --git a/src/cc/bcc_btf.cc b/src/cc/bcc_btf.cc new file mode 100644 index 000000000..233899617 --- /dev/null +++ b/src/cc/bcc_btf.cc @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2019 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "bcc_btf.h" +#include +#include "linux/btf.h" +#include "libbpf.h" +#include "libbpf/src/btf.h" +#include + +namespace ebpf { + +uint32_t BTFStringTable::addString(std::string S) { + // Check whether the string already exists. + for (auto &OffsetM : OffsetToIdMap) { + if (Table[OffsetM.second] == S) + return OffsetM.first; + } + // Not find, add to the string table. + uint32_t Offset = Size; + OffsetToIdMap[Offset] = Table.size(); + Table.push_back(S); + Size += S.size() + 1; + return Offset; +} + +BTF::~BTF() { + btf__free(btf_); + btf_ext__free(btf_ext_); +} + +// The compiler doesn't have source code for remapped files. +// So we modify .BTF and .BTF.ext sections here to add these +// missing line source codes. +// The .BTF and .BTF.ext ELF section specification can be +// found at linux repo: linux/Documentation/bpf/btf.rst. +void BTF::adjust(uint8_t *btf_sec, uintptr_t btf_sec_size, + uint8_t *btf_ext_sec, uintptr_t btf_ext_sec_size, + std::map &remapped_sources, + uint8_t **new_btf_sec, uintptr_t *new_btf_sec_size) { + + // Line cache for remapped files + std::map> LineCaches; + for (auto it = remapped_sources.begin(); it != remapped_sources.end(); ++it) { + size_t FileBufSize = it->second.size(); + std::vector LineCache; + + for (uint32_t start = 0, end = start; end < FileBufSize; end++) { + if (it->second[end] == '\n' || end == FileBufSize - 1 || + (it->second[end] == '\r' && it->second[end + 1] == '\n')) { + // Not including the endline + LineCache.push_back(std::string(it->second.substr(start, end - start))); + if (it->second[end] == '\r') + end++; + start = end + 1; + } + } + LineCaches[it->first] = std::move(LineCache); + } + + // Check the LineInfo table and add missing lines + + struct btf_header *hdr = (struct btf_header *)btf_sec; + struct btf_ext_header *ehdr = (struct btf_ext_header *)btf_ext_sec; + + char *strings = (char *)(btf_sec + hdr->hdr_len + hdr->str_off); + unsigned orig_strings_len = hdr->str_len; + unsigned *linfo_s = (unsigned *)(btf_ext_sec + ehdr->hdr_len + ehdr->line_info_off); + unsigned lrec_size = *linfo_s; + linfo_s++; + unsigned linfo_len = ehdr->line_info_len - 4; + + // Go through all line info. For any line number whose line is in the LineCaches, + // Correct the line_off and record the corresponding source line in BTFStringTable, + // which later will be merged into .BTF string section. + BTFStringTable new_strings; + while (linfo_len) { + unsigned num_recs = linfo_s[1]; + linfo_s += 2; + for (unsigned i = 0; i < num_recs; i++) { + struct bpf_line_info *linfo = (struct bpf_line_info *)linfo_s; + if (linfo->line_off == 0) { + for (auto it = LineCaches.begin(); it != LineCaches.end(); ++it) { + if (strcmp(strings + linfo->file_name_off, it->first.c_str()) == 0) { + unsigned line_num = BPF_LINE_INFO_LINE_NUM(linfo->line_col); + if (line_num > 0 && line_num <= it->second.size()) + linfo->line_off = orig_strings_len + new_strings.addString(it->second[line_num - 1]); + } + } + } + linfo_s += lrec_size >> 2; + } + linfo_len -= 8 + num_recs * lrec_size; + } + + // If any new source lines need to be recorded, do not touch the original section, + // allocate a new section. The original section is allocated through llvm infra. + if (new_strings.getSize() > 0) { + // LLVM generated .BTF layout always has type_sec followed by str_sec without holes, + // so we can just append the new strings to the end and adjust str_sec size. + unsigned tmp_sec_size = btf_sec_size + new_strings.getSize(); + uint8_t *tmp_sec = new uint8_t[tmp_sec_size]; + memcpy(tmp_sec, btf_sec, btf_sec_size); + + struct btf_header *nhdr = (struct btf_header *)tmp_sec; + nhdr->str_len += new_strings.getSize(); + + // Populate new strings to the string table. + uint8_t *new_str = tmp_sec + nhdr->hdr_len + nhdr->str_off + orig_strings_len; + std::vector &Table = new_strings.getTable(); + for (unsigned i = 0; i < Table.size(); i++) { + strcpy((char *)new_str, Table[i].c_str()); + new_str += Table[i].size() + 1; + } + + *new_btf_sec = tmp_sec; + *new_btf_sec_size = tmp_sec_size; + } +} + +int BTF::load(uint8_t *btf_sec, uintptr_t btf_sec_size, + uint8_t *btf_ext_sec, uintptr_t btf_ext_sec_size, + std::map &remapped_sources) { + struct btf *btf; + struct btf_ext *btf_ext; + uint8_t *new_btf_sec = NULL; + uintptr_t new_btf_sec_size = 0; + + adjust(btf_sec, btf_sec_size, btf_ext_sec, btf_ext_sec_size, + remapped_sources, &new_btf_sec, &new_btf_sec_size); + + if (new_btf_sec) { + btf = btf__new(new_btf_sec, new_btf_sec_size); + delete new_btf_sec; + } else { + btf = btf__new(btf_sec, btf_sec_size); + } + if (!btf) { + fprintf(stderr, "Processing .BTF section failure\n"); + return -1; + } + + btf_ext = btf_ext__new(btf_ext_sec, btf_ext_sec_size); + if (!btf_ext) { + btf__free(btf); + fprintf(stderr, "Processing .BTF.ext section failure\n"); + return -1; + } + + btf_ = btf; + btf_ext_ = btf_ext; + return 0; +} + +int BTF::get_fd() { + return btf__fd(btf_); +} + +int BTF::get_btf_info(const char *fname, + void **func_info, unsigned *func_info_cnt, + unsigned *finfo_rec_size, + void **line_info, unsigned *line_info_cnt, + unsigned *linfo_rec_size) { + int ret; + + *func_info = *line_info = NULL; + *func_info_cnt = *line_info_cnt = 0; + + *finfo_rec_size = btf_ext__func_info_rec_size(btf_ext_); + *linfo_rec_size = btf_ext__line_info_rec_size(btf_ext_); + + ret = btf_ext__reloc_func_info(btf_, btf_ext_, fname, 0, + func_info, func_info_cnt); + if (ret) { + fprintf(stderr, ".BTF.ext reloc func_info not successful\n"); + return ret; + } + + ret = btf_ext__reloc_line_info(btf_, btf_ext_, fname, 0, + line_info, line_info_cnt); + if (ret) { + fprintf(stderr, ".BTF.ext reloc line_info not successful\n"); + return ret; + } + + return 0; +} + +} // namespace ebpf diff --git a/src/cc/bcc_btf.h b/src/cc/bcc_btf.h new file mode 100644 index 000000000..334e179f7 --- /dev/null +++ b/src/cc/bcc_btf.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019 Facebook, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef BCC_BTF_H +#define BCC_BTF_H + +#include +#include +#include +#include +#include + +struct btf; +struct btf_ext; + +namespace ebpf { + +class BTFStringTable { + private: + uint32_t Size; + std::map OffsetToIdMap; + std::vector Table; + + public: + BTFStringTable(): Size(0) {} + uint32_t getSize() { return Size; } + std::vector &getTable() { return Table; } + uint32_t addString(std::string Str); +}; + +class BTF { + public: + ~BTF(); + int load(uint8_t *btf_sec, uintptr_t btf_sec_size, + uint8_t *btf_ext_sec, uintptr_t btf_ext_sec_size, + std::map &remapped_sources); + int get_fd(); + int get_btf_info(const char *fname, + void **func_info, unsigned *func_info_cnt, + unsigned *finfo_rec_size, + void **line_info, unsigned *line_info_cnt, + unsigned *linfo_rec_size); + + private: + void adjust(uint8_t *btf_sec, uintptr_t btf_sec_size, + uint8_t *btf_ext_sec, uintptr_t btf_ext_sec_size, + std::map &remapped_sources, + uint8_t **new_btf_sec, uintptr_t *new_btf_sec_size); + + private: + struct btf *btf_; + struct btf_ext *btf_ext_; +}; + +} // namespace ebpf + +#endif diff --git a/src/cc/bpf_common.cc b/src/cc/bpf_common.cc index 4a7119789..ac5764e4c 100644 --- a/src/cc/bpf_common.cc +++ b/src/cc/bpf_common.cc @@ -234,4 +234,16 @@ int bpf_table_leaf_sscanf(void *program, size_t id, const char *buf, void *leaf) return mod->table_leaf_scanf(id, buf, leaf); } +int bcc_func_load(void *program, int prog_type, const char *name, + const struct bpf_insn *insns, int prog_len, + const char *license, unsigned kern_version, + int log_level, char *log_buf, unsigned log_buf_size) { + auto mod = static_cast(program); + if (!mod) return -1; + return mod->bcc_func_load(prog_type, name, insns, prog_len, + license, kern_version, log_level, + log_buf, log_buf_size); + +} + } diff --git a/src/cc/bpf_common.h b/src/cc/bpf_common.h index 0abdbd487..f83f4e676 100644 --- a/src/cc/bpf_common.h +++ b/src/cc/bpf_common.h @@ -60,6 +60,12 @@ int bpf_table_leaf_snprintf(void *program, size_t id, char *buf, size_t buflen, int bpf_table_key_sscanf(void *program, size_t id, const char *buf, void *key); int bpf_table_leaf_sscanf(void *program, size_t id, const char *buf, void *leaf); +struct bpf_insn; +int bcc_func_load(void *program, int prog_type, const char *name, + const struct bpf_insn *insns, int prog_len, + const char *license, unsigned kern_version, + int log_level, char *log_buf, unsigned log_buf_size); + #ifdef __cplusplus } #endif diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc index 0291157b5..4d33ee8ec 100644 --- a/src/cc/bpf_module.cc +++ b/src/cc/bpf_module.cc @@ -42,6 +42,8 @@ #include "bpf_module.h" #include "exported_files.h" #include "libbpf.h" +#include "bcc_btf.h" +#include "libbpf/src/bpf.h" namespace ebpf { @@ -78,9 +80,11 @@ class MyMemoryManager : public SectionMemoryManager { uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, unsigned SectionID, StringRef SectionName, bool isReadOnly) override { - uint8_t *Addr = SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, SectionName, isReadOnly); - //printf("allocateDataSection: %s Addr %p Size %ld Alignment %d SectionID %d RO %d\n", - // SectionName.str().c_str(), (void *)Addr, Size, Alignment, SectionID, isReadOnly); + // The lines in .BTF.ext line_info, if corresponding to remapped files, will have empty source line. + // The line_info will be fixed in place, so not allocate ReadOnly regions. + uint8_t *Addr = SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, SectionName, false); + //printf("allocateDataSection: %s Addr %p Size %ld Alignment %d SectionID %d\n", + // SectionName.str().c_str(), (void *)Addr, Size, Alignment, SectionID); (*sections_)[SectionName.str()] = make_tuple(Addr, Size); return Addr; } @@ -95,7 +99,7 @@ BPFModule::BPFModule(unsigned flags, TableStorage *ts, bool rw_engine_enabled, ctx_(new LLVMContext), id_(std::to_string((uintptr_t)this)), maps_ns_(maps_ns), - ts_(ts) { + ts_(ts), btf_(nullptr) { initialize_rw_engine(); LLVMInitializeBPFTarget(); LLVMInitializeBPFTargetMC(); @@ -139,6 +143,9 @@ BPFModule::~BPFModule() { ctx_.reset(); func_src_.reset(); + if (btf_) + delete btf_; + ts_->DeletePrefix(Path({id_})); } @@ -214,6 +221,51 @@ int BPFModule::run_pass_manager(Module &mod) { return 0; } +void BPFModule::load_btf(std::map> §ions) { + uint8_t *btf_sec = nullptr, *btf_ext_sec = nullptr; + uintptr_t btf_sec_size = 0, btf_ext_sec_size = 0; + + for (auto section: sections) { + auto sname = section.first; + uint8_t *addr = get<0>(section.second); + uintptr_t size = get<1>(section.second); + + if (strcmp(".BTF", sname.c_str()) == 0) { + btf_sec = addr; + btf_sec_size = size; + } + + if (strcmp(".BTF.ext", sname.c_str()) == 0) { + btf_ext_sec = addr; + btf_ext_sec_size = size; + } + } + + if (btf_sec == nullptr || btf_ext_sec == nullptr) + return; + + // Both .BTF and .BTF.ext ELF sections are present. + // The remapped files (the main file and /virtual/include/bcc/helpers.h) + // will provide missing source codes in the .BTF.ext line_info table. + auto helpers_h = ExportedFiles::headers().find("/virtual/include/bcc/helpers.h"); + if (helpers_h == ExportedFiles::headers().end()) { + fprintf(stderr, "Internal error: missing bcc/helpers.h"); + return; + } + std::map remapped_sources; + remapped_sources["/virtual/main.c"] = mod_src_; + remapped_sources["/virtual/include/bcc/helpers.h"] = helpers_h->second; + + BTF *btf = new BTF(); + int ret = btf->load(btf_sec, btf_sec_size, btf_ext_sec, btf_ext_sec_size, + remapped_sources); + if (ret) { + delete btf; + return; + } + btf_ = btf; +} + int BPFModule::finalize() { Module *mod = &*mod_; std::map> tmp_sections, @@ -257,6 +309,8 @@ int BPFModule::finalize() { src_debugger.dump(); } + load_btf(*sections_p); + if (!rw_engine_enabled_) { // Setup sections_ correctly and then free llvm internal memory for (auto section : tmp_sections) { @@ -649,4 +703,48 @@ int BPFModule::load_string(const string &text, const char *cflags[], int ncflags return 0; } +int BPFModule::bcc_func_load(int prog_type, const char *name, + const struct bpf_insn *insns, int prog_len, + const char *license, unsigned kern_version, + int log_level, char *log_buf, unsigned log_buf_size) { + struct bpf_load_program_attr attr = {}; + unsigned func_info_cnt, line_info_cnt, finfo_rec_size, linfo_rec_size; + void *func_info = NULL, *line_info = NULL; + int ret; + + attr.prog_type = (enum bpf_prog_type)prog_type; + attr.name = name; + attr.insns = insns; + attr.license = license; + attr.kern_version = kern_version; + attr.log_level = log_level; + + if (btf_) { + int btf_fd = btf_->get_fd(); + char secname[256]; + + ::snprintf(secname, sizeof(secname), ".bpf.fn.%s", name); + ret = btf_->get_btf_info(secname, &func_info, &func_info_cnt, + &finfo_rec_size, &line_info, + &line_info_cnt, &linfo_rec_size); + if (!ret) { + attr.prog_btf_fd = btf_fd; + attr.func_info = func_info; + attr.func_info_cnt = func_info_cnt; + attr.func_info_rec_size = finfo_rec_size; + attr.line_info = line_info; + attr.line_info_cnt = line_info_cnt; + attr.line_info_rec_size = linfo_rec_size; + } + } + + ret = bcc_prog_load_xattr(&attr, prog_len, log_buf, log_buf_size); + if (btf_) { + free(func_info); + free(line_info); + } + + return ret; +} + } // namespace ebpf diff --git a/src/cc/bpf_module.h b/src/cc/bpf_module.h index f4f56390d..ce31c6ad5 100644 --- a/src/cc/bpf_module.h +++ b/src/cc/bpf_module.h @@ -53,6 +53,7 @@ class TableStorage; class BLoader; class ClangLoader; class FuncSource; +class BTF; bool bpf_module_rw_engine_enabled(void); @@ -78,6 +79,7 @@ class BPFModule { StatusTuple sscanf(std::string fn_name, const char *str, void *val); StatusTuple snprintf(std::string fn_name, char *str, size_t sz, const void *val); + void load_btf(std::map> §ions); public: BPFModule(unsigned flags, TableStorage *ts = nullptr, bool rw_engine_enabled = true, @@ -125,6 +127,10 @@ class BPFModule { char * license() const; unsigned kern_version() const; TableStorage &table_storage() { return *ts_; } + int bcc_func_load(int prog_type, const char *name, + const struct bpf_insn *insns, int prog_len, + const char *license, unsigned kern_version, + int log_level, char *log_buf, unsigned log_buf_size); private: unsigned flags_; // 0x1 for printing @@ -149,6 +155,7 @@ class BPFModule { std::map src_dbg_fmap_; TableStorage *ts_; std::unique_ptr local_ts_; + BTF *btf_; }; } // namespace ebpf diff --git a/src/cc/frontends/clang/b_frontend_action.cc b/src/cc/frontends/clang/b_frontend_action.cc index 6ba484459..01396f7bf 100644 --- a/src/cc/frontends/clang/b_frontend_action.cc +++ b/src/cc/frontends/clang/b_frontend_action.cc @@ -1399,11 +1399,17 @@ void BFrontendAction::EndSourceFileAction() { if (flags_ & DEBUG_PREPROCESSOR) rewriter_->getEditBuffer(rewriter_->getSourceMgr().getMainFileID()).write(llvm::errs()); +#if LLVM_MAJOR_VERSION >= 9 + llvm::raw_string_ostream tmp_os(mod_src_); + rewriter_->getEditBuffer(rewriter_->getSourceMgr().getMainFileID()) + .write(tmp_os); +#else if (flags_ & DEBUG_SOURCE) { llvm::raw_string_ostream tmp_os(mod_src_); rewriter_->getEditBuffer(rewriter_->getSourceMgr().getMainFileID()) .write(tmp_os); } +#endif for (auto func : func_range_) { auto f = func.first; diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 0c9af6960..c4452317b 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -508,6 +508,19 @@ int bcc_prog_load_xattr(struct bpf_load_program_attr *attr, int prog_len, } ret = bpf_load_program_xattr(attr, attr_log_buf, attr_log_buf_size); + + // func_info/line_info may not be supported in old kernels. + if (ret < 0 && attr->func_info && errno == EINVAL) { + attr->prog_btf_fd = 0; + attr->func_info = NULL; + attr->func_info_cnt = 0; + attr->func_info_rec_size = 0; + attr->line_info = NULL; + attr->line_info_cnt = 0; + attr->line_info_rec_size = 0; + ret = bpf_load_program_xattr(attr, attr_log_buf, attr_log_buf_size); + } + // BPF object name is not supported on older Kernels. // If we failed due to this, clear the name and try again. if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) { diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index c661183ee..01b304e7c 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -365,7 +365,7 @@ def load_func(self, func_name, prog_type): log_level = 2 elif (self.debug & DEBUG_BPF): log_level = 1 - fd = lib.bcc_prog_load(prog_type, func_name, + fd = lib.bcc_func_load(self.module, prog_type, func_name, lib.bpf_function_start(self.module, func_name), lib.bpf_function_size(self.module, func_name), lib.bpf_module_license(self.module), diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py index 0624382a8..4d218a727 100644 --- a/src/python/bcc/libbcc.py +++ b/src/python/bcc/libbcc.py @@ -82,8 +82,8 @@ lib.bpf_open_raw_sock.argtypes = [ct.c_char_p] lib.bpf_attach_socket.restype = ct.c_int lib.bpf_attach_socket.argtypes = [ct.c_int, ct.c_int] -lib.bcc_prog_load.restype = ct.c_int -lib.bcc_prog_load.argtypes = [ct.c_int, ct.c_char_p, ct.c_void_p, +lib.bcc_func_load.restype = ct.c_int +lib.bcc_func_load.argtypes = [ct.c_void_p, ct.c_int, ct.c_char_p, ct.c_void_p, ct.c_size_t, ct.c_char_p, ct.c_uint, ct.c_int, ct.c_char_p, ct.c_uint] _RAW_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_void_p, ct.c_int) _LOST_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_ulonglong) From 8300c7bdd7f8599bdeac830cfed05ec8fd51616d Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Sun, 10 Feb 2019 17:48:01 -0800 Subject: [PATCH 042/135] add btf support for maps (#2192) Added bpf support for maps so map key/value types can be retrieved by user space introspection tool to pretty print map key/values. To associate maps with its key/value types, the below macro BPF_ANNOTATE_KV_PAIR(name, type_key, type_val) is used, similar to usage in kernel tools/testing/selftests/bpf and tools/lib/bpf. Currently, some map types (e.g, PERF_EVENT_ARRAY) do not support pretty print. But common [per_cpu] hash/array maps are supported. Currently, bcc create maps before llvm compilation. To support pretty printing of maps, map creation needs have key/value type id's which can only be obtained after compilation. Therefore, during rewriting, fake map fd is used. After compilation, btf is first loaded and maps are created. With latest bpf-next and latest trunk llvm, bpftool is able to pretty print bcc tool map key/values as below: ; running tcptop.py in one window $ tcptop.py ; running bpftool in another window $ bpftool m s ... 343: hash name ipv4_send_bytes flags 0x0 key 16B value 8B max_entries 10240 memlock 1003520B 344: hash name ipv4_recv_bytes flags 0x0 key 16B value 8B max_entries 10240 memlock 1003520B 345: hash name ipv6_send_bytes flags 0x0 key 64B value 8B max_entries 10240 memlock 1495040B 346: hash name ipv6_recv_bytes flags 0x0 key 64B value 8B max_entries 10240 memlock 1495040B $ bpftool m d id 345 ... },{ "key": { "pid": 5511, "saddr": 0x100007fffff00000000000000000000, "daddr": 0x100007fffff00000000000000000000, "lport": 2378, "dport": 52602 }, "value": 49 },{ "key": { "pid": 2823, "saddr": 0x4e000000cefa7bb0300000db0124, "daddr": 0x60000000cefa7bb0300000db0124, "lport": 2406, "dport": 49348 }, "value": 36 } ... Signed-off-by: Yonghong Song --- src/cc/bcc_btf.cc | 8 ++ src/cc/bcc_btf.h | 3 + src/cc/bpf_module.cc | 115 +++++++++++++++++++- src/cc/bpf_module.h | 3 + src/cc/export/helpers.h | 13 ++- src/cc/file_desc.h | 8 +- src/cc/frontends/clang/b_frontend_action.cc | 21 ++-- src/cc/frontends/clang/b_frontend_action.h | 10 +- src/cc/frontends/clang/loader.cc | 13 ++- src/cc/frontends/clang/loader.h | 6 +- src/cc/libbpf.c | 10 ++ src/cc/table_desc.h | 5 +- src/cc/table_storage.h | 2 + 13 files changed, 190 insertions(+), 27 deletions(-) diff --git a/src/cc/bcc_btf.cc b/src/cc/bcc_btf.cc index 233899617..03e1ce889 100644 --- a/src/cc/bcc_btf.cc +++ b/src/cc/bcc_btf.cc @@ -199,4 +199,12 @@ int BTF::get_btf_info(const char *fname, return 0; } +int BTF::get_map_tids(std::string map_name, + unsigned expected_ksize, unsigned expected_vsize, + unsigned *key_tid, unsigned *value_tid) { + return btf__get_map_kv_tids(btf_, map_name.c_str(), + expected_ksize, expected_vsize, + key_tid, value_tid); +} + } // namespace ebpf diff --git a/src/cc/bcc_btf.h b/src/cc/bcc_btf.h index 334e179f7..ffa8307e8 100644 --- a/src/cc/bcc_btf.h +++ b/src/cc/bcc_btf.h @@ -53,6 +53,9 @@ class BTF { unsigned *finfo_rec_size, void **line_info, unsigned *line_info_cnt, unsigned *linfo_rec_size); + int get_map_tids(std::string map_name, + unsigned expected_ksize, unsigned expected_vsize, + unsigned *key_tid, unsigned *value_tid); private: void adjust(uint8_t *btf_sec, uintptr_t btf_sec_size, diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc index 4d33ee8ec..f00245470 100644 --- a/src/cc/bpf_module.cc +++ b/src/cc/bpf_module.cc @@ -71,8 +71,9 @@ class MyMemoryManager : public SectionMemoryManager { uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment, unsigned SectionID, StringRef SectionName) override { - uint8_t *Addr = SectionMemoryManager::allocateCodeSection(Size, Alignment, SectionID, SectionName); - //printf("allocateCodeSection: %s Addr %p Size %ld Alignment %d SectionID %d\n", + // The programs need to change from fake fd to real map fd, so not allocate ReadOnly regions. + uint8_t *Addr = SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, SectionName, false); + //printf("allocateDataSection: %s Addr %p Size %ld Alignment %d SectionID %d\n", // SectionName.str().c_str(), (void *)Addr, Size, Alignment, SectionID); (*sections_)[SectionName.str()] = make_tuple(Addr, Size); return Addr; @@ -157,7 +158,7 @@ int BPFModule::free_bcc_memory() { int BPFModule::load_cfile(const string &file, bool in_memory, const char *cflags[], int ncflags) { ClangLoader clang_loader(&*ctx_, flags_); if (clang_loader.parse(&mod_, *ts_, file, in_memory, cflags, ncflags, id_, - *func_src_, mod_src_, maps_ns_)) + *func_src_, mod_src_, maps_ns_, fake_fd_map_)) return -1; return 0; } @@ -170,7 +171,7 @@ int BPFModule::load_cfile(const string &file, bool in_memory, const char *cflags int BPFModule::load_includes(const string &text) { ClangLoader clang_loader(&*ctx_, flags_); if (clang_loader.parse(&mod_, *ts_, text, true, nullptr, 0, "", *func_src_, - mod_src_, "")) + mod_src_, "", fake_fd_map_)) return -1; return 0; } @@ -266,6 +267,110 @@ void BPFModule::load_btf(std::map> btf_ = btf; } +int BPFModule::load_maps(std::map> §ions) { + // find .maps. sections and retrieve all map key/value type id's + std::map> map_tids; + if (btf_) { + for (auto section : sections) { + auto sec_name = section.first; + if (strncmp(".maps.", sec_name.c_str(), 6) == 0) { + std::string map_name = sec_name.substr(6); + unsigned key_tid = 0, value_tid = 0; + unsigned expected_ksize = 0, expected_vsize = 0; + + for (auto map : fake_fd_map_) { + std::string name; + + name = get<1>(map.second); + if (map_name == name) { + expected_ksize = get<2>(map.second); + expected_vsize = get<3>(map.second); + break; + } + } + + int ret = btf_->get_map_tids(map_name, expected_ksize, + expected_vsize, &key_tid, &value_tid); + if (ret) + continue; + + map_tids[map_name] = std::make_pair(key_tid, value_tid); + } + } + } + + // create maps + std::map map_fds; + for (auto map : fake_fd_map_) { + int fd, fake_fd, map_type, key_size, value_size, max_entries, map_flags; + const char *map_name; + + fake_fd = map.first; + map_type = get<0>(map.second); + map_name = get<1>(map.second).c_str(); + key_size = get<2>(map.second); + value_size = get<3>(map.second); + max_entries = get<4>(map.second); + map_flags = get<5>(map.second); + + struct bpf_create_map_attr attr = {}; + attr.map_type = (enum bpf_map_type)map_type; + attr.name = map_name; + attr.key_size = key_size; + attr.value_size = value_size; + attr.max_entries = max_entries; + attr.map_flags = map_flags; + + if (map_tids.find(map_name) != map_tids.end()) { + attr.btf_fd = btf_->get_fd(); + attr.btf_key_type_id = map_tids[map_name].first; + attr.btf_value_type_id = map_tids[map_name].second; + } + + fd = bcc_create_map_xattr(&attr); + if (fd < 0) { + fprintf(stderr, "could not open bpf map: %s\nis map type enabled in your kernel?\n", + map_name); + return -1; + } + + map_fds[fake_fd] = fd; + } + + // update map table fd's + for (auto it = ts_->begin(), up = ts_->end(); it != up; ++it) { + TableDesc &table = it->second; + if (map_fds.find(table.fake_fd) != map_fds.end()) { + table.fd = map_fds[table.fake_fd]; + table.fake_fd = 0; + } + } + + // update instructions + for (auto section : sections) { + auto sec_name = section.first; + if (strncmp(".bpf.fn.", sec_name.c_str(), 8) == 0) { + uint8_t *addr = get<0>(section.second); + uintptr_t size = get<1>(section.second); + struct bpf_insn *insns = (struct bpf_insn *)addr; + int i, num_insns; + + num_insns = size/sizeof(struct bpf_insn); + for (i = 0; i < num_insns; i++) { + if (insns[i].code == (BPF_LD | BPF_DW | BPF_IMM)) { + // change map_fd is it is a ld_pseudo */ + if (insns[i].src_reg == BPF_PSEUDO_MAP_FD && + map_fds.find(insns[i].imm) != map_fds.end()) + insns[i].imm = map_fds[insns[i].imm]; + i++; + } + } + } + } + + return 0; +} + int BPFModule::finalize() { Module *mod = &*mod_; std::map> tmp_sections, @@ -310,6 +415,8 @@ int BPFModule::finalize() { } load_btf(*sections_p); + if (load_maps(*sections_p)) + return -1; if (!rw_engine_enabled_) { // Setup sections_ correctly and then free llvm internal memory diff --git a/src/cc/bpf_module.h b/src/cc/bpf_module.h index ce31c6ad5..bd79455ce 100644 --- a/src/cc/bpf_module.h +++ b/src/cc/bpf_module.h @@ -23,6 +23,7 @@ #include #include "bcc_exception.h" +#include "table_storage.h" namespace llvm { class ExecutionEngine; @@ -80,6 +81,7 @@ class BPFModule { StatusTuple snprintf(std::string fn_name, char *str, size_t sz, const void *val); void load_btf(std::map> §ions); + int load_maps(std::map> §ions); public: BPFModule(unsigned flags, TableStorage *ts = nullptr, bool rw_engine_enabled = true, @@ -156,6 +158,7 @@ class BPFModule { TableStorage *ts_; std::unique_ptr local_ts_; BTF *btf_; + fake_fd_map_def fake_fd_map_; }; } // namespace ebpf diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index c025da5a2..9954e0ce6 100644 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -53,6 +53,16 @@ R"********( */ #define SEC(NAME) __attribute__((section(NAME), used)) +// Associate map with its key/value types +#define BPF_ANNOTATE_KV_PAIR(name, type_key, type_val) \ + struct ____btf_map_##name { \ + type_key key; \ + type_val value; \ + }; \ + struct ____btf_map_##name \ + __attribute__ ((section(".maps." #name), used)) \ + ____btf_map_##name = { } + // Changes to the macro require changes in BFrontendAction classes #define BPF_F_TABLE(_table_type, _key_type, _leaf_type, _name, _max_entries, _flags) \ struct _name##_table_t { \ @@ -70,7 +80,8 @@ struct _name##_table_t { \ int flags; \ }; \ __attribute__((section("maps/" _table_type))) \ -struct _name##_table_t _name = { .flags = (_flags), .max_entries = (_max_entries) } +struct _name##_table_t _name = { .flags = (_flags), .max_entries = (_max_entries) }; \ +BPF_ANNOTATE_KV_PAIR(_name, _key_type, _leaf_type) #define BPF_TABLE(_table_type, _key_type, _leaf_type, _name, _max_entries) \ BPF_F_TABLE(_table_type, _key_type, _leaf_type, _name, _max_entries, 0) diff --git a/src/cc/file_desc.h b/src/cc/file_desc.h index a55ba0b67..d4ab310db 100644 --- a/src/cc/file_desc.h +++ b/src/cc/file_desc.h @@ -49,8 +49,12 @@ class FileDesc { FileDesc &operator=(const FileDesc &that) = delete; FileDesc dup() const { - int dup_fd = ::dup(fd_); - return FileDesc(dup_fd); + if (fd_ >= 0) { + int dup_fd = ::dup(fd_); + return FileDesc(dup_fd); + } else { + return FileDesc(-1); + } } operator int() { return fd_; } diff --git a/src/cc/frontends/clang/b_frontend_action.cc b/src/cc/frontends/clang/b_frontend_action.cc index 01396f7bf..8f734db48 100644 --- a/src/cc/frontends/clang/b_frontend_action.cc +++ b/src/cc/frontends/clang/b_frontend_action.cc @@ -766,7 +766,7 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) { return false; } } - string fd = to_string(desc->second.fd); + string fd = to_string(desc->second.fd >= 0 ? desc->second.fd : desc->second.fake_fd); string prefix, suffix; string txt; auto rewrite_start = GET_BEGINLOC(Call); @@ -1230,14 +1230,10 @@ bool BTypeVisitor::VisitVarDecl(VarDecl *Decl) { } table.type = map_type; - table.fd = bcc_create_map(map_type, table.name.c_str(), - table.key_size, table.leaf_size, - table.max_entries, table.flags); - } - if (table.fd < 0) { - error(GET_BEGINLOC(Decl), "could not open bpf map: %0\nis %1 map type enabled in your kernel?") << - strerror(errno) << A->getName(); - return false; + table.fake_fd = fe_.get_next_fake_fd(); + fe_.add_map_def(table.fake_fd, std::make_tuple((int)map_type, std::string(table.name), + (int)table.key_size, (int)table.leaf_size, + (int)table.max_entries, table.flags)); } if (!table.is_extern) @@ -1351,7 +1347,8 @@ BFrontendAction::BFrontendAction(llvm::raw_ostream &os, unsigned flags, TableStorage &ts, const std::string &id, const std::string &main_path, FuncSource &func_src, std::string &mod_src, - const std::string &maps_ns) + const std::string &maps_ns, + fake_fd_map_def &fake_fd_map) : os_(os), flags_(flags), ts_(ts), @@ -1360,7 +1357,9 @@ BFrontendAction::BFrontendAction(llvm::raw_ostream &os, unsigned flags, rewriter_(new Rewriter), main_path_(main_path), func_src_(func_src), - mod_src_(mod_src) {} + mod_src_(mod_src), + next_fake_fd_(-1), + fake_fd_map_(fake_fd_map) {} bool BFrontendAction::is_rewritable_ext_func(FunctionDecl *D) { StringRef file_name = rewriter_->getSourceMgr().getFilename(GET_BEGINLOC(D)); diff --git a/src/cc/frontends/clang/b_frontend_action.h b/src/cc/frontends/clang/b_frontend_action.h index 37aea8205..bc6690cbd 100644 --- a/src/cc/frontends/clang/b_frontend_action.h +++ b/src/cc/frontends/clang/b_frontend_action.h @@ -155,7 +155,8 @@ class BFrontendAction : public clang::ASTFrontendAction { BFrontendAction(llvm::raw_ostream &os, unsigned flags, TableStorage &ts, const std::string &id, const std::string &main_path, FuncSource &func_src, std::string &mod_src, - const std::string &maps_ns); + const std::string &maps_ns, + fake_fd_map_def &fake_fd_map); // Called by clang when the AST has been completed, here the output stream // will be flushed. @@ -170,6 +171,11 @@ class BFrontendAction : public clang::ASTFrontendAction { std::string maps_ns() const { return maps_ns_; } bool is_rewritable_ext_func(clang::FunctionDecl *D); void DoMiscWorkAround(); + // negative fake_fd to be different from real fd in bpf_pseudo_fd. + int get_next_fake_fd() { return next_fake_fd_--; } + void add_map_def(int fd, std::tuple map_def) { + fake_fd_map_[fd] = map_def; + } private: llvm::raw_ostream &os_; @@ -184,6 +190,8 @@ class BFrontendAction : public clang::ASTFrontendAction { FuncSource &func_src_; std::string &mod_src_; std::set m_; + int next_fake_fd_; + fake_fd_map_def &fake_fd_map_; }; } // namespace visitor diff --git a/src/cc/frontends/clang/loader.cc b/src/cc/frontends/clang/loader.cc index 8f091708f..3e579ba19 100644 --- a/src/cc/frontends/clang/loader.cc +++ b/src/cc/frontends/clang/loader.cc @@ -108,7 +108,8 @@ int ClangLoader::parse(unique_ptr *mod, TableStorage &ts, const string &file, bool in_memory, const char *cflags[], int ncflags, const std::string &id, FuncSource &func_src, std::string &mod_src, - const std::string &maps_ns) { + const std::string &maps_ns, + fake_fd_map_def &fake_fd_map) { string main_path = "/virtual/main.c"; unique_ptr main_buf; struct utsname un; @@ -205,7 +206,7 @@ int ClangLoader::parse(unique_ptr *mod, TableStorage &ts, #endif if (do_compile(mod, ts, in_memory, flags_cstr, flags_cstr_rem, main_path, - main_buf, id, func_src, mod_src, true, maps_ns)) { + main_buf, id, func_src, mod_src, true, maps_ns, fake_fd_map)) { #if BCC_BACKUP_COMPILE != 1 return -1; #else @@ -215,8 +216,9 @@ int ClangLoader::parse(unique_ptr *mod, TableStorage &ts, ts.DeletePrefix(Path({id})); func_src.clear(); mod_src.clear(); + fake_fd_map.clear(); if (do_compile(mod, ts, in_memory, flags_cstr, flags_cstr_rem, main_path, - main_buf, id, func_src, mod_src, false, maps_ns)) + main_buf, id, func_src, mod_src, false, maps_ns, fake_fd_map)) return -1; #endif } @@ -263,7 +265,8 @@ int ClangLoader::do_compile(unique_ptr *mod, TableStorage &ts, const unique_ptr &main_buf, const std::string &id, FuncSource &func_src, std::string &mod_src, bool use_internal_bpfh, - const std::string &maps_ns) { + const std::string &maps_ns, + fake_fd_map_def &fake_fd_map) { using namespace clang; vector flags_cstr = flags_cstr_in; @@ -377,7 +380,7 @@ int ClangLoader::do_compile(unique_ptr *mod, TableStorage &ts, // capture the rewritten c file string out_str1; llvm::raw_string_ostream os1(out_str1); - BFrontendAction bact(os1, flags_, ts, id, main_path, func_src, mod_src, maps_ns); + BFrontendAction bact(os1, flags_, ts, id, main_path, func_src, mod_src, maps_ns, fake_fd_map); if (!compiler1.ExecuteAction(bact)) return -1; unique_ptr out_buf1 = llvm::MemoryBuffer::getMemBuffer(out_str1); diff --git a/src/cc/frontends/clang/loader.h b/src/cc/frontends/clang/loader.h index 1aeb6523a..984ca2fdb 100644 --- a/src/cc/frontends/clang/loader.h +++ b/src/cc/frontends/clang/loader.h @@ -54,7 +54,8 @@ class ClangLoader { int parse(std::unique_ptr *mod, TableStorage &ts, const std::string &file, bool in_memory, const char *cflags[], int ncflags, const std::string &id, FuncSource &func_src, - std::string &mod_src, const std::string &maps_ns); + std::string &mod_src, const std::string &maps_ns, + fake_fd_map_def &fake_fd_map); private: int do_compile(std::unique_ptr *mod, TableStorage &ts, @@ -64,7 +65,8 @@ class ClangLoader { const std::unique_ptr &main_buf, const std::string &id, FuncSource &func_src, std::string &mod_src, bool use_internal_bpfh, - const std::string &maps_ns); + const std::string &maps_ns, + fake_fd_map_def &fake_fd_map); private: std::map> remapped_headers_; diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index c4452317b..612bdd18c 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -206,6 +206,16 @@ int bcc_create_map_xattr(struct bpf_create_map_attr *attr) memcpy(map_name, attr->name, min(name_len, BPF_OBJ_NAME_LEN - 1)); attr->name = map_name; int ret = bpf_create_map_xattr(attr); + + // kernel already supports btf if its loading is successful, + // but this map type may not support pretty print yet. + if (ret < 0 && attr->btf_key_type_id && errno == 524 /* ENOTSUPP */) { + attr->btf_fd = 0; + attr->btf_key_type_id = 0; + attr->btf_value_type_id = 0; + ret = bpf_create_map_xattr(attr); + } + if (ret < 0 && name_len && (errno == E2BIG || errno == EINVAL)) { map_name[0] = '\0'; ret = bpf_create_map_xattr(attr); diff --git a/src/cc/table_desc.h b/src/cc/table_desc.h index da0927f94..3cb9393ec 100644 --- a/src/cc/table_desc.h +++ b/src/cc/table_desc.h @@ -45,6 +45,7 @@ class TableDesc { TableDesc(const TableDesc &that) : name(that.name), fd(that.fd.dup()), + fake_fd(that.fake_fd), type(that.type), key_size(that.key_size), leaf_size(that.leaf_size), @@ -61,7 +62,8 @@ class TableDesc { public: TableDesc() - : type(0), + : fake_fd(0), + type(0), key_size(0), leaf_size(0), max_entries(0), @@ -88,6 +90,7 @@ class TableDesc { std::string name; FileDesc fd; + int fake_fd; int type; size_t key_size; // sizes are in bytes size_t leaf_size; diff --git a/src/cc/table_storage.h b/src/cc/table_storage.h index 87aaa3383..2df99c597 100644 --- a/src/cc/table_storage.h +++ b/src/cc/table_storage.h @@ -27,6 +27,8 @@ namespace ebpf { +typedef std::map> fake_fd_map_def; + class TableStorageImpl; class TableStorageIteratorImpl; From 5a578707bd221c5d434a77aeda34fbbcb264e14e Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Sun, 10 Feb 2019 22:45:42 -0800 Subject: [PATCH 043/135] do not expose bcc_elf.h as uapi header (#2194) Commit 51480d0597cc ("implement free_bcc_memory() API") exposed bcc_elf.h as a uapi header. The original implementation does not provide a BPFModule level api so this header can be used to call free_bcc_memory() for applications using BPFModule level API. Later Commit 4c5509fc1664 ("Add free_bcc_memory to BPFModule") added such an interface in BPFModule, so exposing this header becomes unnecessary. So removing bcc_elf.h from uapi headers. Signed-off-by: Yonghong Song --- src/cc/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cc/CMakeLists.txt b/src/cc/CMakeLists.txt index 32240b4f0..dd5d91c41 100644 --- a/src/cc/CMakeLists.txt +++ b/src/cc/CMakeLists.txt @@ -51,7 +51,7 @@ set(bcc_util_sources ns_guard.cc common.cc) set(bcc_sym_sources bcc_syms.cc bcc_elf.c bcc_perf_map.c bcc_proc.c) set(bcc_common_headers libbpf.h perf_reader.h) set(bcc_table_headers file_desc.h table_desc.h table_storage.h) -set(bcc_api_headers bpf_common.h bpf_module.h bcc_exception.h bcc_syms.h bcc_elf.h) +set(bcc_api_headers bpf_common.h bpf_module.h bcc_exception.h bcc_syms.h) if(ENABLE_CLANG_JIT) add_library(bcc-shared SHARED From ecf87da9e010c0a85a971fc10b274d629cef15d4 Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Mon, 11 Feb 2019 11:45:13 -0800 Subject: [PATCH 044/135] allow application to set rlimit for RLIMIT_MEMLOCK instead of bcc (#2195) Currently, if map_create and prog_load failed in bcc, bcc will tries to raise MEMLOCK resource limit to INFINITY and try again. The map and program memory is charged against user->locked_vm which is accumulated per user and the MEMLOCK rlimit is set per task. This makes hard to get a proper estimate what value to set, hence INFINITY is the most popular value. In production systems, there is a reason why INFINITY may not be the best option. . Uncontrolled prog/map memories could eat much more memory before it is noticied. . If there is a bug in user space or kernel leaking prog/map's (not released), they will not get noticed since other bpf program/map will continue to load successfully with INFINITY MEMLOCK rlimit. This patch provides a mechanism for python and C++ APIs to disable bcc setrlimit. The C++ example RandomRead is intrumented to allow/disallow setrlimit, and the following is an execution instance: -bash-4.4$ sudo ./RandomRead -r could not open bpf map: cgroup, error: Operation not permitted Unable to initialize BPF program -bash-4.4$ sudo ./RandomRead -u Started tracing, hit Ctrl-C to terminate. ^CTerminating... -bash-4.4$ A python test is also added. Signed-off-by: Yonghong Song --- examples/cpp/RandomRead.cc | 33 +++++++++++++++++++++++------ src/cc/api/BPF.h | 7 +++++-- src/cc/bpf_common.cc | 10 +++++---- src/cc/bpf_common.h | 7 +++++-- src/cc/bpf_module.cc | 11 +++++----- src/cc/bpf_module.h | 3 ++- src/cc/libbpf.c | 15 +++++++++----- src/cc/libbpf.h | 5 +++-- src/lua/bcc/bpf.lua | 4 ++-- src/lua/bcc/libbcc.lua | 4 ++-- src/python/bcc/__init__.py | 6 +++--- src/python/bcc/libbcc.py | 4 ++-- tests/cc/test_static.c | 2 +- tests/python/CMakeLists.txt | 2 ++ tests/python/test_rlimit.py | 41 +++++++++++++++++++++++++++++++++++++ 15 files changed, 117 insertions(+), 37 deletions(-) create mode 100755 tests/python/test_rlimit.py diff --git a/examples/cpp/RandomRead.cc b/examples/cpp/RandomRead.cc index 7b4262612..8e38596c3 100644 --- a/examples/cpp/RandomRead.cc +++ b/examples/cpp/RandomRead.cc @@ -11,6 +11,7 @@ */ #include +#include #include #include "BPF.h" @@ -79,25 +80,45 @@ void signal_handler(int s) { exit(0); } +void usage(void) { + std::cerr << "USAGE: RandomRead [{-r|-u} [cgroup2_path]]" << std::endl; +} + int main(int argc, char** argv) { - if (argc != 1 && argc != 2) { - std::cerr << "USAGE: RandomRead [cgroup2_path]" << std::endl; + if (argc > 3) { + usage(); return 1; } + bool allow_rlimit = true; + if (argc >= 2) { + // Set a small rlimit for MEMLOCK + struct rlimit rlim_new = {4096, 4096}; + setrlimit(RLIMIT_MEMLOCK, &rlim_new); + + if (strcmp(argv[1], "-r") == 0) { + allow_rlimit = false; + } else if (strcmp(argv[1], "-u") == 0) { + allow_rlimit = true; + } else { + usage(); + return 1; + } + } + std::vector cflags = {}; - if (argc == 2) + if (argc == 3) cflags.emplace_back("-DCGROUP_FILTER=1"); - bpf = new ebpf::BPF(); + bpf = new ebpf::BPF(0, nullptr, true, "", allow_rlimit); auto init_res = bpf->init(BPF_PROGRAM, cflags, {}); if (init_res.code() != 0) { std::cerr << init_res.msg() << std::endl; return 1; } - if (argc == 2) { + if (argc == 3) { auto cgroup_array = bpf->get_cgroup_array("cgroup"); - auto update_res = cgroup_array.update_value(0, argv[1]); + auto update_res = cgroup_array.update_value(0, argv[2]); if (update_res.code() != 0) { std::cerr << update_res.msg() << std::endl; return 1; diff --git a/src/cc/api/BPF.h b/src/cc/api/BPF.h index 700135f45..38d54b299 100644 --- a/src/cc/api/BPF.h +++ b/src/cc/api/BPF.h @@ -47,10 +47,13 @@ class BPF { static const int BPF_MAX_STACK_DEPTH = 127; explicit BPF(unsigned int flag = 0, TableStorage* ts = nullptr, - bool rw_engine_enabled = bpf_module_rw_engine_enabled(), const std::string &maps_ns = "") + bool rw_engine_enabled = bpf_module_rw_engine_enabled(), + const std::string &maps_ns = "", + bool allow_rlimit = true) : flag_(flag), bsymcache_(NULL), - bpf_module_(new BPFModule(flag, ts, rw_engine_enabled, maps_ns)) {} + bpf_module_(new BPFModule(flag, ts, rw_engine_enabled, maps_ns, + allow_rlimit)) {} StatusTuple init(const std::string& bpf_program, const std::vector& cflags = {}, const std::vector& usdt = {}); diff --git a/src/cc/bpf_common.cc b/src/cc/bpf_common.cc index ac5764e4c..fa42d19ed 100644 --- a/src/cc/bpf_common.cc +++ b/src/cc/bpf_common.cc @@ -26,8 +26,9 @@ void * bpf_module_create_b(const char *filename, const char *proto_filename, uns return mod; } -void * bpf_module_create_c(const char *filename, unsigned flags, const char *cflags[], int ncflags) { - auto mod = new ebpf::BPFModule(flags); +void * bpf_module_create_c(const char *filename, unsigned flags, const char *cflags[], + int ncflags, bool allow_rlimit) { + auto mod = new ebpf::BPFModule(flags, nullptr, true, "", allow_rlimit); if (mod->load_c(filename, cflags, ncflags) != 0) { delete mod; return nullptr; @@ -35,8 +36,9 @@ void * bpf_module_create_c(const char *filename, unsigned flags, const char *cfl return mod; } -void * bpf_module_create_c_from_string(const char *text, unsigned flags, const char *cflags[], int ncflags) { - auto mod = new ebpf::BPFModule(flags); +void * bpf_module_create_c_from_string(const char *text, unsigned flags, const char *cflags[], + int ncflags, bool allow_rlimit) { + auto mod = new ebpf::BPFModule(flags, nullptr, true, "", allow_rlimit); if (mod->load_string(text, cflags, ncflags) != 0) { delete mod; return nullptr; diff --git a/src/cc/bpf_common.h b/src/cc/bpf_common.h index f83f4e676..56c30f6c8 100644 --- a/src/cc/bpf_common.h +++ b/src/cc/bpf_common.h @@ -17,6 +17,7 @@ #ifndef BPF_COMMON_H #define BPF_COMMON_H +#include #include #include @@ -25,8 +26,10 @@ extern "C" { #endif void * bpf_module_create_b(const char *filename, const char *proto_filename, unsigned flags); -void * bpf_module_create_c(const char *filename, unsigned flags, const char *cflags[], int ncflags); -void * bpf_module_create_c_from_string(const char *text, unsigned flags, const char *cflags[], int ncflags); +void * bpf_module_create_c(const char *filename, unsigned flags, const char *cflags[], int ncflags, + bool allow_rlimit); +void * bpf_module_create_c_from_string(const char *text, unsigned flags, const char *cflags[], + int ncflags, bool allow_rlimit); void bpf_module_destroy(void *program); char * bpf_module_license(void *program); unsigned bpf_module_kern_version(void *program); diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc index f00245470..c326613e2 100644 --- a/src/cc/bpf_module.cc +++ b/src/cc/bpf_module.cc @@ -93,10 +93,11 @@ class MyMemoryManager : public SectionMemoryManager { }; BPFModule::BPFModule(unsigned flags, TableStorage *ts, bool rw_engine_enabled, - const std::string &maps_ns) + const std::string &maps_ns, bool allow_rlimit) : flags_(flags), rw_engine_enabled_(rw_engine_enabled && bpf_module_rw_engine_enabled()), used_b_loader_(false), + allow_rlimit_(allow_rlimit), ctx_(new LLVMContext), id_(std::to_string((uintptr_t)this)), maps_ns_(maps_ns), @@ -327,10 +328,10 @@ int BPFModule::load_maps(std::map> attr.btf_value_type_id = map_tids[map_name].second; } - fd = bcc_create_map_xattr(&attr); + fd = bcc_create_map_xattr(&attr, allow_rlimit_); if (fd < 0) { - fprintf(stderr, "could not open bpf map: %s\nis map type enabled in your kernel?\n", - map_name); + fprintf(stderr, "could not open bpf map: %s, error: %s\n", + map_name, strerror(errno)); return -1; } @@ -845,7 +846,7 @@ int BPFModule::bcc_func_load(int prog_type, const char *name, } } - ret = bcc_prog_load_xattr(&attr, prog_len, log_buf, log_buf_size); + ret = bcc_prog_load_xattr(&attr, prog_len, log_buf, log_buf_size, allow_rlimit_); if (btf_) { free(func_info); free(line_info); diff --git a/src/cc/bpf_module.h b/src/cc/bpf_module.h index bd79455ce..547972e0b 100644 --- a/src/cc/bpf_module.h +++ b/src/cc/bpf_module.h @@ -85,7 +85,7 @@ class BPFModule { public: BPFModule(unsigned flags, TableStorage *ts = nullptr, bool rw_engine_enabled = true, - const std::string &maps_ns = ""); + const std::string &maps_ns = "", bool allow_rlimit = true); ~BPFModule(); int free_bcc_memory(); int load_b(const std::string &filename, const std::string &proto_filename); @@ -138,6 +138,7 @@ class BPFModule { unsigned flags_; // 0x1 for printing bool rw_engine_enabled_; bool used_b_loader_; + bool allow_rlimit_; std::string filename_; std::string proto_filename_; std::unique_ptr ctx_; diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 612bdd18c..003ca866e 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -198,7 +198,7 @@ static uint64_t ptr_to_u64(void *ptr) return (uint64_t) (unsigned long) ptr; } -int bcc_create_map_xattr(struct bpf_create_map_attr *attr) +int bcc_create_map_xattr(struct bpf_create_map_attr *attr, bool allow_rlimit) { size_t name_len = attr->name ? strlen(attr->name) : 0; char map_name[BPF_OBJ_NAME_LEN] = {}; @@ -222,8 +222,10 @@ int bcc_create_map_xattr(struct bpf_create_map_attr *attr) } if (ret < 0 && errno == EPERM) { - // see note below about the rationale for this retry + if (!allow_rlimit) + return ret; + // see note below about the rationale for this retry struct rlimit rl = {}; if (getrlimit(RLIMIT_MEMLOCK, &rl) == 0) { rl.rlim_max = RLIM_INFINITY; @@ -247,7 +249,7 @@ int bcc_create_map(enum bpf_map_type map_type, const char *name, attr.value_size = value_size; attr.max_entries = max_entries; attr.map_flags = map_flags; - return bcc_create_map_xattr(&attr); + return bcc_create_map_xattr(&attr, true); } int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags) @@ -465,7 +467,7 @@ int bpf_prog_get_tag(int fd, unsigned long long *ptag) } int bcc_prog_load_xattr(struct bpf_load_program_attr *attr, int prog_len, - char *log_buf, unsigned log_buf_size) + char *log_buf, unsigned log_buf_size, bool allow_rlimit) { size_t name_len = attr->name ? strlen(attr->name) : 0; char *tmp_log_buf = NULL, *attr_log_buf = NULL; @@ -539,6 +541,9 @@ int bcc_prog_load_xattr(struct bpf_load_program_attr *attr, int prog_len, } if (ret < 0 && errno == EPERM) { + if (!allow_rlimit) + return ret; + // When EPERM is returned, two reasons are possible: // 1. user has no permissions for bpf() // 2. user has insufficent rlimit for locked memory @@ -628,7 +633,7 @@ int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, attr.license = license; attr.kern_version = kern_version; attr.log_level = log_level; - return bcc_prog_load_xattr(&attr, prog_len, log_buf, log_buf_size); + return bcc_prog_load_xattr(&attr, prog_len, log_buf, log_buf_size, true); } int bpf_open_raw_sock(const char *name) diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h index 18dcc5482..e4c1b77ec 100644 --- a/src/cc/libbpf.h +++ b/src/cc/libbpf.h @@ -19,6 +19,7 @@ #define LIBBPF_H #include "linux/bpf.h" +#include #include #include @@ -37,7 +38,7 @@ enum bpf_probe_attach_type { int bcc_create_map(enum bpf_map_type map_type, const char *name, int key_size, int value_size, int max_entries, int map_flags); -int bcc_create_map_xattr(struct bpf_create_map_attr *attr); +int bcc_create_map_xattr(struct bpf_create_map_attr *attr, bool allow_rlimit); int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags); int bpf_lookup_elem(int fd, void *key, void *value); int bpf_delete_elem(int fd, void *key); @@ -66,7 +67,7 @@ int bcc_prog_load(enum bpf_prog_type prog_type, const char *name, int log_level, char *log_buf, unsigned log_buf_size); int bcc_prog_load_xattr(struct bpf_load_program_attr *attr, int prog_len, char *log_buf, - unsigned log_buf_size); + unsigned log_buf_size, bool allow_rlimit); int bpf_attach_socket(int sockfd, int progfd); diff --git a/src/lua/bcc/bpf.lua b/src/lua/bcc/bpf.lua index da462ac3e..1fe862a20 100644 --- a/src/lua/bcc/bpf.lua +++ b/src/lua/bcc/bpf.lua @@ -121,7 +121,7 @@ function Bpf:initialize(args) if args.text then log.info("\n%s\n", args.text) - self.module = libbcc.bpf_module_create_c_from_string(args.text, llvm_debug, cflags_ary, #cflags) + self.module = libbcc.bpf_module_create_c_from_string(args.text, llvm_debug, cflags_ary, #cflags, true) elseif args.src_file then local src = _find_file(Bpf.SCRIPT_ROOT, args.src_file) @@ -129,7 +129,7 @@ function Bpf:initialize(args) local hdr = _find_file(Bpf.SCRIPT_ROOT, args.hdr_file) self.module = libbcc.bpf_module_create_b(src, hdr, llvm_debug) else - self.module = libbcc.bpf_module_create_c(src, llvm_debug, cflags_ary, #cflags) + self.module = libbcc.bpf_module_create_c(src, llvm_debug, cflags_ary, #cflags, true) end end diff --git a/src/lua/bcc/libbcc.lua b/src/lua/bcc/libbcc.lua index 4d3e5e0fa..f7a1b7aef 100644 --- a/src/lua/bcc/libbcc.lua +++ b/src/lua/bcc/libbcc.lua @@ -59,8 +59,8 @@ int bpf_close_perf_event_fd(int fd); ffi.cdef[[ void * bpf_module_create_b(const char *filename, const char *proto_filename, unsigned flags); -void * bpf_module_create_c(const char *filename, unsigned flags, const char *cflags[], int ncflags); -void * bpf_module_create_c_from_string(const char *text, unsigned flags, const char *cflags[], int ncflags); +void * bpf_module_create_c(const char *filename, unsigned flags, const char *cflags[], int ncflags, bool allow_rlimit); +void * bpf_module_create_c_from_string(const char *text, unsigned flags, const char *cflags[], int ncflags, bool allow_rlimit); void bpf_module_destroy(void *program); char * bpf_module_license(void *program); unsigned bpf_module_kern_version(void *program); diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index 01b304e7c..a0f7002da 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -270,7 +270,7 @@ def is_exe(fpath): return None def __init__(self, src_file=b"", hdr_file=b"", text=None, debug=0, - cflags=[], usdt_contexts=[]): + cflags=[], usdt_contexts=[], allow_rlimit=True): """Create a new BPF module with the given source code. Note: @@ -318,7 +318,7 @@ def __init__(self, src_file=b"", hdr_file=b"", text=None, debug=0, if text: self.module = lib.bpf_module_create_c_from_string(text, - self.debug, cflags_array, len(cflags_array)) + self.debug, cflags_array, len(cflags_array), allow_rlimit) if not self.module: raise Exception("Failed to compile BPF text") else: @@ -329,7 +329,7 @@ def __init__(self, src_file=b"", hdr_file=b"", text=None, debug=0, self.debug) else: self.module = lib.bpf_module_create_c(src_file, self.debug, - cflags_array, len(cflags_array)) + cflags_array, len(cflags_array), allow_rlimit) if not self.module: raise Exception("Failed to compile BPF module %s" % src_file) diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py index 4d218a727..2aa35b276 100644 --- a/src/python/bcc/libbcc.py +++ b/src/python/bcc/libbcc.py @@ -21,10 +21,10 @@ lib.bpf_module_create_b.argtypes = [ct.c_char_p, ct.c_char_p, ct.c_uint] lib.bpf_module_create_c.restype = ct.c_void_p lib.bpf_module_create_c.argtypes = [ct.c_char_p, ct.c_uint, - ct.POINTER(ct.c_char_p), ct.c_int] + ct.POINTER(ct.c_char_p), ct.c_int, ct.c_bool] lib.bpf_module_create_c_from_string.restype = ct.c_void_p lib.bpf_module_create_c_from_string.argtypes = [ct.c_char_p, ct.c_uint, - ct.POINTER(ct.c_char_p), ct.c_int] + ct.POINTER(ct.c_char_p), ct.c_int, ct.c_bool] lib.bpf_module_destroy.restype = None lib.bpf_module_destroy.argtypes = [ct.c_void_p] lib.bpf_module_license.restype = ct.c_char_p diff --git a/tests/cc/test_static.c b/tests/cc/test_static.c index 4af8b9308..ff675fb92 100644 --- a/tests/cc/test_static.c +++ b/tests/cc/test_static.c @@ -1,6 +1,6 @@ #include "bpf_common.h" int main(int argc, char **argv) { - void *mod = bpf_module_create_c_from_string("BPF_TABLE(\"array\", int, int, stats, 10);\n", 4, NULL, 0); + void *mod = bpf_module_create_c_from_string("BPF_TABLE(\"array\", int, int, stats, 10);\n", 4, NULL, 0, true); return !(mod != NULL); } diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt index 4a233bb8a..a16f76d32 100644 --- a/tests/python/CMakeLists.txt +++ b/tests/python/CMakeLists.txt @@ -79,3 +79,5 @@ add_test(NAME py_test_license WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND ${TEST_WRAPPER} py_test_license sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_license.py) add_test(NAME py_test_free_bcc_memory WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND ${TEST_WRAPPER} py_test_free_bcc_memory sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_free_bcc_memory.py) +add_test(NAME py_test_rlimit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMAND ${TEST_WRAPPER} py_test_rlimit sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_rlimit.py) diff --git a/tests/python/test_rlimit.py b/tests/python/test_rlimit.py new file mode 100755 index 000000000..d3152d223 --- /dev/null +++ b/tests/python/test_rlimit.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +# +# USAGE: test_usdt.py +# +# Copyright 2018 Facebook, Inc +# Licensed under the Apache License, Version 2.0 (the "License") + +from __future__ import print_function +from bcc import BPF +from unittest import main, skipUnless, TestCase +import distutils.version +import os, resource + +class TestRlimitMemlock(TestCase): + def testRlimitMemlock(self): + text = """ +BPF_HASH(unused, u64, u64, 65536); +int test() { return 0; } +""" + # save the original memlock limits + memlock_limit = resource.getrlimit(resource.RLIMIT_MEMLOCK) + + # set a small RLIMIT_MEMLOCK limit + resource.setrlimit(resource.RLIMIT_MEMLOCK, (4096, 4096)) + + # below will fail + failed = 0 + try: + b = BPF(text=text, allow_rlimit=False) + except: + failed = 1 + self.assertEqual(failed, 1) + + # below should succeed + b = BPF(text=text, allow_rlimit=True) + + # reset to the original memlock limits + resource.setrlimit(resource.RLIMIT_MEMLOCK, memlock_limit) + +if __name__ == "__main__": + main() From ac2229c1a7f2ea314252e34b3aa3b06ffac62098 Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Mon, 11 Feb 2019 16:05:24 -0800 Subject: [PATCH 045/135] ignore external maps during retrieving type id's (#2196) When running example UseExternalMap or test test_libbcc, the following warning showed up: -bash-4.4$ sudo ./UseExternalMap libbpf: map:control btf_key_type_size:4 != map_def_key_size:0 ..... -bash-4.4$ sudo ./test_libbcc libbpf: map:mysharedtable btf_key_type_size:4 != map_def_key_size:0 ..... This is related to external maps. When the application defines an external maps, the extra .maps. section gets generated. But the map is not in fake_fd_map_ as the external map does not require an explicit bpf_create_map syscall. It merely duplicates fd from an existing map. The warning showed up because the external map was not in fake_fd_map_ and expected_ksize and expected_vsize are based on fake_fd_map_ hence is 0, which does not match the real map. There is really no reason to find type ids for external maps. So just skip them and the warnings are gone as well. Signed-off-by: Yonghong Song --- src/cc/bpf_module.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc index c326613e2..cfb6ff654 100644 --- a/src/cc/bpf_module.cc +++ b/src/cc/bpf_module.cc @@ -279,6 +279,18 @@ int BPFModule::load_maps(std::map> unsigned key_tid = 0, value_tid = 0; unsigned expected_ksize = 0, expected_vsize = 0; + // skip extern maps, which won't be in fake_fd_map_ as they do not + // require explicit bpf_create_map. + bool is_extern = false; + for (auto &t : tables_) { + if (t->name == map_name) { + is_extern = t->is_extern; + break; + } + } + if (is_extern) + continue; + for (auto map : fake_fd_map_) { std::string name; From 626c41fc246700202f54a273fa4dc99f75e72df8 Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Mon, 11 Feb 2019 20:44:09 -0800 Subject: [PATCH 046/135] handle return value properly for btf__new and btf_ext__new (#2197) Currently, in case of error, btf__new or btf_ext__new does not return NULL pointer. Instead, the return value looks like ERR_PTR(-errno), and the caller needs to check the range of return value to decide whether it is a valid pointer. Let us define similar macros and handle return values properly. We can restore the change if later kernel libbpf changes the return value such that NULL pointer stands for failure. Signed-off-by: Yonghong Song --- src/cc/bcc_btf.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/cc/bcc_btf.cc b/src/cc/bcc_btf.cc index 03e1ce889..eaf0ee9c4 100644 --- a/src/cc/bcc_btf.cc +++ b/src/cc/bcc_btf.cc @@ -21,6 +21,10 @@ #include "libbpf/src/btf.h" #include +#define BCC_MAX_ERRNO 4095 +#define BCC_IS_ERR_VALUE(x) ((x) >= (unsigned long)-BCC_MAX_ERRNO) +#define BCC_IS_ERR(ptr) BCC_IS_ERR_VALUE((unsigned long)ptr) + namespace ebpf { uint32_t BTFStringTable::addString(std::string S) { @@ -148,13 +152,13 @@ int BTF::load(uint8_t *btf_sec, uintptr_t btf_sec_size, } else { btf = btf__new(btf_sec, btf_sec_size); } - if (!btf) { + if (BCC_IS_ERR(btf)) { fprintf(stderr, "Processing .BTF section failure\n"); return -1; } btf_ext = btf_ext__new(btf_ext_sec, btf_ext_sec_size); - if (!btf_ext) { + if (BCC_IS_ERR(btf_ext)) { btf__free(btf); fprintf(stderr, "Processing .BTF.ext section failure\n"); return -1; From 9dcde2792f26cb273114026eefc22c05c24e5766 Mon Sep 17 00:00:00 2001 From: Timothy J Fontaine Date: Mon, 11 Feb 2019 20:50:15 -0800 Subject: [PATCH 047/135] usdt: use ProcMountNSGuard for pid and path attach (#2064) Since the probe may be in a descendent namespace, ensure that when `stat()`ing for HINTs and resolving linked elf locations, be sure to always perform them relative to any observed mount namespace. However, usage of the mount namespace must be surgical, since we're observing the pid with its outter identity, make sure we're not in the mount namespace when attempting to `readlink` the `/proc//exe` --- src/cc/usdt/usdt.cc | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/cc/usdt/usdt.cc b/src/cc/usdt/usdt.cc index 7408d2f07..0914fe3a5 100644 --- a/src/cc/usdt/usdt.cc +++ b/src/cc/usdt/usdt.cc @@ -373,7 +373,12 @@ Context::Context(int pid, const std::string &bin_path) mount_ns_instance_(new ProcMountNS(pid)), loaded_(false) { std::string full_path = resolve_bin_path(bin_path); if (!full_path.empty()) { - if (bcc_elf_foreach_usdt(full_path.c_str(), _each_probe, this) == 0) { + int res; + { + ProcMountNSGuard g(mount_ns_instance_.get()); + res = bcc_elf_foreach_usdt(full_path.c_str(), _each_probe, this); + } + if (res == 0) { cmd_bin_path_ = ebpf::get_pid_exe(pid); if (cmd_bin_path_.empty()) return; @@ -399,13 +404,16 @@ void *bcc_usdt_new_frompid(int pid, const char *path) { if (!path) { ctx = new USDT::Context(pid); } else { - struct stat buffer; - if (strlen(path) >= 1 && path[0] != '/') { - fprintf(stderr, "HINT: Binary path should be absolute.\n\n"); - return nullptr; - } else if (stat(path, &buffer) == -1) { - fprintf(stderr, "HINT: Specified binary doesn't exist.\n\n"); - return nullptr; + { + ProcMountNSGuard g(new ProcMountNS(pid)); + struct stat buffer; + if (strlen(path) >= 1 && path[0] != '/') { + fprintf(stderr, "HINT: Binary path should be absolute.\n\n"); + return nullptr; + } else if (stat(path, &buffer) == -1) { + fprintf(stderr, "HINT: Specified binary doesn't exist.\n\n"); + return nullptr; + } } ctx = new USDT::Context(pid, path); } From 3156303439960602667b2e1dc05aebfe43ad0066 Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Thu, 14 Feb 2019 14:33:58 +0800 Subject: [PATCH 048/135] generate perf event data structure in Python automatically (#2198) * generate perf event data structure in Python automatically When ring buffers are opened to receive custom perf event, we have to define the event data structure twice: once in BPF C and once in Python. It is tedious and error-prone. This patch implements the automatic generation of Python data structure from the C declaration, thus making the redundant definition in Python unnecessary. Example: // define output data structure in C struct data_t { u32 pid; u64 ts; char comm[TASK_COMM_LEN]; }; BPF_PERF_OUTPUT(events); ... Old way: # define output data structure in Python TASK_COMM_LEN = 16 # linux/sched.h class Data(ct.Structure): _fields_ = [("pid", ct.c_ulonglong), ("ts", ct.c_ulonglong), ("comm", ct.c_char * TASK_COMM_LEN)] def print_event(cpu, data, size): event = ct.cast(data, ct.POINTER(Data)).contents ... New way: def print_event(cpu, data, size): event = b["events"].event(data) ... * tools/tcpconnect.py: deduce perf event data structure automatically Take tcpconnect.py as an example to show the new, simpler way of outputing perf event. Other tools/examples can be simplified in a similar way. --- src/cc/bpf_common.cc | 15 +++++ src/cc/bpf_common.h | 2 + src/cc/bpf_module.cc | 18 +++++- src/cc/bpf_module.h | 5 ++ src/cc/frontends/clang/b_frontend_action.cc | 23 ++++++- src/cc/frontends/clang/b_frontend_action.h | 4 +- src/cc/frontends/clang/loader.cc | 13 ++-- src/cc/frontends/clang/loader.h | 6 +- src/python/bcc/__init__.py | 2 +- src/python/bcc/libbcc.py | 4 ++ src/python/bcc/table.py | 71 ++++++++++++++++++++- tools/tcpconnect.py | 32 +--------- 12 files changed, 149 insertions(+), 46 deletions(-) diff --git a/src/cc/bpf_common.cc b/src/cc/bpf_common.cc index fa42d19ed..e65ef9d07 100644 --- a/src/cc/bpf_common.cc +++ b/src/cc/bpf_common.cc @@ -230,6 +230,7 @@ int bpf_table_key_sscanf(void *program, size_t id, const char *buf, void *key) { if (!mod) return -1; return mod->table_key_scanf(id, buf, key); } + int bpf_table_leaf_sscanf(void *program, size_t id, const char *buf, void *leaf) { auto mod = static_cast(program); if (!mod) return -1; @@ -248,4 +249,18 @@ int bcc_func_load(void *program, int prog_type, const char *name, } +size_t bpf_perf_event_fields(void *program, const char *event) { + auto mod = static_cast(program); + if (!mod) + return 0; + return mod->perf_event_fields(event); +} + +const char * bpf_perf_event_field(void *program, const char *event, size_t i) { + auto mod = static_cast(program); + if (!mod) + return nullptr; + return mod->perf_event_field(event, i); +} + } diff --git a/src/cc/bpf_common.h b/src/cc/bpf_common.h index 56c30f6c8..9ad414288 100644 --- a/src/cc/bpf_common.h +++ b/src/cc/bpf_common.h @@ -62,6 +62,8 @@ int bpf_table_key_snprintf(void *program, size_t id, char *buf, size_t buflen, c int bpf_table_leaf_snprintf(void *program, size_t id, char *buf, size_t buflen, const void *leaf); int bpf_table_key_sscanf(void *program, size_t id, const char *buf, void *key); int bpf_table_leaf_sscanf(void *program, size_t id, const char *buf, void *leaf); +size_t bpf_perf_event_fields(void *program, const char *event); +const char * bpf_perf_event_field(void *program, const char *event, size_t i); struct bpf_insn; int bcc_func_load(void *program, int prog_type, const char *name, diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc index cfb6ff654..de7e9a54c 100644 --- a/src/cc/bpf_module.cc +++ b/src/cc/bpf_module.cc @@ -159,7 +159,7 @@ int BPFModule::free_bcc_memory() { int BPFModule::load_cfile(const string &file, bool in_memory, const char *cflags[], int ncflags) { ClangLoader clang_loader(&*ctx_, flags_); if (clang_loader.parse(&mod_, *ts_, file, in_memory, cflags, ncflags, id_, - *func_src_, mod_src_, maps_ns_, fake_fd_map_)) + *func_src_, mod_src_, maps_ns_, fake_fd_map_, perf_events_)) return -1; return 0; } @@ -172,7 +172,7 @@ int BPFModule::load_cfile(const string &file, bool in_memory, const char *cflags int BPFModule::load_includes(const string &text) { ClangLoader clang_loader(&*ctx_, flags_); if (clang_loader.parse(&mod_, *ts_, text, true, nullptr, 0, "", *func_src_, - mod_src_, "", fake_fd_map_)) + mod_src_, "", fake_fd_map_, perf_events_)) return -1; return 0; } @@ -595,6 +595,20 @@ unsigned BPFModule::kern_version() const { size_t BPFModule::num_tables() const { return tables_.size(); } +size_t BPFModule::perf_event_fields(const char *event) const { + auto it = perf_events_.find(event); + if (it == perf_events_.end()) + return 0; + return it->second.size(); +} + +const char * BPFModule::perf_event_field(const char *event, size_t i) const { + auto it = perf_events_.find(event); + if (it == perf_events_.end() || i >= it->second.size()) + return nullptr; + return it->second[i].c_str(); +} + size_t BPFModule::table_id(const string &name) const { auto it = table_names_.find(name); if (it == table_names_.end()) return ~0ull; diff --git a/src/cc/bpf_module.h b/src/cc/bpf_module.h index 547972e0b..b372a954e 100644 --- a/src/cc/bpf_module.h +++ b/src/cc/bpf_module.h @@ -133,6 +133,8 @@ class BPFModule { const struct bpf_insn *insns, int prog_len, const char *license, unsigned kern_version, int log_level, char *log_buf, unsigned log_buf_size); + size_t perf_event_fields(const char *) const; + const char * perf_event_field(const char *, size_t i) const; private: unsigned flags_; // 0x1 for printing @@ -160,6 +162,9 @@ class BPFModule { std::unique_ptr local_ts_; BTF *btf_; fake_fd_map_def fake_fd_map_; + + // map of events -- key: event name, value: event fields + std::map> perf_events_; }; } // namespace ebpf diff --git a/src/cc/frontends/clang/b_frontend_action.cc b/src/cc/frontends/clang/b_frontend_action.cc index 8f734db48..4fc643ce5 100644 --- a/src/cc/frontends/clang/b_frontend_action.cc +++ b/src/cc/frontends/clang/b_frontend_action.cc @@ -813,6 +813,23 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) { GET_ENDLOC(Call->getArg(2))))); txt = "bpf_perf_event_output(" + arg0 + ", bpf_pseudo_fd(1, " + fd + ")"; txt += ", CUR_CPU_IDENTIFIER, " + args_other + ")"; + + // e.g. + // struct data_t { u32 pid; }; data_t data; + // events.perf_submit(ctx, &data, sizeof(data)); + // ... + // &data -> data -> typeof(data) -> data_t + auto type_arg1 = Call->getArg(1)->IgnoreCasts()->getType().getTypePtr()->getPointeeType().getTypePtr(); + if (type_arg1->isStructureType()) { + auto event_type = type_arg1->getAsTagDecl(); + const auto *r = dyn_cast(event_type); + std::vector perf_event; + + for (auto it = r->field_begin(); it != r->field_end(); ++it) { + perf_event.push_back(it->getNameAsString() + "#" + it->getType().getAsString()); //"pid#u32" + } + fe_.perf_events_[name] = perf_event; + } } else if (memb_name == "perf_submit_skb") { string skb = rewriter_.getRewrittenText(expansionRange(Call->getArg(0)->getSourceRange())); string skb_len = rewriter_.getRewrittenText(expansionRange(Call->getArg(1)->getSourceRange())); @@ -1348,7 +1365,8 @@ BFrontendAction::BFrontendAction(llvm::raw_ostream &os, unsigned flags, const std::string &main_path, FuncSource &func_src, std::string &mod_src, const std::string &maps_ns, - fake_fd_map_def &fake_fd_map) + fake_fd_map_def &fake_fd_map, + std::map> &perf_events) : os_(os), flags_(flags), ts_(ts), @@ -1359,7 +1377,8 @@ BFrontendAction::BFrontendAction(llvm::raw_ostream &os, unsigned flags, func_src_(func_src), mod_src_(mod_src), next_fake_fd_(-1), - fake_fd_map_(fake_fd_map) {} + fake_fd_map_(fake_fd_map), + perf_events_(perf_events) {} bool BFrontendAction::is_rewritable_ext_func(FunctionDecl *D) { StringRef file_name = rewriter_->getSourceMgr().getFilename(GET_BEGINLOC(D)); diff --git a/src/cc/frontends/clang/b_frontend_action.h b/src/cc/frontends/clang/b_frontend_action.h index bc6690cbd..a8ac858f1 100644 --- a/src/cc/frontends/clang/b_frontend_action.h +++ b/src/cc/frontends/clang/b_frontend_action.h @@ -156,7 +156,8 @@ class BFrontendAction : public clang::ASTFrontendAction { const std::string &id, const std::string &main_path, FuncSource &func_src, std::string &mod_src, const std::string &maps_ns, - fake_fd_map_def &fake_fd_map); + fake_fd_map_def &fake_fd_map, + std::map> &perf_events); // Called by clang when the AST has been completed, here the output stream // will be flushed. @@ -192,6 +193,7 @@ class BFrontendAction : public clang::ASTFrontendAction { std::set m_; int next_fake_fd_; fake_fd_map_def &fake_fd_map_; + std::map> &perf_events_; }; } // namespace visitor diff --git a/src/cc/frontends/clang/loader.cc b/src/cc/frontends/clang/loader.cc index 3e579ba19..a3e09e6dc 100644 --- a/src/cc/frontends/clang/loader.cc +++ b/src/cc/frontends/clang/loader.cc @@ -109,7 +109,8 @@ int ClangLoader::parse(unique_ptr *mod, TableStorage &ts, int ncflags, const std::string &id, FuncSource &func_src, std::string &mod_src, const std::string &maps_ns, - fake_fd_map_def &fake_fd_map) { + fake_fd_map_def &fake_fd_map, + std::map> &perf_events) { string main_path = "/virtual/main.c"; unique_ptr main_buf; struct utsname un; @@ -206,7 +207,7 @@ int ClangLoader::parse(unique_ptr *mod, TableStorage &ts, #endif if (do_compile(mod, ts, in_memory, flags_cstr, flags_cstr_rem, main_path, - main_buf, id, func_src, mod_src, true, maps_ns, fake_fd_map)) { + main_buf, id, func_src, mod_src, true, maps_ns, fake_fd_map, perf_events)) { #if BCC_BACKUP_COMPILE != 1 return -1; #else @@ -218,7 +219,7 @@ int ClangLoader::parse(unique_ptr *mod, TableStorage &ts, mod_src.clear(); fake_fd_map.clear(); if (do_compile(mod, ts, in_memory, flags_cstr, flags_cstr_rem, main_path, - main_buf, id, func_src, mod_src, false, maps_ns, fake_fd_map)) + main_buf, id, func_src, mod_src, false, maps_ns, fake_fd_map, perf_events)) return -1; #endif } @@ -266,7 +267,8 @@ int ClangLoader::do_compile(unique_ptr *mod, TableStorage &ts, const std::string &id, FuncSource &func_src, std::string &mod_src, bool use_internal_bpfh, const std::string &maps_ns, - fake_fd_map_def &fake_fd_map) { + fake_fd_map_def &fake_fd_map, + std::map> &perf_events) { using namespace clang; vector flags_cstr = flags_cstr_in; @@ -380,7 +382,8 @@ int ClangLoader::do_compile(unique_ptr *mod, TableStorage &ts, // capture the rewritten c file string out_str1; llvm::raw_string_ostream os1(out_str1); - BFrontendAction bact(os1, flags_, ts, id, main_path, func_src, mod_src, maps_ns, fake_fd_map); + BFrontendAction bact(os1, flags_, ts, id, main_path, func_src, mod_src, + maps_ns, fake_fd_map, perf_events); if (!compiler1.ExecuteAction(bact)) return -1; unique_ptr out_buf1 = llvm::MemoryBuffer::getMemBuffer(out_str1); diff --git a/src/cc/frontends/clang/loader.h b/src/cc/frontends/clang/loader.h index 984ca2fdb..176fc7efc 100644 --- a/src/cc/frontends/clang/loader.h +++ b/src/cc/frontends/clang/loader.h @@ -55,7 +55,8 @@ class ClangLoader { const std::string &file, bool in_memory, const char *cflags[], int ncflags, const std::string &id, FuncSource &func_src, std::string &mod_src, const std::string &maps_ns, - fake_fd_map_def &fake_fd_map); + fake_fd_map_def &fake_fd_map, + std::map> &perf_events); private: int do_compile(std::unique_ptr *mod, TableStorage &ts, @@ -66,7 +67,8 @@ class ClangLoader { const std::string &id, FuncSource &func_src, std::string &mod_src, bool use_internal_bpfh, const std::string &maps_ns, - fake_fd_map_def &fake_fd_map); + fake_fd_map_def &fake_fd_map, + std::map> &perf_events); private: std::map> remapped_headers_; diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index a0f7002da..534bc6677 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -477,7 +477,7 @@ def get_table(self, name, keytype=None, leaftype=None, reducer=None): if not leaf_desc: raise Exception("Failed to load BPF Table %s leaf desc" % name) leaftype = BPF._decode_table_type(json.loads(leaf_desc)) - return Table(self, map_id, map_fd, keytype, leaftype, reducer=reducer) + return Table(self, map_id, map_fd, keytype, leaftype, name, reducer=reducer) def __getitem__(self, key): if key not in self.tables: diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py index 2aa35b276..23d0b11ed 100644 --- a/src/python/bcc/libbcc.py +++ b/src/python/bcc/libbcc.py @@ -65,6 +65,10 @@ lib.bpf_table_leaf_sscanf.restype = ct.c_int lib.bpf_table_leaf_sscanf.argtypes = [ct.c_void_p, ct.c_ulonglong, ct.c_char_p, ct.c_void_p] +lib.bpf_perf_event_fields.restype = ct.c_ulonglong +lib.bpf_perf_event_fields.argtypes = [ct.c_void_p, ct.c_char_p] +lib.bpf_perf_event_field.restype = ct.c_char_p +lib.bpf_perf_event_field.argtypes = [ct.c_void_p, ct.c_char_p, ct.c_ulonglong] # keep in sync with libbpf.h lib.bpf_get_next_key.restype = ct.c_int diff --git a/src/python/bcc/table.py b/src/python/bcc/table.py index f6449de77..78eddf3a6 100644 --- a/src/python/bcc/table.py +++ b/src/python/bcc/table.py @@ -18,6 +18,7 @@ import multiprocessing import os import errno +import re from .libbcc import lib, _RAW_CB_TYPE, _LOST_CB_TYPE from .perf import Perf @@ -122,7 +123,7 @@ def _print_linear_hist(vals, val_type): _stars(val, val_max, stars))) -def Table(bpf, map_id, map_fd, keytype, leaftype, **kwargs): +def Table(bpf, map_id, map_fd, keytype, leaftype, name, **kwargs): """Table(bpf, map_id, map_fd, keytype, leaftype, **kwargs) Create a python object out of a reference to a bpf table handle""" @@ -136,7 +137,7 @@ def Table(bpf, map_id, map_fd, keytype, leaftype, **kwargs): elif ttype == BPF_MAP_TYPE_PROG_ARRAY: t = ProgArray(bpf, map_id, map_fd, keytype, leaftype) elif ttype == BPF_MAP_TYPE_PERF_EVENT_ARRAY: - t = PerfEventArray(bpf, map_id, map_fd, keytype, leaftype) + t = PerfEventArray(bpf, map_id, map_fd, keytype, leaftype, name) elif ttype == BPF_MAP_TYPE_PERCPU_HASH: t = PerCpuHash(bpf, map_id, map_fd, keytype, leaftype, **kwargs) elif ttype == BPF_MAP_TYPE_PERCPU_ARRAY: @@ -162,7 +163,7 @@ def Table(bpf, map_id, map_fd, keytype, leaftype, **kwargs): class TableBase(MutableMapping): - def __init__(self, bpf, map_id, map_fd, keytype, leaftype): + def __init__(self, bpf, map_id, map_fd, keytype, leaftype, name=None): self.bpf = bpf self.map_id = map_id self.map_fd = map_fd @@ -171,6 +172,7 @@ def __init__(self, bpf, map_id, map_fd, keytype, leaftype): self.ttype = lib.bpf_table_type_id(self.bpf.module, self.map_id) self.flags = lib.bpf_table_flags_id(self.bpf.module, self.map_id) self._cbs = {} + self._name = name def key_sprintf(self, key): buf = ct.create_string_buffer(ct.sizeof(self.Key) * 8) @@ -537,6 +539,7 @@ class PerfEventArray(ArrayBase): def __init__(self, *args, **kwargs): super(PerfEventArray, self).__init__(*args, **kwargs) self._open_key_fds = {} + self._event_class = None def __del__(self): keys = list(self._open_key_fds.keys()) @@ -559,6 +562,68 @@ def __delitem__(self, key): lib.bpf_close_perf_event_fd(self._open_key_fds[key]) del self._open_key_fds[key] + def _get_event_class(self): + ct_mapping = { 'char' : ct.c_char, + 's8' : ct.c_char, + 'unsigned char' : ct.c_ubyte, + 'u8' : ct.c_ubyte, + 'u8 *' : ct.c_char_p, + 'char *' : ct.c_char_p, + 'short' : ct.c_short, + 's16' : ct.c_short, + 'unsigned short' : ct.c_ushort, + 'u16' : ct.c_ushort, + 'int' : ct.c_int, + 's32' : ct.c_int, + 'unsigned int' : ct.c_uint, + 'u32' : ct.c_uint, + 'long' : ct.c_long, + 'unsigned long' : ct.c_ulong, + 'long long' : ct.c_longlong, + 's64' : ct.c_longlong, + 'unsigned long long': ct.c_ulonglong, + 'u64' : ct.c_ulonglong, + '__int128' : (ct.c_longlong * 2), + 'unsigned __int128' : (ct.c_ulonglong * 2), + 'void *' : ct.c_void_p } + + # handle array types e.g. "int [16] foo" + array_type = re.compile(r"(.+) \[([0-9]+)\]$") + + fields = [] + num_fields = lib.bpf_perf_event_fields(self.bpf.module, self._name) + i = 0 + while i < num_fields: + field = lib.bpf_perf_event_field(self.bpf.module, self._name, i) + m = re.match(r"(.*)#(.*)", field) + field_name = m.group(1) + field_type = m.group(2) + + m = array_type.match(field_type) + try: + if m: + fields.append((field_name, ct_mapping[m.group(1)] * int(m.group(2)))) + else: + fields.append((field_name, ct_mapping[field_type])) + except KeyError: + print("Type: '%s' not recognized. Please define the data with ctypes manually." + % field_type) + exit() + i += 1 + return type('', (ct.Structure,), {'_fields_': fields}) + + def event(self, data): + """event(data) + + When ring buffers are opened to receive custom perf event, + the underlying event data struct which is defined in C in + the BPF program can be deduced via this function. This avoids + redundant definitions in Python. + """ + if self._event_class == None: + self._event_class = self._get_event_class() + return ct.cast(data, ct.POINTER(self._event_class)).contents + def open_perf_buffer(self, callback, page_cnt=8, lost_cb=None): """open_perf_buffers(callback) diff --git a/tools/tcpconnect.py b/tools/tcpconnect.py index 54364c939..e230f6551 100755 --- a/tools/tcpconnect.py +++ b/tools/tcpconnect.py @@ -24,7 +24,6 @@ import argparse from socket import inet_ntop, ntohs, AF_INET, AF_INET6 from struct import pack -import ctypes as ct # arguments examples = """examples: @@ -187,36 +186,9 @@ if args.ebpf: exit() -# event data -TASK_COMM_LEN = 16 # linux/sched.h - -class Data_ipv4(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("pid", ct.c_uint), - ("uid", ct.c_uint), - ("saddr", ct.c_uint), - ("daddr", ct.c_uint), - ("ip", ct.c_ulonglong), - ("dport", ct.c_ushort), - ("task", ct.c_char * TASK_COMM_LEN) - ] - -class Data_ipv6(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("pid", ct.c_uint), - ("uid", ct.c_uint), - ("saddr", (ct.c_ulonglong * 2)), - ("daddr", (ct.c_ulonglong * 2)), - ("ip", ct.c_ulonglong), - ("dport", ct.c_ushort), - ("task", ct.c_char * TASK_COMM_LEN) - ] - # process event def print_ipv4_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data_ipv4)).contents + event = b["ipv4_events"].event(data) global start_ts if args.timestamp: if start_ts == 0: @@ -230,7 +202,7 @@ def print_ipv4_event(cpu, data, size): inet_ntop(AF_INET, pack("I", event.daddr)).encode(), event.dport)) def print_ipv6_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data_ipv6)).contents + event = b["ipv6_events"].event(data) global start_ts if args.timestamp: if start_ts == 0: From 790425edad107a421d12757b8295f91226fa86ed Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Fri, 15 Feb 2019 03:04:47 +0800 Subject: [PATCH 049/135] README.md: add missing pointer to tools/mountsnoop.py (#2200) add missing pointer to tools/mountsnoop.py --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 77c520bdb..b09c380da 100644 --- a/README.md +++ b/README.md @@ -118,8 +118,9 @@ pair of .c and .py files, and some are directories of files. - tools/[killsnoop](tools/killsnoop.py): Trace signals issued by the kill() syscall. [Examples](tools/killsnoop_example.txt). - tools/[llcstat](tools/llcstat.py): Summarize CPU cache references and misses by process. [Examples](tools/llcstat_example.txt). - tools/[mdflush](tools/mdflush.py): Trace md flush events. [Examples](tools/mdflush_example.txt). -- tools/[mysqld_qslower](tools/mysqld_qslower.py): Trace MySQL server queries slower than a threshold. [Examples](tools/mysqld_qslower_example.txt). - tools/[memleak](tools/memleak.py): Display outstanding memory allocations to find memory leaks. [Examples](tools/memleak_example.txt). +- tools/[mountsnoop](tools/mountsnoop.py): Trace mount and umount syscalls system-wide. [Examples](tools/mountsnoop_example.txt). +- tools/[mysqld_qslower](tools/mysqld_qslower.py): Trace MySQL server queries slower than a threshold. [Examples](tools/mysqld_qslower_example.txt). - tools/[nfsslower](tools/nfsslower.py): Trace slow NFS operations. [Examples](tools/nfsslower_example.txt). - tools/[nfsdist](tools/nfsdist.py): Summarize NFS operation latency distribution as a histogram. [Examples](tools/nfsdist_example.txt). - tools/[offcputime](tools/offcputime.py): Summarize off-CPU time by kernel stack trace. [Examples](tools/offcputime_example.txt). From 6c79c68ba2a0d0588c908d639bbf3e186087cd56 Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Fri, 15 Feb 2019 03:41:22 +0800 Subject: [PATCH 050/135] add `enum` support for generating perf event data struct (#2199) Add enum type support so that scripts like tools/dcsnoop.py could be simplied. --- src/python/bcc/table.py | 4 ++++ tools/dcsnoop.py | 14 +------------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/src/python/bcc/table.py b/src/python/bcc/table.py index 78eddf3a6..f2462570a 100644 --- a/src/python/bcc/table.py +++ b/src/python/bcc/table.py @@ -575,6 +575,7 @@ def _get_event_class(self): 'u16' : ct.c_ushort, 'int' : ct.c_int, 's32' : ct.c_int, + 'enum' : ct.c_int, 'unsigned int' : ct.c_uint, 'u32' : ct.c_uint, 'long' : ct.c_long, @@ -599,6 +600,9 @@ def _get_event_class(self): field_name = m.group(1) field_type = m.group(2) + if re.match(r"enum .*", field_type): + field_type = "enum" + m = array_type.match(field_type) try: if m: diff --git a/tools/dcsnoop.py b/tools/dcsnoop.py index 4c3757188..331ee30e1 100755 --- a/tools/dcsnoop.py +++ b/tools/dcsnoop.py @@ -23,7 +23,6 @@ from __future__ import print_function from bcc import BPF import argparse -import ctypes as ct import re import time @@ -123,17 +122,6 @@ } """ -TASK_COMM_LEN = 16 # linux/sched.h -MAX_FILE_LEN = 64 # see inline C - -class Data(ct.Structure): - _fields_ = [ - ("pid", ct.c_uint), - ("type", ct.c_int), - ("comm", ct.c_char * TASK_COMM_LEN), - ("filename", ct.c_char * MAX_FILE_LEN), - ] - if args.ebpf: print(bpf_text) exit() @@ -151,7 +139,7 @@ class Data(ct.Structure): start_ts = time.time() def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) print("%-11.6f %-6d %-16s %1s %s" % ( time.time() - start_ts, event.pid, event.comm.decode('utf-8', 'replace'), mode_s[event.type], From 75a1f3d051138dbc00a67d17ddad938c83f13f00 Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Thu, 14 Feb 2019 14:07:58 -0800 Subject: [PATCH 051/135] implement DEBUG_BTF flag (#2202) silence BTF related warning messages by default. The DEBUG_BTF is added to enable such warning messages. Signed-off-by: Yonghong Song --- src/cc/bcc_btf.cc | 26 ++++++++++++++++++++++---- src/cc/bcc_btf.h | 3 +++ src/cc/bpf_module.cc | 2 +- src/cc/bpf_module.h | 2 ++ src/python/bcc/__init__.py | 4 +++- 5 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/cc/bcc_btf.cc b/src/cc/bcc_btf.cc index eaf0ee9c4..dee2d114c 100644 --- a/src/cc/bcc_btf.cc +++ b/src/cc/bcc_btf.cc @@ -15,9 +15,11 @@ */ #include "bcc_btf.h" +#include #include #include "linux/btf.h" #include "libbpf.h" +#include "libbpf/src/libbpf.h" #include "libbpf/src/btf.h" #include @@ -41,11 +43,27 @@ uint32_t BTFStringTable::addString(std::string S) { return Offset; } +BTF::BTF(bool debug) : debug_(debug), btf_(nullptr), btf_ext_(nullptr) { + if (!debug) + libbpf_set_print(NULL); +} + BTF::~BTF() { btf__free(btf_); btf_ext__free(btf_ext_); } +void BTF::warning(const char *format, ...) { + va_list args; + + if (!debug_) + return; + + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); +} + // The compiler doesn't have source code for remapped files. // So we modify .BTF and .BTF.ext sections here to add these // missing line source codes. @@ -153,14 +171,14 @@ int BTF::load(uint8_t *btf_sec, uintptr_t btf_sec_size, btf = btf__new(btf_sec, btf_sec_size); } if (BCC_IS_ERR(btf)) { - fprintf(stderr, "Processing .BTF section failure\n"); + warning("Processing .BTF section failed\n"); return -1; } btf_ext = btf_ext__new(btf_ext_sec, btf_ext_sec_size); if (BCC_IS_ERR(btf_ext)) { btf__free(btf); - fprintf(stderr, "Processing .BTF.ext section failure\n"); + warning("Processing .BTF.ext section failed\n"); return -1; } @@ -189,14 +207,14 @@ int BTF::get_btf_info(const char *fname, ret = btf_ext__reloc_func_info(btf_, btf_ext_, fname, 0, func_info, func_info_cnt); if (ret) { - fprintf(stderr, ".BTF.ext reloc func_info not successful\n"); + warning(".BTF.ext reloc func_info failed\n"); return ret; } ret = btf_ext__reloc_line_info(btf_, btf_ext_, fname, 0, line_info, line_info_cnt); if (ret) { - fprintf(stderr, ".BTF.ext reloc line_info not successful\n"); + warning(".BTF.ext reloc line_info failed\n"); return ret; } diff --git a/src/cc/bcc_btf.h b/src/cc/bcc_btf.h index ffa8307e8..5204b016c 100644 --- a/src/cc/bcc_btf.h +++ b/src/cc/bcc_btf.h @@ -43,6 +43,7 @@ class BTFStringTable { class BTF { public: + BTF(bool debug); ~BTF(); int load(uint8_t *btf_sec, uintptr_t btf_sec_size, uint8_t *btf_ext_sec, uintptr_t btf_ext_sec_size, @@ -62,8 +63,10 @@ class BTF { uint8_t *btf_ext_sec, uintptr_t btf_ext_sec_size, std::map &remapped_sources, uint8_t **new_btf_sec, uintptr_t *new_btf_sec_size); + void warning(const char *format, ...); private: + bool debug_; struct btf *btf_; struct btf_ext *btf_ext_; }; diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc index de7e9a54c..f0d399784 100644 --- a/src/cc/bpf_module.cc +++ b/src/cc/bpf_module.cc @@ -258,7 +258,7 @@ void BPFModule::load_btf(std::map> remapped_sources["/virtual/main.c"] = mod_src_; remapped_sources["/virtual/include/bcc/helpers.h"] = helpers_h->second; - BTF *btf = new BTF(); + BTF *btf = new BTF(flags_ & DEBUG_BTF); int ret = btf->load(btf_sec, btf_sec_size, btf_ext_sec, btf_ext_sec_size, remapped_sources); if (ret) { diff --git a/src/cc/bpf_module.h b/src/cc/bpf_module.h index b372a954e..18c71d3d5 100644 --- a/src/cc/bpf_module.h +++ b/src/cc/bpf_module.h @@ -47,6 +47,8 @@ enum { DEBUG_SOURCE = 0x8, // Debug output register state on all instructions in addition to DEBUG_BPF. DEBUG_BPF_REGISTER_STATE = 0x10, + // Debug BTF. + DEBUG_BTF = 0x20, }; class TableDesc; diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index 534bc6677..2c45d8c6d 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -51,8 +51,10 @@ def _get_num_open_probes(): DEBUG_PREPROCESSOR = 0x4 # Debug output ASM instructions embedded with source. DEBUG_SOURCE = 0x8 -#Debug output register state on all instructions in addition to DEBUG_BPF. +# Debug output register state on all instructions in addition to DEBUG_BPF. DEBUG_BPF_REGISTER_STATE = 0x10 +# Debug BTF. +DEBUG_BTF = 0x20 class SymbolCache(object): def __init__(self, pid): From 1b36e9e3843cd7574bcd34f5d4cf28312d238e4e Mon Sep 17 00:00:00 2001 From: Brian Moyles Date: Thu, 14 Feb 2019 19:31:31 -0800 Subject: [PATCH 052/135] Have packages declare they provide and conflict with Ubuntu- and Debian- provided packages so they can satisfy dependencies but not be simultaneously installed or mixed --- INSTALL.md | 41 ++++++++++++++++++++++++++++------------- debian/control | 10 ++++++++++ 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index ecf02d2d9..10445b4a1 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -55,7 +55,32 @@ Kernel compile flags can usually be checked by looking at `/proc/config.gz` or The stable and the nightly packages are built for Ubuntu Xenial (16.04), Ubuntu Artful (17.10) and Ubuntu Bionic (18.04). The steps are very straightforward, no need to upgrade the kernel or compile from source! -**Stable and Signed Packages** +**Ubuntu Packages** + +As of Ubuntu Bionic (18.04), versions of bcc are available in the standard Ubuntu +multiverse repository. The Ubuntu packages have slightly different names: where iovisor +packages use `bcc` in the name (e.g. `bcc-tools`), Ubuntu packages use `bpfcc` (e.g. +`bpfcc-tools`). Source packages and the binary packages produced from them can be +found at [packages.ubuntu.com](https://packages.ubuntu.com/search?suite=default§ion=all&arch=any&keywords=bpfcc&searchon=sourcenames). + +```bash +sudo apt-get install bpfcc-tools linux-headers-$(uname -r) +``` + +The tools are installed in `/sbin` with a `-bpfcc` extension. Try running `sudo opensnoop-bpfcc`. + +**_Note_**: the Ubuntu packages have different names but the package contents, in most cases, conflict +and as such _cannot_ be installed alongside upstream packages. Should one choose to use +Ubuntu's packages instead of the upstream iovisor packages (or vice-versa), the +conflicting packages will need to be removed. + +The iovisor packages _do_ declare they provide the Ubuntu packages and as such may be +used to satisfy dependencies. For example, should one attempt to install package `foo` +which declares a dependency on `libbpfcc` while the upstream `libbcc` package is installed, +`foo` should install without trouble as `libbcc` declares that it provides `libbpfcc`. +That said, one should always test such a configuration in case of version incompatibilities. + +**Upstream Stable and Signed Packages** ```bash sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 4052245BD4284CDD @@ -65,7 +90,7 @@ sudo apt-get install bcc-tools libbcc-examples linux-headers-$(uname -r) ``` (replace `xenial` with `artful` or `bionic` as appropriate). Tools will be installed under /usr/share/bcc/tools. -**Nightly Packages** +**Upstream Nightly Packages** ```bash echo "deb [trusted=yes] https://repo.iovisor.org/apt/xenial xenial-nightly main" | sudo tee /etc/apt/sources.list.d/iovisor.list @@ -74,16 +99,6 @@ sudo apt-get install bcc-tools libbcc-examples linux-headers-$(uname -r) ``` (replace `xenial` with `artful` or `bionic` as appropriate) -**Ubuntu Packages** - -The previous commands will install the latest bcc from the iovisor repositories. It is also available from the standard Ubuntu multiverse repository, under the package name `bpfcc-tools`. - -```bash -sudo apt-get install bpfcc-tools linux-headers-$(uname -r) -``` - -The tools are installed in /sbin with a -bpfcc extension. Try running `sudo opensnoop-bpfcc`. - ## Fedora - Binary Ensure that you are running a 4.2+ kernel with `uname -r`. If not, install a 4.2+ kernel from @@ -379,7 +394,7 @@ Tested on Amazon Linux AMI release 2018.03 (kernel 4.14.47-56.37.amzn1.x86_64) ### Install packages required for building ``` -# enable epel to get iperf, luajit, luajit-devel, cmake3 (cmake3 is required to support c++11) +# enable epel to get iperf, luajit, luajit-devel, cmake3 (cmake3 is required to support c++11) sudo yum-config-manager --enable epel sudo yum install -y bison cmake3 ethtool flex git iperf libstdc++-static python-netaddr gcc gcc-c++ make zlib-devel elfutils-libelf-devel diff --git a/debian/control b/debian/control index 5143a42b8..4b5a22e75 100644 --- a/debian/control +++ b/debian/control @@ -16,6 +16,8 @@ Homepage: https://github.com/iovisor/bcc Package: libbcc Architecture: all +Provides: libbpfcc, libbpfcc-dev +Conflicts: libbpfcc, libbpfcc-dev Depends: libc6, libstdc++6, libelf1 Description: Shared Library for BPF Compiler Collection (BCC) Shared Library for BPF Compiler Collection to control BPF programs @@ -28,20 +30,28 @@ Description: Examples for BPF Compiler Collection (BCC) Package: python-bcc Architecture: all +Provides: python-bpfcc +Conflicts: python-bpfcc Depends: libbcc (= ${binary:Version}), python, binutils Description: Python wrappers for BPF Compiler Collection (BCC) Package: python3-bcc Architecture: all +Provides: python3-bpfcc +Conflicts: python3-bpfcc Depends: libbcc (= ${binary:Version}), python3, binutils Description: Python3 wrappers for BPF Compiler Collection (BCC) Package: bcc-tools Architecture: all +Provides: bpfcc-tools +Conflicts: bpfcc-tools Depends: python-bcc (= ${binary:Version}) Description: Command line tools for BPF Compiler Collection (BCC) Package: bcc-lua Architecture: all +Provides: bpfcc-lua +Conflicts: bpfcc-lua Depends: libbcc (= ${binary:Version}) Description: Standalone tool to run BCC tracers written in Lua From 51d62d36bd072530a238ac147a61b631fdc44659 Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Fri, 15 Feb 2019 13:03:05 +0800 Subject: [PATCH 053/135] tools: remove redundant Python event data structure definitions (#2204) Simplify code following #2198 (https://github.com/iovisor/bcc/pull/2198). Some tools are not touched: mountsnoop.py, trace.py, lib/*.py, old/*.py. --- tools/bashreadline.py | 10 +--------- tools/biosnoop.py | 18 +----------------- tools/btrfsslower.py | 18 +----------------- tools/capable.py | 16 +--------------- tools/cpuunclaimed.py | 10 +--------- tools/criticalstat.py | 15 +-------------- tools/dbslower.py | 11 +---------- tools/execsnoop.py | 17 +---------------- tools/ext4slower.py | 18 +----------------- tools/filelife.py | 14 +------------- tools/fileslower.py | 17 +---------------- tools/funcslower.py | 17 +---------------- tools/gethostlatency.py | 13 +------------ tools/killsnoop.py | 15 +-------------- tools/mdflush.py | 13 +------------ tools/mysqld_qslower.py | 11 +---------- tools/nfsslower.py | 21 +-------------------- tools/oomkill.py | 14 +------------- tools/opensnoop.py | 17 +---------------- tools/profile.py | 1 - tools/runqslower.py | 13 +------------ tools/shmsnoop.py | 19 +------------------ tools/sofdsnoop.py | 16 +--------------- tools/solisten.py | 17 +---------------- tools/sslsniff.py | 20 ++++---------------- tools/statsnoop.py | 15 +-------------- tools/syncsnoop.py | 8 +------- tools/tcpaccept.py | 30 ++---------------------------- tools/tcpconnlat.py | 32 ++------------------------------ tools/tcpdrop.py | 32 ++------------------------------ tools/tcplife.py | 34 ++-------------------------------- tools/tcpretrans.py | 30 ++---------------------------- tools/tcpstates.py | 36 ++---------------------------------- tools/tcptop.py | 1 - tools/tcptracer.py | 38 ++------------------------------------ tools/ttysnoop.py | 11 +---------- tools/xfsslower.py | 18 +----------------- tools/zfsslower.py | 18 +----------------- 38 files changed, 46 insertions(+), 628 deletions(-) diff --git a/tools/bashreadline.py b/tools/bashreadline.py index da9c1b7c1..af4f18ec8 100755 --- a/tools/bashreadline.py +++ b/tools/bashreadline.py @@ -14,7 +14,6 @@ from __future__ import print_function from bcc import BPF from time import strftime -import ctypes as ct # load BPF program bpf_text = """ @@ -40,13 +39,6 @@ return 0; }; """ -STR_DATA = 80 - -class Data(ct.Structure): - _fields_ = [ - ("pid", ct.c_ulonglong), - ("str", ct.c_char * STR_DATA) - ] b = BPF(text=bpf_text) b.attach_uretprobe(name="/bin/bash", sym="readline", fn_name="printret") @@ -55,7 +47,7 @@ class Data(ct.Structure): print("%-9s %-6s %s" % ("TIME", "PID", "COMMAND")) def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) print("%-9s %-6d %s" % (strftime("%H:%M:%S"), event.pid, event.str.decode('utf-8', 'replace'))) diff --git a/tools/biosnoop.py b/tools/biosnoop.py index 259a81b32..97f478587 100755 --- a/tools/biosnoop.py +++ b/tools/biosnoop.py @@ -15,7 +15,6 @@ from __future__ import print_function from bcc import BPF -import ctypes as ct import re # load BPF program @@ -128,21 +127,6 @@ b.attach_kprobe(event="blk_account_io_completion", fn_name="trace_req_completion") -TASK_COMM_LEN = 16 # linux/sched.h -DISK_NAME_LEN = 32 # linux/genhd.h - -class Data(ct.Structure): - _fields_ = [ - ("pid", ct.c_ulonglong), - ("rwflag", ct.c_ulonglong), - ("delta", ct.c_ulonglong), - ("sector", ct.c_ulonglong), - ("len", ct.c_ulonglong), - ("ts", ct.c_ulonglong), - ("disk_name", ct.c_char * DISK_NAME_LEN), - ("name", ct.c_char * TASK_COMM_LEN) - ] - # header print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % ("TIME(s)", "COMM", "PID", "DISK", "T", "SECTOR", "BYTES", "LAT(ms)")) @@ -154,7 +138,7 @@ class Data(ct.Structure): # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) val = -1 global start_ts diff --git a/tools/btrfsslower.py b/tools/btrfsslower.py index 0a59820f9..bacbc06ad 100755 --- a/tools/btrfsslower.py +++ b/tools/btrfsslower.py @@ -28,7 +28,6 @@ from bcc import BPF import argparse from time import strftime -import ctypes as ct # symbols kallsyms = "/proc/kallsyms" @@ -287,24 +286,9 @@ if args.ebpf: exit() -# kernel->user event data: struct data_t -DNAME_INLINE_LEN = 32 # linux/dcache.h -TASK_COMM_LEN = 16 # linux/sched.h -class Data(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("type", ct.c_ulonglong), - ("size", ct.c_ulonglong), - ("offset", ct.c_ulonglong), - ("delta_us", ct.c_ulonglong), - ("pid", ct.c_ulonglong), - ("task", ct.c_char * TASK_COMM_LEN), - ("file", ct.c_char * DNAME_INLINE_LEN) - ] - # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) type = 'R' if event.type == 1: diff --git a/tools/capable.py b/tools/capable.py index 65ffa7729..a4a332c91 100755 --- a/tools/capable.py +++ b/tools/capable.py @@ -18,7 +18,6 @@ import errno import argparse from time import strftime -import ctypes as ct # arguments examples = """examples: @@ -165,19 +164,6 @@ def __getattr__(self, name): # initialize BPF b = BPF(text=bpf_text) -TASK_COMM_LEN = 16 # linux/sched.h - -class Data(ct.Structure): - _fields_ = [ - ("tgid", ct.c_uint32), - ("pid", ct.c_uint32), - ("uid", ct.c_uint32), - ("cap", ct.c_int), - ("audit", ct.c_int), - ("comm", ct.c_char * TASK_COMM_LEN), - ] + ([("kernel_stack_id", ct.c_int)] if args.kernel_stack else []) \ - + ([("user_stack_id", ct.c_int)] if args.user_stack else []) - # header print("%-9s %-6s %-6s %-6s %-16s %-4s %-20s %s" % ( "TIME", "UID", "PID", "TID", "COMM", "CAP", "NAME", "AUDIT")) @@ -198,7 +184,7 @@ def print_stack(bpf, stack_id, stack_type, tgid): # process event def print_event(bpf, cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) if event.cap in capabilities: name = capabilities[event.cap] diff --git a/tools/cpuunclaimed.py b/tools/cpuunclaimed.py index 75ee9324e..dc0f32523 100755 --- a/tools/cpuunclaimed.py +++ b/tools/cpuunclaimed.py @@ -59,11 +59,9 @@ from __future__ import print_function from bcc import BPF, PerfType, PerfSWConfig from time import sleep, strftime -from ctypes import c_int import argparse import multiprocessing from os import getpid, system, open, close, dup, unlink, O_WRONLY -import ctypes as ct from tempfile import NamedTemporaryFile # arguments @@ -248,12 +246,6 @@ def check_runnable_weight_field(): else: print(("Sampling run queues... Output every %s seconds. " + "Hit Ctrl-C to end.") % args.interval) -class Data(ct.Structure): - _fields_ = [ - ("ts", ct.c_ulonglong), - ("cpu", ct.c_ulonglong), - ("len", ct.c_ulonglong) - ] samples = {} group = {} @@ -261,7 +253,7 @@ class Data(ct.Structure): # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) samples[event.ts] = {} samples[event.ts]['cpu'] = event.cpu samples[event.ts]['len'] = event.len diff --git a/tools/criticalstat.py b/tools/criticalstat.py index da2859412..250cfc4dd 100755 --- a/tools/criticalstat.py +++ b/tools/criticalstat.py @@ -15,7 +15,6 @@ from __future__ import print_function from bcc import BPF import argparse -import ctypes as ct import sys import subprocess import os.path @@ -271,18 +270,6 @@ b = BPF(text=bpf_text) -TASK_COMM_LEN = 16 # linux/sched.h - -class Data(ct.Structure): - _fields_ = [ - ("time", ct.c_ulonglong), - ("stack_id", ct.c_longlong), - ("cpu", ct.c_int), - ("id", ct.c_ulonglong), - ("addrs", ct.c_int * 4), - ("comm", ct.c_char * TASK_COMM_LEN), - ] - def get_syms(kstack): syms = [] @@ -296,7 +283,7 @@ def get_syms(kstack): def print_event(cpu, data, size): try: global b - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) stack_traces = b['stack_traces'] stext = b.ksymname('_stext') diff --git a/tools/dbslower.py b/tools/dbslower.py index 24e63948c..da2180f84 100755 --- a/tools/dbslower.py +++ b/tools/dbslower.py @@ -27,7 +27,6 @@ from bcc import BPF, USDT import argparse import re -import ctypes as ct import subprocess examples = """examples: @@ -203,18 +202,10 @@ if args.ebpf: exit() -class Data(ct.Structure): - _fields_ = [ - ("pid", ct.c_ulonglong), - ("timestamp", ct.c_ulonglong), - ("delta", ct.c_ulonglong), - ("query", ct.c_char * 256) - ] - start = BPF.monotonic_time() def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = bpf["events"].event(data) print("%-14.6f %-6d %8.3f %s" % ( float(event.timestamp - start) / 1000000000, event.pid, float(event.delta) / 1000000, event.query)) diff --git a/tools/execsnoop.py b/tools/execsnoop.py index 0c2c0655a..c4021165f 100755 --- a/tools/execsnoop.py +++ b/tools/execsnoop.py @@ -21,7 +21,6 @@ from bcc.utils import ArgString, printb import bcc.utils as utils import argparse -import ctypes as ct import re import time from collections import defaultdict @@ -173,19 +172,6 @@ print("%-8s" % ("TIME(s)"), end="") print("%-16s %-6s %-6s %3s %s" % ("PCOMM", "PID", "PPID", "RET", "ARGS")) -TASK_COMM_LEN = 16 # linux/sched.h -ARGSIZE = 128 # should match #define in C above - -class Data(ct.Structure): - _fields_ = [ - ("pid", ct.c_uint), - ("ppid", ct.c_uint), - ("comm", ct.c_char * TASK_COMM_LEN), - ("type", ct.c_int), - ("argv", ct.c_char * ARGSIZE), - ("retval", ct.c_int), - ] - class EventType(object): EVENT_ARG = 0 EVENT_RET = 1 @@ -209,8 +195,7 @@ def get_ppid(pid): # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents - + event = b["events"].event(data) skip = False if event.type == EventType.EVENT_ARG: diff --git a/tools/ext4slower.py b/tools/ext4slower.py index 16b56ec4c..0524f22e7 100755 --- a/tools/ext4slower.py +++ b/tools/ext4slower.py @@ -29,7 +29,6 @@ from bcc import BPF import argparse from time import strftime -import ctypes as ct # symbols kallsyms = "/proc/kallsyms" @@ -285,24 +284,9 @@ if args.ebpf: exit() -# kernel->user event data: struct data_t -DNAME_INLINE_LEN = 32 # linux/dcache.h -TASK_COMM_LEN = 16 # linux/sched.h -class Data(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("type", ct.c_ulonglong), - ("size", ct.c_ulonglong), - ("offset", ct.c_ulonglong), - ("delta_us", ct.c_ulonglong), - ("pid", ct.c_ulonglong), - ("task", ct.c_char * TASK_COMM_LEN), - ("file", ct.c_char * DNAME_INLINE_LEN) - ] - # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) type = 'R' if event.type == 1: diff --git a/tools/filelife.py b/tools/filelife.py index f66f00bb2..2eb4244b1 100755 --- a/tools/filelife.py +++ b/tools/filelife.py @@ -21,7 +21,6 @@ from bcc import BPF import argparse from time import strftime -import ctypes as ct # arguments examples = """examples: @@ -100,17 +99,6 @@ } """ -TASK_COMM_LEN = 16 # linux/sched.h -DNAME_INLINE_LEN = 255 # linux/dcache.h - -class Data(ct.Structure): - _fields_ = [ - ("pid", ct.c_uint), - ("delta", ct.c_ulonglong), - ("comm", ct.c_char * TASK_COMM_LEN), - ("fname", ct.c_char * DNAME_INLINE_LEN) - ] - if args.pid: bpf_text = bpf_text.replace('FILTER', 'if (pid != %s) { return 0; }' % args.pid) @@ -134,7 +122,7 @@ class Data(ct.Structure): # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) print("%-8s %-6d %-16s %-7.2f %s" % (strftime("%H:%M:%S"), event.pid, event.comm.decode('utf-8', 'replace'), float(event.delta) / 1000, event.fname.decode('utf-8', 'replace'))) diff --git a/tools/fileslower.py b/tools/fileslower.py index e2830e98f..6fa0c26cc 100755 --- a/tools/fileslower.py +++ b/tools/fileslower.py @@ -31,7 +31,6 @@ from __future__ import print_function from bcc import BPF import argparse -import ctypes as ct import time # arguments @@ -210,20 +209,6 @@ b.attach_kprobe(event="vfs_write", fn_name="trace_write_entry") b.attach_kretprobe(event="vfs_write", fn_name="trace_write_return") -TASK_COMM_LEN = 16 # linux/sched.h -DNAME_INLINE_LEN = 32 # linux/dcache.h - -class Data(ct.Structure): - _fields_ = [ - ("mode", ct.c_int), - ("pid", ct.c_uint), - ("sz", ct.c_uint), - ("delta_us", ct.c_ulonglong), - ("name_len", ct.c_uint), - ("name", ct.c_char * DNAME_INLINE_LEN), - ("comm", ct.c_char * TASK_COMM_LEN), - ] - mode_s = { 0: 'R', 1: 'W', @@ -237,7 +222,7 @@ class Data(ct.Structure): start_ts = time.time() def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) ms = float(event.delta_us) / 1000 name = event.name.decode('utf-8', 'replace') diff --git a/tools/funcslower.py b/tools/funcslower.py index 283c80182..bda6a844c 100755 --- a/tools/funcslower.py +++ b/tools/funcslower.py @@ -24,7 +24,6 @@ from __future__ import print_function from bcc import BPF import argparse -import ctypes as ct import time examples = """examples: @@ -241,20 +240,6 @@ b.attach_kprobe(event=function, fn_name="trace_%d" % i) b.attach_kretprobe(event=function, fn_name="trace_return") -TASK_COMM_LEN = 16 # linux/sched.h - -class Data(ct.Structure): - _fields_ = [ - ("id", ct.c_ulonglong), - ("tgid_pid", ct.c_ulonglong), - ("start_ns", ct.c_ulonglong), - ("duration_ns", ct.c_ulonglong), - ("retval", ct.c_ulonglong), - ("comm", ct.c_char * TASK_COMM_LEN) - ] + ([("args", ct.c_ulonglong * 6)] if args.arguments else []) + \ - ([("user_stack_id", ct.c_int)] if args.user_stack else []) + \ - ([("kernel_stack_id", ct.c_int),("kernel_ip", ct.c_ulonglong)] if args.kernel_stack else []) - time_designator = "us" if args.min_us else "ms" time_value = args.min_us or args.min_ms or 1 time_multiplier = 1000 if args.min_us else 1000000 @@ -319,7 +304,7 @@ def print_stack(event): print(" %s" % b.sym(addr, event.tgid_pid)) def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) ts = float(event.duration_ns) / time_multiplier if not args.folded: print((time_str(event) + "%-14.14s %-6s %7.2f %16x %s %s") % diff --git a/tools/gethostlatency.py b/tools/gethostlatency.py index 8d07e23ab..965c0db94 100755 --- a/tools/gethostlatency.py +++ b/tools/gethostlatency.py @@ -19,7 +19,6 @@ from bcc import BPF from time import strftime import argparse -import ctypes as ct examples = """examples: ./gethostlatency # trace all TCP accept()s @@ -113,21 +112,11 @@ b.attach_uretprobe(name="c", sym="gethostbyname2", fn_name="do_return", pid=args.pid) -TASK_COMM_LEN = 16 # linux/sched.h - -class Data(ct.Structure): - _fields_ = [ - ("pid", ct.c_ulonglong), - ("delta", ct.c_ulonglong), - ("comm", ct.c_char * TASK_COMM_LEN), - ("host", ct.c_char * 80) - ] - # header print("%-9s %-6s %-16s %10s %s" % ("TIME", "PID", "COMM", "LATms", "HOST")) def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) print("%-9s %-6d %-16s %10.2f %s" % (strftime("%H:%M:%S"), event.pid, event.comm.decode('utf-8', 'replace'), (float(event.delta) / 1000000), event.host.decode('utf-8', 'replace'))) diff --git a/tools/killsnoop.py b/tools/killsnoop.py index 16221a2a2..2fb1dcb5d 100755 --- a/tools/killsnoop.py +++ b/tools/killsnoop.py @@ -17,7 +17,6 @@ from bcc.utils import ArgString, printb import argparse from time import strftime -import ctypes as ct # arguments examples = """examples: @@ -116,25 +115,13 @@ b.attach_kprobe(event=kill_fnname, fn_name="syscall__kill") b.attach_kretprobe(event=kill_fnname, fn_name="do_ret_sys_kill") - -TASK_COMM_LEN = 16 # linux/sched.h - -class Data(ct.Structure): - _fields_ = [ - ("pid", ct.c_ulonglong), - ("tpid", ct.c_int), - ("sig", ct.c_int), - ("ret", ct.c_int), - ("comm", ct.c_char * TASK_COMM_LEN) - ] - # header print("%-9s %-6s %-16s %-4s %-6s %s" % ( "TIME", "PID", "COMM", "SIG", "TPID", "RESULT")) # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) if (args.failed and (event.ret >= 0)): return diff --git a/tools/mdflush.py b/tools/mdflush.py index 485635d70..f1c68aee9 100755 --- a/tools/mdflush.py +++ b/tools/mdflush.py @@ -14,7 +14,6 @@ from __future__ import print_function from bcc import BPF from time import strftime -import ctypes as ct # load BPF program b = BPF(text=""" @@ -54,23 +53,13 @@ } """) -# event data -TASK_COMM_LEN = 16 # linux/sched.h -DISK_NAME_LEN = 32 # linux/genhd.h -class Data(ct.Structure): - _fields_ = [ - ("pid", ct.c_ulonglong), - ("comm", ct.c_char * TASK_COMM_LEN), - ("disk", ct.c_char * DISK_NAME_LEN) - ] - # header print("Tracing md flush requests... Hit Ctrl-C to end.") print("%-8s %-6s %-16s %s" % ("TIME", "PID", "COMM", "DEVICE")) # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) print("%-8s %-6d %-16s %s" % (strftime("%H:%M:%S"), event.pid, event.comm.decode('utf-8', 'replace'), event.disk.decode('utf-8', 'replace'))) diff --git a/tools/mysqld_qslower.py b/tools/mysqld_qslower.py index ab23b5b1a..1518974a9 100755 --- a/tools/mysqld_qslower.py +++ b/tools/mysqld_qslower.py @@ -18,7 +18,6 @@ from __future__ import print_function from bcc import BPF, USDT import sys -import ctypes as ct # arguments def usage(): @@ -109,19 +108,11 @@ def usage(): min_ms_text)) print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY")) -class Data(ct.Structure): - _fields_ = [ - ("pid", ct.c_ulonglong), - ("ts", ct.c_ulonglong), - ("delta", ct.c_ulonglong), - ("query", ct.c_char * QUERY_MAX) - ] - # process event start = 0 def print_event(cpu, data, size): global start - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) if start == 0: start = event.ts print("%-14.6f %-6d %8.3f %s" % (float(event.ts - start) / 1000000000, diff --git a/tools/nfsslower.py b/tools/nfsslower.py index 32e91c7b6..36918ca00 100755 --- a/tools/nfsslower.py +++ b/tools/nfsslower.py @@ -32,7 +32,6 @@ from bcc import BPF import argparse from time import strftime -import ctypes as ct examples = """ ./nfsslower # trace operations slower than 10ms @@ -243,27 +242,9 @@ if args.ebpf: exit() -# kernel->user event data: struct data_t -DNAME_INLINE_LEN = 32 # linux/dcache.h -TASK_COMM_LEN = 16 # linux/sched.h - - -class Data(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("type", ct.c_ulonglong), - ("size", ct.c_ulonglong), - ("offset", ct.c_ulonglong), - ("delta_us", ct.c_ulonglong), - ("pid", ct.c_ulonglong), - ("task", ct.c_char * TASK_COMM_LEN), - ("file", ct.c_char * DNAME_INLINE_LEN) - ] - - # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) type = 'R' if event.type == 1: diff --git a/tools/oomkill.py b/tools/oomkill.py index 16defe090..4f3b6ce75 100755 --- a/tools/oomkill.py +++ b/tools/oomkill.py @@ -16,7 +16,6 @@ from bcc import BPF from time import strftime -import ctypes as ct # linux stats loadavg = "/proc/loadavg" @@ -51,20 +50,9 @@ } """ -# kernel->user event data: struct data_t -TASK_COMM_LEN = 16 # linux/sched.h -class Data(ct.Structure): - _fields_ = [ - ("fpid", ct.c_ulonglong), - ("tpid", ct.c_ulonglong), - ("pages", ct.c_ulonglong), - ("fcomm", ct.c_char * TASK_COMM_LEN), - ("tcomm", ct.c_char * TASK_COMM_LEN) - ] - # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) with open(loadavg) as stats: avgline = stats.read().rstrip() print(("%s Triggered by PID %d (\"%s\"), OOM kill of PID %d (\"%s\")" diff --git a/tools/opensnoop.py b/tools/opensnoop.py index 55db352d8..4ffedfa93 100755 --- a/tools/opensnoop.py +++ b/tools/opensnoop.py @@ -19,7 +19,6 @@ from bcc import ArgString, BPF from bcc.utils import printb import argparse -import ctypes as ct from datetime import datetime, timedelta import os @@ -182,20 +181,6 @@ b.attach_kprobe(event="do_sys_open", fn_name="trace_entry") b.attach_kretprobe(event="do_sys_open", fn_name="trace_return") -TASK_COMM_LEN = 16 # linux/sched.h -NAME_MAX = 255 # linux/limits.h - -class Data(ct.Structure): - _fields_ = [ - ("id", ct.c_ulonglong), - ("ts", ct.c_ulonglong), - ("uid", ct.c_uint32), - ("ret", ct.c_int), - ("comm", ct.c_char * TASK_COMM_LEN), - ("fname", ct.c_char * NAME_MAX), - ("flags", ct.c_int), - ] - initial_ts = 0 # header @@ -211,7 +196,7 @@ class Data(ct.Structure): # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) global initial_ts # split return value into FD and errno columns diff --git a/tools/profile.py b/tools/profile.py index 89cd5230d..958b6323e 100755 --- a/tools/profile.py +++ b/tools/profile.py @@ -35,7 +35,6 @@ import os import errno import multiprocessing -import ctypes as ct # # Process Arguments diff --git a/tools/runqslower.py b/tools/runqslower.py index bd1138e03..1d48be8a6 100755 --- a/tools/runqslower.py +++ b/tools/runqslower.py @@ -33,7 +33,6 @@ from bcc import BPF import argparse from time import strftime -import ctypes as ct # arguments examples = """examples: @@ -226,19 +225,9 @@ if args.ebpf: exit() -# kernel->user event data: struct data_t -DNAME_INLINE_LEN = 32 # linux/dcache.h -TASK_COMM_LEN = 16 # linux/sched.h -class Data(ct.Structure): - _fields_ = [ - ("pid", ct.c_uint), - ("task", ct.c_char * TASK_COMM_LEN), - ("delta_us", ct.c_ulonglong), - ] - # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) print("%-8s %-16s %-6s %14s" % (strftime("%H:%M:%S"), event.task, event.pid, event.delta_us)) # load BPF program diff --git a/tools/shmsnoop.py b/tools/shmsnoop.py index bb5053547..11b4b6f60 100755 --- a/tools/shmsnoop.py +++ b/tools/shmsnoop.py @@ -14,7 +14,6 @@ from __future__ import print_function from bcc import ArgString, BPF import argparse -import ctypes as ct from datetime import datetime, timedelta # arguments @@ -207,22 +206,6 @@ initial_ts = 0 -class Data(ct.Structure): - _fields_ = [ - ("id", ct.c_ulonglong), - ("ts", ct.c_ulonglong), - ("sys", ct.c_int), - ("key", ct.c_ulong), - ("size", ct.c_ulong), - ("shmflg", ct.c_ulong), - ("shmid", ct.c_ulong), - ("cmd", ct.c_ulong), - ("buf", ct.c_ulong), - ("shmaddr", ct.c_ulong), - ("ret", ct.c_ulong), - ("comm", ct.c_char * TASK_COMM_LEN), - ] - # header if args.timestamp: print("%-14s" % ("TIME(s)"), end="") @@ -281,7 +264,7 @@ def shmflg_str(val, flags): # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) global initial_ts if not initial_ts: diff --git a/tools/sofdsnoop.py b/tools/sofdsnoop.py index 77f8f14cd..e0c1310bb 100755 --- a/tools/sofdsnoop.py +++ b/tools/sofdsnoop.py @@ -15,7 +15,6 @@ from bcc import ArgString, BPF import os import argparse -import ctypes as ct from datetime import datetime, timedelta # arguments @@ -279,21 +278,8 @@ b.attach_kprobe(event="scm_detach_fds", fn_name="trace_scm_detach_fds_entry") b.attach_kretprobe(event="scm_detach_fds", fn_name="trace_scm_detach_fds_return") -TASK_COMM_LEN = 16 # linux/sched.h - initial_ts = 0 -class Data(ct.Structure): - _fields_ = [ - ("id", ct.c_ulonglong), - ("ts", ct.c_ulonglong), - ("action", ct.c_int), - ("sock_fd", ct.c_int), - ("fd_cnt", ct.c_int), - ("fd", ct.c_int * MAX_FD), - ("comm", ct.c_char * TASK_COMM_LEN), - ] - # header if args.timestamp: print("%-14s" % ("TIME(s)"), end="") @@ -309,7 +295,7 @@ def get_file(pid, fd): # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) tid = event.id & 0xffffffff; cnt = min(MAX_FD, event.fd_cnt); diff --git a/tools/solisten.py b/tools/solisten.py index bced0a2ab..f2a0a342a 100755 --- a/tools/solisten.py +++ b/tools/solisten.py @@ -22,7 +22,6 @@ from struct import pack import argparse from bcc import BPF -import ctypes as ct from bcc.utils import printb # Arguments @@ -123,27 +122,13 @@ }; """ -# event data -TASK_COMM_LEN = 16 # linux/sched.h -class ListenEvt(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("pid_tgid", ct.c_ulonglong), - ("backlog", ct.c_ulonglong), - ("netns", ct.c_ulonglong), - ("proto", ct.c_ulonglong), - ("lport", ct.c_ulonglong), - ("laddr", ct.c_ulonglong * 2), - ("task", ct.c_char * TASK_COMM_LEN) - ] - # TODO: properties to unpack protocol / ip / pid / tgid ... # Format output def event_printer(show_netns): def print_event(cpu, data, size): # Decode event - event = ct.cast(data, ct.POINTER(ListenEvt)).contents + event = b["listen_evt"].event(data) pid = event.pid_tgid & 0xffffffff proto_family = event.proto & 0xff diff --git a/tools/sslsniff.py b/tools/sslsniff.py index 265e87f40..e48fbb470 100755 --- a/tools/sslsniff.py +++ b/tools/sslsniff.py @@ -14,7 +14,6 @@ # from __future__ import print_function -import ctypes as ct from bcc import BPF import argparse @@ -171,17 +170,6 @@ MAX_BUF_SIZE = 464 # Limited by the BPF stack -# Max size of the whole struct: 512 bytes -class Data(ct.Structure): - _fields_ = [ - ("timestamp_ns", ct.c_ulonglong), - ("pid", ct.c_uint), - ("comm", ct.c_char * TASK_COMM_LEN), - ("v0", ct.c_char * MAX_BUF_SIZE), - ("len", ct.c_uint) - ] - - # header print("%-12s %-18s %-16s %-6s %-6s" % ("FUNC", "TIME(s)", "COMM", "PID", "LEN")) @@ -191,16 +179,16 @@ class Data(ct.Structure): def print_event_write(cpu, data, size): - print_event(cpu, data, size, "WRITE/SEND") + print_event(cpu, data, size, "WRITE/SEND", "perf_SSL_write") def print_event_read(cpu, data, size): - print_event(cpu, data, size, "READ/RECV") + print_event(cpu, data, size, "READ/RECV", "perf_SSL_read") -def print_event(cpu, data, size, rw): +def print_event(cpu, data, size, rw, evt): global start - event = ct.cast(data, ct.POINTER(Data)).contents + event = b[evt].event(data) # Filter events by command if args.comm: diff --git a/tools/statsnoop.py b/tools/statsnoop.py index 516eda2d8..6cdff9459 100755 --- a/tools/statsnoop.py +++ b/tools/statsnoop.py @@ -15,7 +15,6 @@ from __future__ import print_function from bcc import BPF import argparse -import ctypes as ct # arguments examples = """examples: @@ -129,18 +128,6 @@ b.attach_kprobe(event=syscall_fnname, fn_name="syscall__entry") b.attach_kretprobe(event=syscall_fnname, fn_name="trace_return") -TASK_COMM_LEN = 16 # linux/sched.h -NAME_MAX = 255 # linux/limits.h - -class Data(ct.Structure): - _fields_ = [ - ("pid", ct.c_ulonglong), - ("ts_ns", ct.c_ulonglong), - ("ret", ct.c_int), - ("comm", ct.c_char * TASK_COMM_LEN), - ("fname", ct.c_char * NAME_MAX) - ] - start_ts = 0 prev_ts = 0 delta = 0 @@ -152,7 +139,7 @@ class Data(ct.Structure): # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) global start_ts global prev_ts global delta diff --git a/tools/syncsnoop.py b/tools/syncsnoop.py index 708fbc4a0..e5fa78e3e 100755 --- a/tools/syncsnoop.py +++ b/tools/syncsnoop.py @@ -15,7 +15,6 @@ from __future__ import print_function from bcc import BPF -import ctypes as ct # load BPF program b = BPF(text=""" @@ -34,17 +33,12 @@ b.attach_kprobe(event=b.get_syscall_fnname("sync"), fn_name="syscall__sync") -class Data(ct.Structure): - _fields_ = [ - ("ts", ct.c_ulonglong) - ] - # header print("%-18s %s" % ("TIME(s)", "CALL")) # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) print("%-18.9f sync()" % (float(event.ts) / 1000000)) # loop with callback to print_event diff --git a/tools/tcpaccept.py b/tools/tcpaccept.py index b12808630..5f7b48c33 100755 --- a/tools/tcpaccept.py +++ b/tools/tcpaccept.py @@ -20,7 +20,6 @@ from socket import inet_ntop, AF_INET, AF_INET6 from struct import pack import argparse -import ctypes as ct from bcc.utils import printb # arguments @@ -211,34 +210,9 @@ if args.ebpf: exit() -# event data -TASK_COMM_LEN = 16 # linux/sched.h - -class Data_ipv4(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("pid", ct.c_uint), - ("saddr", ct.c_uint), - ("daddr", ct.c_uint), - ("ip", ct.c_ulonglong), - ("lport", ct.c_ushort), - ("task", ct.c_char * TASK_COMM_LEN) - ] - -class Data_ipv6(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("pid", ct.c_uint), - ("saddr", (ct.c_ulonglong * 2)), - ("daddr", (ct.c_ulonglong * 2)), - ("ip", ct.c_ulonglong), - ("lport", ct.c_ushort), - ("task", ct.c_char * TASK_COMM_LEN) - ] - # process event def print_ipv4_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data_ipv4)).contents + event = b["ipv4_events"].event(data) global start_ts if args.timestamp: if start_ts == 0: @@ -251,7 +225,7 @@ def print_ipv4_event(cpu, data, size): event.lport)) def print_ipv6_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data_ipv6)).contents + event = b["ipv6_events"].event(data) global start_ts if args.timestamp: if start_ts == 0: diff --git a/tools/tcpconnlat.py b/tools/tcpconnlat.py index 9f25f0f41..8f686211d 100755 --- a/tools/tcpconnlat.py +++ b/tools/tcpconnlat.py @@ -19,7 +19,6 @@ from socket import inet_ntop, AF_INET, AF_INET6 from struct import pack import argparse -import ctypes as ct # arg validation def positive_float(val): @@ -199,38 +198,11 @@ def positive_float(val): b.attach_kprobe(event="tcp_rcv_state_process", fn_name="trace_tcp_rcv_state_process") -# event data -TASK_COMM_LEN = 16 # linux/sched.h - -class Data_ipv4(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("pid", ct.c_uint), - ("saddr", ct.c_uint), - ("daddr", ct.c_uint), - ("ip", ct.c_ulonglong), - ("dport", ct.c_ushort), - ("delta_us", ct.c_ulonglong), - ("task", ct.c_char * TASK_COMM_LEN) - ] - -class Data_ipv6(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("pid", ct.c_uint), - ("saddr", (ct.c_ulonglong * 2)), - ("daddr", (ct.c_ulonglong * 2)), - ("ip", ct.c_ulonglong), - ("dport", ct.c_ushort), - ("delta_us", ct.c_ulonglong), - ("task", ct.c_char * TASK_COMM_LEN) - ] - # process event start_ts = 0 def print_ipv4_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data_ipv4)).contents + event = b["ipv4_events"].event(data) global start_ts if args.timestamp: if start_ts == 0: @@ -243,7 +215,7 @@ def print_ipv4_event(cpu, data, size): float(event.delta_us) / 1000)) def print_ipv6_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data_ipv6)).contents + event = b["ipv6_events"].event(data) global start_ts if args.timestamp: if start_ts == 0: diff --git a/tools/tcpdrop.py b/tools/tcpdrop.py index ca89be60c..bf8634df6 100755 --- a/tools/tcpdrop.py +++ b/tools/tcpdrop.py @@ -23,7 +23,6 @@ from time import strftime from socket import inet_ntop, AF_INET, AF_INET6 from struct import pack -import ctypes as ct from time import sleep from bcc import tcp @@ -151,36 +150,9 @@ if args.ebpf: exit() -# event data -class Data_ipv4(ct.Structure): - _fields_ = [ - ("pid", ct.c_uint), - ("ip", ct.c_ulonglong), - ("saddr", ct.c_uint), - ("daddr", ct.c_uint), - ("sport", ct.c_ushort), - ("dport", ct.c_ushort), - ("state", ct.c_ubyte), - ("tcpflags", ct.c_ubyte), - ("stack_id", ct.c_ulong) - ] - -class Data_ipv6(ct.Structure): - _fields_ = [ - ("pid", ct.c_uint), - ("ip", ct.c_ulonglong), - ("saddr", (ct.c_ulonglong * 2)), - ("daddr", (ct.c_ulonglong * 2)), - ("sport", ct.c_ushort), - ("dport", ct.c_ushort), - ("state", ct.c_ubyte), - ("tcpflags", ct.c_ubyte), - ("stack_id", ct.c_ulong) - ] - # process event def print_ipv4_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data_ipv4)).contents + event = b["ipv4_events"].event(data) print("%-8s %-6d %-2d %-20s > %-20s %s (%s)" % ( strftime("%H:%M:%S"), event.pid, event.ip, "%s:%d" % (inet_ntop(AF_INET, pack('I', event.saddr)), event.sport), @@ -192,7 +164,7 @@ def print_ipv4_event(cpu, data, size): print("") def print_ipv6_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data_ipv6)).contents + event = b["ipv6_events"].event(data) print("%-8s %-6d %-2d %-20s > %-20s %s (%s)" % ( strftime("%H:%M:%S"), event.pid, event.ip, "%s:%d" % (inet_ntop(AF_INET6, event.saddr), event.sport), diff --git a/tools/tcplife.py b/tools/tcplife.py index 46395822b..ed2515539 100755 --- a/tools/tcplife.py +++ b/tools/tcplife.py @@ -27,7 +27,6 @@ import argparse from socket import inet_ntop, ntohs, AF_INET, AF_INET6 from struct import pack -import ctypes as ct from time import strftime # arguments @@ -391,35 +390,6 @@ if args.ebpf: exit() -# event data -TASK_COMM_LEN = 16 # linux/sched.h - -class Data_ipv4(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("pid", ct.c_uint), - ("saddr", ct.c_uint), - ("daddr", ct.c_uint), - ("ports", ct.c_ulonglong), - ("rx_b", ct.c_ulonglong), - ("tx_b", ct.c_ulonglong), - ("span_us", ct.c_ulonglong), - ("task", ct.c_char * TASK_COMM_LEN) - ] - -class Data_ipv6(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("pid", ct.c_uint), - ("saddr", (ct.c_ulonglong * 2)), - ("daddr", (ct.c_ulonglong * 2)), - ("ports", ct.c_ulonglong), - ("rx_b", ct.c_ulonglong), - ("tx_b", ct.c_ulonglong), - ("span_us", ct.c_ulonglong), - ("task", ct.c_char * TASK_COMM_LEN) - ] - # # Setup output formats # @@ -439,7 +409,7 @@ class Data_ipv6(ct.Structure): # process event def print_ipv4_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data_ipv4)).contents + event = b["ipv4_events"].event(data) global start_ts if args.time: if args.csv: @@ -461,7 +431,7 @@ def print_ipv4_event(cpu, data, size): event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000)) def print_ipv6_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data_ipv6)).contents + event = b["ipv6_events"].event(data) global start_ts if args.time: if args.csv: diff --git a/tools/tcpretrans.py b/tools/tcpretrans.py index 47ac8c105..4301b8eb4 100755 --- a/tools/tcpretrans.py +++ b/tools/tcpretrans.py @@ -21,7 +21,6 @@ from time import strftime from socket import inet_ntop, AF_INET, AF_INET6 from struct import pack -import ctypes as ct from time import sleep # arguments @@ -199,31 +198,6 @@ if args.ebpf: exit() -# event data -class Data_ipv4(ct.Structure): - _fields_ = [ - ("pid", ct.c_uint), - ("ip", ct.c_ulonglong), - ("saddr", ct.c_uint), - ("daddr", ct.c_uint), - ("lport", ct.c_ushort), - ("dport", ct.c_ushort), - ("state", ct.c_ulonglong), - ("type", ct.c_ulonglong) - ] - -class Data_ipv6(ct.Structure): - _fields_ = [ - ("pid", ct.c_uint), - ("ip", ct.c_ulonglong), - ("saddr", (ct.c_ulonglong * 2)), - ("daddr", (ct.c_ulonglong * 2)), - ("lport", ct.c_ushort), - ("dport", ct.c_ushort), - ("state", ct.c_ulonglong), - ("type", ct.c_ulonglong) - ] - # from bpf_text: type = {} type[1] = 'R' @@ -246,7 +220,7 @@ class Data_ipv6(ct.Structure): # process event def print_ipv4_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data_ipv4)).contents + event = b["ipv4_events"].event(data) print("%-8s %-6d %-2d %-20s %1s> %-20s %s" % ( strftime("%H:%M:%S"), event.pid, event.ip, "%s:%d" % (inet_ntop(AF_INET, pack('I', event.saddr)), event.lport), @@ -255,7 +229,7 @@ def print_ipv4_event(cpu, data, size): tcpstate[event.state])) def print_ipv6_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data_ipv6)).contents + event = b["ipv6_events"].event(data) print("%-8s %-6d %-2d %-20s %1s> %-20s %s" % ( strftime("%H:%M:%S"), event.pid, event.ip, "%s:%d" % (inet_ntop(AF_INET6, event.saddr), event.lport), diff --git a/tools/tcpstates.py b/tools/tcpstates.py index 4a21f0205..9b25b094a 100755 --- a/tools/tcpstates.py +++ b/tools/tcpstates.py @@ -20,7 +20,6 @@ import argparse from socket import inet_ntop, AF_INET, AF_INET6 from struct import pack -import ctypes as ct from time import strftime, time from os import getuid @@ -191,37 +190,6 @@ if args.ebpf: exit() -# event data -TASK_COMM_LEN = 16 # linux/sched.h - -class Data_ipv4(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("skaddr", ct.c_ulonglong), - ("saddr", ct.c_uint), - ("daddr", ct.c_uint), - ("span_us", ct.c_ulonglong), - ("pid", ct.c_uint), - ("ports", ct.c_uint), - ("oldstate", ct.c_uint), - ("newstate", ct.c_uint), - ("task", ct.c_char * TASK_COMM_LEN) - ] - -class Data_ipv6(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("skaddr", ct.c_ulonglong), - ("saddr", (ct.c_ulonglong * 2)), - ("daddr", (ct.c_ulonglong * 2)), - ("span_us", ct.c_ulonglong), - ("pid", ct.c_uint), - ("ports", ct.c_uint), - ("oldstate", ct.c_uint), - ("newstate", ct.c_uint), - ("task", ct.c_char * TASK_COMM_LEN) - ] - # # Setup output formats # @@ -312,7 +280,7 @@ def journal_fields(event, addr_family): # process event def print_ipv4_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data_ipv4)).contents + event = b["ipv4_events"].event(data) global start_ts if args.time: if args.csv: @@ -337,7 +305,7 @@ def print_ipv4_event(cpu, data, size): journal.send(**journal_fields(event, AF_INET)) def print_ipv6_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data_ipv6)).contents + event = b["ipv6_events"].event(data) global start_ts if args.time: if args.csv: diff --git a/tools/tcptop.py b/tools/tcptop.py index e1eb24111..b6e26e186 100755 --- a/tools/tcptop.py +++ b/tools/tcptop.py @@ -31,7 +31,6 @@ from struct import pack from time import sleep, strftime from subprocess import call -import ctypes as ct from collections import namedtuple, defaultdict # arguments diff --git a/tools/tcptracer.py b/tools/tcptracer.py index cc92c3fc9..e61fe9ba7 100755 --- a/tools/tcptracer.py +++ b/tools/tcptracer.py @@ -18,7 +18,6 @@ from bcc import BPF import argparse as ap -import ctypes from socket import inet_ntop, AF_INET, AF_INET6 from struct import pack @@ -493,45 +492,12 @@ } """ -TASK_COMM_LEN = 16 # linux/sched.h - - -class TCPIPV4Evt(ctypes.Structure): - _fields_ = [ - ("ts_ns", ctypes.c_ulonglong), - ("type", ctypes.c_uint), - ("pid", ctypes.c_uint), - ("comm", ctypes.c_char * TASK_COMM_LEN), - ("ip", ctypes.c_ubyte), - ("saddr", ctypes.c_uint), - ("daddr", ctypes.c_uint), - ("sport", ctypes.c_ushort), - ("dport", ctypes.c_ushort), - ("netns", ctypes.c_uint) - ] - - -class TCPIPV6Evt(ctypes.Structure): - _fields_ = [ - ("ts_ns", ctypes.c_ulonglong), - ("type", ctypes.c_uint), - ("pid", ctypes.c_uint), - ("comm", ctypes.c_char * TASK_COMM_LEN), - ("ip", ctypes.c_ubyte), - ("saddr", (ctypes.c_ulong * 2)), - ("daddr", (ctypes.c_ulong * 2)), - ("sport", ctypes.c_ushort), - ("dport", ctypes.c_ushort), - ("netns", ctypes.c_uint) - ] - - verbose_types = {"C": "connect", "A": "accept", "X": "close", "U": "unknown"} def print_ipv4_event(cpu, data, size): - event = ctypes.cast(data, ctypes.POINTER(TCPIPV4Evt)).contents + event = b["tcp_ipv4_event"].event(data) global start_ts if args.timestamp: @@ -569,7 +535,7 @@ def print_ipv4_event(cpu, data, size): def print_ipv6_event(cpu, data, size): - event = ctypes.cast(data, ctypes.POINTER(TCPIPV6Evt)).contents + event = b["tcp_ipv6_event"].event(data) global start_ts if args.timestamp: if start_ts == 0: diff --git a/tools/ttysnoop.py b/tools/ttysnoop.py index 07f272fa9..058dc7e34 100755 --- a/tools/ttysnoop.py +++ b/tools/ttysnoop.py @@ -16,7 +16,6 @@ from __future__ import print_function from bcc import BPF -import ctypes as ct from subprocess import call import argparse from sys import argv @@ -101,20 +100,12 @@ def usage(): # initialize BPF b = BPF(text=bpf_text) -BUFSIZE = 256 - -class Data(ct.Structure): - _fields_ = [ - ("count", ct.c_int), - ("buf", ct.c_char * BUFSIZE) - ] - if not args.noclear: call("clear") # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) print("%s" % event.buf[0:event.count].decode('utf-8', 'replace'), end="") sys.stdout.flush() diff --git a/tools/xfsslower.py b/tools/xfsslower.py index b79527b11..9fa125662 100755 --- a/tools/xfsslower.py +++ b/tools/xfsslower.py @@ -28,7 +28,6 @@ from bcc import BPF import argparse from time import strftime -import ctypes as ct # arguments examples = """examples: @@ -240,24 +239,9 @@ if args.ebpf: exit() -# kernel->user event data: struct data_t -DNAME_INLINE_LEN = 32 # linux/dcache.h -TASK_COMM_LEN = 16 # linux/sched.h -class Data(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("type", ct.c_ulonglong), - ("size", ct.c_ulonglong), - ("offset", ct.c_ulonglong), - ("delta_us", ct.c_ulonglong), - ("pid", ct.c_ulonglong), - ("task", ct.c_char * TASK_COMM_LEN), - ("file", ct.c_char * DNAME_INLINE_LEN) - ] - # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) type = 'R' if event.type == 1: diff --git a/tools/zfsslower.py b/tools/zfsslower.py index 7bf160b7f..2f05b561e 100755 --- a/tools/zfsslower.py +++ b/tools/zfsslower.py @@ -31,7 +31,6 @@ from bcc import BPF import argparse from time import strftime -import ctypes as ct # arguments examples = """examples: @@ -236,24 +235,9 @@ if args.ebpf: exit() -# kernel->user event data: struct data_t -DNAME_INLINE_LEN = 32 # linux/dcache.h -TASK_COMM_LEN = 16 # linux/sched.h -class Data(ct.Structure): - _fields_ = [ - ("ts_us", ct.c_ulonglong), - ("type", ct.c_ulonglong), - ("size", ct.c_ulonglong), - ("offset", ct.c_ulonglong), - ("delta_us", ct.c_ulonglong), - ("pid", ct.c_ulonglong), - ("task", ct.c_char * TASK_COMM_LEN), - ("file", ct.c_char * DNAME_INLINE_LEN) - ] - # process event def print_event(cpu, data, size): - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) type = 'R' if event.type == 1: From f03beca4d6e6bc3fa7089416d752387bd26904dc Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Fri, 15 Feb 2019 17:35:37 +0100 Subject: [PATCH 054/135] tools: fix some python3 bytes vs strings issues (#2205) It fixes the following errors: $ execsnoop.py -q PCOMM PID PPID RET ARGS Traceback (most recent call last): File "_ctypes/callbacks.c", line 234, in 'calling callback function' File "/usr/lib/python3.6/site-packages/bcc/table.py", line 572, in raw_cb_ callback(cpu, data, size) File "tools/execsnoop.py", line 229, in print_event for arg in argv[event.pid] File "tools/execsnoop.py", line 229, in for arg in argv[event.pid] TypeError: a bytes-like object is required, not 'str' $ offcputime.py -K -f 5 Traceback (most recent call last): File "./tools/offcputime.py", line 298, in print("%s %d" % (";".join(line), v.value)) TypeError: sequence item 1: expected str instance, bytes found $ offwaketime.py -f 5 Traceback (most recent call last): File "./tools/offwaketime.py", line 350, in print("%s %d" % (";".join(line), v.value)) TypeError: sequence item 1: expected str instance, bytes found --- tools/execsnoop.py | 2 +- tools/offcputime.py | 6 ++++-- tools/offwaketime.py | 8 ++++---- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/tools/execsnoop.py b/tools/execsnoop.py index c4021165f..1ce83e07d 100755 --- a/tools/execsnoop.py +++ b/tools/execsnoop.py @@ -210,7 +210,7 @@ def print_event(cpu, data, size): skip = True if args.quote: argv[event.pid] = [ - "\"" + arg.replace("\"", "\\\"") + "\"" + b"\"" + arg.replace(b"\"", b"\\\"") + b"\"" for arg in argv[event.pid] ] diff --git a/tools/offcputime.py b/tools/offcputime.py index d84ae529f..ac3b7281f 100755 --- a/tools/offcputime.py +++ b/tools/offcputime.py @@ -288,13 +288,15 @@ def signal_ignore(signal, frame): if stack_id_err(k.user_stack_id): line.append("[Missed User Stack]") else: - line.extend([b.sym(addr, k.tgid) for addr in reversed(user_stack)]) + line.extend([b.sym(addr, k.tgid).decode('utf-8', 'replace') + for addr in reversed(user_stack)]) if not args.user_stacks_only: line.extend(["-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else []) if stack_id_err(k.kernel_stack_id): line.append("[Missed Kernel Stack]") else: - line.extend([b.ksym(addr) for addr in reversed(kernel_stack)]) + line.extend([b.ksym(addr).decode('utf-8', 'replace') + for addr in reversed(kernel_stack)]) print("%s %d" % (";".join(line), v.value)) else: # print default multi-line stack output diff --git a/tools/offwaketime.py b/tools/offwaketime.py index 38a9ff252..4a1cebabd 100755 --- a/tools/offwaketime.py +++ b/tools/offwaketime.py @@ -323,28 +323,28 @@ def signal_ignore(signal, frame): if stack_id_err(k.t_u_stack_id): line.append("[Missed User Stack]") else: - line.extend([b.sym(addr, k.t_tgid) + line.extend([b.sym(addr, k.t_tgid).decode('utf-8', 'replace') for addr in reversed(list(target_user_stack)[1:])]) if not args.user_stacks_only: line.extend(["-"] if (need_delimiter and k.t_k_stack_id > 0 and k.t_u_stack_id > 0) else []) if stack_id_err(k.t_k_stack_id): line.append("[Missed Kernel Stack]") else: - line.extend([b.ksym(addr) + line.extend([b.ksym(addr).decode('utf-8', 'replace') for addr in reversed(list(target_kernel_stack)[1:])]) line.append("--") if not args.user_stacks_only: if stack_id_err(k.w_k_stack_id): line.append("[Missed Kernel Stack]") else: - line.extend([b.ksym(addr) + line.extend([b.ksym(addr).decode('utf-8', 'replace') for addr in reversed(list(waker_kernel_stack))]) if not args.kernel_stacks_only: line.extend(["-"] if (need_delimiter and k.w_u_stack_id > 0 and k.w_k_stack_id > 0) else []) if stack_id_err(k.w_u_stack_id): line.append("[Missed User Stack]") else: - line.extend([b.sym(addr, k.w_tgid) + line.extend([b.sym(addr, k.w_tgid).decode('utf-8', 'replace') for addr in reversed(list(waker_user_stack))]) line.append(k.waker.decode('utf-8', 'replace')) print("%s %d" % (";".join(line), v.value)) From 960a9e0f902f0d6e82433581465bb8cfd9b65142 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 Feb 2019 16:52:19 -0800 Subject: [PATCH 055/135] enable debug output for test_usdt3.py test_usdt3.py has been flaky for a while. When the test failed, it looks like it did not catch a single event. The patch enabled debug output to print out all the trace points before attachments so we will have more context in case of failure. Also this debug output seems significantly reduced flakiness and in my fc28 I cannot reproduce the test failure with it. Signed-off-by: Yonghong Song --- tests/python/test_usdt3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/test_usdt3.py b/tests/python/test_usdt3.py index f7881118e..9a40a5ae5 100755 --- a/tests/python/test_usdt3.py +++ b/tests/python/test_usdt3.py @@ -105,7 +105,7 @@ def _create_file(name, text): # Run the application self.app = Popen([m_bin], env=dict(os.environ, LD_LIBRARY_PATH=self.tmp_dir)) - # os.system("tplist.py -vvv -p " + str(self.app.pid)) + os.system("../../tools/tplist.py -vvv -p " + str(self.app.pid)) def test_attach1(self): # enable USDT probe from given PID and verifier generated BPF programs From e16aa5b760704771b7695465c35e32f9995596f8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 Feb 2019 09:49:33 -0800 Subject: [PATCH 056/135] sync with latest bpf-next Original btf__new() is separated into btf__new() for ELF section processing and btf__load() to load the blob to the kernel. Make corresponding adjustment in bcc as well. Signed-off-by: Yonghong Song --- docs/kernel-versions.md | 2 + src/cc/bcc_btf.cc | 6 ++ src/cc/compat/linux/virtual_bpf.h | 98 ++++++++++++++++++++++++++++--- src/cc/export/helpers.h | 4 ++ src/cc/libbpf | 2 +- src/cc/libbpf.c | 2 + 6 files changed, 104 insertions(+), 10 deletions(-) diff --git a/docs/kernel-versions.md b/docs/kernel-versions.md index 7842bf4e2..c2ab17b64 100644 --- a/docs/kernel-versions.md +++ b/docs/kernel-versions.md @@ -207,6 +207,7 @@ Helper | Kernel version | License | Commit | `BPF_FUNC_set_hash()` | 4.13 | | [`ded092cd73c2`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ded092cd73c2c56a394b936f86897f29b2e131c0) `BPF_FUNC_set_hash_invalid()` | 4.9 | | [`7a4b28c6cc9f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7a4b28c6cc9ffac50f791b99cc7e46106436e5d8) `BPF_FUNC_setsockopt()` | 4.13 | | [`8c4b4c7e9ff0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=8c4b4c7e9ff0447995750d9329949fa082520269) +`BPF_FUNC_sk_fullsock()` | 5.1 | | [`46f8bc92758c`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=46f8bc92758c6259bcf945e9216098661c1587cd) `BPF_FUNC_sk_lookup_tcp()` | 4.20 | | [`6acc9b432e67`](https://github.com/torvalds/linux/commit/6acc9b432e6714d72d7d77ec7c27f6f8358d0c71) `BPF_FUNC_sk_lookup_udp()` | 4.20 | | [`6acc9b432e67`](https://github.com/torvalds/linux/commit/6acc9b432e6714d72d7d77ec7c27f6f8358d0c71) `BPF_FUNC_sk_redirect_hash()` | 4.18 | | [`81110384441a`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=81110384441a59cff47430f20f049e69b98c17f4) @@ -237,6 +238,7 @@ Helper | Kernel version | License | Commit | `BPF_FUNC_spin_lock()` | 5.1 | | [`d83525ca62cf`](https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/?id=d83525ca62cf8ebe3271d14c36fb900c294274a2) `BPF_FUNC_spin_unlock()` | 5.1 | | [`d83525ca62cf`](https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/?id=d83525ca62cf8ebe3271d14c36fb900c294274a2) `BPF_FUNC_tail_call()` | 4.2 | | [`04fd61ab36ec`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=04fd61ab36ec065e194ab5e74ae34a5240d992bb) +`BPF_FUNC_tcp_sock()` | 5.1 | | [`655a51e536c0`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=655a51e536c09d15ffa3603b1b6fce2b45b85a1f) `BPF_FUNC_trace_printk()` | 4.1 | GPL | [`9c959c863f82`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=9c959c863f8217a2ff3d7c296e8223654d240569) `BPF_FUNC_xdp_adjust_head()` | 4.10 | | [`17bedab27231`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=17bedab2723145d17b14084430743549e6943d03) `BPF_FUNC_xdp_adjust_meta()` | 4.15 | | [`de8f3a83b0a0`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=de8f3a83b0a0fddb2cf56e7a718127e9619ea3da) diff --git a/src/cc/bcc_btf.cc b/src/cc/bcc_btf.cc index dee2d114c..881959afa 100644 --- a/src/cc/bcc_btf.cc +++ b/src/cc/bcc_btf.cc @@ -175,6 +175,12 @@ int BTF::load(uint8_t *btf_sec, uintptr_t btf_sec_size, return -1; } + if (btf__load(btf)) { + btf__free(btf); + warning("Loading .BTF section failed\n"); + return -1; + } + btf_ext = btf_ext__new(btf_ext_sec, btf_ext_sec_size); if (BCC_IS_ERR(btf_ext)) { btf__free(btf); diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h index 9061382d2..1b53fbd68 100644 --- a/src/cc/compat/linux/virtual_bpf.h +++ b/src/cc/compat/linux/virtual_bpf.h @@ -2017,6 +2017,19 @@ union bpf_attr { * Only works if *skb* contains an IPv6 packet. Insert a * Segment Routing Header (**struct ipv6_sr_hdr**) inside * the IPv6 header. + * **BPF_LWT_ENCAP_IP** + * IP encapsulation (GRE/GUE/IPIP/etc). The outer header + * must be IPv4 or IPv6, followed by zero or more + * additional headers, up to LWT_BPF_MAX_HEADROOM total + * bytes in all prepended headers. Please note that + * if skb_is_gso(skb) is true, no more than two headers + * can be prepended, and the inner header, if present, + * should be either GRE or UDP/GUE. + * + * BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of + * type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called + * by bpf programs of types BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2330,6 +2343,23 @@ union bpf_attr { * "**y**". * Return * 0 + * + * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_sock** pointer such + * that all the fields in bpf_sock can be accessed. + * Return + * A **struct bpf_sock** pointer on success, or NULL in + * case of failure. + * + * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * + * Return + * A **struct bpf_tcp_sock** pointer on success, or NULL in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2426,7 +2456,9 @@ union bpf_attr { FN(msg_pop_data), \ FN(rc_pointer_rel), \ FN(spin_lock), \ - FN(spin_unlock), + FN(spin_unlock), \ + FN(sk_fullsock), \ + FN(tcp_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2499,7 +2531,8 @@ enum bpf_hdr_start_off { /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_SEG6, - BPF_LWT_ENCAP_SEG6_INLINE + BPF_LWT_ENCAP_SEG6_INLINE, + BPF_LWT_ENCAP_IP, }; #define __bpf_md_ptr(type, name) \ @@ -2546,6 +2579,7 @@ struct __sk_buff { __u64 tstamp; __u32 wire_len; __u32 gso_segs; + __bpf_md_ptr(struct bpf_sock *, sk); }; struct bpf_tunnel_key { @@ -2587,7 +2621,15 @@ enum bpf_ret_code { BPF_DROP = 2, /* 3-6 reserved */ BPF_REDIRECT = 7, - /* >127 are reserved for prog type specific return codes */ + /* >127 are reserved for prog type specific return codes. + * + * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been + * changed and should be routed based on its new L3 header. + * (This is an L3 redirect, as opposed to L2 redirect + * represented by BPF_REDIRECT above). + */ + BPF_LWT_REROUTE = 128, }; struct bpf_sock { @@ -2597,14 +2639,52 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; - __u32 src_ip4; /* Allows 1,2,4-byte read. - * Stored in network byte order. + /* IP address also allows 1 and 2 bytes access */ + __u32 src_ip4; + __u32 src_ip6[4]; + __u32 src_port; /* host byte order */ + __u32 dst_port; /* network byte order */ + __u32 dst_ip4; + __u32 dst_ip6[4]; + __u32 state; +}; + +struct bpf_tcp_sock { + __u32 snd_cwnd; /* Sending congestion window */ + __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + __u32 rtt_min; + __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 rcv_nxt; /* What we want to receive next */ + __u32 snd_nxt; /* Next sequence we send */ + __u32 snd_una; /* First byte we want an ack for */ + __u32 mss_cache; /* Cached effective mss, not including SACKS */ + __u32 ecn_flags; /* ECN status bits. */ + __u32 rate_delivered; /* saved rate sample: packets delivered */ + __u32 rate_interval_us; /* saved rate sample: time elapsed */ + __u32 packets_out; /* Packets which are "in flight" */ + __u32 retrans_out; /* Retransmitted packets out */ + __u32 total_retrans; /* Total retransmits for entire connection */ + __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn + * total number of segments in. */ - __u32 src_ip6[4]; /* Allows 1,2,4-byte read. - * Stored in network byte order. + __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn + * total number of data segments in. + */ + __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + __u32 lost_out; /* Lost packets */ + __u32 sacked_out; /* SACK'd packets */ + __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. */ - __u32 src_port; /* Allows 4-byte read. - * Stored in host byte order + __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked + * sum(delta(snd_una)), or how many bytes + * were acked. */ }; diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index 9954e0ce6..527700f9e 100644 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -476,6 +476,10 @@ static void (*bpf_spin_lock)(struct bpf_spin_lock *lock) = (void *) BPF_FUNC_spin_lock; static void (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = (void *) BPF_FUNC_spin_unlock; +static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = + (void *) BPF_FUNC_sk_fullsock; +static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = + (void *) BPF_FUNC_tcp_sock; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/src/cc/libbpf b/src/cc/libbpf index f0bcba631..5beb8a2eb 160000 --- a/src/cc/libbpf +++ b/src/cc/libbpf @@ -1 +1 @@ -Subproject commit f0bcba631dec4540fc6ab2cd0a0923a111cf4cf2 +Subproject commit 5beb8a2ebffd1045e3edb9b522d6ff5bb477c541 diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 003ca866e..8f9f6b871 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -191,6 +191,8 @@ static struct bpf_helper helpers[] = { {"rc_pointer_rel", "5.0"}, {"spin_lock", "5.1"}, {"spin_unlock", "5.1"}, + {"sk_fullsock", "5.1"}, + {"tcp_sock", "5.1"}, }; static uint64_t ptr_to_u64(void *ptr) From 7d6be4fe023ad50c168afd9c9b22703e8b008cc5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 Feb 2019 11:33:42 -0800 Subject: [PATCH 057/135] check permission before nullifying type_id's in bcc_map_create_xattr In the latest kernel, before trying to copy map_name and check map key/value type id, the kernel first tried to allocate map which may hit permission deny error due to unsufficiently charged memory. Let us amend the retry sequence in bcc_map_create_xattr to reflect what kernel does. Note that retry logic can be made simpler if libbpf provides probe logic and stores the result somewhere and bcc can examine these results to tailor parameters without excessive retries. Signed-off-by: Yonghong Song --- src/cc/libbpf.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 8f9f6b871..18506d311 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -209,6 +209,20 @@ int bcc_create_map_xattr(struct bpf_create_map_attr *attr, bool allow_rlimit) attr->name = map_name; int ret = bpf_create_map_xattr(attr); + if (ret < 0 && errno == EPERM) { + if (!allow_rlimit) + return ret; + + // see note below about the rationale for this retry + struct rlimit rl = {}; + if (getrlimit(RLIMIT_MEMLOCK, &rl) == 0) { + rl.rlim_max = RLIM_INFINITY; + rl.rlim_cur = rl.rlim_max; + if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0) + ret = bpf_create_map_xattr(attr); + } + } + // kernel already supports btf if its loading is successful, // but this map type may not support pretty print yet. if (ret < 0 && attr->btf_key_type_id && errno == 524 /* ENOTSUPP */) { From a9fdcf474db69a754ae460021f8428c1dac4191f Mon Sep 17 00:00:00 2001 From: Teng Qin Date: Fri, 15 Feb 2019 18:43:34 -0800 Subject: [PATCH 058/135] Rename bpf_common.h to bcc_common.h (#2208) Rename bpf_common.h to bcc_common.h --- src/cc/CMakeLists.txt | 4 ++-- src/cc/{bpf_common.cc => bcc_common.cc} | 2 +- src/cc/{bpf_common.h => bcc_common.h} | 4 ++-- src/python/bcc/libbcc.py | 2 +- tests/cc/test_static.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) rename src/cc/{bpf_common.cc => bcc_common.cc} (99%) rename src/cc/{bpf_common.h => bcc_common.h} (98%) diff --git a/src/cc/CMakeLists.txt b/src/cc/CMakeLists.txt index dd5d91c41..59a598560 100644 --- a/src/cc/CMakeLists.txt +++ b/src/cc/CMakeLists.txt @@ -35,7 +35,7 @@ add_library(bpf-shared SHARED libbpf.c perf_reader.c ${libbpf_sources}) set_target_properties(bpf-shared PROPERTIES VERSION ${REVISION_LAST} SOVERSION 0) set_target_properties(bpf-shared PROPERTIES OUTPUT_NAME bpf) -set(bcc_common_sources bpf_common.cc bpf_module.cc bcc_btf.cc exported_files.cc) +set(bcc_common_sources bcc_common.cc bpf_module.cc bcc_btf.cc exported_files.cc) if (${LLVM_PACKAGE_VERSION} VERSION_EQUAL 6 OR ${LLVM_PACKAGE_VERSION} VERSION_GREATER 6) set(bcc_common_sources ${bcc_common_sources} bcc_debug.cc) endif() @@ -51,7 +51,7 @@ set(bcc_util_sources ns_guard.cc common.cc) set(bcc_sym_sources bcc_syms.cc bcc_elf.c bcc_perf_map.c bcc_proc.c) set(bcc_common_headers libbpf.h perf_reader.h) set(bcc_table_headers file_desc.h table_desc.h table_storage.h) -set(bcc_api_headers bpf_common.h bpf_module.h bcc_exception.h bcc_syms.h) +set(bcc_api_headers bcc_common.h bpf_module.h bcc_exception.h bcc_syms.h) if(ENABLE_CLANG_JIT) add_library(bcc-shared SHARED diff --git a/src/cc/bpf_common.cc b/src/cc/bcc_common.cc similarity index 99% rename from src/cc/bpf_common.cc rename to src/cc/bcc_common.cc index e65ef9d07..182281119 100644 --- a/src/cc/bpf_common.cc +++ b/src/cc/bcc_common.cc @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "bpf_common.h" +#include "bcc_common.h" #include "bpf_module.h" extern "C" { diff --git a/src/cc/bpf_common.h b/src/cc/bcc_common.h similarity index 98% rename from src/cc/bpf_common.h rename to src/cc/bcc_common.h index 9ad414288..2504ea237 100644 --- a/src/cc/bpf_common.h +++ b/src/cc/bcc_common.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef BPF_COMMON_H -#define BPF_COMMON_H +#ifndef BCC_COMMON_H +#define BCC_COMMON_H #include #include diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py index 23d0b11ed..48c6eb602 100644 --- a/src/python/bcc/libbcc.py +++ b/src/python/bcc/libbcc.py @@ -16,7 +16,7 @@ lib = ct.CDLL("libbcc.so.0", use_errno=True) -# keep in sync with bpf_common.h +# keep in sync with bcc_common.h lib.bpf_module_create_b.restype = ct.c_void_p lib.bpf_module_create_b.argtypes = [ct.c_char_p, ct.c_char_p, ct.c_uint] lib.bpf_module_create_c.restype = ct.c_void_p diff --git a/tests/cc/test_static.c b/tests/cc/test_static.c index ff675fb92..919b07420 100644 --- a/tests/cc/test_static.c +++ b/tests/cc/test_static.c @@ -1,4 +1,4 @@ -#include "bpf_common.h" +#include "bcc_common.h" int main(int argc, char **argv) { void *mod = bpf_module_create_c_from_string("BPF_TABLE(\"array\", int, int, stats, 10);\n", 4, NULL, 0, true); From 1d1aa3f0389516561cb03a600ae75cb17c21202d Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Sat, 16 Feb 2019 12:37:51 +0800 Subject: [PATCH 059/135] docs/reference_guide: add auto-generation of perf event data structure in Python (#2211) Follow-up to #2198. --- docs/reference_guide.md | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/docs/reference_guide.md b/docs/reference_guide.md index b0fa8d759..d735e8149 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -1263,7 +1263,7 @@ while 1: b.perf_buffer_poll() ``` -Note that the data structure transferred will need to be declared in C in the BPF program, and in Python. For example: +Note that the data structure transferred will need to be declared in C in the BPF program. For example: ```C // define output data structure in C @@ -1272,8 +1272,20 @@ struct data_t { u64 ts; char comm[TASK_COMM_LEN]; }; +BPF_PERF_OUTPUT(events); +[...] +``` + +In Python, you can either let bcc generate the data structure from C declaration automatically (recommanded): + +```Python +def print_event(cpu, data, size): + event = b["events"].event(data) +[...] ``` +or define it manually: + ```Python # define output data structure in Python TASK_COMM_LEN = 16 # linux/sched.h @@ -1281,9 +1293,11 @@ class Data(ct.Structure): _fields_ = [("pid", ct.c_ulonglong), ("ts", ct.c_ulonglong), ("comm", ct.c_char * TASK_COMM_LEN)] -``` -Perhaps in a future bcc version, the Python data structure will be automatically generated from the C declaration. +def print_event(cpu, data, size): + event = ct.cast(data, ct.POINTER(Data)).contents +[...] +``` Examples in situ: [code](https://github.com/iovisor/bcc/blob/08fbceb7e828f0e3e77688497727c5b2405905fd/examples/tracing/hello_perf_output.py#L59), From d3c32b14a857fad8d88c8d06739326d141fb87a1 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 16 Feb 2019 20:52:35 +0100 Subject: [PATCH 060/135] ext4dist: code cleanup (#2181) (#2206) * ext4dist: code cleanup (#2181) - Dynamically handle different ext4_file_operations (avoid wasting extra cycles). - make the code pep8 compliant - clarify comments * ext4dist: fix: properly support kernels without ext4_file_read_iter() Signed-off-by: Andrea Righi --- tools/ext4dist.py | 90 +++++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 43 deletions(-) diff --git a/tools/ext4dist.py b/tools/ext4dist.py index bc797fb03..384a4c147 100755 --- a/tools/ext4dist.py +++ b/tools/ext4dist.py @@ -81,24 +81,7 @@ return 0; } -// The current ext4 (Linux 4.5) uses generic_file_read_iter(), instead of it's -// own function, for reads. So we need to trace that and then filter on ext4, -// which I do by checking file->f_op. -int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb) -{ - u32 pid = bpf_get_current_pid_tgid(); - if (FILTER_PID) - return 0; - - // ext4 filter on file->f_op == ext4_file_operations - struct file *fp = iocb->ki_filp; - if ((u64)fp->f_op != EXT4_FILE_OPERATIONS) - return 0; - - u64 ts = bpf_ktime_get_ns(); - start.update(&pid, &ts); - return 0; -} +EXT4_TRACE_READ_CODE static int trace_return(struct pt_regs *ctx, const char *op) { @@ -152,20 +135,51 @@ } """ +# Starting from Linux 4.10 ext4_file_operations.read_iter has been changed from +# using generic_file_read_iter() to its own ext4_file_read_iter(). +# +# To detect the proper function to trace check if ext4_file_read_iter() is +# defined in /proc/kallsyms, if it's defined attach to that function, otherwise +# use generic_file_read_iter() and inside the trace hook filter on ext4 read +# events (checking if file->f_op == ext4_file_operations). +if BPF.get_kprobe_functions(b'ext4_file_read_iter'): + ext4_read_fn = 'ext4_file_read_iter' + ext4_trace_read_fn = 'trace_entry' + ext4_trace_read_code = '' +else: + ext4_read_fn = 'generic_file_read_iter' + ext4_trace_read_fn = 'trace_read_entry' + ext4_file_ops_addr = '' + with open(kallsyms) as syms: + for line in syms: + (addr, size, name) = line.rstrip().split(" ", 2) + name = name.split("\t")[0] + if name == "ext4_file_operations": + ext4_file_ops_addr = "0x" + addr + break + if ext4_file_ops_addr == '': + print("ERROR: no ext4_file_operations in /proc/kallsyms. Exiting.") + print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.") + exit() + ext4_trace_read_code = """ +int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb) +{ + u32 pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + + // ext4 filter on file->f_op == ext4_file_operations + struct file *fp = iocb->ki_filp; + if ((u64)fp->f_op != %s) + return 0; + + u64 ts = bpf_ktime_get_ns(); + start.update(&pid, &ts); + return 0; +}""" % ext4_file_ops_addr + # code replacements -with open(kallsyms) as syms: - ops = '' - for line in syms: - (addr, size, name) = line.rstrip().split(" ", 2) - name = name.split("\t")[0] - if name == "ext4_file_operations": - ops = "0x" + addr - break - if ops == '': - print("ERROR: no ext4_file_operations in /proc/kallsyms. Exiting.") - print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.") - exit() - bpf_text = bpf_text.replace('EXT4_FILE_OPERATIONS', ops) +bpf_text = bpf_text.replace('EXT4_TRACE_READ_CODE', ext4_trace_read_code) bpf_text = bpf_text.replace('FACTOR', str(factor)) if args.pid: bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) @@ -179,21 +193,11 @@ # load BPF program b = BPF(text=bpf_text) -# Common file functions. See earlier comment about generic_file_read_iter(). -# Comment by Joe Yin -# From Linux 4.10, the function .read_iter at the ext4_file_operations has -# changed to ext4_file_read_iter. -# So, I add get_kprobe_functions(b'ext4_file_read_iter'),it will first to attach ext4_file_read_iter, -# if fails and will attach the generic_file_read_iter which used to pre-4.10. - -if BPF.get_kprobe_functions(b'ext4_file_read_iter'): - b.attach_kprobe(event="ext4_file_read_iter", fn_name="trace_entry") -else: - b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry") +b.attach_kprobe(event=ext4_read_fn, fn_name=ext4_trace_read_fn) b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_entry") b.attach_kprobe(event="ext4_file_open", fn_name="trace_entry") b.attach_kprobe(event="ext4_sync_file", fn_name="trace_entry") -b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return") +b.attach_kretprobe(event=ext4_read_fn, fn_name='trace_read_return') b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return") b.attach_kretprobe(event="ext4_file_open", fn_name="trace_open_return") b.attach_kretprobe(event="ext4_sync_file", fn_name="trace_fsync_return") From bc0d472ec9d741a061e9ae5b0a7a4053a4d9125a Mon Sep 17 00:00:00 2001 From: Joel Date: Sun, 17 Feb 2019 01:15:41 -0500 Subject: [PATCH 061/135] Fix BCC on arm64 by allowing missing ausyscall (#2213) ausyscall is not available on ARM64 systems I am testing. The system is running debian buster. All BCC tools fail as a result. Let us not fail if ausyscall is missing. Signed-off-by: Joel Fernandes --- src/python/bcc/syscall.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/python/bcc/syscall.py b/src/python/bcc/syscall.py index 752b64ed8..0ba282a53 100644 --- a/src/python/bcc/syscall.py +++ b/src/python/bcc/syscall.py @@ -381,10 +381,7 @@ def _parse_syscall(line): out = out.split(b'\n',1)[1] syscalls = dict(map(_parse_syscall, out.strip().split(b'\n'))) except Exception as e: - if platform.machine() == "x86_64": - pass - else: - raise Exception("ausyscall: command not found") + pass def syscall_name(syscall_num): """Return the syscall name for the particular syscall number.""" From b26e26b94cb5bb8527e5e543002dcc593e9208a5 Mon Sep 17 00:00:00 2001 From: JayceCao Date: Mon, 18 Feb 2019 14:55:12 +0800 Subject: [PATCH 062/135] fix #1851 for Arch Linux users (#2214) * fix #1851 for Arch Linux users --- man/man8/bashreadline.8 | 11 ++++++++++- tools/bashreadline.py | 19 ++++++++++++++++++- tools/bashreadline_example.txt | 10 ++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/man/man8/bashreadline.8 b/man/man8/bashreadline.8 index a70fc5884..185598aa9 100644 --- a/man/man8/bashreadline.8 +++ b/man/man8/bashreadline.8 @@ -2,7 +2,7 @@ .SH NAME bashreadline \- Print entered bash commands system wide. Uses Linux eBPF/bcc. .SH SYNOPSIS -.B bashreadline +.B bashreadline [\-h] [\-s SHARED] .SH DESCRIPTION bashreadline traces the return of the readline() function using uprobes, to show the bash commands that were entered interactively, system wide. The @@ -17,6 +17,15 @@ which uses an older mechanism Since this uses BPF, only the root user can use this tool. .SH REQUIREMENTS CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +\-s +Specify the location of libreadline.so shared library when you failed to run the +script directly with error: "Exception: could not determine address of symbol +\'readline\'". Default value is /lib/libreadline.so. .SH EXAMPLES .TP Trace bash commands system wide: diff --git a/tools/bashreadline.py b/tools/bashreadline.py index af4f18ec8..4cc1f9653 100755 --- a/tools/bashreadline.py +++ b/tools/bashreadline.py @@ -3,7 +3,12 @@ # bashreadline Print entered bash commands from all running shells. # For Linux, uses BCC, eBPF. Embedded C. # +# USAGE: bashreadline [-s SHARED] # This works by tracing the readline() function using a uretprobe (uprobes). +# When you failed to run the script directly with error: +# `Exception: could not determine address of symbol b'readline'`, +# you may need specify the location of libreadline.so library +# with `-s` option. # # Copyright 2016 Netflix, Inc. # Licensed under the Apache License, Version 2.0 (the "License") @@ -14,6 +19,18 @@ from __future__ import print_function from bcc import BPF from time import strftime +import argparse + +parser = argparse.ArgumentParser( + description="Print entered bash commands from all running shells", + formatter_class=argparse.RawDescriptionHelpFormatter) +parser.add_argument("-s", "--shared", nargs="?", + const="/lib/libreadline.so", type=str, + help="specify the location of libreadline.so library.\ + Default is /lib/libreadline.so") +args = parser.parse_args() + +name = args.shared if args.shared else "/bin/bash" # load BPF program bpf_text = """ @@ -41,7 +58,7 @@ """ b = BPF(text=bpf_text) -b.attach_uretprobe(name="/bin/bash", sym="readline", fn_name="printret") +b.attach_uretprobe(name=name, sym="readline", fn_name="printret") # header print("%-9s %-6s %s" % ("TIME", "PID", "COMMAND")) diff --git a/tools/bashreadline_example.txt b/tools/bashreadline_example.txt index 861a89bf1..f8543b712 100644 --- a/tools/bashreadline_example.txt +++ b/tools/bashreadline_example.txt @@ -14,6 +14,16 @@ TIME PID COMMAND 05:29:04 3059 echo another shell 05:29:13 21176 echo first shell again +When running the script on Arch Linux, you may need to specify the location +of libreadline.so library: + +# ./bashreadline -s /lib/libreadline.so +TIME PID COMMAND +11:17:34 28796 whoami +11:17:41 28796 ps -ef +11:17:51 28796 echo "Hello eBPF!" + + The entered command may fail. This is just showing what command lines were entered interactively for bash to process. From cd9334c7ba2ef95acfbf25c8ff05c5c676a980ab Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Mon, 18 Feb 2019 09:08:28 -0800 Subject: [PATCH 063/135] fix a bug related to syscall.py (#2217) Commit 218f7482f8ae refactored syscall number=>name mapping into a separate file src/python/bcc/syscall.py. The commit added a reference in python __init__.py, and this will cause virtually all non x64 arch python tools failure. Commit bc0d472ec9d7 attempted to fix the issue by removing the failure in syscall.py but this is not the correct fix for arm64 as the syscall numbers won't match. Removing the syscall.py reference in __init__.py should be enough to restore the previous working behavior. Fixes: bc0d472ec9d7 ("Fix BCC on arm64 by allowing missing ausyscall") Fixes: 218f7482f8ae ("Wcohen/efficiency (#2063)") Signed-off-by: Yonghong Song --- src/python/bcc/__init__.py | 1 - src/python/bcc/syscall.py | 5 ++++- tools/lib/ucalls.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index 2c45d8c6d..7a48438da 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -27,7 +27,6 @@ from .libbcc import lib, bcc_symbol, bcc_symbol_option, bcc_stacktrace_build_id, _SYM_CB_TYPE from .table import Table, PerfEventArray from .perf import Perf -from .syscall import syscall_name from .utils import get_online_cpus, printb, _assert_is_bytes, ArgString from .version import __version__ diff --git a/src/python/bcc/syscall.py b/src/python/bcc/syscall.py index 0ba282a53..1346b4e81 100644 --- a/src/python/bcc/syscall.py +++ b/src/python/bcc/syscall.py @@ -381,7 +381,10 @@ def _parse_syscall(line): out = out.split(b'\n',1)[1] syscalls = dict(map(_parse_syscall, out.strip().split(b'\n'))) except Exception as e: - pass + if platform.machine() == "x86_64": + pass + else: + raise Exception("ausyscall: command not found") def syscall_name(syscall_num): """Return the syscall name for the particular syscall number.""" diff --git a/tools/lib/ucalls.py b/tools/lib/ucalls.py index 352e4d70b..d072af09c 100755 --- a/tools/lib/ucalls.py +++ b/tools/lib/ucalls.py @@ -15,7 +15,8 @@ from __future__ import print_function import argparse from time import sleep -from bcc import BPF, USDT, utils, syscall_name +from bcc import BPF, USDT, utils +from bcc.syscall import syscall_name languages = ["java", "perl", "php", "python", "ruby", "tcl"] From 1d6d45cd2604af307c5f0e502df57fe66fd869d5 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Thu, 21 Feb 2019 19:07:56 +0100 Subject: [PATCH 064/135] Expose kprobe's maxactive parameter in bcc (#2224) When a kretprobe is installed on a kernel function, there is a limit on how many parallel calls it can catch. You can change that limit with the maxactive parameter. Since commit 696ced4 ("tracing/kprobes: expose maxactive for kretprobe in kprobe_events") in the Linux kernel, debugfs exposes that parameter. This commit exposes that parameter in bcc as well, through bpf_attach_kprobe in libbcc and BPF.attach_kretprobe at the Python level. If a maxactive value is given, we fallback to using debugfs to install the kprobe event, instead of the usual perf_event_open. For kernels without maxactive supports in debugfs, the error resolution is not straightforward. The kernel will still install a probe, but under a different name. We therefore need to check for our expected name, delete the probe with the different name, and try to attach the probe again, without the maxactive value. Signed-off-by: Paul Chaignon --- docs/reference_guide.md | 4 +- src/cc/api/BPF.cc | 6 +- src/cc/api/BPF.h | 3 +- src/cc/libbpf.c | 115 +++++++++++++++++++-------- src/cc/libbpf.h | 3 +- src/lua/bcc/bpf.lua | 3 +- src/lua/bcc/libbcc.lua | 2 +- src/python/bcc/__init__.py | 9 ++- src/python/bcc/libbcc.py | 2 +- tests/python/CMakeLists.txt | 2 + tests/python/test_trace_maxactive.py | 37 +++++++++ 11 files changed, 143 insertions(+), 43 deletions(-) create mode 100755 tests/python/test_trace_maxactive.py diff --git a/docs/reference_guide.md b/docs/reference_guide.md index d735e8149..4b37a247c 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -987,7 +987,7 @@ Examples in situ: ### 2. attach_kretprobe() -Syntax: ```BPF.attach_kretprobe(event="event", fn_name="name")``` +Syntax: ```BPF.attach_kretprobe(event="event", fn_name="name" [, maxactive=int])``` Instruments the return of the kernel function ```event()``` using kernel dynamic tracing of the function return, and attaches our C defined function ```name()``` to be called when the kernel function returns. @@ -1001,6 +1001,8 @@ This will instrument the kernel ```vfs_read()``` function, which will then run o You can call attach_kretprobe() more than once, and attach your BPF function to multiple kernel function returns. +When a kretprobe is installed on a kernel function, there is a limit on how many parallel calls it can catch. You can change that limit with ```maxactive```. See the kprobes documentation for its default value. + See the previous kretprobes section for how to instrument the return value from BPF. Examples in situ: diff --git a/src/cc/api/BPF.cc b/src/cc/api/BPF.cc index 17ac9b384..473784c0a 100644 --- a/src/cc/api/BPF.cc +++ b/src/cc/api/BPF.cc @@ -167,7 +167,8 @@ StatusTuple BPF::detach_all() { StatusTuple BPF::attach_kprobe(const std::string& kernel_func, const std::string& probe_func, uint64_t kernel_func_offset, - bpf_probe_attach_type attach_type) { + bpf_probe_attach_type attach_type, + int maxactive) { std::string probe_event = get_kprobe_event(kernel_func, attach_type); if (kprobes_.find(probe_event) != kprobes_.end()) return StatusTuple(-1, "kprobe %s already attached", probe_event.c_str()); @@ -176,7 +177,8 @@ StatusTuple BPF::attach_kprobe(const std::string& kernel_func, TRY2(load_func(probe_func, BPF_PROG_TYPE_KPROBE, probe_fd)); int res_fd = bpf_attach_kprobe(probe_fd, attach_type, probe_event.c_str(), - kernel_func.c_str(), kernel_func_offset); + kernel_func.c_str(), kernel_func_offset, + maxactive); if (res_fd < 0) { TRY2(unload_func(probe_func)); diff --git a/src/cc/api/BPF.h b/src/cc/api/BPF.h index 38d54b299..4b260887e 100644 --- a/src/cc/api/BPF.h +++ b/src/cc/api/BPF.h @@ -64,7 +64,8 @@ class BPF { StatusTuple attach_kprobe(const std::string& kernel_func, const std::string& probe_func, uint64_t kernel_func_offset = 0, - bpf_probe_attach_type = BPF_PROBE_ENTRY); + bpf_probe_attach_type = BPF_PROBE_ENTRY, + int maxactive = 0); StatusTuple detach_kprobe( const std::string& kernel_func, bpf_probe_attach_type attach_type = BPF_PROBE_ENTRY); diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 18506d311..9b0024dc4 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -856,47 +856,100 @@ static int bpf_attach_tracing_event(int progfd, const char *event_path, int pid, return 0; } +static int create_kprobe_event(char *buf, const char *ev_name, + enum bpf_probe_attach_type attach_type, + const char *fn_name, uint64_t fn_offset, + int maxactive) +{ + int kfd; + char ev_alias[128]; + static unsigned int buf_size = 256; + + kfd = open("/sys/kernel/debug/tracing/kprobe_events", O_WRONLY | O_APPEND, 0); + if (kfd < 0) { + fprintf(stderr, "open(/sys/kernel/debug/tracing/kprobe_events): %s\n", buf, + strerror(errno)); + return -1; + } + + snprintf(ev_alias, sizeof(ev_alias), "%s_bcc_%d", ev_name, getpid()); + + if (fn_offset > 0 && attach_type == BPF_PROBE_ENTRY) + snprintf(buf, buf_size, "p:kprobes/%s %s+%"PRIu64, + ev_alias, fn_name, fn_offset); + else if (maxactive > 0 && attach_type == BPF_PROBE_RETURN) + snprintf(buf, buf_size, "r%d:kprobes/%s %s", + maxactive, ev_alias, fn_name); + else + snprintf(buf, buf_size, "%c:kprobes/%s %s", + attach_type == BPF_PROBE_ENTRY ? 'p' : 'r', + ev_alias, fn_name); + + if (write(kfd, buf, strlen(buf)) < 0) { + if (errno == ENOENT) + fprintf(stderr, "cannot attach kprobe, probe entry may not exist\n"); + else + fprintf(stderr, "cannot attach kprobe, %s\n", strerror(errno)); + close(kfd); + return -1; + } + close(kfd); + snprintf(buf, buf_size, "/sys/kernel/debug/tracing/events/kprobes/%s", + ev_alias); + return 0; +} + int bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type, - const char *ev_name, const char *fn_name, uint64_t fn_offset) + const char *ev_name, const char *fn_name, uint64_t fn_offset, + int maxactive) { int kfd, pfd = -1; - char buf[256]; - char event_alias[128]; - static char *event_type = "kprobe"; + char buf[256], fname[256]; + + if (maxactive <= 0) + // Try create the kprobe Perf Event with perf_event_open API. + pfd = bpf_try_perf_event_open_with_probe(fn_name, fn_offset, -1, "kprobe", + attach_type != BPF_PROBE_ENTRY); - // Try create the kprobe Perf Event with perf_event_open API. - pfd = bpf_try_perf_event_open_with_probe(fn_name, fn_offset, -1, event_type, - attach_type != BPF_PROBE_ENTRY); // If failed, most likely Kernel doesn't support the new perf_event_open API // yet. Try create the event using debugfs. if (pfd < 0) { - snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type); - kfd = open(buf, O_WRONLY | O_APPEND, 0); - if (kfd < 0) { - fprintf(stderr, "open(%s): %s\n", buf, strerror(errno)); + if (create_kprobe_event(buf, ev_name, attach_type, fn_name, fn_offset, + maxactive) < 0) goto error; - } - - snprintf(event_alias, sizeof(event_alias), "%s_bcc_%d", ev_name, getpid()); - if (fn_offset > 0 && attach_type == BPF_PROBE_ENTRY) - snprintf(buf, sizeof(buf), "p:%ss/%s %s+%"PRIu64, - event_type, event_alias, fn_name, fn_offset); - else - snprintf(buf, sizeof(buf), "%c:%ss/%s %s", - attach_type == BPF_PROBE_ENTRY ? 'p' : 'r', - event_type, event_alias, fn_name); - - if (write(kfd, buf, strlen(buf)) < 0) { - if (errno == ENOENT) - fprintf(stderr, "cannot attach kprobe, probe entry may not exist\n"); - else - fprintf(stderr, "cannot attach kprobe, %s\n", strerror(errno)); - close(kfd); - goto error; + // If we're using maxactive, we need to check that the event was created + // under the expected name. If debugfs doesn't support maxactive yet + // (kernel < 4.12), the event is created under a different name; we need to + // delete that event and start again without maxactive. + if (maxactive > 0 && attach_type == BPF_PROBE_RETURN) { + snprintf(fname, sizeof(fname), "%s/id", buf); + if (access(fname, F_OK) == -1) { + // Deleting kprobe event with incorrect name. + kfd = open("/sys/kernel/debug/tracing/kprobe_events", + O_WRONLY | O_APPEND, 0); + if (kfd < 0) { + fprintf(stderr, "open(/sys/kernel/debug/tracing/kprobe_events): %s\n", + strerror(errno)); + return -1; + } + snprintf(fname, sizeof(fname), "-:kprobes/%s_0", ev_name); + if (write(kfd, fname, strlen(fname)) < 0) { + if (errno == ENOENT) + fprintf(stderr, "cannot detach kprobe, probe entry may not exist\n"); + else + fprintf(stderr, "cannot detach kprobe, %s\n", strerror(errno)); + close(kfd); + goto error; + } + close(kfd); + + // Re-creating kprobe event without maxactive. + if (create_kprobe_event(buf, ev_name, attach_type, fn_name, + fn_offset, 0) < 0) + goto error; + } } - close(kfd); - snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, event_alias); } // If perf_event_open succeeded, bpf_attach_tracing_event will use the created // Perf Event FD directly and buf would be empty and unused. diff --git a/src/cc/libbpf.h b/src/cc/libbpf.h index e4c1b77ec..024db2186 100644 --- a/src/cc/libbpf.h +++ b/src/cc/libbpf.h @@ -79,7 +79,8 @@ typedef void (*perf_reader_raw_cb)(void *cb_cookie, void *raw, int raw_size); typedef void (*perf_reader_lost_cb)(void *cb_cookie, uint64_t lost); int bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type, - const char *ev_name, const char *fn_name, uint64_t fn_offset); + const char *ev_name, const char *fn_name, uint64_t fn_offset, + int maxactive); int bpf_detach_kprobe(const char *ev_name); int bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type, diff --git a/src/lua/bcc/bpf.lua b/src/lua/bcc/bpf.lua index 1fe862a20..123590079 100644 --- a/src/lua/bcc/bpf.lua +++ b/src/lua/bcc/bpf.lua @@ -213,8 +213,9 @@ function Bpf:attach_kprobe(args) local ev_name = string.format("%s_%s", ptype, event:gsub("[%+%.]", "_")) local offset = args.fn_offset or 0 local retprobe = args.retprobe and 1 or 0 + local maxactive = args.maxactive or 0 - local res = libbcc.bpf_attach_kprobe(fn.fd, retprobe, ev_name, event, offset) + local res = libbcc.bpf_attach_kprobe(fn.fd, retprobe, ev_name, event, offset, maxactive) assert(res >= 0, "failed to attach BPF to kprobe") self:probe_store("kprobe", ev_name, res) diff --git a/src/lua/bcc/libbcc.lua b/src/lua/bcc/libbcc.lua index f7a1b7aef..4b7fee585 100644 --- a/src/lua/bcc/libbcc.lua +++ b/src/lua/bcc/libbcc.lua @@ -43,7 +43,7 @@ typedef void (*perf_reader_raw_cb)(void *cb_cookie, void *raw, int raw_size); typedef void (*perf_reader_lost_cb)(void *cb_cookie, uint64_t lost); int bpf_attach_kprobe(int progfd, int attach_type, const char *ev_name, - const char *fn_name, uint64_t fn_offset); + const char *fn_name, uint64_t fn_offset, int maxactive); int bpf_detach_kprobe(const char *ev_name); diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index 7a48438da..c9e1c14ad 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -629,14 +629,14 @@ def attach_kprobe(self, event=b"", event_off=0, fn_name=b"", event_re=b""): self._check_probe_quota(1) fn = self.load_func(fn_name, BPF.KPROBE) ev_name = b"p_" + event.replace(b"+", b"_").replace(b".", b"_") - fd = lib.bpf_attach_kprobe(fn.fd, 0, ev_name, event, event_off) + fd = lib.bpf_attach_kprobe(fn.fd, 0, ev_name, event, event_off, 0) if fd < 0: raise Exception("Failed to attach BPF program %s to kprobe %s" % (fn_name, event)) self._add_kprobe_fd(ev_name, fd) return self - def attach_kretprobe(self, event=b"", fn_name=b"", event_re=b""): + def attach_kretprobe(self, event=b"", fn_name=b"", event_re=b"", maxactive=0): event = _assert_is_bytes(event) fn_name = _assert_is_bytes(fn_name) event_re = _assert_is_bytes(event_re) @@ -645,7 +645,8 @@ def attach_kretprobe(self, event=b"", fn_name=b"", event_re=b""): if event_re: for line in BPF.get_kprobe_functions(event_re): try: - self.attach_kretprobe(event=line, fn_name=fn_name) + self.attach_kretprobe(event=line, fn_name=fn_name, + maxactive=maxactive) except: pass return @@ -653,7 +654,7 @@ def attach_kretprobe(self, event=b"", fn_name=b"", event_re=b""): self._check_probe_quota(1) fn = self.load_func(fn_name, BPF.KPROBE) ev_name = b"r_" + event.replace(b"+", b"_").replace(b".", b"_") - fd = lib.bpf_attach_kprobe(fn.fd, 1, ev_name, event, 0) + fd = lib.bpf_attach_kprobe(fn.fd, 1, ev_name, event, 0, maxactive) if fd < 0: raise Exception("Failed to attach BPF program %s to kretprobe %s" % (fn_name, event)) diff --git a/src/python/bcc/libbcc.py b/src/python/bcc/libbcc.py index 48c6eb602..e98bb1401 100644 --- a/src/python/bcc/libbcc.py +++ b/src/python/bcc/libbcc.py @@ -93,7 +93,7 @@ _LOST_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_ulonglong) lib.bpf_attach_kprobe.restype = ct.c_int lib.bpf_attach_kprobe.argtypes = [ct.c_int, ct.c_int, ct.c_char_p, ct.c_char_p, - ct.c_ulonglong] + ct.c_ulonglong, ct.c_int] lib.bpf_detach_kprobe.restype = ct.c_int lib.bpf_detach_kprobe.argtypes = [ct.c_char_p] lib.bpf_attach_uprobe.restype = ct.c_int diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt index a16f76d32..3862e505d 100644 --- a/tests/python/CMakeLists.txt +++ b/tests/python/CMakeLists.txt @@ -37,6 +37,8 @@ add_test(NAME py_test_trace3_c WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND ${TEST_WRAPPER} py_trace3_c sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_trace3.py test_trace3.c) add_test(NAME py_test_trace4 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND ${TEST_WRAPPER} py_trace4 sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_trace4.py) +add_test(NAME py_test_trace_maxactive WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMAND ${TEST_WRAPPER} py_trace_maxactive sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_trace_maxactive.py) add_test(NAME py_test_probe_count WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND ${TEST_WRAPPER} py_probe_count sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_probe_count.py) add_test(NAME py_test_debuginfo WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} diff --git a/tests/python/test_trace_maxactive.py b/tests/python/test_trace_maxactive.py new file mode 100755 index 000000000..b0d4d68eb --- /dev/null +++ b/tests/python/test_trace_maxactive.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# Copyright (c) PLUMgrid, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") + +from bcc import BPF +import os +import sys +from unittest import main, TestCase + +class TestKprobeMaxactive(TestCase): + def setUp(self): + self.b = BPF(text=b""" + typedef struct { int idx; } Key; + typedef struct { u64 val; } Val; + BPF_HASH(stats, Key, Val, 3); + int hello(void *ctx) { + stats.lookup_or_init(&(Key){1}, &(Val){0})->val++; + return 0; + } + int goodbye(void *ctx) { + stats.lookup_or_init(&(Key){2}, &(Val){0})->val++; + return 0; + } + """) + self.b.attach_kprobe(event_re=self.b.get_syscall_prefix() + b"bpf", + fn_name=b"hello") + self.b.attach_kretprobe(event_re=self.b.get_syscall_prefix() + b"bpf", + fn_name=b"goodbye", maxactive=128) + + def test_send1(self): + k1 = self.b[b"stats"].Key(1) + k2 = self.b[b"stats"].Key(2) + self.assertTrue(self.b[b"stats"][k1].val >= 2) + self.assertTrue(self.b[b"stats"][k2].val == 1) + +if __name__ == "__main__": + main() From 549432857f78a21932846a50abf2c01c4d984483 Mon Sep 17 00:00:00 2001 From: Oriol Arcas Date: Fri, 22 Feb 2019 18:15:51 +0100 Subject: [PATCH 065/135] Python BPF disassembler and map layout parser (#2209) * Python BPF disassembler and map layout parser Debugging eBPF programs can be tricky. The clang debug flags are not supported in all the code-loading branches yet - e.g., only load_prog() supports BPF_DEBUG or DEBUG_BPF_REGISTER_STATE, but compiling a kprobe with BPF(...) doesn't. This built-in disassembler can disassemble and print the BPF code in a similar syntax than the kernel, whenever and the number of times the user needs it. The BPF ISA is relatively stable so it doesn't require much maintenance. In addition, this parser is agnostic from the original source language (C, B, Go, etc.), and doesn't depend on a particular compiler. Example output for trace_pid_start() in biotop: Disassemble of BPF program trace_pid_start: 0: (79) r1 = *(u64*)(r1 +112) 1: (7b) *(u64*)(r10 -8) = r1 2: (b7) r1 = 0 3: (63) *(u32*)(r10 -16) = r1 4: (7b) *(u64*)(r10 -24) = r1 5: (7b) *(u64*)(r10 -32) = r1 6: (bf) r1 = r10 7: (07) r1 += -28 8: (b7) r2 = 16 9: (85) call bpf_get_current_comm#16 10: (67) r0 <<= 32 11: (77) r0 >>= 32 12: (55) if r0 != 0 goto +10 <23> 13: (85) call bpf_get_current_pid_tgid#14 14: (63) *(u32*)(r10 -32) = r0 15: (18) r1 = 17: (64-bit upper word) 17: (bf) r2 = r10 18: (07) r2 += -8 19: (bf) r3 = r10 20: (07) r3 += -32 21: (b7) r4 = 0 22: (85) call bpf_map_update_elem#2 23: (b7) r0 = 0 24: (95) exit The fields, types and memory layouts of maps can also be printed, which is something that can be really helpful when dealing with unaligned accesses or packed vs unpacked structures, and currently not supported by clang. For a map with key: struct {int a; short b; struct {int c:4; int d:8;};}); and value u64 the example output is: Layout of BPF type HASH map test_map (ID 0): struct { [0 +4] int a; [4 +2] short b; [6 +2] char[2] __pad_2; [8 +4] struct { int c:4; int d:8; } __anon0; } key; unsigned long long value; The [X +Y] is optional and denotes the offset and the size of each field. Note that bit-fields and padding fields are shown. Signed-off-by: Oriol Arcas --- src/python/bcc/__init__.py | 10 + src/python/bcc/disassembler.py | 455 ++++++++++++++++++++++++++++++ src/python/bcc/table.py | 26 ++ tests/python/CMakeLists.txt | 2 + tests/python/test_disassembler.py | 179 ++++++++++++ 5 files changed, 672 insertions(+) create mode 100644 src/python/bcc/disassembler.py create mode 100755 tests/python/test_disassembler.py diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index c9e1c14ad..64070c092 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -29,6 +29,7 @@ from .perf import Perf from .utils import get_online_cpus, printb, _assert_is_bytes, ArgString from .version import __version__ +from .disassembler import disassemble_prog, decode_map _probe_limit = 1000 _num_open_probes = 0 @@ -399,6 +400,15 @@ def dump_func(self, func_name): size, = lib.bpf_function_size(self.module, func_name), return ct.string_at(start, size) + def disassemble_func(self, func_name): + bpfstr = self.dump_func(func_name) + return disassemble_prog(func_name, bpfstr) + + def decode_table(self, table_name, sizeinfo=False): + table_obj = self[table_name] + table_type = lib.bpf_table_type_id(self.module, table_obj.map_id) + return decode_map(table_name, table_obj, table_type, sizeinfo=sizeinfo) + str2ctype = { u"_Bool": ct.c_bool, u"char": ct.c_char, diff --git a/src/python/bcc/disassembler.py b/src/python/bcc/disassembler.py new file mode 100644 index 000000000..6e1593d52 --- /dev/null +++ b/src/python/bcc/disassembler.py @@ -0,0 +1,455 @@ +# Copyright 2019 Clevernet +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from os import linesep +import ctypes as ct +from .table import get_table_type_name +from .libbcc import lib + +class OffsetUnion(ct.Union): + _fields_ = [('offsetu', ct.c_uint16), ('offset', ct.c_int16)] + +class ImmUnion(ct.Union): + _fields_ = [('immu', ct.c_uint32), ('imm', ct.c_int32)] + +class BPFInstrFields(ct.Structure): + _pack_ = 1 + _anonymous_ = ('o', 'i') + _fields_ = [('opcode', ct.c_uint8), + ('dst', ct.c_uint8, 4), + ('src', ct.c_uint8, 4), + ('o', OffsetUnion), + ('i', ImmUnion)] + +class BPFInstr(ct.Union): + _pack_ = 1 + _anonymous_ = ('s') + _fields_ = [('s', BPFInstrFields), ('instr', ct.c_uint64)] + +class BPFDecoder(): + BPF_PSEUDO_CALL = 1 + bpf_helpers = ['unspec', + 'map_lookup_elem', + 'map_update_elem', + 'map_delete_elem', + 'probe_read', + 'ktime_get_ns', + 'trace_printk', + 'get_prandom_u32', + 'get_smp_processor_id', + 'skb_store_bytes', + 'l3_csum_replace', + 'l4_csum_replace', + 'tail_call', + 'clone_redirect', + 'get_current_pid_tgid', + 'get_current_uid_gid', + 'get_current_comm', + 'get_cgroup_classid', + 'skb_vlan_push', + 'skb_vlan_pop', + 'skb_get_tunnel_key', + 'skb_set_tunnel_key', + 'perf_event_read', + 'redirect', + 'get_route_realm', + 'perf_event_output', + 'skb_load_bytes', + 'get_stackid', + 'csum_diff', + 'skb_get_tunnel_opt', + 'skb_set_tunnel_opt', + 'skb_change_proto', + 'skb_change_type', + 'skb_under_cgroup', + 'get_hash_recalc', + 'get_current_task', + 'probe_write_user', + 'current_task_under_cgroup', + 'skb_change_tail', + 'skb_pull_data', + 'csum_update', + 'set_hash_invalid', + 'get_numa_node_id', + 'skb_change_head', + 'xdp_adjust_head', + 'probe_read_str', + 'get_socket_cookie', + 'get_socket_uid', + 'set_hash', + 'setsockopt', + 'skb_adjust_room', + 'redirect_map', + 'sk_redirect_map', + 'sock_map_update', + 'xdp_adjust_meta', + 'perf_event_read_value', + 'perf_prog_read_value', + 'getsockopt', + 'override_return', + 'sock_ops_cb_flags_set', + 'msg_redirect_map', + 'msg_apply_bytes', + 'msg_cork_bytes', + 'msg_pull_data', + 'bind', + 'xdp_adjust_tail', + 'skb_get_xfrm_state', + 'get_stack', + 'skb_load_bytes_relative', + 'fib_lookup', + 'sock_hash_update', + 'msg_redirect_hash', + 'sk_redirect_hash', + 'lwt_push_encap', + 'lwt_seg6_store_bytes', + 'lwt_seg6_adjust_srh', + 'lwt_seg6_action', + 'rc_repeat', + 'rc_keydown', + 'skb_cgroup_id', + 'get_current_cgroup_id', + 'get_local_storage', + 'sk_select_reuseport', + 'skb_ancestor_cgroup_id', + 'sk_lookup_tcp', + 'sk_lookup_udp', + 'sk_release', + 'map_push_elem', + 'map_pop_elem', + 'map_peek_elem', + 'msg_push_data', + 'msg_pop_data', + 'rc_pointer_rel'] + + opcodes = {0x04: ('add32', 'dstimm', '+=', 32), + 0x05: ('ja', 'joff', None, 64), + 0x07: ('add', 'dstimm', '+=', 64), + 0x0c: ('add32', 'dstsrc', '+=', 32), + 0x0f: ('add', 'dstsrc', '+=', 64), + 0x14: ('sub32', 'dstimm', '-=', 32), + 0x15: ('jeq', 'jdstimmoff', '==', 64), + 0x17: ('sub', 'dstimm', '-=', 64), + 0x18: ('lddw', 'lddw', None, 64), + 0x1c: ('sub32', 'dstsrc', '-=', 32), + 0x1d: ('jeq', 'jdstsrcoff', '==', 64), + 0x1f: ('sub', 'dstsrc', '-=', 64), + 0x20: ('ldabsw', 'ldabs', None, 32), + 0x24: ('mul32', 'dstimm', '*=', 32), + 0x25: ('jgt', 'jdstimmoff', '>', 64), + 0x27: ('mul', 'dstimm', '*=', 64), + 0x28: ('ldabsh', 'ldabs', None, 16), + 0x2c: ('mul32', 'dstsrc', '*=', 32), + 0x2d: ('jgt', 'jdstsrcoff', '>', 64), + 0x2f: ('mul', 'dstsrc', '*=', 64), + 0x30: ('ldabsb', 'ldabs', None, 8), + 0x34: ('div32', 'dstimm', '/=', 32), + 0x35: ('jge', 'jdstimmoff', '>=', 64), + 0x37: ('div', 'dstimm', '/=', 64), + 0x38: ('ldabsdw', 'ldabs', None, 64), + 0x3c: ('div32', 'dstsrc', '/=', 32), + 0x3d: ('jge', 'jdstsrcoff', '>=', 64), + 0x3f: ('div', 'dstsrc', '/=', 64), + 0x40: ('ldindw', 'ldind', None, 32), + 0x44: ('or32', 'dstimm_bw', '|=', 32), + 0x45: ('jset', 'jdstimmoff', '&', 64), + 0x47: ('or', 'dstimm_bw', '|=', 64), + 0x48: ('ldindh', 'ldind', None, 16), + 0x4c: ('or32', 'dstsrc', '|=', 32), + 0x4d: ('jset', 'jdstsrcoff', '&', 64), + 0x4f: ('or', 'dstsrc', '|=', 64), + 0x50: ('ldindb', 'ldind', None, 8), + 0x54: ('and32', 'dstimm_bw', '&=', 32), + 0x55: ('jne', 'jdstimmoff', '!=', 64), + 0x57: ('and', 'dstimm_bw', '&=', 64), + 0x58: ('ldinddw', 'ldind', None, 64), + 0x5c: ('and32', 'dstsrc', '&=', 32), + 0x5d: ('jne', 'jdstsrcoff', '!=', 64), + 0x5f: ('and', 'dstsrc', '&=', 64), + 0x61: ('ldxw', 'ldstsrcoff', None, 32), + 0x62: ('stw', 'sdstoffimm', None, 32), + 0x63: ('stxw', 'sdstoffsrc', None, 32), + 0x64: ('lsh32', 'dstimm', '<<=', 32), + 0x65: ('jsgt', 'jdstimmoff', 's>', 64), + 0x67: ('lsh', 'dstimm', '<<=', 64), + 0x69: ('ldxh', 'ldstsrcoff', None, 16), + 0x6a: ('sth', 'sdstoffimm', None, 16), + 0x6b: ('stxh', 'sdstoffsrc', None, 16), + 0x6c: ('lsh32', 'dstsrc', '<<=', 32), + 0x6d: ('jsgt', 'jdstsrcoff', 's>', 64), + 0x6f: ('lsh', 'dstsrc', '<<=', 64), + 0x71: ('ldxb', 'ldstsrcoff', None, 8), + 0x72: ('stb', 'sdstoffimm', None, 8), + 0x73: ('stxb', 'sdstoffsrc', None, 8), + 0x74: ('rsh32', 'dstimm', '>>=', 32), + 0x75: ('jsge', 'jdstimmoff', 's>=', 64), + 0x77: ('rsh', 'dstimm', '>>=', 64), + 0x79: ('ldxdw', 'ldstsrcoff', None, 64), + 0x7a: ('stdw', 'sdstoffimm', None, 64), + 0x7b: ('stxdw', 'sdstoffsrc', None, 64), + 0x7c: ('rsh32', 'dstsrc', '>>=', 32), + 0x7d: ('jsge', 'jdstsrcoff', 's>=', 64), + 0x7f: ('rsh', 'dstsrc', '>>=', 64), + 0x84: ('neg32', 'dst', '~', 32), + 0x85: ('call', 'call', None, 64), + 0x87: ('neg', 'dst', '~', 64), + 0x94: ('mod32', 'dstimm', '%=', 32), + 0x95: ('exit', 'exit', None, 64), + 0x97: ('mod', 'dstimm', '%=', 64), + 0x9c: ('mod32', 'dstsrc', '%=', 32), + 0x9f: ('mod', 'dstsrc', '%=', 64), + 0xa4: ('xor32', 'dstimm_bw', '^=', 32), + 0xa5: ('jlt', 'jdstimmoff', '<', 64), + 0xa7: ('xor', 'dstimm_bw', '^=', 64), + 0xac: ('xor32', 'dstsrc', '^=', 32), + 0xad: ('jlt', 'jdstsrcoff', '<', 64), + 0xaf: ('xor', 'dstsrc', '^=', 64), + 0xb4: ('mov32', 'dstimm', '=', 32), + 0xb5: ('jle', 'jdstimmoff', '<=', 64), + 0xb7: ('mov', 'dstimm', '=', 64), + 0xbc: ('mov32', 'dstsrc', '=', 32), + 0xbd: ('jle', 'jdstsrcoff', '<=', 64), + 0xbf: ('mov', 'dstsrc', '=', 64), + 0xc4: ('arsh32', 'dstimm', 's>>=', 32), + 0xc5: ('jslt', 'jdstimmoff', 's<', 64), + 0xc7: ('arsh', 'dstimm', 's>>=', 64), + 0xcc: ('arsh32', 'dstsrc', 's>>=', 32), + 0xcd: ('jslt', 'jdstsrcoff', 's<', 64), + 0xcf: ('arsh', 'dstsrc', 's>>=', 64), + 0xd5: ('jsle', 'jdstimmoff', 's<=', 64), + 0xdc: ('endian32', 'dstsrc', 'endian', 32), + 0xdd: ('jsle', 'jdstimmoff', 's<=', 64),} + + @classmethod + def decode(cls, i, w, w1): + try: + name, opclass, op, bits = cls.opcodes[w.opcode] + if opclass == 'dstimm': + return 'r%d %s %d' % (w.dst, op, w.imm), 0 + + elif opclass == 'dstimm_bw': + return 'r%d %s 0x%x' % (w.dst, op, w.immu), 0 + + elif opclass == 'joff': + return 'goto %s <%d>' % ('%+d' % (w.offset), + i + w.offset + 1), 0 + + elif opclass == 'dstsrc': + return 'r%d %s r%d' % (w.dst, op, w.src), 0 + + elif opclass == 'jdstimmoff': + return 'if r%d %s %d goto pc%s <%d>' % (w.dst, op, w.imm, + '%+d' % (w.offset), + i + w.offset + 1), 0 + + elif opclass == 'jdstsrcoff': + return 'if r%d %s r%d goto pc%s <%d>' % (w.dst, op, w.src, + '%+d' % (w.offset), + i + w.offset + 1), 0 + + elif opclass == 'lddw': + # imm contains the file descriptor (FD) of the map being loaded; + # the kernel will translate this into the proper address + if w1 is None: + raise Exception("lddw requires two instructions to be disassembled") + if w1.imm == 0: + return 'r%d = ' % (w.dst, w.imm), 1 + imm = (w1.imm << 32) | w.imm + return 'r%d = 0x%x' % (w.dst, imm), 1 + + elif opclass == 'ldabs': + return 'r0 = *(u%s*)skb[%s]' % (bits, w.imm), 0 + + elif opclass == 'ldind': + return 'r0 = *(u%d*)skb[r%d %s]' % (bits, w.src, + '%+d' % (w.imm)), 0 + + elif opclass == 'ldstsrcoff': + return 'r%d = *(u%d*)(r%d %s)' % (w.dst, bits, w.src, + '%+d' % (w.offset)), 0 + + elif opclass == 'sdstoffimm': + return '*(u%d*)(r%d %s) = %d' % (bits, w.dst, + '%+d' % (w.offset), w.imm), 0 + + elif opclass == 'sdstoffsrc': + return '*(u%d*)(r%d %s) = r%d' % (bits, w.dst, + '%+d' % (w.offset), w.src), 0 + + elif opclass == 'dst': + return 'r%d = %s (u%s)r%d' % (w.dst, op, bits, w.dst), 0 + + elif opclass == 'call': + if w.src != cls.BPF_PSEUDO_CALL: + try: + return '%s bpf_%s#%d' % (name, cls.bpf_helpers[w.immu], w.immu), 0 + except IndexError: + return '%s ' % (op, w.immu), 0 + return '%s %s' % (name, '%+d' % (w.imm)), 0 + elif opclass == 'exit': + return name, 0 + else: + raise Exception('unknown opcode class') + + except KeyError: + return 'unknown <0x%x>' % (w.opcode) + +def disassemble_instruction(i, w0, w1=None): + instr, skip = BPFDecoder.decode(i, w0, w1) + return "%4d: (%02x) %s" % (i, w0.opcode, instr), skip + +def disassemble_str(bpfstr): + ptr = ct.cast(ct.c_char_p(bpfstr), ct.POINTER(BPFInstr)) + numinstr = int(len(bpfstr) / 8) + w0 = ptr[0] + skip = 0 + instr_list = [] + for i in range(1, numinstr): + w1 = ptr[i] + if skip: + skip -= 1 + instr_str = "%4d: (64-bit upper word)" % (i) + else: + instr_str, skip = disassemble_instruction(i - 1, w0, w1) + instr_list.append(instr_str) + w0 = w1 + instr_str, skip = disassemble_instruction(numinstr - 1, w0, None) + instr_list.append(instr_str) + return instr_list + +def disassemble_prog(func_name, bpfstr): + instr_list = ["Disassemble of BPF program %s:" % (func_name)] + instr_list += disassemble_str(bpfstr) + return linesep.join(instr_list) + +class MapDecoder (): + ctype2str = {ct.c_bool: u"_Bool", + ct.c_char: u"char", + ct.c_wchar: u"wchar_t", + ct.c_ubyte: u"unsigned char", + ct.c_short: u"short", + ct.c_ushort: u"unsigned short", + ct.c_int: u"int", + ct.c_uint: u"unsigned int", + ct.c_long: u"long", + ct.c_ulong: u"unsigned long", + ct.c_longlong: u"long long", + ct.c_ulonglong: u"unsigned long long", + ct.c_float: u"float", + ct.c_double: u"double", + ct.c_longdouble: u"long double", + ct.c_int64 * 2: u"__int128", + ct.c_uint64 * 2: u"unsigned __int128",} + + @classmethod + def get_ct_name(cls, t): + try: + if issubclass(t, ct.Structure): + field_type_name = "struct" + elif issubclass(t, ct.Union): + field_type_name = "union" + elif issubclass(t, ct.Array): + field_type_name = cls.ctype2str[t._type_] + "[" + str(t._length_) + "]" + else: + field_type_name = cls.ctype2str[t] + except KeyError: + field_type_name = str(t) + return field_type_name + + @classmethod + def format_size_info(cls, offset, size, enabled=False, bitoffset=None): + if not enabled: + return "" + if bitoffset is not None: + return "[%d,%d +%d bit]" % (offset, bitoffset, size) + return "[%d +%d] " % (offset, size) + + @classmethod + def print_ct_map(cls, t, indent="", offset=0, sizeinfo=False): + map_lines = [] + try: + for field_name, field_type in t._fields_: + is_structured = (issubclass(field_type, ct.Structure) or + issubclass(field_type, ct.Union)) + field_type_name = cls.get_ct_name(field_type) + field_offset = getattr(t, field_name).offset + field_size = ct.sizeof(field_type) + sizedesc = cls.format_size_info(offset + field_offset, + field_size, sizeinfo) + if is_structured: + map_lines.append("%s%s%s {" % (indent, sizedesc, field_type_name)) + map_lines += cls.print_ct_map(field_type, + indent + " ", + offset + field_offset) + map_lines.append("%s} %s;" % (indent, field_name)) + else: + map_lines.append("%s%s%s %s;" % (indent, sizedesc, + field_type_name, + field_name)) + except ValueError: + # is a bit field + offset_bits = 0 + for field in t._fields_: + if len(field) == 3: + field_name, field_type, field_bits = field + field_type_name = cls.get_ct_name(field_type) + sizedesc = cls.format_size_info(offset, offset_bits, + sizeinfo, field_bits) + map_lines.append("%s%s%s %s:%d;" % (indent, sizedesc, + field_type_name, + field_name, + field_bits)) + else: + # end of previous bit field + field_name, field_type = field + field_type_name = cls.get_ct_name(field_type) + field_offset = getattr(t, field_name).offset + field_size = ct.sizeof(field_type) + field_bits = 0 + offset_bits = 0 + sizedesc = cls.format_size_info(offset + field_offset, + field_size, sizeinfo) + map_lines.append("%s%s%s %s;" % (indent, sizedesc, + field_type_name, + field_name)) + offset += field_offset + offset_bits += field_bits + return map_lines + + @classmethod + def print_map_ctype(cls, t, field_name, sizeinfo): + is_structured = (issubclass(t, ct.Structure) or + issubclass(t, ct.Union)) + type_name = cls.get_ct_name(t); + if is_structured: + map_lines = [" %s {" % (type_name)] + map_lines += cls.print_ct_map(t, " ", sizeinfo=sizeinfo) + map_lines.append(" } %s;" % (field_name)) + else: + map_lines = [" %s %s;" % (type_name, field_name)] + return map_lines + + @classmethod + def decode_map(cls, map_name, map_obj, map_type, sizeinfo=False): + map_lines = ['Layout of BPF map %s (type %s, FD %d, ID %d):' % (map_name, + map_type, + map_obj.map_fd, + map_obj.map_id)] + map_lines += cls.print_map_ctype(map_obj.Key, 'key', sizeinfo=sizeinfo) + map_lines += cls.print_map_ctype(map_obj.Leaf, 'value', sizeinfo=sizeinfo) + return linesep.join(map_lines) + +def decode_map(map_name, map_obj, map_type, sizeinfo=False): + map_type_name = get_table_type_name(map_type) + return MapDecoder.decode_map(map_name, map_obj, map_type_name, sizeinfo=sizeinfo) diff --git a/src/python/bcc/table.py b/src/python/bcc/table.py index f2462570a..d33d46eb0 100644 --- a/src/python/bcc/table.py +++ b/src/python/bcc/table.py @@ -45,6 +45,25 @@ BPF_MAP_TYPE_XSKMAP = 17 BPF_MAP_TYPE_SOCKHASH = 18 +map_type_name = {BPF_MAP_TYPE_HASH: "HASH", + BPF_MAP_TYPE_ARRAY: "ARRAY", + BPF_MAP_TYPE_PROG_ARRAY: "PROG_ARRAY", + BPF_MAP_TYPE_PERF_EVENT_ARRAY: "PERF_EVENT_ARRAY", + BPF_MAP_TYPE_PERCPU_HASH: "PERCPU_HASH", + BPF_MAP_TYPE_PERCPU_ARRAY: "PERCPU_ARRAY", + BPF_MAP_TYPE_STACK_TRACE: "STACK_TRACE", + BPF_MAP_TYPE_CGROUP_ARRAY: "CGROUP_ARRAY", + BPF_MAP_TYPE_LRU_HASH: "LRU_HASH", + BPF_MAP_TYPE_LRU_PERCPU_HASH: "LRU_PERCPU_HASH", + BPF_MAP_TYPE_LPM_TRIE: "LPM_TRIE", + BPF_MAP_TYPE_ARRAY_OF_MAPS: "ARRAY_OF_MAPS", + BPF_MAP_TYPE_HASH_OF_MAPS: "HASH_OF_MAPS", + BPF_MAP_TYPE_DEVMAP: "DEVMAP", + BPF_MAP_TYPE_SOCKMAP: "SOCKMAP", + BPF_MAP_TYPE_CPUMAP: "CPUMAP", + BPF_MAP_TYPE_XSKMAP: "XSKMAP", + BPF_MAP_TYPE_SOCKHASH: "SOCKHASH",} + stars_max = 40 log2_index_max = 65 linear_index_max = 1025 @@ -123,6 +142,13 @@ def _print_linear_hist(vals, val_type): _stars(val, val_max, stars))) +def get_table_type_name(ttype): + try: + return map_type_name[ttype] + except KeyError: + return "" + + def Table(bpf, map_id, map_fd, keytype, leaftype, name, **kwargs): """Table(bpf, map_id, map_fd, keytype, leaftype, **kwargs) diff --git a/tests/python/CMakeLists.txt b/tests/python/CMakeLists.txt index 3862e505d..f323ac1b3 100644 --- a/tests/python/CMakeLists.txt +++ b/tests/python/CMakeLists.txt @@ -67,6 +67,8 @@ add_test(NAME py_test_percpu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND ${TEST_WRAPPER} py_test_percpu sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_percpu.py) add_test(NAME py_test_dump_func WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND ${TEST_WRAPPER} py_dump_func simple ${CMAKE_CURRENT_SOURCE_DIR}/test_dump_func.py) +add_test(NAME py_test_disassembler WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + COMMAND ${TEST_WRAPPER} py_test_disassembler sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_disassembler.py) add_test(NAME py_test_tools_smoke WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND ${TEST_WRAPPER} py_test_tools_smoke sudo ${CMAKE_CURRENT_SOURCE_DIR}/test_tools_smoke.py) add_test(NAME py_test_tools_memleak WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} diff --git a/tests/python/test_disassembler.py b/tests/python/test_disassembler.py new file mode 100755 index 000000000..89c4ec560 --- /dev/null +++ b/tests/python/test_disassembler.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python +# Copyright (c) Clevernet +# Licensed under the Apache License, Version 2.0 (the "License") + +# test program for the 'disassemble_func' and 'decode_table' methods + +from bcc import BPF +from bcc import disassembler +import ctypes as ct +import random +from unittest import main, TestCase + +class BPFInstr(ct.Structure): + _pack_ = 1 + _fields_ = [('opcode', ct.c_uint8), + ('dst', ct.c_uint8, 4), + ('src', ct.c_uint8, 4), + ('offset', ct.c_int16), + ('imm', ct.c_int32)] + +class TestDisassembler(TestCase): + opcodes = [(0x04, "%dst += %imm"), + (0x05, "goto %off <%jmp>"), + (0x07, "%dst += %imm"), + (0x0c, "%dst += %src"), + (0x0f, "%dst += %src"), + (0x14, "%dst -= %imm"), + (0x15, "if %dst == %imm goto pc%off <%jmp>"), + (0x17, "%dst -= %imm"), + #(0x18, "lddw"), + (0x1c, "%dst -= %src"), + (0x1d, "if %dst == %src goto pc%off <%jmp>"), + (0x1f, "%dst -= %src"), + (0x20, "r0 = *(u32*)skb[%imm]"), + (0x24, "%dst *= %imm"), + (0x25, "if %dst > %imm goto pc%off <%jmp>"), + (0x27, "%dst *= %imm"), + (0x28, "r0 = *(u16*)skb[%imm]"), + (0x2c, "%dst *= %src"), + (0x2d, "if %dst > %src goto pc%off <%jmp>"), + (0x2f, "%dst *= %src"), + (0x30, "r0 = *(u8*)skb[%imm]"), + (0x34, "%dst /= %imm"), + (0x35, "if %dst >= %imm goto pc%off <%jmp>"), + (0x37, "%dst /= %imm"), + (0x38, "r0 = *(u64*)skb[%imm]"), + (0x3c, "%dst /= %src"), + (0x3d, "if %dst >= %src goto pc%off <%jmp>"), + (0x3f, "%dst /= %src"), + (0x40, "r0 = *(u32*)skb[%src %sim]"), + (0x44, "%dst |= %ibw"), + (0x45, "if %dst & %imm goto pc%off <%jmp>"), + (0x47, "%dst |= %ibw"), + (0x48, "r0 = *(u16*)skb[%src %sim]"), + (0x4c, "%dst |= %src"), + (0x4d, "if %dst & %src goto pc%off <%jmp>"), + (0x4f, "%dst |= %src"), + (0x50, "r0 = *(u8*)skb[%src %sim]"), + (0x54, "%dst &= %ibw"), + (0x55, "if %dst != %imm goto pc%off <%jmp>"), + (0x57, "%dst &= %ibw"), + (0x58, "r0 = *(u64*)skb[%src %sim]"), + (0x5c, "%dst &= %src"), + (0x5d, "if %dst != %src goto pc%off <%jmp>"), + (0x5f, "%dst &= %src"), + (0x61, "%dst = *(u32*)(%src %off)"), + (0x62, "*(u32*)(%dst %off) = %imm"), + (0x63, "*(u32*)(%dst %off) = %src"), + (0x64, "%dst <<= %imm"), + (0x65, "if %dst s> %imm goto pc%off <%jmp>"), + (0x67, "%dst <<= %imm"), + (0x69, "%dst = *(u16*)(%src %off)"), + (0x6a, "*(u16*)(%dst %off) = %imm"), + (0x6b, "*(u16*)(%dst %off) = %src"), + (0x6c, "%dst <<= %src"), + (0x6d, "if %dst s> %src goto pc%off <%jmp>"), + (0x6f, "%dst <<= %src"), + (0x71, "%dst = *(u8*)(%src %off)"), + (0x72, "*(u8*)(%dst %off) = %imm"), + (0x73, "*(u8*)(%dst %off) = %src"), + (0x74, "%dst >>= %imm"), + (0x75, "if %dst s>= %imm goto pc%off <%jmp>"), + (0x77, "%dst >>= %imm"), + (0x79, "%dst = *(u64*)(%src %off)"), + (0x7a, "*(u64*)(%dst %off) = %imm"), + (0x7b, "*(u64*)(%dst %off) = %src"), + (0x7c, "%dst >>= %src"), + (0x7d, "if %dst s>= %src goto pc%off <%jmp>"), + (0x7f, "%dst >>= %src"), + (0x84, "%dst = ~ (u32)%dst"), + #(0x85, "call"), + (0x87, "%dst = ~ (u64)%dst"), + (0x94, "%dst %= %imm"), + (0x95, "exit"), + (0x97, "%dst %= %imm"), + (0x9c, "%dst %= %src"), + (0x9f, "%dst %= %src"), + (0xa4, "%dst ^= %ibw"), + (0xa5, "if %dst < %imm goto pc%off <%jmp>"), + (0xa7, "%dst ^= %ibw"), + (0xac, "%dst ^= %src"), + (0xad, "if %dst < %src goto pc%off <%jmp>"), + (0xaf, "%dst ^= %src"), + (0xb4, "%dst = %imm"), + (0xb5, "if %dst <= %imm goto pc%off <%jmp>"), + (0xb7, "%dst = %imm"), + (0xbc, "%dst = %src"), + (0xbd, "if %dst <= %src goto pc%off <%jmp>"), + (0xbf, "%dst = %src"), + (0xc4, "%dst s>>= %imm"), + (0xc5, "if %dst s< %imm goto pc%off <%jmp>"), + (0xc7, "%dst s>>= %imm"), + (0xcc, "%dst s>>= %src"), + (0xcd, "if %dst s< %src goto pc%off <%jmp>"), + (0xcf, "%dst s>>= %src"), + (0xd5, "if %dst s<= %imm goto pc%off <%jmp>"), + (0xdc, "%dst endian %src"), + (0xdd, "if %dst s<= %imm goto pc%off <%jmp>"),] + + @classmethod + def build_instr(cls, op): + dst = random.randint(0, 0xf) + src = random.randint(0, 0xf) + offset = random.randint(0, 0xffff) + imm = random.randint(0, 0xffffffff) + return BPFInstr(op, dst, src, offset, imm) + + @classmethod + def format_instr(cls, instr, fmt): + uimm = ct.c_uint32(instr.imm).value + return (fmt.replace("%dst", "r%d" % (instr.dst)) + .replace("%src", "r%d" % (instr.src)) + .replace("%imm", "%d" % (instr.imm)) + .replace("%ibw", "0x%x" % (uimm)) + .replace("%sim", "%+d" % (instr.imm)) + .replace("%off", "%+d" % (instr.offset)) + .replace("%jmp", "%d" % (instr.offset + 1))) + + def test_func(self): + b = BPF(text=""" + struct key_t {int a; short b; struct {int c:4; int d:8;} e;} __attribute__((__packed__)); + BPF_HASH(test_map, struct key_t); + int test_func(void) + { + return 1; + }""") + + self.assertEqual( + """Disassemble of BPF program test_func: + 0: (b7) r0 = 1 + 1: (95) exit""", + b.disassemble_func("test_func")) + + self.assertEqual( + """Layout of BPF map test_map (type HASH, FD 3, ID 0): + struct { + int a; + short b; + struct { + int c:4; + int d:8; + } e; + } key; + unsigned long long value;""", + b.decode_table("test_map")) + + def test_bpf_isa(self): + for op, instr_fmt in self.opcodes: + instr_fmt + if instr_fmt is None: + continue + instr = self.build_instr(op) + instr_str = ct.string_at(ct.addressof(instr), ct.sizeof(instr)) + target_text = self.format_instr(instr, instr_fmt) + self.assertEqual(disassembler.disassemble_str(instr_str)[0], + "%4d: (%02x) %s" % (0, op, target_text)) + +if __name__ == "__main__": + main() From eba6beb48cee66753a7546088b9077f3b3c97119 Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Sun, 24 Feb 2019 05:40:27 +0800 Subject: [PATCH 066/135] tutorial_bcc_python_developer: new way to get the event data (#2226) Follow-up to #2198. --- docs/tutorial_bcc_python_developer.md | 12 ++---------- examples/tracing/hello_perf_output.py | 10 +--------- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/docs/tutorial_bcc_python_developer.md b/docs/tutorial_bcc_python_developer.md index 192902eb0..f5d2ff986 100644 --- a/docs/tutorial_bcc_python_developer.md +++ b/docs/tutorial_bcc_python_developer.md @@ -250,7 +250,6 @@ Code is [examples/tracing/hello_perf_output.py](../examples/tracing/hello_perf_o ```Python from bcc import BPF -import ctypes as ct # define BPF program prog = """ @@ -281,13 +280,6 @@ int hello(struct pt_regs *ctx) { b = BPF(text=prog) b.attach_kprobe(event=b.get_syscall_fnname("clone"), fn_name="hello") -# define output data structure in Python -TASK_COMM_LEN = 16 # linux/sched.h -class Data(ct.Structure): - _fields_ = [("pid", ct.c_uint), - ("ts", ct.c_ulonglong), - ("comm", ct.c_char * TASK_COMM_LEN)] - # header print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "MESSAGE")) @@ -295,7 +287,7 @@ print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "MESSAGE")) start = 0 def print_event(cpu, data, size): global start - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) if start == 0: start = event.ts time_s = (float(event.ts - start)) / 1000000000 @@ -316,8 +308,8 @@ Things to learn: 1. ```bpf_get_current_pid_tgid()```: Returns the process ID in the lower 32 bits (kernel's view of the PID, which in user space is usually presented as the thread ID), and the thread group ID in the upper 32 bits (what user space often thinks of as the PID). By directly setting this to a u32, we discard the upper 32 bits. Should you be presenting the PID or the TGID? For a multi-threaded app, the TGID will be the same, so you need the PID to differentiate them, if that's what you want. It's also a question of expectations for the end user. 1. ```bpf_get_current_comm()```: Populates the first argument address with the current process name. 1. ```events.perf_submit()```: Submit the event for user space to read via a perf ring buffer. -1. ```class Data(ct.Structure)```: Now define the Python version of the C data structure. 1. ```def print_event()```: Define a Python function that will handle reading events from the ```events``` stream. +1. ```b["events"].event(data)```: Now get the event as a Python object. 1. ```b["events"].open_perf_buffer(print_event)```: Associate the Python ```print_event``` function with the ```events``` stream. 1. ```while 1: b.perf_buffer_poll()```: Block waiting for events. diff --git a/examples/tracing/hello_perf_output.py b/examples/tracing/hello_perf_output.py index 64cfb63fc..bcf8b4388 100755 --- a/examples/tracing/hello_perf_output.py +++ b/examples/tracing/hello_perf_output.py @@ -3,7 +3,6 @@ # This is a Hello World example that uses BPF_PERF_OUTPUT. from bcc import BPF -import ctypes as ct # define BPF program prog = """ @@ -34,13 +33,6 @@ b = BPF(text=prog) b.attach_kprobe(event=b.get_syscall_fnname("clone"), fn_name="hello") -# define output data structure in Python -TASK_COMM_LEN = 16 # linux/sched.h -class Data(ct.Structure): - _fields_ = [("pid", ct.c_uint), - ("ts", ct.c_ulonglong), - ("comm", ct.c_char * TASK_COMM_LEN)] - # header print("%-18s %-16s %-6s %s" % ("TIME(s)", "COMM", "PID", "MESSAGE")) @@ -48,7 +40,7 @@ class Data(ct.Structure): start = 0 def print_event(cpu, data, size): global start - event = ct.cast(data, ct.POINTER(Data)).contents + event = b["events"].event(data) if start == 0: start = event.ts time_s = (float(event.ts - start)) / 1000000000 From e8ece5660ff5f4829dd156e831f79ce02109034c Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Mon, 25 Feb 2019 23:23:54 +0100 Subject: [PATCH 067/135] support gcc 8's cold subfunctions (#2228) * support gcc 8's cold subfunctions GCC 8 can split functions bodies into hot and cold regions, causing extra symbols with a ".cold.N" suffix to be emitted. Exclude these extra symbols and only allow the parent function to be traced. Signed-off-by: Andrea Righi * use byte string with function names read from /proc/kallsyms Function names are read from /proc/kallsyms, that is opened in binary mode. Regex pattern must be bytes as well. Signed-off-by: Andrea Righi --- src/python/bcc/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index 64070c092..01d5604b3 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -564,6 +564,9 @@ def get_kprobe_functions(event_re): # non-attachable. elif fn.startswith(b'__perf') or fn.startswith(b'perf_'): continue + # Exclude all gcc 8's extra .cold functions + elif re.match(b'^.*\.cold\.\d+$', fn): + continue if (t.lower() in [b't', b'w']) and re.match(event_re, fn) \ and fn not in blacklist: fns.append(fn) From 2ebdf164c28fce81620b783cb73744b76da0714c Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Wed, 27 Feb 2019 05:20:12 +0800 Subject: [PATCH 068/135] tcpaccept: only show TCP accept event (#2232) pr #1842 forgot to add filtering in the new tracepoint code so it's incorrectly showing every sock:inet_sock_set_state event. This patch fixes that. --- tools/tcpaccept.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/tcpaccept.py b/tools/tcpaccept.py index 5f7b48c33..f606b734e 100755 --- a/tools/tcpaccept.py +++ b/tools/tcpaccept.py @@ -161,6 +161,8 @@ { if (args->protocol != IPPROTO_TCP) return 0; + if (args->oldstate != TCP_SYN_RECV || args->newstate != TCP_ESTABLISHED) + return 0; u32 pid = bpf_get_current_pid_tgid(); ##FILTER_PID## From bb65bea69a6486efd50dd57331eed0f6435b2358 Mon Sep 17 00:00:00 2001 From: Gary Lin Date: Wed, 27 Feb 2019 16:52:35 +0800 Subject: [PATCH 069/135] examples/tracing: Use printb to improve python3 compatibility When using python3, 'print' will output the bytes arrays with the "b" prefix. Switch to printb to get rid of the prefix. Signed-off-by: Gary Lin --- examples/tracing/disksnoop.py | 9 +++++---- examples/tracing/hello_fields.py | 3 ++- examples/tracing/hello_perf_output.py | 5 +++-- examples/tracing/mallocstacks.py | 3 ++- examples/tracing/mysqld_query.py | 3 ++- examples/tracing/nodejs_http_server.py | 3 ++- examples/tracing/strlen_count.py | 3 ++- examples/tracing/sync_timing.py | 3 ++- examples/tracing/tcpv4connect.py | 11 ++++++----- examples/tracing/trace_perf_output.py | 3 ++- examples/tracing/urandomread-explicit.py | 3 ++- examples/tracing/urandomread.py | 3 ++- 12 files changed, 32 insertions(+), 20 deletions(-) diff --git a/examples/tracing/disksnoop.py b/examples/tracing/disksnoop.py index 17d911a12..e181235b6 100755 --- a/examples/tracing/disksnoop.py +++ b/examples/tracing/disksnoop.py @@ -12,6 +12,7 @@ from __future__ import print_function from bcc import BPF +from bcc.utils import printb REQ_WRITE = 1 # from include/linux/blk_types.h @@ -56,13 +57,13 @@ (bytes_s, bflags_s, us_s) = msg.split() if int(bflags_s, 16) & REQ_WRITE: - type_s = "W" + type_s = b"W" elif bytes_s == "0": # see blk_fill_rwbs() for logic - type_s = "M" + type_s = b"M" else: - type_s = "R" + type_s = b"R" ms = float(int(us_s, 10)) / 1000 - print("%-18.9f %-2s %-7s %8.2f" % (ts, type_s, bytes_s, ms)) + printb(b"%-18.9f %-2s %-7s %8.2f" % (ts, type_s, bytes_s, ms)) except KeyboardInterrupt: exit() diff --git a/examples/tracing/hello_fields.py b/examples/tracing/hello_fields.py index be53e6222..b8ee6db47 100755 --- a/examples/tracing/hello_fields.py +++ b/examples/tracing/hello_fields.py @@ -3,6 +3,7 @@ # This is a Hello World example that formats output as fields. from bcc import BPF +from bcc.utils import printb # define BPF program prog = """ @@ -25,4 +26,4 @@ (task, pid, cpu, flags, ts, msg) = b.trace_fields() except ValueError: continue - print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) + printb(b"%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) diff --git a/examples/tracing/hello_perf_output.py b/examples/tracing/hello_perf_output.py index bcf8b4388..fdc74deb3 100755 --- a/examples/tracing/hello_perf_output.py +++ b/examples/tracing/hello_perf_output.py @@ -3,6 +3,7 @@ # This is a Hello World example that uses BPF_PERF_OUTPUT. from bcc import BPF +from bcc.utils import printb # define BPF program prog = """ @@ -44,8 +45,8 @@ def print_event(cpu, data, size): if start == 0: start = event.ts time_s = (float(event.ts - start)) / 1000000000 - print("%-18.9f %-16s %-6d %s" % (time_s, event.comm, event.pid, - "Hello, perf_output!")) + printb(b"%-18.9f %-16s %-6d %s" % (time_s, event.comm, event.pid, + b"Hello, perf_output!")) # loop with callback to print_event b["events"].open_perf_buffer(print_event) diff --git a/examples/tracing/mallocstacks.py b/examples/tracing/mallocstacks.py index 2f3eb2594..84b435e10 100755 --- a/examples/tracing/mallocstacks.py +++ b/examples/tracing/mallocstacks.py @@ -12,6 +12,7 @@ from __future__ import print_function from bcc import BPF +from bcc.utils import printb from time import sleep import sys @@ -56,4 +57,4 @@ for k, v in reversed(sorted(calls.items(), key=lambda c: c[1].value)): print("%d bytes allocated at:" % v.value) for addr in stack_traces.walk(k.value): - print("\t%s" % b.sym(addr, pid, show_offset=True)) + printb(b"\t%s" % b.sym(addr, pid, show_offset=True)) diff --git a/examples/tracing/mysqld_query.py b/examples/tracing/mysqld_query.py index 15ff297af..aa453ce68 100755 --- a/examples/tracing/mysqld_query.py +++ b/examples/tracing/mysqld_query.py @@ -12,6 +12,7 @@ from __future__ import print_function from bcc import BPF, USDT +from bcc.utils import printb import sys if len(sys.argv) < 2: @@ -58,4 +59,4 @@ except ValueError: print("value error") continue - print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) + printb(b"%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) diff --git a/examples/tracing/nodejs_http_server.py b/examples/tracing/nodejs_http_server.py index 1017de563..1f6a7b906 100755 --- a/examples/tracing/nodejs_http_server.py +++ b/examples/tracing/nodejs_http_server.py @@ -10,6 +10,7 @@ from __future__ import print_function from bcc import BPF, USDT +from bcc.utils import printb import sys if len(sys.argv) < 2: @@ -51,4 +52,4 @@ except ValueError: print("value error") continue - print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) + printb(b"%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) diff --git a/examples/tracing/strlen_count.py b/examples/tracing/strlen_count.py index 49d70809e..eab67109f 100755 --- a/examples/tracing/strlen_count.py +++ b/examples/tracing/strlen_count.py @@ -12,6 +12,7 @@ from __future__ import print_function from bcc import BPF +from bcc.utils import printb from time import sleep # load BPF program @@ -52,4 +53,4 @@ print("%10s %s" % ("COUNT", "STRING")) counts = b.get_table("counts") for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): - print("%10d \"%s\"" % (v.value, k.c.encode('string-escape'))) + printb(b"%10d \"%s\"" % (v.value, k.c)) diff --git a/examples/tracing/sync_timing.py b/examples/tracing/sync_timing.py index 675ad14c8..4fad777c4 100755 --- a/examples/tracing/sync_timing.py +++ b/examples/tracing/sync_timing.py @@ -10,6 +10,7 @@ from __future__ import print_function from bcc import BPF +from bcc.utils import printb # load BPF program b = BPF(text=""" @@ -48,4 +49,4 @@ if start == 0: start = ts ts = ts - start - print("At time %.2f s: multiple syncs detected, last %s ms ago" % (ts, ms)) + printb(b"At time %.2f s: multiple syncs detected, last %s ms ago" % (ts, ms)) diff --git a/examples/tracing/tcpv4connect.py b/examples/tracing/tcpv4connect.py index 8a89469de..81385e8f8 100755 --- a/examples/tracing/tcpv4connect.py +++ b/examples/tracing/tcpv4connect.py @@ -16,6 +16,7 @@ from __future__ import print_function from bcc import BPF +from bcc.utils import printb # define BPF program bpf_text = """ @@ -76,11 +77,11 @@ "DPORT")) def inet_ntoa(addr): - dq = '' + dq = b'' for i in range(0, 4): - dq = dq + str(addr & 0xff) + dq = dq + str(addr & 0xff).encode() if (i != 3): - dq = dq + '.' + dq = dq + b'.' addr = addr >> 8 return dq @@ -89,7 +90,7 @@ def inet_ntoa(addr): # Read messages from kernel pipe try: (task, pid, cpu, flags, ts, msg) = b.trace_fields() - (_tag, saddr_hs, daddr_hs, dport_s) = msg.split(" ") + (_tag, saddr_hs, daddr_hs, dport_s) = msg.split(b" ") except ValueError: # Ignore messages from other tracers continue @@ -98,7 +99,7 @@ def inet_ntoa(addr): if _tag != "trace_tcp4connect": continue - print("%-6d %-12.12s %-16s %-16s %-4s" % (pid, task, + printb(b"%-6d %-12.12s %-16s %-16s %-4s" % (pid, task, inet_ntoa(int(saddr_hs, 16)), inet_ntoa(int(daddr_hs, 16)), dport_s)) diff --git a/examples/tracing/trace_perf_output.py b/examples/tracing/trace_perf_output.py index 35a579573..635be129c 100755 --- a/examples/tracing/trace_perf_output.py +++ b/examples/tracing/trace_perf_output.py @@ -8,6 +8,7 @@ import atexit from bcc import BPF +from bcc.utils import printb import ctypes as ct class Data(ct.Structure): @@ -50,7 +51,7 @@ def print_counter(): global b print("counter = %d vs %d" % (counter, b["counters"][ct.c_int(0)].value)) -print("Tracing " + event_name + ", try `dd if=/dev/zero of=/dev/null`") +printb(b"Tracing " + event_name + b", try `dd if=/dev/zero of=/dev/null`") print("Tracing... Hit Ctrl-C to end.") while 1: try: diff --git a/examples/tracing/urandomread-explicit.py b/examples/tracing/urandomread-explicit.py index 448ffdfc4..0706092a7 100755 --- a/examples/tracing/urandomread-explicit.py +++ b/examples/tracing/urandomread-explicit.py @@ -17,6 +17,7 @@ from __future__ import print_function from bcc import BPF +from bcc.utils import printb # define BPF program bpf_text = """ @@ -49,4 +50,4 @@ (task, pid, cpu, flags, ts, msg) = b.trace_fields() except ValueError: continue - print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) + printb(b"%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) diff --git a/examples/tracing/urandomread.py b/examples/tracing/urandomread.py index 319db2ca5..c1468c8cd 100755 --- a/examples/tracing/urandomread.py +++ b/examples/tracing/urandomread.py @@ -13,6 +13,7 @@ from __future__ import print_function from bcc import BPF +from bcc.utils import printb # load BPF program b = BPF(text=""" @@ -32,4 +33,4 @@ (task, pid, cpu, flags, ts, msg) = b.trace_fields() except ValueError: continue - print("%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) + printb(b"%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) From eb0b5155aacfe15fbb3aeaf52b82ba3ba8e59d56 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Wed, 27 Feb 2019 18:06:55 +0700 Subject: [PATCH 070/135] docs/reference_guide: Update kernel macro name 540adea3809f "error-injection: Separate error-injection from kprobe", merged in v4.16, changed the name of BPF_ALLOW_ERROR_INJECTION to just ALLOW_ERROR_INJECTION. Update the name to help readers grepping the kernel code. --- docs/reference_guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference_guide.md b/docs/reference_guide.md index 4b37a247c..26fcf0c7e 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -417,7 +417,7 @@ This is used for targeted error injection. bpf_override_return will only work when the kprobed function is whitelisted to allow error injections. Whitelisting entails tagging a function with -`BPF_ALLOW_ERROR_INJECTION()` in the kernel source tree; see `io_ctl_init` for +`ALLOW_ERROR_INJECTION()` in the kernel source tree; see `io_ctl_init` for an example. If the kprobed function is not whitelisted, the bpf program will fail to attach with ` ioctl(PERF_EVENT_IOC_SET_BPF): Invalid argument` From 4f690c2bbb8c3da4ef209e8e12a04bb7c24f1940 Mon Sep 17 00:00:00 2001 From: Allan McAleavy Date: Wed, 27 Feb 2019 18:03:42 +0000 Subject: [PATCH 071/135] added DNAME_INLINE_LEN to be 32 as per filetop (#2236) added DNAME_INLINE_LEN to be 32 as per filetop --- tools/fileslower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/fileslower.py b/tools/fileslower.py index 6fa0c26cc..31e3adf9a 100755 --- a/tools/fileslower.py +++ b/tools/fileslower.py @@ -220,7 +220,7 @@ "BYTES", "LAT(ms)", "FILENAME")) start_ts = time.time() - +DNAME_INLINE_LEN = 32 def print_event(cpu, data, size): event = b["events"].event(data) From 14ab8aa44f52db5a50dcc8704a7f395b802fa4f8 Mon Sep 17 00:00:00 2001 From: Hui Peng Date: Wed, 27 Feb 2019 13:42:32 -0500 Subject: [PATCH 072/135] Fix documentation error for Ubuntu 18.04 (#2238) Fix documentation error for Ubuntu 18.04 --- INSTALL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL.md b/INSTALL.md index 10445b4a1..f2ece9f82 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -67,7 +67,7 @@ found at [packages.ubuntu.com](https://packages.ubuntu.com/search?suite=default& sudo apt-get install bpfcc-tools linux-headers-$(uname -r) ``` -The tools are installed in `/sbin` with a `-bpfcc` extension. Try running `sudo opensnoop-bpfcc`. +The tools are installed in `/sbin` (`/usr/sbin` in Ubuntu 18.04) with a `-bpfcc` extension. Try running `sudo opensnoop-bpfcc`. **_Note_**: the Ubuntu packages have different names but the package contents, in most cases, conflict and as such _cannot_ be installed alongside upstream packages. Should one choose to use From 0f5849187972a50adf0d9eaa8788c11f9fd926ea Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 28 Feb 2019 11:18:04 -0800 Subject: [PATCH 073/135] fix b.support_raw_tracepoint for 5.0 kernel Fix issue #2240. In 5.0, the following commit commit a38d1107f937ca95dcf820161ef44ea683d6a0b1 Author: Matt Mullins Date: Wed Dec 12 16:42:37 2018 -0800 bpf: support raw tracepoints in modules renamed the function bpf_find_raw_tracepoint() to bpf_get_raw_tracepoint(). The bcc relies on checking bpf_find_raw_tracepoint() in /proc/kallsyms to detect whether raw_tracepoint is supported in kernel or not. We do not have better mechanism to detect raw_tracepoint support without additional syscalls. So tentatively, let us just check bpf_get_raw_tracepoint() ksym as well for raw_tracepoint support. Signed-off-by: Yonghong Song --- src/python/bcc/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index 01d5604b3..69a048f3d 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -858,7 +858,8 @@ def detach_raw_tracepoint(self, tp=b""): @staticmethod def support_raw_tracepoint(): # kernel symbol "bpf_find_raw_tracepoint" indicates raw_tracepint support - if BPF.ksymname("bpf_find_raw_tracepoint") != -1: + if BPF.ksymname("bpf_find_raw_tracepoint") != -1 or \ + BPF.ksymname("bpf_get_raw_tracepoint") != -1: return True return False From 1eab542d1a0be5fed66add126d1668ef648fe08a Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Thu, 28 Feb 2019 14:15:35 -0800 Subject: [PATCH 074/135] fix a compilation issue with latest llvm trunk (9.0) (#2242) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following llvm change in trunk (9.0) https://reviews.llvm.org/D58194 breaks bcc build. [ 76%] Building CXX object src/cc/CMakeFiles/bcc-static.dir/bpf_module.cc.o /home/yhs/work/bcc/src/cc/bcc_debug.cc: In member function ‘void ebpf::SourceDebugger::dump()’: /home/yhs/work/bcc/src/cc/bcc_debug.cc:207:23: error: no matching function for call to ‘llvm::DWARFDebugLine::LineTable::getFileLineInfoForAddress(uint64_t, const char*, llvm::DILineInfoSpecifier::FileLineInfoKind, llvm::DILineInfo&) const’ LineInfo); ^ /home/yhs/work/bcc/src/cc/bcc_debug.cc:207:23: note: candidate is: In file included from /home/yhs/work/llvm/build/install/include/llvm/DebugInfo/DWARF/DWARFContext.h:24:0, from /home/yhs/work/bcc/src/cc/bcc_debug.cc:22: /home/yhs/work/llvm/build/install/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h:253:10: note: bool llvm::DWARFDebugLin e::LineTable::getFileLineInfoForAddress(llvm::object::SectionedAddress, const char*, llvm::DILineInfoSpecifier::FileLi neInfoKind, llvm::DILineInfo&) const bool getFileLineInfoForAddress(object::SectionedAddress Address, The reason is that function getFileLineInfoForAddress() signature changed. The first argument used to be "uint64_t" and now "object::SectionedAddress" which includes both address and section ID. This patch fixed this issue by specializing getFileLineInfoForAddress() for LLVM 9. There exists no variant for getFileLineInfoForAddress() working for all LLVM versions. Signed-off-by: Yonghong Song --- src/cc/bcc_debug.cc | 9 ++++++++- src/cc/bcc_debug.h | 7 +++++-- src/cc/bpf_module.cc | 16 ++++++++-------- src/cc/bpf_module.h | 8 +++++--- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/src/cc/bcc_debug.cc b/src/cc/bcc_debug.cc index 786074a45..b856cd02d 100644 --- a/src/cc/bcc_debug.cc +++ b/src/cc/bcc_debug.cc @@ -183,6 +183,7 @@ void SourceDebugger::dump() { uint64_t Size; uint8_t *FuncStart = get<0>(section.second); uint64_t FuncSize = get<1>(section.second); + unsigned SectionID = get<2>(section.second); ArrayRef Data(FuncStart, FuncSize); uint32_t CurrentSrcLine = 0; string func_name = section.first.substr(fn_prefix_.size()); @@ -201,8 +202,14 @@ void SourceDebugger::dump() { break; } else { DILineInfo LineInfo; + LineTable->getFileLineInfoForAddress( - (uint64_t)FuncStart + Index, CU->getCompilationDir(), +#if LLVM_MAJOR_VERSION >= 9 + {(uint64_t)FuncStart + Index, SectionID}, +#else + (uint64_t)FuncStart + Index, +#endif + CU->getCompilationDir(), DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, LineInfo); diff --git a/src/cc/bcc_debug.h b/src/cc/bcc_debug.h index 9b195be65..1467ca800 100644 --- a/src/cc/bcc_debug.h +++ b/src/cc/bcc_debug.h @@ -13,13 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +#include "bpf_module.h" + namespace ebpf { class SourceDebugger { public: SourceDebugger( llvm::Module *mod, - std::map> §ions, + sec_map_def §ions, const std::string &fn_prefix, const std::string &mod_src, std::map &src_dbg_fmap) : mod_(mod), @@ -52,7 +55,7 @@ class SourceDebugger { private: llvm::Module *mod_; - const std::map> §ions_; + const sec_map_def §ions_; const std::string &fn_prefix_; const std::string &mod_src_; std::map &src_dbg_fmap_; diff --git a/src/cc/bpf_module.cc b/src/cc/bpf_module.cc index f0d399784..cf6ea8f3b 100644 --- a/src/cc/bpf_module.cc +++ b/src/cc/bpf_module.cc @@ -63,7 +63,7 @@ const string BPFModule::FN_PREFIX = BPF_FN_PREFIX; class MyMemoryManager : public SectionMemoryManager { public: - explicit MyMemoryManager(map> *sections) + explicit MyMemoryManager(sec_map_def *sections) : sections_(sections) { } @@ -75,7 +75,7 @@ class MyMemoryManager : public SectionMemoryManager { uint8_t *Addr = SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, SectionName, false); //printf("allocateDataSection: %s Addr %p Size %ld Alignment %d SectionID %d\n", // SectionName.str().c_str(), (void *)Addr, Size, Alignment, SectionID); - (*sections_)[SectionName.str()] = make_tuple(Addr, Size); + (*sections_)[SectionName.str()] = make_tuple(Addr, Size, SectionID); return Addr; } uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment, @@ -86,10 +86,10 @@ class MyMemoryManager : public SectionMemoryManager { uint8_t *Addr = SectionMemoryManager::allocateDataSection(Size, Alignment, SectionID, SectionName, false); //printf("allocateDataSection: %s Addr %p Size %ld Alignment %d SectionID %d\n", // SectionName.str().c_str(), (void *)Addr, Size, Alignment, SectionID); - (*sections_)[SectionName.str()] = make_tuple(Addr, Size); + (*sections_)[SectionName.str()] = make_tuple(Addr, Size, SectionID); return Addr; } - map> *sections_; + sec_map_def *sections_; }; BPFModule::BPFModule(unsigned flags, TableStorage *ts, bool rw_engine_enabled, @@ -223,7 +223,7 @@ int BPFModule::run_pass_manager(Module &mod) { return 0; } -void BPFModule::load_btf(std::map> §ions) { +void BPFModule::load_btf(sec_map_def §ions) { uint8_t *btf_sec = nullptr, *btf_ext_sec = nullptr; uintptr_t btf_sec_size = 0, btf_ext_sec_size = 0; @@ -268,7 +268,7 @@ void BPFModule::load_btf(std::map> btf_ = btf; } -int BPFModule::load_maps(std::map> §ions) { +int BPFModule::load_maps(sec_map_def §ions) { // find .maps. sections and retrieve all map key/value type id's std::map> map_tids; if (btf_) { @@ -386,7 +386,7 @@ int BPFModule::load_maps(std::map> int BPFModule::finalize() { Module *mod = &*mod_; - std::map> tmp_sections, + sec_map_def tmp_sections, *sections_p; mod->setTargetTriple("bpf-pc-linux"); @@ -443,7 +443,7 @@ int BPFModule::finalize() { tmp_p = new uint8_t[size]; memcpy(tmp_p, addr, size); } - sections_[fname] = make_tuple(tmp_p, size); + sections_[fname] = make_tuple(tmp_p, size, get<2>(section.second)); } engine_.reset(); ctx_.reset(); diff --git a/src/cc/bpf_module.h b/src/cc/bpf_module.h index 18c71d3d5..63d998c7b 100644 --- a/src/cc/bpf_module.h +++ b/src/cc/bpf_module.h @@ -35,6 +35,8 @@ class Type; namespace ebpf { +typedef std::map> sec_map_def; + // Options to enable different debug logging. enum { // Debug output compiled LLVM IR. @@ -82,8 +84,8 @@ class BPFModule { StatusTuple sscanf(std::string fn_name, const char *str, void *val); StatusTuple snprintf(std::string fn_name, char *str, size_t sz, const void *val); - void load_btf(std::map> §ions); - int load_maps(std::map> §ions); + void load_btf(sec_map_def §ions); + int load_maps(sec_map_def §ions); public: BPFModule(unsigned flags, TableStorage *ts = nullptr, bool rw_engine_enabled = true, @@ -150,7 +152,7 @@ class BPFModule { std::unique_ptr rw_engine_; std::unique_ptr mod_; std::unique_ptr func_src_; - std::map> sections_; + sec_map_def sections_; std::vector tables_; std::map table_names_; std::vector function_names_; From ef9d83f289222df78f0b16a04ade7c393bf4d83d Mon Sep 17 00:00:00 2001 From: Teng Qin Date: Thu, 28 Feb 2019 15:29:55 -0800 Subject: [PATCH 075/135] Add PyPerf, example of profiling Python using BPF (#2239) This is a tool attaches BPF program to CPU Perf Events for profiling. The BPF program understands CPython internal data structure and hence able to walk actual Python stack-trace, as oppose to strac-trace of the CPython runtime itself as we would normally get with Linux perf. To use the tool, just run the PyPerf binary: Use -d / --duration to specify intended profiling duration, in milliseconds. Default value, if not specified, is 1000ms. Use -c / --sample-rate to specify intended profiling sample rate, same as -c argument of Linux perf. Default value, if not specified, is 1e6. You can also use -v / --verbose to specify logging verbosity 1 or 2 for more detailed information during profiling. The tool is a prototype at this point is by no mean mature. It currently has follow limitation: It only runs on CPU Cycles event. It only works on Python 3.6 at this point. In fact all Python version from 3.0 to 3.6 should work, I just need to verify and change the constant value. However in Python 3.7 there are some internal data structure changes that the actual parsing logic needs to be updated. It currently hard-codes the Python internal data structure offsets. It would be better to get a dependency of python-devel and get them directly from the header files. The output is pretty horrible. No de-duplication across same stack, and we always output the GIL state, Thread state and output them in raw integer value. I will need to work on prettifying the output and make better sense of the enum values. Landing it in C++ example for now, once it's mature enough I will move it to tools/. --- examples/cpp/CMakeLists.txt | 2 + examples/cpp/pyperf/CMakeLists.txt | 13 + examples/cpp/pyperf/Py36Offsets.cc | 29 ++ examples/cpp/pyperf/PyPerf.cc | 75 ++++ examples/cpp/pyperf/PyPerfBPFProgram.cc | 496 +++++++++++++++++++++ examples/cpp/pyperf/PyPerfLoggingHelper.cc | 32 ++ examples/cpp/pyperf/PyPerfLoggingHelper.h | 15 + examples/cpp/pyperf/PyPerfType.h | 103 +++++ examples/cpp/pyperf/PyPerfUtil.cc | 398 +++++++++++++++++ examples/cpp/pyperf/PyPerfUtil.h | 74 +++ 10 files changed, 1237 insertions(+) create mode 100644 examples/cpp/pyperf/CMakeLists.txt create mode 100644 examples/cpp/pyperf/Py36Offsets.cc create mode 100644 examples/cpp/pyperf/PyPerf.cc create mode 100644 examples/cpp/pyperf/PyPerfBPFProgram.cc create mode 100644 examples/cpp/pyperf/PyPerfLoggingHelper.cc create mode 100644 examples/cpp/pyperf/PyPerfLoggingHelper.h create mode 100644 examples/cpp/pyperf/PyPerfType.h create mode 100644 examples/cpp/pyperf/PyPerfUtil.cc create mode 100644 examples/cpp/pyperf/PyPerfUtil.h diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index 0b49a6a56..906c9aaf1 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -41,3 +41,5 @@ if(INSTALL_CPP_EXAMPLES) install (TARGETS FollyRequestContextSwitch DESTINATION share/bcc/examples/cpp) install (TARGETS UseExternalMap DESTINATION share/bcc/examples/cpp) endif(INSTALL_CPP_EXAMPLES) + +add_subdirectory(pyperf) diff --git a/examples/cpp/pyperf/CMakeLists.txt b/examples/cpp/pyperf/CMakeLists.txt new file mode 100644 index 000000000..8b8027516 --- /dev/null +++ b/examples/cpp/pyperf/CMakeLists.txt @@ -0,0 +1,13 @@ +# Copyright (c) Facebook, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") + +include_directories(${CMAKE_SOURCE_DIR}/src/cc) +include_directories(${CMAKE_SOURCE_DIR}/src/cc/api) +include_directories(${CMAKE_SOURCE_DIR}/src/cc/libbpf/include/uapi) + +add_executable(PyPerf PyPerf.cc PyPerfUtil.cc PyPerfBPFProgram.cc PyPerfLoggingHelper.cc Py36Offsets.cc) +target_link_libraries(PyPerf bcc-static) + +if(INSTALL_CPP_EXAMPLES) + install (TARGETS PyPerf DESTINATION share/bcc/examples/cpp) +endif(INSTALL_CPP_EXAMPLES) diff --git a/examples/cpp/pyperf/Py36Offsets.cc b/examples/cpp/pyperf/Py36Offsets.cc new file mode 100644 index 000000000..0f6cbdad5 --- /dev/null +++ b/examples/cpp/pyperf/Py36Offsets.cc @@ -0,0 +1,29 @@ +/* + * Copyright (c) Facebook, Inc. + * Licensed under the Apache License, Version 2.0 (the "License") + */ + +#include "PyPerfType.h" + +namespace ebpf { +namespace pyperf { + +extern const OffsetConfig kPy36OffsetConfig = { + .PyObject_type = 8, // offsetof(PyObject, ob_type) + .PyTypeObject_name = 24, // offsetof(PyTypeObject, tp_name) + .PyThreadState_frame = 24, // offsetof(PyThreadState, frame) + .PyThreadState_thread = 152, // offsetof(PyThreadState, thread_id) + .PyFrameObject_back = 24, // offsetof(PyFrameObject, f_back) + .PyFrameObject_code = 32, // offsetof(PyFrameObject, f_code) + .PyFrameObject_lineno = 124, // offsetof(PyFrameObject, f_lineno) + .PyFrameObject_localsplus = 376, // offsetof(PyFrameObject, f_localsplus) + .PyCodeObject_filename = 96, // offsetof(PyCodeObject, co_filename) + .PyCodeObject_name = 104, // offsetof(PyCodeObject, co_name) + .PyCodeObject_varnames = 64, // offsetof(PyCodeObject, co_varnames) + .PyTupleObject_item = 24, // offsetof(PyTupleObject, ob_item) + .String_data = 48, // sizeof(PyASCIIObject) + .String_size = 16, // offsetof(PyVarObject, ob_size) +}; + +} +} // namespace ebpf diff --git a/examples/cpp/pyperf/PyPerf.cc b/examples/cpp/pyperf/PyPerf.cc new file mode 100644 index 000000000..bee9b59fc --- /dev/null +++ b/examples/cpp/pyperf/PyPerf.cc @@ -0,0 +1,75 @@ +/* + * PyPerf Profile Python Processes with Python stack-trace. + * For Linux, uses BCC, eBPF. Embedded C. + * + * Example of using BPF to profile Python Processes with Python stack-trace. + * + * USAGE: PyPerf [-d|--duration DURATION_MS] [-c|--sample-rate SAMPLE_RATE] + * [-v|--verbosity LOG_VERBOSITY] + * + * Copyright (c) Facebook, Inc. + * Licensed under the Apache License, Version 2.0 (the "License") + */ + +#include +#include +#include +#include + +#include "PyPerfLoggingHelper.h" +#include "PyPerfUtil.h" + +int main(int argc, char** argv) { + int pos = 1; + + auto parseIntArg = [&](std::vector argNames, uint64_t& target) { + std::string arg(argv[pos]); + for (const auto& name : argNames) { + if (arg == name) { + if (pos == argc) { + std::fprintf(stderr, "Expect value after %s\n", arg.c_str()); + std::exit(1); + } + pos++; + std::string value(argv[pos]); + try { + target = std::stoi(value); + } catch (const std::exception& e) { + std::fprintf(stderr, "Expect integer value after %s, got %s: %s\n", + arg.c_str(), value.c_str(), e.what()); + std::exit(1); + } + return true; + } + } + return false; + }; + + uint64_t sampleRate = 1000000; + uint64_t durationMs = 1000; + uint64_t verbosityLevel = 0; + while (true) { + if (pos >= argc) { + break; + } + bool found = false; + found = found || parseIntArg({"-c", "--sample-rate"}, sampleRate); + found = found || parseIntArg({"-d", "--duration"}, durationMs); + found = found || parseIntArg({"-v", "--verbose"}, verbosityLevel); + if (!found) { + std::fprintf(stderr, "Unexpected argument: %s\n", argv[pos]); + std::exit(1); + } + pos++; + } + + ebpf::pyperf::setVerbosity(verbosityLevel); + ebpf::pyperf::logInfo(1, "Profiling Sample Rate: %" PRIu64 "\n", sampleRate); + ebpf::pyperf::logInfo(1, "Profiling Duration: %" PRIu64 "ms\n", durationMs); + + ebpf::pyperf::PyPerfUtil util; + util.init(); + util.profile(sampleRate, durationMs); + + return 0; +} diff --git a/examples/cpp/pyperf/PyPerfBPFProgram.cc b/examples/cpp/pyperf/PyPerfBPFProgram.cc new file mode 100644 index 000000000..e04f743ee --- /dev/null +++ b/examples/cpp/pyperf/PyPerfBPFProgram.cc @@ -0,0 +1,496 @@ +/* + * Copyright (c) Facebook, Inc. + * Licensed under the Apache License, Version 2.0 (the "License") + */ + +#include + +namespace ebpf { +namespace pyperf { + +extern const std::string PYPERF_BPF_PROGRAM = R"( +#include +#include + +#define PYTHON_STACK_FRAMES_PER_PROG 25 +#define PYTHON_STACK_PROG_CNT 3 +#define STACK_MAX_LEN (PYTHON_STACK_FRAMES_PER_PROG * PYTHON_STACK_PROG_CNT) +#define CLASS_NAME_LEN 32 +#define FUNCTION_NAME_LEN 64 +#define FILE_NAME_LEN 128 +#define TASK_COMM_LEN 16 + +enum { + STACK_STATUS_COMPLETE = 0, + STACK_STATUS_ERROR = 1, + STACK_STATUS_TRUNCATED = 2, +}; + +enum { + GIL_STATE_NO_INFO = 0, + GIL_STATE_ERROR = 1, + GIL_STATE_UNINITIALIZED = 2, + GIL_STATE_NOT_LOCKED = 3, + GIL_STATE_THIS_THREAD = 4, + GIL_STATE_GLOBAL_CURRENT_THREAD = 5, + GIL_STATE_OTHER_THREAD = 6, + GIL_STATE_NULL = 7, +}; + +enum { + THREAD_STATE_UNKNOWN = 0, + THREAD_STATE_MATCH = 1, + THREAD_STATE_MISMATCH = 2, + THREAD_STATE_THIS_THREAD_NULL = 3, + THREAD_STATE_GLOBAL_CURRENT_THREAD_NULL = 4, + THREAD_STATE_BOTH_NULL = 5, +}; + +enum { + PTHREAD_ID_UNKNOWN = 0, + PTHREAD_ID_MATCH = 1, + PTHREAD_ID_MISMATCH = 2, + PTHREAD_ID_THREAD_STATE_NULL = 3, + PTHREAD_ID_NULL = 4, + PTHREAD_ID_ERROR = 5, +}; + +typedef struct { + int64_t PyObject_type; + int64_t PyTypeObject_name; + int64_t PyThreadState_frame; + int64_t PyThreadState_thread; + int64_t PyFrameObject_back; + int64_t PyFrameObject_code; + int64_t PyFrameObject_lineno; + int64_t PyFrameObject_localsplus; + int64_t PyCodeObject_filename; + int64_t PyCodeObject_name; + int64_t PyCodeObject_varnames; + int64_t PyTupleObject_item; + int64_t String_data; + int64_t String_size; +} OffsetConfig; + +typedef struct { + uintptr_t current_state_addr; // virtual address of _PyThreadState_Current + uintptr_t tls_key_addr; // virtual address of autoTLSkey for pthreads TLS + uintptr_t gil_locked_addr; // virtual address of gil_locked + uintptr_t gil_last_holder_addr; // virtual address of gil_last_holder + OffsetConfig offsets; +} PidData; + +typedef struct { + char classname[CLASS_NAME_LEN]; + char name[FUNCTION_NAME_LEN]; + char file[FILE_NAME_LEN]; + // NOTE: PyFrameObject also has line number but it is typically just the + // first line of that function and PyCode_Addr2Line needs to be called + // to get the actual line +} Symbol; + +typedef struct { + uint32_t pid; + uint32_t tid; + char comm[TASK_COMM_LEN]; + uint8_t thread_state_match; + uint8_t gil_state; + uint8_t pthread_id_match; + uint8_t stack_status; + // instead of storing symbol name here directly, we add it to another + // hashmap with Symbols and only store the ids here + int64_t stack_len; + int32_t stack[STACK_MAX_LEN]; +} Event; + +#define _STR_CONCAT(str1, str2) str1##str2 +#define STR_CONCAT(str1, str2) _STR_CONCAT(str1, str2) +#define FAIL_COMPILATION_IF(condition) \ + typedef struct { \ + char _condition_check[1 - 2 * !!(condition)]; \ + } STR_CONCAT(compile_time_condition_check, __COUNTER__); +// See comments in get_frame_data +FAIL_COMPILATION_IF(sizeof(Symbol) == sizeof(struct bpf_perf_event_value)) + +typedef struct { + OffsetConfig offsets; + uint64_t cur_cpu; + int64_t symbol_counter; + void* frame_ptr; + int64_t python_stack_prog_call_cnt; + Event event; +} sample_state_t; + +BPF_PERCPU_ARRAY(state_heap, sample_state_t, 1); +BPF_HASH(symbols, Symbol, int32_t, __SYMBOLS_SIZE__); +BPF_HASH(pid_config, pid_t, PidData); +BPF_PROG_ARRAY(progs, 1); + +BPF_PERF_OUTPUT(events); + +static inline __attribute__((__always_inline__)) void* get_thread_state( + void* tls_base, + PidData* pid_data) { + // Python sets the thread_state using pthread_setspecific with the key + // stored in a global variable autoTLSkey. + // We read the value of the key from the global variable and then read + // the value in the thread-local storage. This relies on pthread implementation. + // This is basically the same as running the following in GDB: + // p *(PyThreadState*)((struct pthread*)pthread_self())-> + // specific_1stblock[autoTLSkey]->data + int key; + bpf_probe_read(&key, sizeof(key), (void*)pid_data->tls_key_addr); + // This assumes autoTLSkey < 32, which means that the TLS is stored in + // pthread->specific_1stblock[autoTLSkey] + // 0x310 is offsetof(struct pthread, specific_1stblock), + // 0x10 is sizeof(pthread_key_data) + // 0x8 is offsetof(struct pthread_key_data, data) + // 'struct pthread' is not in the public API so we have to hardcode + // the offsets here + void* thread_state; + bpf_probe_read( + &thread_state, + sizeof(thread_state), + tls_base + 0x310 + key * 0x10 + 0x08); + return thread_state; +} + +static inline __attribute__((__always_inline__)) int submit_sample( + struct pt_regs* ctx, + sample_state_t* state) { + events.perf_submit(ctx, &state->event, sizeof(Event)); + return 0; +} + +// this function is trivial, but we need to do map lookup in separate function, +// because BCC doesn't allow direct map calls (including lookups) from inside +// a macro (which we want to do in GET_STATE() macro below) +static inline __attribute__((__always_inline__)) sample_state_t* get_state() { + int zero = 0; + return state_heap.lookup(&zero); +} + +#define GET_STATE() \ + sample_state_t* state = get_state(); \ + if (!state) { \ + return 0; /* should never happen */ \ + } + +static inline __attribute__((__always_inline__)) int get_thread_state_match( + void* this_thread_state, + void* global_thread_state) { + if (this_thread_state == 0 && global_thread_state == 0) { + return THREAD_STATE_BOTH_NULL; + } + if (this_thread_state == 0) { + return THREAD_STATE_THIS_THREAD_NULL; + } + if (global_thread_state == 0) { + return THREAD_STATE_GLOBAL_CURRENT_THREAD_NULL; + } + if (this_thread_state == global_thread_state) { + return THREAD_STATE_MATCH; + } else { + return THREAD_STATE_MISMATCH; + } +} + +static inline __attribute__((__always_inline__)) int get_gil_state( + void* this_thread_state, + void* global_thread_state, + PidData* pid_data) { + // Get information of GIL state + if (pid_data->gil_locked_addr == 0 || pid_data->gil_last_holder_addr == 0) { + return GIL_STATE_NO_INFO; + } + + int gil_locked = 0; + void* gil_thread_state = 0; + if (bpf_probe_read( + &gil_locked, sizeof(gil_locked), (void*)pid_data->gil_locked_addr)) { + return GIL_STATE_ERROR; + } + + switch (gil_locked) { + case -1: + return GIL_STATE_UNINITIALIZED; + case 0: + return GIL_STATE_NOT_LOCKED; + case 1: + // GIL is held by some Thread + bpf_probe_read( + &gil_thread_state, + sizeof(void*), + (void*)pid_data->gil_last_holder_addr); + if (gil_thread_state == this_thread_state) { + return GIL_STATE_THIS_THREAD; + } else if (gil_thread_state == global_thread_state) { + return GIL_STATE_GLOBAL_CURRENT_THREAD; + } else if (gil_thread_state == 0) { + return GIL_STATE_NULL; + } else { + return GIL_STATE_OTHER_THREAD; + } + default: + return GIL_STATE_ERROR; + } +} + +static inline __attribute__((__always_inline__)) int +get_pthread_id_match(void* thread_state, void* tls_base, PidData* pid_data) { + if (thread_state == 0) { + return PTHREAD_ID_THREAD_STATE_NULL; + } + + uint64_t pthread_self, pthread_created; + + bpf_probe_read( + &pthread_created, + sizeof(pthread_created), + thread_state + pid_data->offsets.PyThreadState_thread); + if (pthread_created == 0) { + return PTHREAD_ID_NULL; + } + + // 0x10 = offsetof(struct pthread, header.self) + bpf_probe_read(&pthread_self, sizeof(pthread_self), tls_base + 0x10); + if (pthread_self == 0) { + return PTHREAD_ID_ERROR; + } + + if (pthread_self == pthread_created) { + return PTHREAD_ID_MATCH; + } else { + return PTHREAD_ID_MISMATCH; + } +} + +int on_event(struct pt_regs* ctx) { + uint64_t pid_tgid = bpf_get_current_pid_tgid(); + pid_t pid = (pid_t)(pid_tgid >> 32); + PidData* pid_data = pid_config.lookup(&pid); + if (!pid_data) { + return 0; + } + + GET_STATE(); + + state->offsets = pid_data->offsets; + state->cur_cpu = bpf_get_smp_processor_id(); + state->python_stack_prog_call_cnt = 0; + + Event* event = &state->event; + event->pid = pid; + event->tid = (pid_t)pid_tgid; + bpf_get_current_comm(&event->comm, sizeof(event->comm)); + + // Get pointer of global PyThreadState, which should belong to the Thread + // currently holds the GIL + void* global_current_thread = (void*)0; + bpf_probe_read( + &global_current_thread, + sizeof(global_current_thread), + (void*)pid_data->current_state_addr); + + struct task_struct* task = (struct task_struct*)bpf_get_current_task(); +#if __x86_64__ +// thread_struct->fs was renamed to fsbase in +// https://github.com/torvalds/linux/commit/296f781a4b7801ad9c1c0219f9e87b6c25e196fe +// so depending on kernel version, we need to account for that +#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 7, 0) + void* tls_base = (void*)task->thread.fs; +#else + void* tls_base = (void*)task->thread.fsbase; +#endif +#elif __aarch64__ + void* tls_base = (void*)task->thread.tp_value; +#else +#error "Unsupported platform" +#endif + + // Read PyThreadState of this Thread from TLS + void* thread_state = get_thread_state(tls_base, pid_data); + + // Check for matching between TLS PyThreadState and + // the global _PyThreadState_Current + event->thread_state_match = + get_thread_state_match(thread_state, global_current_thread); + + // Read GIL state + event->gil_state = + get_gil_state(thread_state, global_current_thread, pid_data); + + // Check for matching between pthread ID created current PyThreadState and + // pthread of actual current pthread + event->pthread_id_match = + get_pthread_id_match(thread_state, tls_base, pid_data); + + // pre-initialize event struct in case any subprogram below fails + event->stack_status = STACK_STATUS_COMPLETE; + event->stack_len = 0; + + if (thread_state != 0) { + // Get pointer to top frame from PyThreadState + bpf_probe_read( + &state->frame_ptr, + sizeof(void*), + thread_state + pid_data->offsets.PyThreadState_frame); + // jump to reading first set of Python frames + progs.call(ctx, PYTHON_STACK_PROG_IDX); + // we won't ever get here + } + + return submit_sample(ctx, state); +} + +static inline __attribute__((__always_inline__)) void get_names( + void* cur_frame, + void* code_ptr, + OffsetConfig* offsets, + Symbol* symbol, + void* ctx) { + // Figure out if we want to parse class name, basically checking the name of + // the first argument, + // ((PyTupleObject*)$frame->f_code->co_varnames)->ob_item[0] + // If it's 'self', we get the type and it's name, if it's cls, we just get + // the name. This is not perfect but there is no better way to figure this + // out from the code object. + void* args_ptr; + bpf_probe_read( + &args_ptr, sizeof(void*), code_ptr + offsets->PyCodeObject_varnames); + bpf_probe_read( + &args_ptr, sizeof(void*), args_ptr + offsets->PyTupleObject_item); + bpf_probe_read_str( + &symbol->name, sizeof(symbol->name), args_ptr + offsets->String_data); + + // compare strings as ints to save instructions + char self_str[4] = {'s', 'e', 'l', 'f'}; + char cls_str[4] = {'c', 'l', 's', '\0'}; + bool first_self = *(int32_t*)symbol->name == *(int32_t*)self_str; + bool first_cls = *(int32_t*)symbol->name == *(int32_t*)cls_str; + + // We re-use the same Symbol instance across loop iterations, which means + // we will have left-over data in the struct. Although this won't affect + // correctness of the result because we have '\0' at end of the strings read, + // it would affect effectiveness of the deduplication. + // Helper bpf_perf_prog_read_value clears the buffer on error, so here we + // (ab)use this behavior to clear the memory. It requires the size of Symbol + // to be different from struct bpf_perf_event_value, which we check at + // compilation time using the FAIL_COMPILATION_IF macro. + bpf_perf_prog_read_value(ctx, symbol, sizeof(Symbol)); + + // Read class name from $frame->f_localsplus[0]->ob_type->tp_name. + if (first_self || first_cls) { + void* ptr; + bpf_probe_read( + &ptr, sizeof(void*), cur_frame + offsets->PyFrameObject_localsplus); + if (first_self) { + // we are working with an instance, first we need to get type + bpf_probe_read(&ptr, sizeof(void*), ptr + offsets->PyObject_type); + } + bpf_probe_read(&ptr, sizeof(void*), ptr + offsets->PyTypeObject_name); + bpf_probe_read_str(&symbol->classname, sizeof(symbol->classname), ptr); + } + + void* pystr_ptr; + // read PyCodeObject's filename into symbol + bpf_probe_read( + &pystr_ptr, sizeof(void*), code_ptr + offsets->PyCodeObject_filename); + bpf_probe_read_str( + &symbol->file, sizeof(symbol->file), pystr_ptr + offsets->String_data); + // read PyCodeObject's name into symbol + bpf_probe_read( + &pystr_ptr, sizeof(void*), code_ptr + offsets->PyCodeObject_name); + bpf_probe_read_str( + &symbol->name, sizeof(symbol->name), pystr_ptr + offsets->String_data); +} + +// get_frame_data reads current PyFrameObject filename/name and updates +// stack_info->frame_ptr with pointer to next PyFrameObject +static inline __attribute__((__always_inline__)) bool get_frame_data( + void** frame_ptr, + OffsetConfig* offsets, + Symbol* symbol, + // ctx is only used to call helper to clear symbol, see documentation below + void* ctx) { + void* cur_frame = *frame_ptr; + if (!cur_frame) { + return false; + } + void* code_ptr; + // read PyCodeObject first, if that fails, then no point reading next frame + bpf_probe_read( + &code_ptr, sizeof(void*), cur_frame + offsets->PyFrameObject_code); + if (!code_ptr) { + return false; + } + + get_names(cur_frame, code_ptr, offsets, symbol, ctx); + + // read next PyFrameObject pointer, update in place + bpf_probe_read( + frame_ptr, sizeof(void*), cur_frame + offsets->PyFrameObject_back); + + return true; +} + +// To avoid duplicate ids, every CPU needs to use different ids when inserting +// into the hashmap. NUM_CPUS is defined at PyPerf backend side and passed +// through CFlag. +static inline __attribute__((__always_inline__)) int64_t get_symbol_id( + sample_state_t* state, + Symbol* sym) { + int32_t* symbol_id_ptr = symbols.lookup(sym); + if (symbol_id_ptr) { + return *symbol_id_ptr; + } + // the symbol is new, bump the counter + int32_t symbol_id = state->symbol_counter * NUM_CPUS + state->cur_cpu; + state->symbol_counter++; + symbols.update(sym, &symbol_id); + return symbol_id; +} + +int read_python_stack(struct pt_regs* ctx) { + GET_STATE(); + + state->python_stack_prog_call_cnt++; + Event* sample = &state->event; + + Symbol sym = {}; + bool last_res = false; +#pragma unroll + for (int i = 0; i < PYTHON_STACK_FRAMES_PER_PROG; i++) { + last_res = get_frame_data(&state->frame_ptr, &state->offsets, &sym, ctx); + if (last_res) { + uint32_t symbol_id = get_symbol_id(state, &sym); + int64_t cur_len = sample->stack_len; + if (cur_len >= 0 && cur_len < STACK_MAX_LEN) { + sample->stack[cur_len] = symbol_id; + sample->stack_len++; + } + } + } + + if (!state->frame_ptr) { + sample->stack_status = STACK_STATUS_COMPLETE; + } else { + if (!last_res) { + sample->stack_status = STACK_STATUS_ERROR; + } else { + sample->stack_status = STACK_STATUS_TRUNCATED; + } + } + + if (sample->stack_status == STACK_STATUS_TRUNCATED && + state->python_stack_prog_call_cnt < PYTHON_STACK_PROG_CNT) { + // read next batch of frames + progs.call(ctx, PYTHON_STACK_PROG_IDX); + } + + return submit_sample(ctx, state); +} +)"; + +} +} // namespace ebpf diff --git a/examples/cpp/pyperf/PyPerfLoggingHelper.cc b/examples/cpp/pyperf/PyPerfLoggingHelper.cc new file mode 100644 index 000000000..61e96ad88 --- /dev/null +++ b/examples/cpp/pyperf/PyPerfLoggingHelper.cc @@ -0,0 +1,32 @@ +/* + * Copyright (c) Facebook, Inc. + * Licensed under the Apache License, Version 2.0 (the "License") + */ + +#include +#include + +#include "PyPerfLoggingHelper.h" + +namespace ebpf { +namespace pyperf { + +static uint64_t setVerbosityLevel = 0; + +void setVerbosity(uint64_t verbosityLevel) { + setVerbosityLevel = verbosityLevel; +} + +void logInfo(uint64_t logLevel, const char* fmt, ...) { + if (logLevel > setVerbosityLevel) { + return; + } + + va_list va; + va_start(va, fmt); + std::vfprintf(stderr, fmt, va); + va_end(va); +} + +} // namespace pyperf +} // namespace ebpf diff --git a/examples/cpp/pyperf/PyPerfLoggingHelper.h b/examples/cpp/pyperf/PyPerfLoggingHelper.h new file mode 100644 index 000000000..d08d93e26 --- /dev/null +++ b/examples/cpp/pyperf/PyPerfLoggingHelper.h @@ -0,0 +1,15 @@ +/* + * Copyright (c) Facebook, Inc. + * Licensed under the Apache License, Version 2.0 (the "License") + */ + +#include + +namespace ebpf { +namespace pyperf { + +void setVerbosity(uint64_t verbosityLevel); +void logInfo(uint64_t logLevel, const char* fmt, ...); + +} // namespace pyperf +} // namespace ebpf diff --git a/examples/cpp/pyperf/PyPerfType.h b/examples/cpp/pyperf/PyPerfType.h new file mode 100644 index 000000000..9a54e9e0c --- /dev/null +++ b/examples/cpp/pyperf/PyPerfType.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) Facebook, Inc. + * Licensed under the Apache License, Version 2.0 (the "License") + */ + +#include + +#define PYTHON_STACK_FRAMES_PER_PROG 25 +#define PYTHON_STACK_PROG_CNT 3 +#define STACK_MAX_LEN (PYTHON_STACK_FRAMES_PER_PROG * PYTHON_STACK_PROG_CNT) +#define CLASS_NAME_LEN 32 +#define FUNCTION_NAME_LEN 64 +#define FILE_NAME_LEN 128 +#define TASK_COMM_LEN 16 + +namespace ebpf { +namespace pyperf { + +enum { + STACK_STATUS_COMPLETE = 0, + STACK_STATUS_ERROR = 1, + STACK_STATUS_TRUNCATED = 2, +}; + +enum { + GIL_STATE_NO_INFO = 0, + GIL_STATE_ERROR = 1, + GIL_STATE_UNINITIALIZED = 2, + GIL_STATE_NOT_LOCKED = 3, + GIL_STATE_THIS_THREAD = 4, + GIL_STATE_GLOBAL_CURRENT_THREAD = 5, + GIL_STATE_OTHER_THREAD = 6, + GIL_STATE_NULL = 7, +}; + +enum { + THREAD_STATE_UNKNOWN = 0, + THREAD_STATE_MATCH = 1, + THREAD_STATE_MISMATCH = 2, + THREAD_STATE_THIS_THREAD_NULL = 3, + THREAD_STATE_GLOBAL_CURRENT_THREAD_NULL = 4, + THREAD_STATE_BOTH_NULL = 5, +}; + +enum { + PTHREAD_ID_UNKNOWN = 0, + PTHREAD_ID_MATCH = 1, + PTHREAD_ID_MISMATCH = 2, + PTHREAD_ID_THREAD_STATE_NULL = 3, + PTHREAD_ID_NULL = 4, + PTHREAD_ID_ERROR = 5, +}; + +typedef struct { + int64_t PyObject_type; + int64_t PyTypeObject_name; + int64_t PyThreadState_frame; + int64_t PyThreadState_thread; + int64_t PyFrameObject_back; + int64_t PyFrameObject_code; + int64_t PyFrameObject_lineno; + int64_t PyFrameObject_localsplus; + int64_t PyCodeObject_filename; + int64_t PyCodeObject_name; + int64_t PyCodeObject_varnames; + int64_t PyTupleObject_item; + int64_t String_data; + int64_t String_size; +} OffsetConfig; + +typedef struct { + uintptr_t current_state_addr; // virtual address of _PyThreadState_Current + uintptr_t tls_key_addr; // virtual address of autoTLSkey for pthreads TLS + uintptr_t gil_locked_addr; // virtual address of gil_locked + uintptr_t gil_last_holder_addr; // virtual address of gil_last_holder + OffsetConfig offsets; +} PidData; + +typedef struct { + char classname[CLASS_NAME_LEN]; + char name[FUNCTION_NAME_LEN]; + char file[FILE_NAME_LEN]; + // NOTE: PyFrameObject also has line number but it is typically just the + // first line of that function and PyCode_Addr2Line needs to be called + // to get the actual line +} Symbol; + +typedef struct { + uint32_t pid; + uint32_t tid; + char comm[TASK_COMM_LEN]; + uint8_t thread_state_match; + uint8_t gil_state; + uint8_t pthread_id_match; + uint8_t stack_status; + // instead of storing symbol name here directly, we add it to another + // hashmap with Symbols and only store the ids here + int64_t stack_len; + int32_t stack[STACK_MAX_LEN]; +} Event; + +} // namespace pyperf +} // namespace ebpf diff --git a/examples/cpp/pyperf/PyPerfUtil.cc b/examples/cpp/pyperf/PyPerfUtil.cc new file mode 100644 index 000000000..d4390831b --- /dev/null +++ b/examples/cpp/pyperf/PyPerfUtil.cc @@ -0,0 +1,398 @@ +/* + * Copyright (c) Facebook, Inc. + * Licensed under the Apache License, Version 2.0 (the "License") + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "PyPerfLoggingHelper.h" +#include "PyPerfUtil.h" +#include "bcc_elf.h" +#include "bcc_proc.h" +#include "bcc_syms.h" + +namespace ebpf { +namespace pyperf { + +extern OffsetConfig kPy36OffsetConfig; +extern std::string PYPERF_BPF_PROGRAM; + +const static std::string kLostSymbol = "[Lost Symbol]"; +const static std::string kIncompleteStack = "[Truncated Stack]"; +const static std::string kErrorStack = "[Stack Error]"; +const static std::string kNonPythonStack = "[Non-Python Code]"; +const static int kPerfBufSizePages = 32; + +const static std::string kPidCfgTableName("pid_config"); +const static std::string kProgsTableName("progs"); +const static std::string kSamplePerfBufName("events"); + +const static std::string kOnEventFuncName("on_event"); + +const static std::string kPythonStackFuncName("read_python_stack"); +const static std::string kPythonStackProgIdxFlag("-DPYTHON_STACK_PROG_IDX="); +const static int kPythonStackProgIdx = 0; + +const static std::string kNumCpusFlag("-DNUM_CPUS="); +const static std::string kSymbolsHashSizeFlag("-D__SYMBOLS_SIZE__="); +const static int kSymbolsHashSize = 16384; + +namespace { + +bool getRunningPids(std::vector& output) { + auto dir = ::opendir("/proc/"); + if (!dir) { + std::fprintf(stderr, "Open /proc failed: %d\n", errno); + return false; + } + + dirent* result = nullptr; + do { + if ((result = readdir(dir))) { + std::string basename = result->d_name; + if (basename == "." || basename == "..") { + continue; + } + + std::string fullpath = "/proc/" + basename; + struct stat st; + if (::stat(fullpath.c_str(), &st) != 0 || !S_ISDIR(st.st_mode)) { + continue; + } + + try { + auto pid = std::stoi(basename); + output.push_back(pid); + } catch (const std::exception& e) { + continue; + } + } + } while (result); + + if (::closedir(dir) == -1) { + std::fprintf(stderr, "Close /proc failed: %d\n", errno); + return false; + } + + return true; +} + +typedef struct { + int pid; + bool found; + uint64_t st; + uint64_t en; +} FindPythonPathHelper; + +const static std::string kPy36LibName = "libpython3.6"; + +int findPythonPathCallback(const char* name, uint64_t st, uint64_t en, uint64_t, + bool, void* payload) { + auto helper = static_cast(payload); + std::string file = name; + auto pos = file.rfind("/"); + if (pos != std::string::npos) { + file = file.substr(pos + 1); + } + if (file.find(kPy36LibName) == 0) { + logInfo(1, "Found Python library %s loaded at %lx-%lx for PID %d\n", name, st, en, helper->pid); + helper->found = true; + helper->st = st; + helper->en = en; + return -1; + } + return 0; +} + +bool allAddrFound(const PidData& data) { + return (data.current_state_addr > 0) && (data.tls_key_addr > 0) && + (data.gil_locked_addr > 0) && (data.gil_last_holder_addr > 0); +} + +int getAddrOfPythonBinaryCallback(const char* name, uint64_t addr, uint64_t, + void* payload) { + PidData& data = *static_cast(payload); + + auto checkAndGetAddr = [&](uintptr_t& targetAddr, const char* targetName) { + if (targetAddr == 0 && std::strcmp(name, targetName) == 0) { + targetAddr = addr; + } + }; + + checkAndGetAddr(data.tls_key_addr, "autoTLSkey"); + checkAndGetAddr(data.current_state_addr, "_PyThreadState_Current"); + checkAndGetAddr(data.gil_locked_addr, "gil_locked"); + checkAndGetAddr(data.gil_last_holder_addr, "gil_last_holder"); + + if (allAddrFound(data)) { + return -1; + } + return 0; +} + +bool getAddrOfPythonBinary(const std::string& path, PidData& data) { + std::memset(&data, 0, sizeof(data)); + + struct bcc_symbol_option option = {.use_debug_file = 0, + .check_debug_file_crc = 0, + .use_symbol_type = (1 << STT_OBJECT)}; + + bcc_elf_foreach_sym(path.c_str(), &getAddrOfPythonBinaryCallback, &option, + &data); + + return allAddrFound(data); +} +} // namespace + +void handleSampleCallback(void* cb_cookie, void* raw_data, int data_size) { + auto profiler = static_cast(cb_cookie); + profiler->handleSample(raw_data, data_size); +} + +void handleLostSamplesCallback(void* cb_cookie, uint64_t lost_cnt) { + auto profiler = static_cast(cb_cookie); + profiler->handleLostSamples(lost_cnt); +} + +PyPerfUtil::PyPerfResult PyPerfUtil::init() { + std::vector cflags; + cflags.emplace_back(kNumCpusFlag + + std::to_string(::sysconf(_SC_NPROCESSORS_ONLN))); + cflags.emplace_back(kSymbolsHashSizeFlag + std::to_string(kSymbolsHashSize)); + cflags.emplace_back(kPythonStackProgIdxFlag + + std::to_string(kPythonStackProgIdx)); + + auto initRes = bpf_.init(PYPERF_BPF_PROGRAM, cflags); + if (initRes.code() != 0) { + std::fprintf(stderr, "Failed to compiled PyPerf BPF programs: %s\n", + initRes.msg().c_str()); + return PyPerfResult::INIT_FAIL; + } + + int progFd = -1; + auto loadRes = + bpf_.load_func(kPythonStackFuncName, BPF_PROG_TYPE_PERF_EVENT, progFd); + if (loadRes.code() != 0) { + std::fprintf(stderr, "Failed to load BPF program %s: %s\n", + kPythonStackFuncName.c_str(), loadRes.msg().c_str()); + return PyPerfResult::INIT_FAIL; + } + + auto progTable = bpf_.get_prog_table(kProgsTableName); + auto updateRes = progTable.update_value(kPythonStackProgIdx, progFd); + if (updateRes.code() != 0) { + std::fprintf(stderr, + "Failed to set BPF program %s FD %d to program table: %s\n", + kPythonStackFuncName.c_str(), progFd, updateRes.msg().c_str()); + return PyPerfResult::INIT_FAIL; + } + + std::vector pids; + if (!getRunningPids(pids)) { + std::fprintf(stderr, "Failed getting running Processes\n"); + return PyPerfResult::INIT_FAIL; + } + + // Populate config for each Python Process + auto pid_hash = bpf_.get_hash_table(kPidCfgTableName); + PidData pidData; + for (const auto pid : pids) { + if (!tryTargetPid(pid, pidData)) { + // Not a Python Process + continue; + } + pid_hash.update_value(pid, pidData); + } + + // Open perf buffer + auto openRes = bpf_.open_perf_buffer( + kSamplePerfBufName, &handleSampleCallback, &handleLostSamplesCallback, + this, kPerfBufSizePages); + if (openRes.code() != 0) { + std::fprintf(stderr, "Unable to open Perf Buffer: %s\n", + openRes.msg().c_str()); + return PyPerfResult::PERF_BUF_OPEN_FAIL; + } + + initCompleted_ = true; + return PyPerfResult::SUCCESS; +} + +void PyPerfUtil::handleSample(const void* data, int dataSize) { + const Event* raw = static_cast(data); + samples_.emplace_back(raw, dataSize); + totalSamples_++; +} + +void PyPerfUtil::handleLostSamples(int lostCnt) { lostSamples_ += lostCnt; } + +PyPerfUtil::PyPerfResult PyPerfUtil::profile(int64_t sampleRate, + int64_t durationMs) { + if (!initCompleted_) { + std::fprintf(stderr, "PyPerfUtil::init not invoked or failed\n"); + return PyPerfResult::NO_INIT; + } + + // Attach to CPU cycles + auto attachRes = + bpf_.attach_perf_event(0, 0, kOnEventFuncName, sampleRate, 0); + if (attachRes.code() != 0) { + std::fprintf(stderr, "Attach to CPU cycles event failed: %s\n", + attachRes.msg().c_str()); + return PyPerfResult::EVENT_ATTACH_FAIL; + } + logInfo(2, "Attached to profiling event\n"); + + // Get Perf Buffer and poll in a loop for a given duration + auto perfBuffer = bpf_.get_perf_buffer(kSamplePerfBufName); + if (!perfBuffer) { + std::fprintf(stderr, "Failed to get Perf Buffer: %s\n", + kSamplePerfBufName.c_str()); + return PyPerfResult::PERF_BUF_OPEN_FAIL; + } + logInfo(2, "Started polling Perf Buffer\n"); + auto start = std::chrono::steady_clock::now(); + while (std::chrono::steady_clock::now() < + start + std::chrono::milliseconds(durationMs)) { + perfBuffer->poll(50 /* 50ms timeout */); + } + logInfo(2, "Profiling duration finished\n"); + + // Detach the event + auto detachRes = bpf_.detach_perf_event(0, 0); + if (detachRes.code() != 0) { + std::fprintf(stderr, "Detach CPU cycles event failed: %s\n", + detachRes.msg().c_str()); + return PyPerfResult::EVENT_DETACH_FAIL; + } + logInfo(2, "Detached from profiling event\n"); + + // Drain remaining samples + logInfo(2, "Draining remaining samples\n"); + while (perfBuffer->poll(0) > 0) { + } + logInfo(2, "Finished draining remaining samples\n"); + + // Get symbol names and output samples + auto symbolTable = bpf_.get_hash_table("symbols"); + std::unordered_map symbols; + for (auto& x : symbolTable.get_table_offline()) { + auto symbolName = getSymbolName(x.first); + logInfo(2, "Symbol ID %d is %s\n", x.second, symbolName.c_str()); + symbols.emplace(x.second, std::move(symbolName)); + } + logInfo(1, "Total %d unique Python symbols\n", symbols.size()); + + for (auto& sample : samples_) { + if (sample.threadStateMatch != THREAD_STATE_THIS_THREAD_NULL && + sample.threadStateMatch != THREAD_STATE_BOTH_NULL) { + for (const auto stackId : sample.pyStackIds) { + auto symbIt = symbols.find(stackId); + if (symbIt != symbols.end()) { + std::printf(" %s\n", symbIt->second.c_str()); + } else { + std::printf(" %s\n", kLostSymbol.c_str()); + lostSymbols_++; + } + } + switch (sample.stackStatus) { + case STACK_STATUS_TRUNCATED: + std::printf(" %s\n", kIncompleteStack.c_str()); + truncatedStack_++; + break; + case STACK_STATUS_ERROR: + std::printf(" %s\n", kErrorStack.c_str()); + break; + default: + break; + } + } else { + std::printf(" %s\n", kNonPythonStack.c_str()); + } + + std::printf("PID: %d TID: %d (%s)\n", sample.pid, sample.tid, + sample.comm.c_str()); + std::printf("GIL State: %d Thread State: %d PthreadID Match State: %d\n\n", + sample.threadStateMatch, sample.gilState, + sample.pthreadIDMatch); + } + + logInfo(0, "%d samples collected\n", totalSamples_); + logInfo(0, "%d samples lost\n", lostSamples_); + logInfo(0, "%d samples with truncated stack\n", truncatedStack_); + logInfo(0, "%d times Python symbol lost\n", lostSymbols_); + + return PyPerfResult::SUCCESS; +} + +std::string PyPerfUtil::getSymbolName(Symbol& sym) const { + std::string nameStr = std::string(sym.name).substr(0, FUNCTION_NAME_LEN); + std::string classStr = std::string(sym.classname).substr(0, CLASS_NAME_LEN); + if (classStr.size() > 0) { + nameStr = classStr + "." + nameStr; + } + + std::string file = std::string(sym.file).substr(0, FILE_NAME_LEN); + if (file.empty()) { + return nameStr; + } + if (file[0] == '/') { + file = file.substr(1); + } + if (file.find("./") == 0) { + file = file.substr(2); + } + if (file.find(".py", file.size() - 3) == (file.size() - 3)) { + file = file.substr(0, file.size() - 3); + } + std::replace(file.begin(), file.end(), '/', '.'); + + return file + "." + nameStr; +} + +bool PyPerfUtil::tryTargetPid(int pid, PidData& data) { + FindPythonPathHelper helper{pid, false, 0, 0}; + bcc_procutils_each_module(pid, &findPythonPathCallback, &helper); + if (!helper.found) { + logInfo(2, "PID %d does not contain Python library\n", pid); + return false; + } + + char path[256]; + int res = std::snprintf(path, sizeof(path), "/proc/%d/map_files/%lx-%lx", pid, + helper.st, helper.en); + if (res < 0 || size_t(res) >= sizeof(path)) { + return false; + } + + if (!getAddrOfPythonBinary(path, data)) { + std::fprintf(stderr, "Failed getting addresses in potential Python library in PID %d\n", pid); + return false; + } + data.offsets = kPy36OffsetConfig; + data.current_state_addr += helper.st; + logInfo(2, "PID %d has _PyThreadState_Current at %lx\n", pid, data.current_state_addr); + data.tls_key_addr += helper.st; + logInfo(2, "PID %d has autoTLSKey at %lx\n", pid, data.current_state_addr); + data.gil_locked_addr += helper.st; + logInfo(2, "PID %d has gil_locked at %lx\n", pid, data.current_state_addr); + data.gil_last_holder_addr += helper.st; + logInfo(2, "PID %d has gil_last_holder at %lx\n", pid, data.current_state_addr); + + return true; +} + +} // namespace pyperf +} // namespace ebpf diff --git a/examples/cpp/pyperf/PyPerfUtil.h b/examples/cpp/pyperf/PyPerfUtil.h new file mode 100644 index 000000000..3e69a292e --- /dev/null +++ b/examples/cpp/pyperf/PyPerfUtil.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) Facebook, Inc. + * Licensed under the Apache License, Version 2.0 (the "License") + */ + +#pragma once + +#include +#include + +#include +#include + +#include "BPF.h" +#include "PyPerfType.h" + +namespace ebpf { +namespace pyperf { + +class PyPerfUtil { + public: + enum class PyPerfResult : int { + SUCCESS = 0, + INIT_FAIL, + PERF_BUF_OPEN_FAIL, + NO_INIT, + EVENT_ATTACH_FAIL, + EVENT_DETACH_FAIL + }; + + struct Sample { + pid_t pid; + pid_t tid; + std::string comm; + uint8_t threadStateMatch; + uint8_t gilState; + uint8_t pthreadIDMatch; + uint8_t stackStatus; + std::vector pyStackIds; + + explicit Sample(const Event* raw, int rawSize) + : pid(raw->pid), + tid(raw->tid), + comm(raw->comm), + threadStateMatch(raw->thread_state_match), + gilState(raw->gil_state), + pthreadIDMatch(raw->pthread_id_match), + stackStatus(raw->stack_status), + pyStackIds(raw->stack, raw->stack + raw->stack_len) {} + }; + + // init must be invoked exactly once before invoking profile + PyPerfResult init(); + + PyPerfResult profile(int64_t sampleRate, int64_t durationMs); + + private: + uint32_t lostSymbols_ = 0, totalSamples_ = 0, lostSamples_ = 0, truncatedStack_ = 0; + + ebpf::BPF bpf_{0, nullptr, false, "", true}; + std::vector samples_; + bool initCompleted_{false}; + + void handleSample(const void* data, int dataSize); + void handleLostSamples(int lostCnt); + friend void handleLostSamplesCallback(void*, uint64_t); + friend void handleSampleCallback(void*, void*, int); + + std::string getSymbolName(Symbol& sym) const; + + bool tryTargetPid(int pid, PidData& data); +}; +} // namespace pyperf +} // namespace ebpf From eca17a0c7831d3da6f5e32fc0209b99c512ba602 Mon Sep 17 00:00:00 2001 From: mephi42 Date: Fri, 1 Mar 2019 19:29:13 +0100 Subject: [PATCH 076/135] Explain how to use addr argument of BPF.attach_uprobe (#2231) (#2244) Explain how to use addr argument of BPF.attach_uprobe --- src/python/bcc/__init__.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index 69a048f3d..6f114de89 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -968,9 +968,13 @@ def attach_uprobe(self, name=b"", sym=b"", sym_re=b"", addr=None, pid=-1) Run the bpf function denoted by fn_name every time the symbol sym in - the library or binary 'name' is encountered. The real address addr may - be supplied in place of sym. Optional parameters pid, cpu, and group_fd - can be used to filter the probe. + the library or binary 'name' is encountered. Optional parameters pid, + cpu, and group_fd can be used to filter the probe. + + The real address addr may be supplied in place of sym, in which case sym + must be set to its default value. If the file is a non-PIE executable, + addr must be a virtual address, otherwise it must be an offset relative + to the file load address. Instead of a symbol name, a regular expression can be provided in sym_re. The uprobe will then attach to symbols that match the provided From 552658edda09298afdccc8a4b5e17311a2d8a771 Mon Sep 17 00:00:00 2001 From: Zwb Date: Sun, 3 Mar 2019 06:08:35 +0800 Subject: [PATCH 077/135] Add source installation instructions for Centos 7 (#2248) Add source installation instructions for Centos 7 --- INSTALL.md | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/INSTALL.md b/INSTALL.md index f2ece9f82..c043d4c3e 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -14,6 +14,7 @@ - [Ubuntu](#ubuntu---source) - [Fedora](#fedora---source) - [openSUSE](#opensuse---source) + - [Centos](#centos---source) - [Amazon Linux](#amazon-linux---source) * [Older Instructions](#older-instructions) @@ -388,6 +389,55 @@ sudo make install popd ``` +## Centos - Source + +For Centos 7.6 only + +### Install build dependencies + +``` +sudo yum install -y epel-release +sudo yum update -y +sudo yum groupinstall -y "Development tools" +sudo yum install -y elfutils-libelf-devel cmake3 +sudo yum install -y luajit luajit-devel # for Lua support +``` + +### Install and compile LLVM + +``` +curl -LO http://releases.llvm.org/7.0.1/llvm-7.0.1.src.tar.xz +curl -LO http://releases.llvm.org/7.0.1/cfe-7.0.1.src.tar.xz +tar -xf cfe-7.0.1.src.tar.xz +tar -xf llvm-7.0.1.src.tar.xz + +mkdir clang-build +mkdir llvm-build + +cd llvm-build +cmake3 -G "Unix Makefiles" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \ + -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr ../llvm-7.0.1.src +make +sudo make install + +cd ../clang-build +cmake3 -G "Unix Makefiles" -DLLVM_TARGETS_TO_BUILD="BPF;X86" \ + -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr ../cfe-7.0.1.src +make +sudo make install +cd .. +``` + +### Install and compile BCC + +``` +git clone https://github.com/iovisor/bcc.git +mkdir bcc/build; cd bcc/build +cmake .. -DCMAKE_INSTALL_PREFIX=/usr +make +sudo make install +``` + ## Amazon Linux - Source Tested on Amazon Linux AMI release 2018.03 (kernel 4.14.47-56.37.amzn1.x86_64) From 593339d8db2e2d9dd2eabca2b65638b975a813b5 Mon Sep 17 00:00:00 2001 From: Gary Lin Date: Wed, 27 Feb 2019 16:54:44 +0800 Subject: [PATCH 078/135] examples/tracing: Handle KeyboardInterrupt Signed-off-by: Gary Lin --- examples/tracing/hello_fields.py | 2 ++ examples/tracing/mysqld_query.py | 2 ++ examples/tracing/nodejs_http_server.py | 2 ++ examples/tracing/sync_timing.py | 13 ++++++++----- examples/tracing/tcpv4connect.py | 2 ++ examples/tracing/trace_fields.py | 5 ++++- examples/tracing/urandomread-explicit.py | 2 ++ examples/tracing/urandomread.py | 2 ++ 8 files changed, 24 insertions(+), 6 deletions(-) diff --git a/examples/tracing/hello_fields.py b/examples/tracing/hello_fields.py index b8ee6db47..9ed6da5df 100755 --- a/examples/tracing/hello_fields.py +++ b/examples/tracing/hello_fields.py @@ -26,4 +26,6 @@ (task, pid, cpu, flags, ts, msg) = b.trace_fields() except ValueError: continue + except KeyboardInterrupt: + exit() printb(b"%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) diff --git a/examples/tracing/mysqld_query.py b/examples/tracing/mysqld_query.py index aa453ce68..73c7f26f0 100755 --- a/examples/tracing/mysqld_query.py +++ b/examples/tracing/mysqld_query.py @@ -59,4 +59,6 @@ except ValueError: print("value error") continue + except KeyboardInterrupt: + exit() printb(b"%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) diff --git a/examples/tracing/nodejs_http_server.py b/examples/tracing/nodejs_http_server.py index 1f6a7b906..a86ca956c 100755 --- a/examples/tracing/nodejs_http_server.py +++ b/examples/tracing/nodejs_http_server.py @@ -52,4 +52,6 @@ except ValueError: print("value error") continue + except KeyboardInterrupt: + exit() printb(b"%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) diff --git a/examples/tracing/sync_timing.py b/examples/tracing/sync_timing.py index 4fad777c4..1d89ce554 100755 --- a/examples/tracing/sync_timing.py +++ b/examples/tracing/sync_timing.py @@ -45,8 +45,11 @@ # format output start = 0 while 1: - (task, pid, cpu, flags, ts, ms) = b.trace_fields() - if start == 0: - start = ts - ts = ts - start - printb(b"At time %.2f s: multiple syncs detected, last %s ms ago" % (ts, ms)) + try: + (task, pid, cpu, flags, ts, ms) = b.trace_fields() + if start == 0: + start = ts + ts = ts - start + printb(b"At time %.2f s: multiple syncs detected, last %s ms ago" % (ts, ms)) + except KeyboardInterrupt: + exit() diff --git a/examples/tracing/tcpv4connect.py b/examples/tracing/tcpv4connect.py index 81385e8f8..26d937636 100755 --- a/examples/tracing/tcpv4connect.py +++ b/examples/tracing/tcpv4connect.py @@ -94,6 +94,8 @@ def inet_ntoa(addr): except ValueError: # Ignore messages from other tracers continue + except KeyboardInterrupt: + exit() # Ignore messages from other tracers if _tag != "trace_tcp4connect": diff --git a/examples/tracing/trace_fields.py b/examples/tracing/trace_fields.py index 8b57f9a21..1c5beef21 100755 --- a/examples/tracing/trace_fields.py +++ b/examples/tracing/trace_fields.py @@ -18,4 +18,7 @@ b = BPF(text=prog) b.attach_kprobe(event=b.get_syscall_fnname("clone"), fn_name="hello") print("PID MESSAGE") -b.trace_print(fmt="{1} {5}") +try: + b.trace_print(fmt="{1} {5}") +except KeyboardInterrupt: + exit() diff --git a/examples/tracing/urandomread-explicit.py b/examples/tracing/urandomread-explicit.py index 0706092a7..9291402ba 100755 --- a/examples/tracing/urandomread-explicit.py +++ b/examples/tracing/urandomread-explicit.py @@ -50,4 +50,6 @@ (task, pid, cpu, flags, ts, msg) = b.trace_fields() except ValueError: continue + except KeyboardInterrupt: + exit() printb(b"%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) diff --git a/examples/tracing/urandomread.py b/examples/tracing/urandomread.py index c1468c8cd..69bcf8d11 100755 --- a/examples/tracing/urandomread.py +++ b/examples/tracing/urandomread.py @@ -33,4 +33,6 @@ (task, pid, cpu, flags, ts, msg) = b.trace_fields() except ValueError: continue + except KeyboardInterrupt: + exit() printb(b"%-18.9f %-16s %-6d %s" % (ts, task, pid, msg)) From d4f5da4a8cffc74762c7020ed9daa3a4818be56c Mon Sep 17 00:00:00 2001 From: JayceCao Date: Mon, 4 Mar 2019 13:45:19 +0800 Subject: [PATCH 079/135] Add the part of tracing system calls according to #2245 for the doc (#2250) Add the part of tracing system calls according to #2245 for the doc --- docs/reference_guide.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/docs/reference_guide.md b/docs/reference_guide.md index 26fcf0c7e..24f7c8189 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -15,6 +15,7 @@ This guide is incomplete. If something feels missing, check the bcc and kernel s - [5. uretprobes](#5-uretprobes) - [6. USDT probes](#6-usdt-probes) - [7. Raw Tracepoints](#7-raw-tracepoints) + - [8. system call tracepoints](#8-system-call-tracepoints) - [Data](#data) - [1. bpf_probe_read()](#1-bpf_probe_read) - [2. bpf_probe_read_str()](#2-bpf_probe_read_str) @@ -277,6 +278,40 @@ This instruments the sched:sched_switch tracepoint, and prints the prev and next Examples in situ: [search /tools](https://github.com/iovisor/bcc/search?q=RAW_TRACEPOINT_PROBE+path%3Atools&type=Code) +### 8. system call tracepoints + +Syntax: ```syscall__SYSCALLNAME``` + +```syscall__``` is a special prefix that creates a kprobe for the system call name provided as the remainder. You can use it by declaring a normal C function, then using the Python ```BPF.get_syscall_name(SYSCALLNAME)``` and ```BPF.attach_kprobe()``` to associate it. + +Arguments are specified on the function declaration: ```syscall__SYSCALLNAME(struct pt_regs *ctx, [, argument1 ...])```. + +For example: +```C +int syscall__execve(struct pt_regs *ctx, + const char __user *filename, + const char __user *const __user *__argv, + const char __user *const __user *__envp) +{ + [...] +} +``` + +This instruments the execve system call. + +The first argument is always ```struct pt_regs *```, the remainder are the arguments to the function (they don't need to be specified, if you don't intend to use them). + +Corresponding Python code: +```Python +b = BPF(text=bpf_text) +execve_fnname = b.get_syscall_name("execve") +b.attach_kprobe(event=execve_fnname, fn_name="syscall__execve") +``` + +Examples in situ: +[code](https://github.com/iovisor/bcc/blob/552658edda09298afdccc8a4b5e17311a2d8a771/tools/execsnoop.py#L101) ([output](https://github.com/iovisor/bcc/blob/552658edda09298afdccc8a4b5e17311a2d8a771/tools/execsnoop_example.txt#L8)) + + ## Data ### 1. bpf_probe_read() From 7fdaa9c53450338bb831c0217d35494233b04e32 Mon Sep 17 00:00:00 2001 From: Gary Ching-Pang Lin Date: Fri, 8 Mar 2019 01:52:06 +0800 Subject: [PATCH 080/135] Fix network examples (#2256) * examples/networking: remove the deprecated pyroute2 functions link_create() and link_remove() are deprecated since pyroute2 0.5.2. Replace them with the equivalent link() commands. Signed-off-by: Gary Lin --- examples/networking/simple_tc.py | 4 ++-- examples/networking/simulation.py | 2 +- examples/networking/tc_perf_event.py | 7 +++++-- examples/networking/xdp/CMakeLists.txt | 3 ++- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/examples/networking/simple_tc.py b/examples/networking/simple_tc.py index ec0a3e74d..38180132b 100755 --- a/examples/networking/simple_tc.py +++ b/examples/networking/simple_tc.py @@ -16,7 +16,7 @@ try: b = BPF(text=text, debug=0) fn = b.load_func("hello", BPF.SCHED_CLS) - ipr.link_create(ifname="t1a", kind="veth", peer="t1b") + ipr.link("add", ifname="t1a", kind="veth", peer="t1b") idx = ipr.link_lookup(ifname="t1a")[0] ipr.tc("add", "ingress", idx, "ffff:") @@ -26,5 +26,5 @@ ipr.tc("add-filter", "bpf", idx, ":1", fd=fn.fd, name=fn.name, parent="1:", action="ok", classid=1) finally: - if "idx" in locals(): ipr.link_remove(idx) + if "idx" in locals(): ipr.link("del", index=idx) print("BPF tc functionality - SCHED_CLS: OK") diff --git a/examples/networking/simulation.py b/examples/networking/simulation.py index 2c6a0f3d6..5395d5dfe 100644 --- a/examples/networking/simulation.py +++ b/examples/networking/simulation.py @@ -49,7 +49,7 @@ def _ns_add_ifc(self, name, ns_ifc, ifc_base_name=None, in_ifc=None, else: # delete the potentially leaf-over veth interfaces ipr = IPRoute() - for i in ipr.link_lookup(ifname='%sa' % ifc_base_name): ipr.link_remove(i) + for i in ipr.link_lookup(ifname='%sa' % ifc_base_name): ipr.link("del", index=i) ipr.close() try: out_ifc = self.ipdb.create(ifname="%sa" % ifc_base_name, kind="veth", diff --git a/examples/networking/tc_perf_event.py b/examples/networking/tc_perf_event.py index 4a1b754e9..99f0e9ebe 100755 --- a/examples/networking/tc_perf_event.py +++ b/examples/networking/tc_perf_event.py @@ -79,7 +79,10 @@ class SkbEvent(ct.Structure): b["skb_events"].open_perf_buffer(print_skb_event) print('Try: "ping6 ff02::1%me"\n') print("%-3s %-32s %-12s %-10s" % ("CPU", "SRC IP", "DST IP", "Magic")) - while True: - b.perf_buffer_poll() + try: + while True: + b.perf_buffer_poll() + except KeyboardInterrupt: + pass finally: if "me" in locals(): ipr.link("del", index=me) diff --git a/examples/networking/xdp/CMakeLists.txt b/examples/networking/xdp/CMakeLists.txt index ebe523966..9a29fc19e 100644 --- a/examples/networking/xdp/CMakeLists.txt +++ b/examples/networking/xdp/CMakeLists.txt @@ -1 +1,2 @@ -install(PROGRAMS xdp_drop_count.py DESTINATION share/bcc/examples/networking/xdp) +file(GLOB PY_FILES *.py) +install(PROGRAMS ${PY_FILES} DESTINATION share/bcc/examples/networking/xdp) From 0ec2d4fd02bfd894b3fdb44ecb175344197a33dc Mon Sep 17 00:00:00 2001 From: Brenden Blanco Date: Thu, 7 Mar 2019 12:30:43 -0800 Subject: [PATCH 081/135] Prepare debian changelog for v0.9.0 tag (#2258) debian changelog for v0.9.0 tag Signed-off-by: Brenden Blanco --- debian/changelog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/debian/changelog b/debian/changelog index 0f0e89b1c..a53c9b9da 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,11 @@ +bcc (0.9.0-1) unstable; urgency=low + + * Adds support for BTF + * Uses libbpf common library to wrap syscall API + * Many bugfixes and new tools + + -- Brenden Blanco Thu, 07 Mar 2019 17:00:00 +0000 + bcc (0.8.0-1) unstable; urgency=low * Support for kernel up to 5.0 From 1b76d6c3822b9244782ea385f9b3f20769a2db49 Mon Sep 17 00:00:00 2001 From: Teng Qin Date: Thu, 7 Mar 2019 17:42:45 -0800 Subject: [PATCH 082/135] Fix error message printing in create_kprobe_event (#2259) Fix error message printing in create_kprobe_event --- src/cc/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 9b0024dc4..50986493b 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -867,7 +867,7 @@ static int create_kprobe_event(char *buf, const char *ev_name, kfd = open("/sys/kernel/debug/tracing/kprobe_events", O_WRONLY | O_APPEND, 0); if (kfd < 0) { - fprintf(stderr, "open(/sys/kernel/debug/tracing/kprobe_events): %s\n", buf, + fprintf(stderr, "open(/sys/kernel/debug/tracing/kprobe_events): %s\n", strerror(errno)); return -1; } From 701bd73a3b1baf6d8b1a35e736522625fa1e47e0 Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Fri, 8 Mar 2019 13:46:28 +0800 Subject: [PATCH 083/135] enhance tools/tcpaccept (#2254) - add option `-T': include time column on output (HH:MM:SS) - add option `-P PORT': only trace port(s) specified - add RPORT colume on output --- man/man8/tcpaccept.8 | 20 ++++++++++++++-- tools/tcpaccept.py | 48 +++++++++++++++++++++++++++++++------ tools/tcpaccept_example.txt | 29 ++++++++++++---------- 3 files changed, 75 insertions(+), 22 deletions(-) diff --git a/man/man8/tcpaccept.8 b/man/man8/tcpaccept.8 index 837717b80..6e340bd09 100644 --- a/man/man8/tcpaccept.8 +++ b/man/man8/tcpaccept.8 @@ -1,8 +1,8 @@ -.TH tcpaccept 8 "2015-08-25" "USER COMMANDS" +.TH tcpaccept 8 "2019-03-08" "USER COMMANDS" .SH NAME tcpaccept \- Trace TCP passive connections (accept()). Uses Linux eBPF/bcc. .SH SYNOPSIS -.B tcpaccept [\-h] [\-t] [\-x] [\-p PID] +.B tcpaccept [\-h] [\-T] [\-t] [\-p PID] [\-P PORTS] .SH DESCRIPTION This tool traces passive TCP connections (eg, via an accept() syscall; connect() are active connections). This can be useful for general @@ -22,11 +22,17 @@ CONFIG_BPF and bcc. \-h Print usage message. .TP +\-T +Include a time column on output (HH:MM:SS). +.TP \-t Include a timestamp column. .TP \-p PID Trace this process ID only (filtered in-kernel). +.TP +\-P PORTS +Comma-separated list of local ports to trace (filtered in-kernel). .SH EXAMPLES .TP Trace all passive TCP connections (accept()s): @@ -37,11 +43,18 @@ Trace all TCP accepts, and include timestamps: # .B tcpaccept \-t .TP +Trace connections to local ports 80 and 81 only: +# +.B tcpaccept \-P 80,81 +.TP Trace PID 181 only: # .B tcpaccept \-p 181 .SH FIELDS .TP +TIME +Time of the event, in HH:MM:SS format. +.TP TIME(s) Time of the event, in seconds. .TP @@ -57,6 +70,9 @@ IP address family (4 or 6) RADDR Remote IP address. .TP +RPORT +Remote port +.TP LADDR Local IP address. .TP diff --git a/tools/tcpaccept.py b/tools/tcpaccept.py index f606b734e..169b0f31d 100755 --- a/tools/tcpaccept.py +++ b/tools/tcpaccept.py @@ -4,7 +4,7 @@ # tcpaccept Trace TCP accept()s. # For Linux, uses BCC, eBPF. Embedded C. # -# USAGE: tcpaccept [-h] [-t] [-p PID] +# USAGE: tcpaccept [-h] [-T] [-t] [-p PID] [-P PORTS] # # This uses dynamic tracing of the kernel inet_csk_accept() socket function # (from tcp_prot.accept), and will need to be modified to match kernel changes. @@ -21,21 +21,27 @@ from struct import pack import argparse from bcc.utils import printb +from time import strftime # arguments examples = """examples: ./tcpaccept # trace all TCP accept()s ./tcpaccept -t # include timestamps + ./tcpaccept -P 80,81 # only trace port 80 and 81 ./tcpaccept -p 181 # only trace PID 181 """ parser = argparse.ArgumentParser( description="Trace TCP accepts", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) +parser.add_argument("-T", "--time", action="store_true", + help="include time column on output (HH:MM:SS)") parser.add_argument("-t", "--timestamp", action="store_true", help="include timestamp on output") parser.add_argument("-p", "--pid", help="trace this PID only") +parser.add_argument("-P", "--port", + help="comma-separated list of local ports to trace") parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) args = parser.parse_args() @@ -55,6 +61,7 @@ u32 daddr; u64 ip; u16 lport; + u16 dport; char task[TASK_COMM_LEN]; }; BPF_PERF_OUTPUT(ipv4_events); @@ -66,6 +73,7 @@ unsigned __int128 daddr; u64 ip; u16 lport; + u16 dport; char task[TASK_COMM_LEN]; }; BPF_PERF_OUTPUT(ipv6_events); @@ -126,9 +134,13 @@ return 0; // pull in details - u16 family = 0, lport = 0; + u16 family = 0, lport = 0, dport; family = newsk->__sk_common.skc_family; lport = newsk->__sk_common.skc_num; + dport = newsk->__sk_common.skc_dport; + dport = ntohs(dport); + + ##FILTER_PORT## if (family == AF_INET) { struct ipv4_data_t data4 = {.pid = pid, .ip = 4}; @@ -136,6 +148,7 @@ data4.saddr = newsk->__sk_common.skc_rcv_saddr; data4.daddr = newsk->__sk_common.skc_daddr; data4.lport = lport; + data4.dport = dport; bpf_get_current_comm(&data4.task, sizeof(data4.task)); ipv4_events.perf_submit(ctx, &data4, sizeof(data4)); @@ -147,6 +160,7 @@ bpf_probe_read(&data6.daddr, sizeof(data6.daddr), &newsk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); data6.lport = lport; + data6.dport = dport; bpf_get_current_comm(&data6.task, sizeof(data6.task)); ipv6_events.perf_submit(ctx, &data6, sizeof(data6)); } @@ -168,9 +182,12 @@ ##FILTER_PID## // pull in details - u16 family = 0, lport = 0; + u16 family = 0, lport = 0, dport; family = args->family; lport = args->sport; + dport = args->dport; + + ##FILTER_PORT## if (family == AF_INET) { struct ipv4_data_t data4 = {.pid = pid, .ip = 4}; @@ -178,6 +195,7 @@ __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr)); __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr)); data4.lport = lport; + data4.dport = dport; bpf_get_current_comm(&data4.task, sizeof(data4.task)); ipv4_events.perf_submit(args, &data4, sizeof(data4)); } else if (family == AF_INET6) { @@ -186,6 +204,7 @@ __builtin_memcpy(&data6.saddr, args->saddr, sizeof(data6.saddr)); __builtin_memcpy(&data6.daddr, args->daddr, sizeof(data6.daddr)); data6.lport = lport; + data6.dport = dport; bpf_get_current_comm(&data6.task, sizeof(data6.task)); ipv6_events.perf_submit(args, &data6, sizeof(data6)); } @@ -207,35 +226,48 @@ 'if (pid != %s) { return 0; }' % args.pid) else: bpf_text = bpf_text.replace('##FILTER_PID##', '') +if args.port: + lports = [int(lport) for lport in args.port.split(',')] + lports_if = ' && '.join(['lport != %d' % lport for lport in lports]) + bpf_text = bpf_text.replace('##FILTER_PORT##', + 'if (%s) { return 0; }' % lports_if) if debug or args.ebpf: print(bpf_text) if args.ebpf: exit() +bpf_text = bpf_text.replace('##FILTER_PORT##', '') + # process event def print_ipv4_event(cpu, data, size): event = b["ipv4_events"].event(data) global start_ts + if args.time: + print("%-9s" % strftime("%H:%M:%S"), end="") if args.timestamp: if start_ts == 0: start_ts = event.ts_us print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="") - printb(b"%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid, + printb(b"%-7d %-12.12s %-2d %-16s %-5d %-16s %-5d" % (event.pid, event.task, event.ip, inet_ntop(AF_INET, pack("I", event.daddr)).encode(), + event.dport, inet_ntop(AF_INET, pack("I", event.saddr)).encode(), event.lport)) def print_ipv6_event(cpu, data, size): event = b["ipv6_events"].event(data) global start_ts + if args.time: + print("%-9s" % strftime("%H:%M:%S"), end="") if args.timestamp: if start_ts == 0: start_ts = event.ts_us print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="") - printb(b"%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid, + printb(b"%-7d %-12.12s %-2d %-16s %-5d %-16s %-5d" % (event.pid, event.task, event.ip, inet_ntop(AF_INET6, event.daddr).encode(), + event.dport, inet_ntop(AF_INET6, event.saddr).encode(), event.lport)) @@ -243,10 +275,12 @@ def print_ipv6_event(cpu, data, size): b = BPF(text=bpf_text) # header +if args.time: + print("%-9s" % ("TIME"), end="") if args.timestamp: print("%-9s" % ("TIME(s)"), end="") -print("%-6s %-12s %-2s %-16s %-16s %-4s" % ("PID", "COMM", "IP", "RADDR", - "LADDR", "LPORT")) +print("%-7s %-12s %-2s %-16s %-5s %-16s %-5s" % ("PID", "COMM", "IP", "RADDR", + "RPORT", "LADDR", "LPORT")) start_ts = 0 diff --git a/tools/tcpaccept_example.txt b/tools/tcpaccept_example.txt index f86c4392a..2adee452e 100644 --- a/tools/tcpaccept_example.txt +++ b/tools/tcpaccept_example.txt @@ -6,10 +6,10 @@ passive connection via accept(); not connect()). Some example output (IP addresses changed to protect the innocent): # ./tcpaccept -PID COMM IP RADDR LADDR LPORT -907 sshd 4 192.168.56.1 192.168.56.102 22 -907 sshd 4 127.0.0.1 127.0.0.1 22 -5389 perl 6 1234:ab12:2040:5020:2299:0:5:0 1234:ab12:2040:5020:2299:0:5:0 7001 +PID COMM IP RADDR RPORT LADDR LPORT +907 sshd 4 192.168.56.1 32324 192.168.56.102 22 +907 sshd 4 127.0.0.1 39866 127.0.0.1 22 +5389 perl 6 1234:ab12:2040:5020:2299:0:5:0 52352 1234:ab12:2040:5020:2299:0:5:0 7001 This output shows three connections, two IPv4 connections to PID 907, an "sshd" process listening on port 22, and one IPv6 connection to a "perl" process @@ -26,26 +26,29 @@ ports will not be shown (those can be traced via other functions). The -t option prints a timestamp column: # ./tcpaccept -t -TIME(s) PID COMM IP RADDR LADDR LPORT -0.000 907 sshd 4 127.0.0.1 127.0.0.1 22 -0.010 5389 perl 6 1234:ab12:2040:5020:2299:0:5:0 1234:ab12:2040:5020:2299:0:5:0 7001 -0.992 907 sshd 4 127.0.0.1 127.0.0.1 22 -1.984 907 sshd 4 127.0.0.1 127.0.0.1 22 +TIME(s) PID COMM IP RADDR RPORT LADDR LPORT +0.000 907 sshd 4 127.0.0.1 53700 127.0.0.1 22 +0.010 5389 perl 6 1234:ab12:2040:5020:2299:0:5:0 40614 1234:ab12:2040:5020:2299:0:5:0 7001 +0.992 907 sshd 4 127.0.0.1 32548 127.0.0.1 22 +1.984 907 sshd 4 127.0.0.1 51250 127.0.0.1 22 USAGE message: # ./tcpaccept -h -usage: tcpaccept [-h] [-t] [-p PID] +usage: tcpaccept [-h] [-T] [-t] [-p PID] [-P PORTS] Trace TCP accepts optional arguments: - -h, --help show this help message and exit - -t, --timestamp include timestamp on output - -p PID, --pid PID trace this PID only + -h, --help show this help message and exit + -T, --time include time column on output (HH:MM:SS) + -t, --timestamp include timestamp on output + -p PID, --pid PID trace this PID only + -P PORTS, --port PORTS comma-separated list of local ports to trace examples: ./tcpaccept # trace all TCP accept()s ./tcpaccept -t # include timestamps + ./tcpaccept -P 80,81 # only trace port 80 and 81 ./tcpaccept -p 181 # only trace PID 181 From 873e93958fc4a3da1997d241c86cb8689c4be408 Mon Sep 17 00:00:00 2001 From: Teng Qin Date: Thu, 7 Mar 2019 21:56:10 -0800 Subject: [PATCH 084/135] Improve PyPerf sample handling and output (#2260) * Add common interface for PyPerf sample handling * Better printing for enum values --- examples/cpp/pyperf/CMakeLists.txt | 2 +- examples/cpp/pyperf/PyPerf.cc | 35 ++++++- examples/cpp/pyperf/PyPerfDefaultPrinter.cc | 106 ++++++++++++++++++++ examples/cpp/pyperf/PyPerfDefaultPrinter.h | 31 ++++++ examples/cpp/pyperf/PyPerfLoggingHelper.h | 2 + examples/cpp/pyperf/PyPerfSampleProcessor.h | 24 +++++ examples/cpp/pyperf/PyPerfType.h | 26 +++++ examples/cpp/pyperf/PyPerfUtil.cc | 71 ++++--------- examples/cpp/pyperf/PyPerfUtil.h | 36 +++---- 9 files changed, 255 insertions(+), 78 deletions(-) create mode 100644 examples/cpp/pyperf/PyPerfDefaultPrinter.cc create mode 100644 examples/cpp/pyperf/PyPerfDefaultPrinter.h create mode 100644 examples/cpp/pyperf/PyPerfSampleProcessor.h diff --git a/examples/cpp/pyperf/CMakeLists.txt b/examples/cpp/pyperf/CMakeLists.txt index 8b8027516..6f963c66b 100644 --- a/examples/cpp/pyperf/CMakeLists.txt +++ b/examples/cpp/pyperf/CMakeLists.txt @@ -5,7 +5,7 @@ include_directories(${CMAKE_SOURCE_DIR}/src/cc) include_directories(${CMAKE_SOURCE_DIR}/src/cc/api) include_directories(${CMAKE_SOURCE_DIR}/src/cc/libbpf/include/uapi) -add_executable(PyPerf PyPerf.cc PyPerfUtil.cc PyPerfBPFProgram.cc PyPerfLoggingHelper.cc Py36Offsets.cc) +add_executable(PyPerf PyPerf.cc PyPerfUtil.cc PyPerfBPFProgram.cc PyPerfLoggingHelper.cc PyPerfDefaultPrinter.cc Py36Offsets.cc) target_link_libraries(PyPerf bcc-static) if(INSTALL_CPP_EXAMPLES) diff --git a/examples/cpp/pyperf/PyPerf.cc b/examples/cpp/pyperf/PyPerf.cc index bee9b59fc..bad2ba098 100644 --- a/examples/cpp/pyperf/PyPerf.cc +++ b/examples/cpp/pyperf/PyPerf.cc @@ -16,10 +16,12 @@ #include #include +#include "PyPerfDefaultPrinter.h" #include "PyPerfLoggingHelper.h" #include "PyPerfUtil.h" int main(int argc, char** argv) { + // Argument parsing helpers int pos = 1; auto parseIntArg = [&](std::vector argNames, uint64_t& target) { @@ -45,9 +47,29 @@ int main(int argc, char** argv) { return false; }; + auto parseBoolArg = [&](std::vector argNames, bool& target) { + std::string arg(argv[pos]); + for (const auto& name : argNames) { + if (arg == ("--" + name)) { + target = true; + return true; + } + if (arg == "--no-" + name) { + target = false; + return true; + } + } + return false; + }; + + // Default argument values uint64_t sampleRate = 1000000; uint64_t durationMs = 1000; uint64_t verbosityLevel = 0; + bool showGILState = true; + bool showThreadState = true; + bool showPthreadIDState = false; + while (true) { if (pos >= argc) { break; @@ -56,6 +78,10 @@ int main(int argc, char** argv) { found = found || parseIntArg({"-c", "--sample-rate"}, sampleRate); found = found || parseIntArg({"-d", "--duration"}, durationMs); found = found || parseIntArg({"-v", "--verbose"}, verbosityLevel); + found = found || parseBoolArg({"show-gil-state"}, showGILState); + found = found || parseBoolArg({"show-thread-state"}, showThreadState); + found = + found || parseBoolArg({"show-pthread-id-state"}, showPthreadIDState); if (!found) { std::fprintf(stderr, "Unexpected argument: %s\n", argv[pos]); std::exit(1); @@ -66,10 +92,17 @@ int main(int argc, char** argv) { ebpf::pyperf::setVerbosity(verbosityLevel); ebpf::pyperf::logInfo(1, "Profiling Sample Rate: %" PRIu64 "\n", sampleRate); ebpf::pyperf::logInfo(1, "Profiling Duration: %" PRIu64 "ms\n", durationMs); + ebpf::pyperf::logInfo(1, "Showing GIL state: %d\n", showGILState); + ebpf::pyperf::logInfo(1, "Showing Thread state: %d\n", showThreadState); + ebpf::pyperf::logInfo(1, "Showing Pthread ID state: %d\n", + showPthreadIDState); ebpf::pyperf::PyPerfUtil util; util.init(); - util.profile(sampleRate, durationMs); + + ebpf::pyperf::PyPerfDefaultPrinter printer(showGILState, showThreadState, + showPthreadIDState); + util.profile(sampleRate, durationMs, &printer); return 0; } diff --git a/examples/cpp/pyperf/PyPerfDefaultPrinter.cc b/examples/cpp/pyperf/PyPerfDefaultPrinter.cc new file mode 100644 index 000000000..22ec2c327 --- /dev/null +++ b/examples/cpp/pyperf/PyPerfDefaultPrinter.cc @@ -0,0 +1,106 @@ +/* + * Copyright (c) Facebook, Inc. + * Licensed under the Apache License, Version 2.0 (the "License") + */ + +#include +#include + +#include "PyPerfDefaultPrinter.h" +#include "PyPerfUtil.h" + +namespace ebpf { +namespace pyperf { + +const static std::string kLostSymbol = "[Lost Symbol]"; +const static std::string kIncompleteStack = "[Truncated Stack]"; +const static std::string kErrorStack = "[Stack Error]"; +const static std::string kNonPythonStack = "[Non-Python Code]"; + +const static std::map kGILStateValues = { + {GIL_STATE_NO_INFO, "No GIL Info"}, + {GIL_STATE_ERROR, "Error Reading GIL State"}, + {GIL_STATE_UNINITIALIZED, "GIL Uninitialized"}, + {GIL_STATE_NOT_LOCKED, "GIL Not Locked"}, + {GIL_STATE_THIS_THREAD, "GIL on This Thread"}, + {GIL_STATE_GLOBAL_CURRENT_THREAD, + "GIL on Global _PyThreadState_Current Thread"}, + {GIL_STATE_OTHER_THREAD, "GIL on Unexpected Thread"}, + {GIL_STATE_NULL, "GIL State Empty"}}; + +const static std::map kThreadStateValues = { + {THREAD_STATE_UNKNOWN, "ThreadState Unknown"}, + {THREAD_STATE_MATCH, "TLS ThreadState is Global _PyThreadState_Current"}, + {THREAD_STATE_MISMATCH, + "TLS ThreadState is not Global _PyThreadState_Current"}, + {THREAD_STATE_THIS_THREAD_NULL, "TLS ThreadState is NULL"}, + {THREAD_STATE_GLOBAL_CURRENT_THREAD_NULL, + "Global _PyThreadState_Current is NULL"}, + {THREAD_STATE_BOTH_NULL, + "Both TLS ThreadState and Global _PyThreadState_Current is NULL"}, +}; + +const static std::map kPthreadIDStateValues = { + {PTHREAD_ID_UNKNOWN, "Pthread ID Unknown"}, + {PTHREAD_ID_MATCH, "System Pthread ID is Python ThreadState Pthread ID"}, + {PTHREAD_ID_MISMATCH, + "System Pthread ID is not Python ThreadState Pthread ID"}, + {PTHREAD_ID_THREAD_STATE_NULL, "No Pthread ID: TLS ThreadState is NULL"}, + {PTHREAD_ID_NULL, "Pthread ID on TLS ThreadState is NULL"}, + {PTHREAD_ID_ERROR, "Error Reading System Pthread ID"}}; + +void PyPerfDefaultPrinter::processSamples( + const std::vector& samples, PyPerfUtil* util) { + auto symbols = util->getSymbolMapping(); + uint32_t lostSymbols = 0; + uint32_t truncatedStack = 0; + + for (auto& sample : samples) { + if (sample.threadStateMatch != THREAD_STATE_THIS_THREAD_NULL && + sample.threadStateMatch != THREAD_STATE_BOTH_NULL) { + for (const auto stackId : sample.pyStackIds) { + auto symbIt = symbols.find(stackId); + if (symbIt != symbols.end()) { + std::printf(" %s\n", symbIt->second.c_str()); + } else { + std::printf(" %s\n", kLostSymbol.c_str()); + lostSymbols++; + } + } + switch (sample.stackStatus) { + case STACK_STATUS_TRUNCATED: + std::printf(" %s\n", kIncompleteStack.c_str()); + truncatedStack++; + break; + case STACK_STATUS_ERROR: + std::printf(" %s\n", kErrorStack.c_str()); + break; + default: + break; + } + } else { + std::printf(" %s\n", kNonPythonStack.c_str()); + } + + std::printf("PID: %d TID: %d (%s)\n", sample.pid, sample.tid, + sample.comm.c_str()); + if (showGILState_) + std::printf("GIL State: %s\n", kGILStateValues.at(sample.gilState)); + if (showThreadState_) + std::printf("Thread State: %s\n", + kThreadStateValues.at(sample.threadStateMatch)); + if (showPthreadIDState_) + std::printf("Pthread ID State: %s\n", + kPthreadIDStateValues.at(sample.pthreadIDMatch)); + + std::printf("\n"); + } + + std::printf("%d samples collected\n", util->getTotalSamples()); + std::printf("%d samples lost\n", util->getLostSamples()); + std::printf("%d samples with truncated stack\n", truncatedStack); + std::printf("%d times Python symbol lost\n", lostSymbols); +} + +} // namespace pyperf +} // namespace ebpf diff --git a/examples/cpp/pyperf/PyPerfDefaultPrinter.h b/examples/cpp/pyperf/PyPerfDefaultPrinter.h new file mode 100644 index 000000000..89c8153e2 --- /dev/null +++ b/examples/cpp/pyperf/PyPerfDefaultPrinter.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) Facebook, Inc. + * Licensed under the Apache License, Version 2.0 (the "License") + */ + +#pragma once + +#include "PyPerfSampleProcessor.h" + +namespace ebpf { +namespace pyperf { + +class PyPerfDefaultPrinter : public PyPerfSampleProcessor { + public: + PyPerfDefaultPrinter(bool showGILState, bool showThreadState, + bool showPthreadIDState) + : showGILState_(showGILState), + showThreadState_(showThreadState), + showPthreadIDState_(showPthreadIDState) {} + + void processSamples(const std::vector& samples, + PyPerfUtil* util) override; + + private: + bool showGILState_; + bool showThreadState_; + bool showPthreadIDState_; +}; + +} // namespace pyperf +} // namespace ebpf diff --git a/examples/cpp/pyperf/PyPerfLoggingHelper.h b/examples/cpp/pyperf/PyPerfLoggingHelper.h index d08d93e26..c10166601 100644 --- a/examples/cpp/pyperf/PyPerfLoggingHelper.h +++ b/examples/cpp/pyperf/PyPerfLoggingHelper.h @@ -3,6 +3,8 @@ * Licensed under the Apache License, Version 2.0 (the "License") */ +#pragma once + #include namespace ebpf { diff --git a/examples/cpp/pyperf/PyPerfSampleProcessor.h b/examples/cpp/pyperf/PyPerfSampleProcessor.h new file mode 100644 index 000000000..5f2fe5e66 --- /dev/null +++ b/examples/cpp/pyperf/PyPerfSampleProcessor.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) Facebook, Inc. + * Licensed under the Apache License, Version 2.0 (the "License") + */ + +#pragma once + +#include + +#include "PyPerfType.h" + +namespace ebpf { +namespace pyperf { + +class PyPerfUtil; + +class PyPerfSampleProcessor { + public: + virtual void processSamples(const std::vector& samples, + PyPerfUtil* util) = 0; +}; + +} // namespace pyperf +} // namespace ebpf diff --git a/examples/cpp/pyperf/PyPerfType.h b/examples/cpp/pyperf/PyPerfType.h index 9a54e9e0c..7df07c70d 100644 --- a/examples/cpp/pyperf/PyPerfType.h +++ b/examples/cpp/pyperf/PyPerfType.h @@ -3,7 +3,12 @@ * Licensed under the Apache License, Version 2.0 (the "License") */ +#pragma once + +#include #include +#include +#include #define PYTHON_STACK_FRAMES_PER_PROG 25 #define PYTHON_STACK_PROG_CNT 3 @@ -99,5 +104,26 @@ typedef struct { int32_t stack[STACK_MAX_LEN]; } Event; +struct PyPerfSample { + pid_t pid; + pid_t tid; + std::string comm; + uint8_t threadStateMatch; + uint8_t gilState; + uint8_t pthreadIDMatch; + uint8_t stackStatus; + std::vector pyStackIds; + + explicit PyPerfSample(const Event* raw, int rawSize) + : pid(raw->pid), + tid(raw->tid), + comm(raw->comm), + threadStateMatch(raw->thread_state_match), + gilState(raw->gil_state), + pthreadIDMatch(raw->pthread_id_match), + stackStatus(raw->stack_status), + pyStackIds(raw->stack, raw->stack + raw->stack_len) {} +}; + } // namespace pyperf } // namespace ebpf diff --git a/examples/cpp/pyperf/PyPerfUtil.cc b/examples/cpp/pyperf/PyPerfUtil.cc index d4390831b..252a0fed5 100644 --- a/examples/cpp/pyperf/PyPerfUtil.cc +++ b/examples/cpp/pyperf/PyPerfUtil.cc @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -29,10 +28,6 @@ namespace pyperf { extern OffsetConfig kPy36OffsetConfig; extern std::string PYPERF_BPF_PROGRAM; -const static std::string kLostSymbol = "[Lost Symbol]"; -const static std::string kIncompleteStack = "[Truncated Stack]"; -const static std::string kErrorStack = "[Stack Error]"; -const static std::string kNonPythonStack = "[Non-Python Code]"; const static int kPerfBufSizePages = 32; const static std::string kPidCfgTableName("pid_config"); @@ -107,7 +102,8 @@ int findPythonPathCallback(const char* name, uint64_t st, uint64_t en, uint64_t, file = file.substr(pos + 1); } if (file.find(kPy36LibName) == 0) { - logInfo(1, "Found Python library %s loaded at %lx-%lx for PID %d\n", name, st, en, helper->pid); + logInfo(1, "Found Python library %s loaded at %lx-%lx for PID %d\n", name, + st, en, helper->pid); helper->found = true; helper->st = st; helper->en = en; @@ -239,7 +235,8 @@ void PyPerfUtil::handleSample(const void* data, int dataSize) { void PyPerfUtil::handleLostSamples(int lostCnt) { lostSamples_ += lostCnt; } PyPerfUtil::PyPerfResult PyPerfUtil::profile(int64_t sampleRate, - int64_t durationMs) { + int64_t durationMs, + PyPerfSampleProcessor* processor) { if (!initCompleted_) { std::fprintf(stderr, "PyPerfUtil::init not invoked or failed\n"); return PyPerfResult::NO_INIT; @@ -285,7 +282,12 @@ PyPerfUtil::PyPerfResult PyPerfUtil::profile(int64_t sampleRate, } logInfo(2, "Finished draining remaining samples\n"); - // Get symbol names and output samples + processor->processSamples(samples_, this); + + return PyPerfResult::SUCCESS; +} + +std::unordered_map PyPerfUtil::getSymbolMapping() { auto symbolTable = bpf_.get_hash_table("symbols"); std::unordered_map symbols; for (auto& x : symbolTable.get_table_offline()) { @@ -294,47 +296,7 @@ PyPerfUtil::PyPerfResult PyPerfUtil::profile(int64_t sampleRate, symbols.emplace(x.second, std::move(symbolName)); } logInfo(1, "Total %d unique Python symbols\n", symbols.size()); - - for (auto& sample : samples_) { - if (sample.threadStateMatch != THREAD_STATE_THIS_THREAD_NULL && - sample.threadStateMatch != THREAD_STATE_BOTH_NULL) { - for (const auto stackId : sample.pyStackIds) { - auto symbIt = symbols.find(stackId); - if (symbIt != symbols.end()) { - std::printf(" %s\n", symbIt->second.c_str()); - } else { - std::printf(" %s\n", kLostSymbol.c_str()); - lostSymbols_++; - } - } - switch (sample.stackStatus) { - case STACK_STATUS_TRUNCATED: - std::printf(" %s\n", kIncompleteStack.c_str()); - truncatedStack_++; - break; - case STACK_STATUS_ERROR: - std::printf(" %s\n", kErrorStack.c_str()); - break; - default: - break; - } - } else { - std::printf(" %s\n", kNonPythonStack.c_str()); - } - - std::printf("PID: %d TID: %d (%s)\n", sample.pid, sample.tid, - sample.comm.c_str()); - std::printf("GIL State: %d Thread State: %d PthreadID Match State: %d\n\n", - sample.threadStateMatch, sample.gilState, - sample.pthreadIDMatch); - } - - logInfo(0, "%d samples collected\n", totalSamples_); - logInfo(0, "%d samples lost\n", lostSamples_); - logInfo(0, "%d samples with truncated stack\n", truncatedStack_); - logInfo(0, "%d times Python symbol lost\n", lostSymbols_); - - return PyPerfResult::SUCCESS; + return symbols; } std::string PyPerfUtil::getSymbolName(Symbol& sym) const { @@ -378,18 +340,23 @@ bool PyPerfUtil::tryTargetPid(int pid, PidData& data) { } if (!getAddrOfPythonBinary(path, data)) { - std::fprintf(stderr, "Failed getting addresses in potential Python library in PID %d\n", pid); + std::fprintf( + stderr, + "Failed getting addresses in potential Python library in PID %d\n", + pid); return false; } data.offsets = kPy36OffsetConfig; data.current_state_addr += helper.st; - logInfo(2, "PID %d has _PyThreadState_Current at %lx\n", pid, data.current_state_addr); + logInfo(2, "PID %d has _PyThreadState_Current at %lx\n", pid, + data.current_state_addr); data.tls_key_addr += helper.st; logInfo(2, "PID %d has autoTLSKey at %lx\n", pid, data.current_state_addr); data.gil_locked_addr += helper.st; logInfo(2, "PID %d has gil_locked at %lx\n", pid, data.current_state_addr); data.gil_last_holder_addr += helper.st; - logInfo(2, "PID %d has gil_last_holder at %lx\n", pid, data.current_state_addr); + logInfo(2, "PID %d has gil_last_holder at %lx\n", pid, + data.current_state_addr); return true; } diff --git a/examples/cpp/pyperf/PyPerfUtil.h b/examples/cpp/pyperf/PyPerfUtil.h index 3e69a292e..c3396f407 100644 --- a/examples/cpp/pyperf/PyPerfUtil.h +++ b/examples/cpp/pyperf/PyPerfUtil.h @@ -6,12 +6,14 @@ #pragma once #include +#include #include #include #include #include "BPF.h" +#include "PyPerfSampleProcessor.h" #include "PyPerfType.h" namespace ebpf { @@ -28,37 +30,23 @@ class PyPerfUtil { EVENT_DETACH_FAIL }; - struct Sample { - pid_t pid; - pid_t tid; - std::string comm; - uint8_t threadStateMatch; - uint8_t gilState; - uint8_t pthreadIDMatch; - uint8_t stackStatus; - std::vector pyStackIds; - - explicit Sample(const Event* raw, int rawSize) - : pid(raw->pid), - tid(raw->tid), - comm(raw->comm), - threadStateMatch(raw->thread_state_match), - gilState(raw->gil_state), - pthreadIDMatch(raw->pthread_id_match), - stackStatus(raw->stack_status), - pyStackIds(raw->stack, raw->stack + raw->stack_len) {} - }; - // init must be invoked exactly once before invoking profile PyPerfResult init(); - PyPerfResult profile(int64_t sampleRate, int64_t durationMs); + PyPerfResult profile(int64_t sampleRate, int64_t durationMs, + PyPerfSampleProcessor* processor); + + std::unordered_map getSymbolMapping(); + + uint32_t getTotalSamples() const { return totalSamples_; } + + uint32_t getLostSamples() const { return lostSamples_; } private: - uint32_t lostSymbols_ = 0, totalSamples_ = 0, lostSamples_ = 0, truncatedStack_ = 0; + uint32_t totalSamples_ = 0, lostSamples_ = 0; ebpf::BPF bpf_{0, nullptr, false, "", true}; - std::vector samples_; + std::vector samples_; bool initCompleted_{false}; void handleSample(const void* data, int dataSize); From c2b371d56b8bbaf9c7d01b88830193ecd1ee4e12 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 8 Mar 2019 11:39:42 -0500 Subject: [PATCH 085/135] filetop: Avoid missing important entries While testing, it is noticed that sometimes filetop can miss certain heavy-weight activity. This is because it sorts by only a single column. So for example, if writes are flooding, then it will be way down in the filetop list and not displayed at all for the default maxrows of 20 and default sort of rbytes. It is better instead to sort by all columns, by default. This will help catch them. Test: Start a bash and do: while true; do echo "asdfasdf" >> testfile; done Run filetop. Without patch no activity is displayed due to other small reads on the system. With the path they are. Fixes issue: https://github.com/iovisor/bcc/issues/2252 Signed-off-by: Joel Fernandes (Google) --- tools/filetop.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/filetop.py b/tools/filetop.py index 03c01f413..ccc5a1079 100755 --- a/tools/filetop.py +++ b/tools/filetop.py @@ -38,8 +38,8 @@ help="don't clear the screen") parser.add_argument("-r", "--maxrows", default=20, help="maximum rows to print, default 20") -parser.add_argument("-s", "--sort", default="rbytes", - choices=["reads", "writes", "rbytes", "wbytes"], +parser.add_argument("-s", "--sort", default="all", + choices=["all", "reads", "writes", "rbytes", "wbytes"], help="sort column, default rbytes") parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="tgid", help="trace this PID only") @@ -166,6 +166,12 @@ def signal_ignore(signal_value, frame): print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval) +def sort_fn(counts): + if args.sort == "all": + return (counts[1].rbytes + counts[1].wbytes + counts[1].reads + counts[1].writes) + else: + return getattr(counts[1], args.sort) + # output exiting = 0 while 1: @@ -188,8 +194,7 @@ def signal_ignore(signal_value, frame): counts = b.get_table("counts") line = 0 for k, v in reversed(sorted(counts.items(), - key=lambda counts: - getattr(counts[1], args.sort))): + key=sort_fn)): name = k.name.decode('utf-8', 'replace') if k.name_len > DNAME_INLINE_LEN: name = name[:-3] + "..." From 6d147e65ce1dba22ec49cd10fe067430bc04b876 Mon Sep 17 00:00:00 2001 From: Zwb Date: Sun, 10 Mar 2019 01:59:53 +0800 Subject: [PATCH 086/135] add a new bcc tool drsnoop (#2220) Trace direct reclaim and print details including issuing PID and number of system free pages in verbose mode. --- README.md | 1 + man/man8/drsnoop.8 | 110 +++++++++++++++ tests/python/test_tools_smoke.py | 4 + tools/drsnoop.py | 235 +++++++++++++++++++++++++++++++ tools/drsnoop_example.txt | 149 ++++++++++++++++++++ 5 files changed, 499 insertions(+) create mode 100644 man/man8/drsnoop.8 create mode 100755 tools/drsnoop.py create mode 100644 tools/drsnoop_example.txt diff --git a/README.md b/README.md index b09c380da..595971431 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,7 @@ pair of .c and .py files, and some are directories of files. - tools/[dcsnoop](tools/dcsnoop.py): Trace directory entry cache (dcache) lookups. [Examples](tools/dcsnoop_example.txt). - tools/[dcstat](tools/dcstat.py): Directory entry cache (dcache) stats. [Examples](tools/dcstat_example.txt). - tools/[deadlock](tools/deadlock.py): Detect potential deadlocks on a running process. [Examples](tools/deadlock_example.txt). +- tools/[drsnoop](tools/drsnoop.py): Trace direct reclaim events with PID and latency. [Examples](tools/drsnoop_example.txt). - tools/[execsnoop](tools/execsnoop.py): Trace new processes via exec() syscalls. [Examples](tools/execsnoop_example.txt). - tools/[ext4dist](tools/ext4dist.py): Summarize ext4 operation latency distribution as a histogram. [Examples](tools/ext4dist_example.txt). - tools/[ext4slower](tools/ext4slower.py): Trace slow ext4 operations. [Examples](tools/ext4slower_example.txt). diff --git a/man/man8/drsnoop.8 b/man/man8/drsnoop.8 new file mode 100644 index 000000000..98e27e56f --- /dev/null +++ b/man/man8/drsnoop.8 @@ -0,0 +1,110 @@ +.TH drsnoop 8 "2019-02-20" "USER COMMANDS" +.SH NAME +drsnoop \- Trace direct reclaim events. Uses Linux eBPF/bcc. +.SH SYNOPSIS +.B drsnoop.py [\-h] [\-T] [\-U] [\-p PID] [\-t TID] [\-u UID] [\-d DURATION] [-n name] [-v] +.SH DESCRIPTION +drsnoop trace direct reclaim events, showing which processes are allocing pages +with direct reclaiming. This can be useful for discovering when allocstall (/p- +roc/vmstat) continues to increase, whether it is caused by some critical proc- +esses or not. + +This works by tracing the direct reclaim events using kernel tracepoints. + +This makes use of a Linux 4.5 feature (bpf_perf_event_output()); +for kernels older than 4.5, see the version under tools/old, +which uses an older mechanism. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +\-T +Include a timestamp column. +.TP +\-U +Show UID. +.TP +\-p PID +Trace this process ID only (filtered in-kernel). +.TP +\-t TID +Trace this thread ID only (filtered in-kernel). +.TP +\-u UID +Trace this UID only (filtered in-kernel). +.TP +\-d DURATION +Total duration of trace in seconds. +.TP +\-n name +Only print processes where its name partially matches 'name' +\-v verbose +Run in verbose mode. Will output system memory state +.SH EXAMPLES +.TP +Trace all direct reclaim events: +# +.B drsnoop +.TP +Trace all direct reclaim events, for 10 seconds only: +# +.B drsnoop -d 10 +.TP +Trace all direct reclaim events, and include timestamps: +# +.B drsnoop \-T +.TP +Show UID: +# +.B drsnoop \-U +.TP +Trace PID 181 only: +# +.B drsnoop \-p 181 +.TP +Trace UID 1000 only: +# +.B drsnoop \-u 1000 +.TP +Trace all direct reclaim events from processes where its name partially match- +es 'mond': +# +.B drnsnoop \-n mond +.SH FIELDS +.TP +TIME(s) +Time of the call, in seconds. +.TP +UID +User ID +.TP +PID +Process ID +.TP +TID +Thread ID +.TP +COMM +Process name +.SH OVERHEAD +This traces the kernel direct reclaim tracepoints and prints output for each +event. As the rate of this is generally expected to be low (< 1000/s), the +overhead is also expected to be negligible. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Ethercflow diff --git a/tests/python/test_tools_smoke.py b/tests/python/test_tools_smoke.py index 13667d909..e1b10ae28 100755 --- a/tests/python/test_tools_smoke.py +++ b/tests/python/test_tools_smoke.py @@ -144,6 +144,10 @@ def test_deadlock(self): # self.run_with_int("deadlock.py $(pgrep -n bash)", timeout=10) pass + @skipUnless(kernel_version_ge(4,7), "requires kernel >= 4.7") + def test_drsnoop(self): + self.run_with_int("drsnoop.py") + @skipUnless(kernel_version_ge(4,8), "requires kernel >= 4.8") def test_execsnoop(self): self.run_with_int("execsnoop.py") diff --git a/tools/drsnoop.py b/tools/drsnoop.py new file mode 100755 index 000000000..c77f52064 --- /dev/null +++ b/tools/drsnoop.py @@ -0,0 +1,235 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# drsnoop Trace direct reclaim and print details including issuing PID. +# For Linux, uses BCC, eBPF. +# +# This uses in-kernel eBPF maps to cache process details (PID and comm) by +# direct reclaim begin, as well as a starting timestamp for calculating +# latency. +# +# Copyright (c) 2019 Ethercflow +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 20-Feb-2019 Ethercflow Created this. +# 09-Mar-2019 Ethercflow Updated for show sys mem info. + +from __future__ import print_function +from bcc import ArgString, BPF +import argparse +from datetime import datetime, timedelta +import os +import math + +# symbols +kallsyms = "/proc/kallsyms" + +# arguments +examples = """examples: + ./drsnoop # trace all direct reclaim + ./drsnoop -T # include timestamps + ./drsnoop -U # include UID + ./drsnoop -P 181 # only trace PID 181 + ./drsnoop -t 123 # only trace TID 123 + ./drsnoop -u 1000 # only trace UID 1000 + ./drsnoop -d 10 # trace for 10 seconds only + ./drsnoop -n main # only print process names containing "main" +""" +parser = argparse.ArgumentParser( + description="Trace direct reclaim", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-T", "--timestamp", action="store_true", + help="include timestamp on output") +parser.add_argument("-U", "--print-uid", action="store_true", + help="print UID column") +parser.add_argument("-p", "--pid", + help="trace this PID only") +parser.add_argument("-t", "--tid", + help="trace this TID only") +parser.add_argument("-u", "--uid", + help="trace this UID only") +parser.add_argument("-d", "--duration", + help="total duration of trace in seconds") +parser.add_argument("-n", "--name", + type=ArgString, + help="only print process names containing this name") +parser.add_argument("-v", "--verbose", action="store_true", + help="show system memory state") +parser.add_argument("--ebpf", action="store_true", + help=argparse.SUPPRESS) +args = parser.parse_args() +debug = 0 +if args.duration: + args.duration = timedelta(seconds=int(args.duration)) + + +# vm_stat +vm_stat_addr = '' +with open(kallsyms) as syms: + for line in syms: + (addr, size, name) = line.rstrip().split(" ", 2) + name = name.split("\t")[0] + if name == "vm_stat": + vm_stat_addr = "0x" + addr + break + if name == "vm_zone_stat": + vm_stat_addr = "0x" + addr + break + if vm_stat_addr == '': + print("ERROR: no vm_stat or vm_zone_stat in /proc/kallsyms. Exiting.") + print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.") + exit() + +NR_FREE_PAGES = 0 + +PAGE_SIZE = os.sysconf("SC_PAGE_SIZE") +PAGE_SHIFT = int(math.log(PAGE_SIZE) / math.log(2)) + +def K(x): + return x << (PAGE_SHIFT - 10) + +# load BPF program +bpf_text = """ +#include +#include +#include + +struct val_t { + u64 id; + u64 ts; // start time + char name[TASK_COMM_LEN]; + u64 vm_stat[NR_VM_ZONE_STAT_ITEMS]; +}; + +struct data_t { + u64 id; + u32 uid; + u64 nr_reclaimed; + u64 delta; + u64 ts; // end time + char name[TASK_COMM_LEN]; + u64 vm_stat[NR_VM_ZONE_STAT_ITEMS]; +}; + +BPF_HASH(start, u64, struct val_t); +BPF_PERF_OUTPUT(events); + +TRACEPOINT_PROBE(vmscan, mm_vmscan_direct_reclaim_begin) { + struct val_t val = {}; + u64 id = bpf_get_current_pid_tgid(); + u32 pid = id >> 32; // PID is higher part + u32 tid = id; // Cast and get the lower part + u32 uid = bpf_get_current_uid_gid(); + u64 ts; + + PID_TID_FILTER + UID_FILTER + if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) { + val.id = id; + val.ts = bpf_ktime_get_ns(); + bpf_probe_read(&val.vm_stat, sizeof(val.vm_stat), (const void *)%s); + start.update(&id, &val); + } + return 0; +} + +TRACEPOINT_PROBE(vmscan, mm_vmscan_direct_reclaim_end) { + u64 id = bpf_get_current_pid_tgid(); + struct val_t *valp; + struct data_t data = {}; + u64 ts = bpf_ktime_get_ns(); + + valp = start.lookup(&id); + if (valp == NULL) { + // missed entry + return 0; + } + + data.delta = ts - valp->ts; + data.ts = ts / 1000; + data.id = valp->id; + data.uid = bpf_get_current_uid_gid(); + bpf_probe_read(&data.name, sizeof(data.name), valp->name); + bpf_probe_read(&data.vm_stat, sizeof(data.vm_stat), valp->vm_stat); + data.nr_reclaimed = args->nr_reclaimed; + + events.perf_submit(args, &data, sizeof(data)); + start.delete(&id); + + return 0; +} +""" % vm_stat_addr + +if args.tid: # TID trumps PID + bpf_text = bpf_text.replace('PID_TID_FILTER', + 'if (tid != %s) { return 0; }' % args.tid) +elif args.pid: + bpf_text = bpf_text.replace('PID_TID_FILTER', + 'if (pid != %s) { return 0; }' % args.pid) +else: + bpf_text = bpf_text.replace('PID_TID_FILTER', '') +if args.uid: + bpf_text = bpf_text.replace('UID_FILTER', + 'if (uid != %s) { return 0; }' % args.uid) +else: + bpf_text = bpf_text.replace('UID_FILTER', '') +if debug or args.ebpf: + print(bpf_text) + if args.ebpf: + exit() + +# initialize BPF +b = BPF(text=bpf_text) + +initial_ts = 0 + +# header +if args.timestamp: + print("%-14s" % ("TIME(s)"), end="") +if args.print_uid: + print("%-6s" % ("UID"), end="") +print("%-14s %-6s %8s %5s" % + ("COMM", "TID" if args.tid else "PID", "LAT(ms)", "PAGES"), end="") +if args.verbose: + print("%10s" % ("FREE(KB)")) +else: + print("") + +# process event +def print_event(cpu, data, size): + event = b["events"].event(data) + + global initial_ts + + if not initial_ts: + initial_ts = event.ts + + if args.name and bytes(args.name) not in event.name: + return + + if args.timestamp: + delta = event.ts - initial_ts + print("%-14.9f" % (float(delta) / 1000000), end="") + + if args.print_uid: + print("%-6d" % event.uid, end="") + + print("%-14.14s %-6s %8.2f %5d" % + (event.name.decode('utf-8', 'replace'), + event.id & 0xffffffff if args.tid else event.id >> 32, + float(event.delta) / 1000000, event.nr_reclaimed), end="") + if args.verbose: + print("%10d" % K(event.vm_stat[NR_FREE_PAGES])) + else: + print("") + + +# loop with callback to print_event +b["events"].open_perf_buffer(print_event, page_cnt=64) +start_time = datetime.now() +while not args.duration or datetime.now() - start_time < args.duration: + try: + b.perf_buffer_poll() + except KeyboardInterrupt: + exit() diff --git a/tools/drsnoop_example.txt b/tools/drsnoop_example.txt new file mode 100644 index 000000000..3171ef266 --- /dev/null +++ b/tools/drsnoop_example.txt @@ -0,0 +1,149 @@ +Demonstrations of drsnoop, the Linux eBPF/bcc version. + + +drsnoop traces the direct reclaim system-wide, and prints various details. +Example output: + +# ./drsnoop +COMM PID LAT(ms) PAGES +summond 17678 0.19 143 +summond 17669 0.55 313 +summond 17669 0.15 145 +summond 17669 0.27 237 +summond 17669 0.48 111 +summond 17669 0.16 75 +head 17821 0.29 339 +head 17825 0.17 109 +summond 17669 0.14 73 +summond 17496 104.84 40 +summond 17678 0.32 167 +summond 17678 0.14 106 +summond 17678 0.16 67 +summond 17678 0.29 267 +summond 17678 0.27 69 +summond 17678 0.32 46 +base64 17816 0.16 85 +summond 17678 0.43 283 +summond 17678 0.14 182 +head 17736 0.57 135 +^C + +While tracing, the processes alloc pages,due to insufficient memory available +in the system, direct reclaim events happened, which will increase the waiting +delay of the processes. + +drsnoop can be useful for discovering when allocstall(/proc/vmstat) continues to increase, +whether it is caused by some critical processes or not. + +The -p option can be used to filter on a PID, which is filtered in-kernel. Here +I've used it with -T to print timestamps: + +# ./drsnoop -Tp +TIME(s) COMM PID LAT(ms) PAGES +107.364115000 summond 17491 0.24 50 +107.364550000 summond 17491 0.26 38 +107.365266000 summond 17491 0.36 72 +107.365753000 summond 17491 0.22 49 +^C + +This shows the summond process allocs pages, and direct reclaim events happening, +and the delays are not affected much. + +The -U option include UID on output: + +# ./drsnoop -U +UID COMM PID LAT(ms) PAGES +1000 summond 17678 0.32 46 +0 base64 17816 0.16 85 +1000 summond 17678 0.43 283 +1000 summond 17678 0.14 182 +0 head 17821 0.29 339 +0 head 17825 0.17 109 +^C + +The -u option filtering UID: + +# ./drsnoop -Uu 1000 +UID COMM PID LAT(ms) PAGES +1000 summond 17678 0.19 143 +1000 summond 17669 0.55 313 +1000 summond 17669 0.15 145 +1000 summond 17669 0.27 237 +1000 summond 17669 0.48 111 +1000 summond 17669 0.16 75 +1000 summond 17669 0.14 73 +1000 summond 17678 0.32 167 +^C + +A maximum tracing duration can be set with the -d option. For example, to trace +for 2 seconds: + +# ./drsnoop -d 2 +COMM PID LAT(ms) PAGES +head 21715 0.15 195 + +The -n option can be used to filter on process name using partial matches: + +# ./drsnoop -n mond +COMM PID LAT(ms) PAGES +summond 10271 0.03 51 +summond 10271 0.03 51 +summond 10259 0.05 51 +summond 10269 319.41 37 +summond 10270 111.73 35 +summond 10270 0.11 78 +summond 10270 0.12 71 +summond 10270 0.03 35 +summond 10277 111.62 41 +summond 10277 0.08 45 +summond 10277 0.06 32 +^C + +This caught the 'summond' command because it partially matches 'mond' that's passed +to the '-n' option. + + +The -v option can be used to show system memory state (now only free mem) at +the beginning of direct reclaiming: + +# ./drsnoop.py -v +COMM PID LAT(ms) PAGES FREE(KB) +base64 34924 0.23 151 86260 +base64 34962 0.26 149 86260 +head 34931 0.24 150 86260 +base64 34902 0.19 148 86260 +head 34963 0.19 151 86228 +base64 34959 0.17 151 86228 +head 34965 0.29 190 86228 +base64 34957 0.24 152 86228 +summond 34870 0.15 151 86080 +summond 34870 0.12 115 86184 + +USAGE message: + +# ./drsnoop -h +usage: drsnoop.py [-h] [-T] [-U] [-p PID] [-t TID] [-u UID] [-d DURATION] + [-n NAME] + +Trace direct reclaim + +optional arguments: + -h, --help show this help message and exit + -T, --timestamp include timestamp on output + -U, --print-uid print UID column + -p PID, --pid PID trace this PID only + -t TID, --tid TID trace this TID only + -u UID, --uid UID trace this UID only + -d DURATION, --duration DURATION + total duration of trace in seconds + -n NAME, --name NAME only print process names containing this name + +examples: + ./drsnoop # trace all direct reclaim + ./drsnoop -T # include timestamps + ./drsnoop -U # include UID + ./drsnoop -P 181 # only trace PID 181 + ./drsnoop -t 123 # only trace TID 123 + ./drsnoop -u 1000 # only trace UID 1000 + ./drsnoop -d 10 # trace for 10 seconds only + ./drsnoop -n main # only print process names containing "main" From ba41501bb2ca89312061b31c08e570a11c092370 Mon Sep 17 00:00:00 2001 From: mephi42 Date: Tue, 12 Mar 2019 07:02:56 +0100 Subject: [PATCH 087/135] Add basic USDT support for s390x (#2266) Approach and code shamelessly borrowed from "422db709: Add basic USDT support for powerpc64". --- src/cc/usdt.h | 7 +++++ src/cc/usdt/usdt.cc | 2 ++ src/cc/usdt/usdt_args.cc | 53 ++++++++++++++++++++++++++++++++++++++ tests/cc/test_usdt_args.cc | 46 +++++++++++++++++++++++++++++++++ 4 files changed, 108 insertions(+) diff --git a/src/cc/usdt.h b/src/cc/usdt.h index 6d89fd644..406cfd546 100644 --- a/src/cc/usdt.h +++ b/src/cc/usdt.h @@ -81,6 +81,7 @@ class Argument { friend class ArgumentParser; friend class ArgumentParser_aarch64; friend class ArgumentParser_powerpc64; + friend class ArgumentParser_s390x; friend class ArgumentParser_x64; }; @@ -130,6 +131,12 @@ class ArgumentParser_powerpc64 : public ArgumentParser { ArgumentParser_powerpc64(const char *arg) : ArgumentParser(arg) {} }; +class ArgumentParser_s390x : public ArgumentParser { +public: + bool parse(Argument *dest); + ArgumentParser_s390x(const char *arg) : ArgumentParser(arg) {} +}; + class ArgumentParser_x64 : public ArgumentParser { private: enum Register { diff --git a/src/cc/usdt/usdt.cc b/src/cc/usdt/usdt.cc index 0914fe3a5..c91faa016 100644 --- a/src/cc/usdt/usdt.cc +++ b/src/cc/usdt/usdt.cc @@ -40,6 +40,8 @@ Location::Location(uint64_t addr, const std::string &bin_path, const char *arg_f ArgumentParser_aarch64 parser(arg_fmt); #elif __powerpc64__ ArgumentParser_powerpc64 parser(arg_fmt); +#elif __s390x__ + ArgumentParser_s390x parser(arg_fmt); #else ArgumentParser_x64 parser(arg_fmt); #endif diff --git a/src/cc/usdt/usdt_args.cc b/src/cc/usdt/usdt_args.cc index b27e515f4..3e2045575 100644 --- a/src/cc/usdt/usdt_args.cc +++ b/src/cc/usdt/usdt_args.cc @@ -285,6 +285,59 @@ bool ArgumentParser_powerpc64::parse(Argument *dest) { return true; } +bool ArgumentParser_s390x::parse(Argument *dest) { + if (done()) + return false; + + bool matched; + std::cmatch matches; +#define S390X_IMM "(-?[0-9]+)" + std::regex arg_n_regex("^" S390X_IMM "@"); + // + std::regex arg_op_regex_imm("^" S390X_IMM "(?: +|$)"); + // %r +#define S390X_REG "%r([0-9]|1[0-5])" + std::regex arg_op_regex_reg("^" S390X_REG "(?: +|$)"); + // (%r,%r) + std::regex arg_op_regex_mem("^" S390X_IMM "?\\(" S390X_REG + "(?:," S390X_REG ")?\\)(?: +|$)"); +#undef S390X_IMM +#undef S390X_REG + + matched = std::regex_search(arg_ + cur_pos_, matches, arg_n_regex); + if (matched) { + dest->arg_size_ = stoi(matches.str(1)); + cur_pos_ += matches.length(0); + + if (std::regex_search(arg_ + cur_pos_, matches, arg_op_regex_imm)) { + dest->constant_ = stoi(matches.str(1)); + } else if (std::regex_search(arg_ + cur_pos_, matches, arg_op_regex_reg)) { + dest->base_register_name_ = "gprs[" + matches.str(1) + "]"; + } else if (std::regex_search(arg_ + cur_pos_, matches, arg_op_regex_mem)) { + if (matches.length(1) > 0) { + dest->deref_offset_ = stoi(matches.str(1)); + } + dest->base_register_name_ = "gprs[" + matches.str(2) + "]"; + if (matches.length(3) > 0) { + dest->index_register_name_ = "gprs[" + matches.str(3) + "]"; + } + } else { + matched = false; + } + } + + if (!matched) { + print_error(cur_pos_); + skip_until_whitespace_from(cur_pos_); + skip_whitespace_from(cur_pos_); + return false; + } + + cur_pos_ += matches.length(0); + skip_whitespace_from(cur_pos_); + return true; +} + ssize_t ArgumentParser_x64::parse_identifier(ssize_t pos, optional *result) { if (isalpha(arg_[pos]) || arg_[pos] == '_') { diff --git a/tests/cc/test_usdt_args.cc b/tests/cc/test_usdt_args.cc index 3a96c5aac..db1f8c8e6 100644 --- a/tests/cc/test_usdt_args.cc +++ b/tests/cc/test_usdt_args.cc @@ -58,6 +58,8 @@ TEST_CASE("test usdt argument parsing", "[usdt]") { USDT::ArgumentParser_aarch64 parser("4@[x32,200]"); #elif __powerpc64__ USDT::ArgumentParser_powerpc64 parser("4@-12(42)"); +#elif __s390x__ + USDT::ArgumentParser_s390x parser("4@-12(%r42)"); #elif defined(__x86_64__) USDT::ArgumentParser_x64 parser("4@i%ra+1r"); #endif @@ -121,6 +123,50 @@ TEST_CASE("test usdt argument parsing", "[usdt]") { verify_register(parser, 2, 1097); verify_register(parser, 4, "gpr[30]", 108); verify_register(parser, -2, "gpr[31]", -4); +#elif __s390x__ + USDT::ArgumentParser_s390x parser( + "-4@%r0 8@%r0 8@0 4@0(%r0) -2@0(%r0) " + "1@%r0 -2@%r3 -8@9 -1@0(%r4) -4@16(%r6) " + "2@%r7 4@%r11 4@-67 8@-16(%r15) 1@-52(%r11) " + "-8@%r4 -8@%r14 2@-11 -2@14(%r13) -8@-32(%r12) " + "4@%r5 2@%r11 -8@-693 -1@-23(%r10) 4@28(%r9) " + "-2@%r3 -4@%r8 2@1097 4@108(%r7) -2@-4(%r6)"); + + verify_register(parser, -4, "gprs[0]"); + verify_register(parser, 8, "gprs[0]"); + verify_register(parser, 8, 0); + verify_register(parser, 4, "gprs[0]", 0); + verify_register(parser, -2, "gprs[0]", 0); + + verify_register(parser, 1, "gprs[0]"); + verify_register(parser, -2, "gprs[3]"); + verify_register(parser, -8, 9); + verify_register(parser, -1, "gprs[4]", 0); + verify_register(parser, -4, "gprs[6]", 16); + + verify_register(parser, 2, "gprs[7]"); + verify_register(parser, 4, "gprs[11]"); + verify_register(parser, 4, -67); + verify_register(parser, 8, "gprs[15]", -16); + verify_register(parser, 1, "gprs[11]", -52); + + verify_register(parser, -8, "gprs[4]"); + verify_register(parser, -8, "gprs[14]"); + verify_register(parser, 2, -11); + verify_register(parser, -2, "gprs[13]", 14); + verify_register(parser, -8, "gprs[12]", -32); + + verify_register(parser, 4, "gprs[5]"); + verify_register(parser, 2, "gprs[11]"); + verify_register(parser, -8, -693); + verify_register(parser, -1, "gprs[10]", -23); + verify_register(parser, 4, "gprs[9]", 28); + + verify_register(parser, -2, "gprs[3]"); + verify_register(parser, -4, "gprs[8]"); + verify_register(parser, 2, 1097); + verify_register(parser, 4, "gprs[7]", 108); + verify_register(parser, -2, "gprs[6]", -4); #elif defined(__x86_64__) USDT::ArgumentParser_x64 parser( "-4@$0 8@$1234 %rdi %rax %rsi " From ef14cfb82ac51a71d67cf01b0e26327592d1d02a Mon Sep 17 00:00:00 2001 From: Terence Namusonge Date: Tue, 12 Mar 2019 19:08:11 +0300 Subject: [PATCH 088/135] explicit warning when source won't be saved (#2267) explicit warning when source won't be saved --- src/cc/api/BPF.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cc/api/BPF.cc b/src/cc/api/BPF.cc index 473784c0a..db2436d84 100644 --- a/src/cc/api/BPF.cc +++ b/src/cc/api/BPF.cc @@ -560,8 +560,10 @@ StatusTuple BPF::load_func(const std::string& func_name, bpf_prog_type type, if (fd < 0) return StatusTuple(-1, "Failed to load %s: %d", func_name.c_str(), fd); - bpf_module_->annotate_prog_tag( + int ret = bpf_module_->annotate_prog_tag( func_name, fd, reinterpret_cast(func_start), func_size); + if (ret < 0) + fprintf(stderr, "WARNING: cannot get prog tag, ignore saving source with program tag\n"); funcs_[func_name] = fd; return StatusTuple(0); } From 81d17be64b6ac66588bb1798506e61fa1d8de790 Mon Sep 17 00:00:00 2001 From: Gary Ching-Pang Lin Date: Thu, 14 Mar 2019 00:35:26 +0800 Subject: [PATCH 089/135] docs/reference_guide: update the examples to reflect the recent change (#2268) Since KeyboardInterrupt is not handled anymore, update the example code and the link to the example scripts to reflect the change. Also fix the link to 'trace_fields' example scripts. Signed-off-by: Gary Lin --- docs/reference_guide.md | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/docs/reference_guide.md b/docs/reference_guide.md index 24f7c8189..a9bd02d77 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -1230,8 +1230,8 @@ while 1: ``` Examples in situ: -[search /examples](https://github.com/iovisor/bcc/search?q=trace_print+path%3Aexamples+language%3Apython&type=Code), -[search /tools](https://github.com/iovisor/bcc/search?q=trace_print+path%3Atools+language%3Apython&type=Code) +[search /examples](https://github.com/iovisor/bcc/search?q=trace_fields+path%3Aexamples+language%3Apython&type=Code), +[search /tools](https://github.com/iovisor/bcc/search?q=trace_fields+path%3Atools+language%3Apython&type=Code) ## Output @@ -1252,11 +1252,14 @@ Example: # loop with callback to print_event b["events"].open_perf_buffer(print_event) while 1: - b.perf_buffer_poll() + try: + b.perf_buffer_poll() + except KeyboardInterrupt: + exit(); ``` Examples in situ: -[code](https://github.com/iovisor/bcc/blob/08fbceb7e828f0e3e77688497727c5b2405905fd/examples/tracing/hello_perf_output.py#L61), +[code](https://github.com/iovisor/bcc/blob/v0.9.0/examples/tracing/hello_perf_output.py#L55), [search /examples](https://github.com/iovisor/bcc/search?q=perf_buffer_poll+path%3Aexamples+language%3Apython&type=Code), [search /tools](https://github.com/iovisor/bcc/search?q=perf_buffer_poll+path%3Atools+language%3Apython&type=Code) @@ -1297,7 +1300,10 @@ def print_event(cpu, data, size): # loop with callback to print_event b["events"].open_perf_buffer(print_event) while 1: - b.perf_buffer_poll() + try: + b.perf_buffer_poll() + except KeyboardInterrupt: + exit() ``` Note that the data structure transferred will need to be declared in C in the BPF program. For example: @@ -1337,7 +1343,7 @@ def print_event(cpu, data, size): ``` Examples in situ: -[code](https://github.com/iovisor/bcc/blob/08fbceb7e828f0e3e77688497727c5b2405905fd/examples/tracing/hello_perf_output.py#L59), +[code](https://github.com/iovisor/bcc/blob/v0.9.0/examples/tracing/hello_perf_output.py#L52), [search /examples](https://github.com/iovisor/bcc/search?q=open_perf_buffer+path%3Aexamples+language%3Apython&type=Code), [search /tools](https://github.com/iovisor/bcc/search?q=open_perf_buffer+path%3Atools+language%3Apython&type=Code) From 3b04ac6f1ddd25ecb3b287bf303ef9a5688af415 Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Wed, 13 Mar 2019 23:26:04 -0700 Subject: [PATCH 090/135] add forward declaration of bpf_insn in bpf_module.h (#2269) struct bpf_insn is used (as a pointee) in bpf_module.h, but no definition or forward declaration can be found by just looking at this file. definition is not needed as it is used as a pointee. providing a forward declaration here. Signed-off-by: Yonghong Song --- src/cc/bpf_module.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cc/bpf_module.h b/src/cc/bpf_module.h index 63d998c7b..03e4f6d63 100644 --- a/src/cc/bpf_module.h +++ b/src/cc/bpf_module.h @@ -33,6 +33,8 @@ class Module; class Type; } +struct bpf_insn; + namespace ebpf { typedef std::map> sec_map_def; From 6cf63612b0be07b718c146b337b00e94a8e601fc Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Thu, 14 Mar 2019 08:21:56 -0700 Subject: [PATCH 091/135] avoid overflowing btf stringtable during btf line adjustment (#2270) For remapped files (the main bpf program file and helpers.h), the llvm compiler does not have the source so bcc did the adjustment/patching after the compilation for lines in .btf.ext line_info. If a particular line in the main file or helpers.h is referenced in the line_info table, the line itself will be added to string table. If too many lines are added into the string table, the string table may become too big (>= 64KB), and libbpf/kernel will reject it. In my instance with a Facebook internal bpf program, after all referenced lines are added, the string table is 67KB. This patch added checking during string table adjustment to avoid overflow against the kernel limit. Signed-off-by: Yonghong Song --- src/cc/bcc_btf.cc | 25 +++++++++++++++++++------ src/cc/bcc_btf.h | 5 +++-- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/cc/bcc_btf.cc b/src/cc/bcc_btf.cc index 881959afa..e46cb9ae0 100644 --- a/src/cc/bcc_btf.cc +++ b/src/cc/bcc_btf.cc @@ -29,12 +29,17 @@ namespace ebpf { -uint32_t BTFStringTable::addString(std::string S) { +int32_t BTFStringTable::addString(std::string S) { // Check whether the string already exists. for (auto &OffsetM : OffsetToIdMap) { if (Table[OffsetM.second] == S) return OffsetM.first; } + + // Make sure we do not overflow the string table. + if (OrigTblLen + Size + S.size() + 1 >= BTF_MAX_NAME_OFFSET) + return -1; + // Not find, add to the string table. uint32_t Offset = Size; OffsetToIdMap[Offset] = Table.size(); @@ -108,18 +113,26 @@ void BTF::adjust(uint8_t *btf_sec, uintptr_t btf_sec_size, // Go through all line info. For any line number whose line is in the LineCaches, // Correct the line_off and record the corresponding source line in BTFStringTable, // which later will be merged into .BTF string section. - BTFStringTable new_strings; - while (linfo_len) { + BTFStringTable new_strings(orig_strings_len); + bool overflow = false; + while (!overflow && linfo_len) { unsigned num_recs = linfo_s[1]; linfo_s += 2; - for (unsigned i = 0; i < num_recs; i++) { + for (unsigned i = 0; !overflow && i < num_recs; i++) { struct bpf_line_info *linfo = (struct bpf_line_info *)linfo_s; if (linfo->line_off == 0) { for (auto it = LineCaches.begin(); it != LineCaches.end(); ++it) { if (strcmp(strings + linfo->file_name_off, it->first.c_str()) == 0) { unsigned line_num = BPF_LINE_INFO_LINE_NUM(linfo->line_col); - if (line_num > 0 && line_num <= it->second.size()) - linfo->line_off = orig_strings_len + new_strings.addString(it->second[line_num - 1]); + if (line_num > 0 && line_num <= it->second.size()) { + int offset = new_strings.addString(it->second[line_num - 1]); + if (offset < 0) { + overflow = true; + warning(".BTF string table overflowed, some lines missing\n"); + break; + } + linfo->line_off = orig_strings_len + offset; + } } } } diff --git a/src/cc/bcc_btf.h b/src/cc/bcc_btf.h index 5204b016c..008e25d4f 100644 --- a/src/cc/bcc_btf.h +++ b/src/cc/bcc_btf.h @@ -31,14 +31,15 @@ namespace ebpf { class BTFStringTable { private: uint32_t Size; + uint32_t OrigTblLen; std::map OffsetToIdMap; std::vector Table; public: - BTFStringTable(): Size(0) {} + BTFStringTable(uint32_t TblLen): Size(0), OrigTblLen(TblLen) {} uint32_t getSize() { return Size; } std::vector &getTable() { return Table; } - uint32_t addString(std::string Str); + int32_t addString(std::string Str); }; class BTF { From 2d182dc60d8fab347bf98963773a1aa7619917d8 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Mar 2019 20:55:52 +0000 Subject: [PATCH 092/135] snapcraft: updates to snapcraft 2.37 and core18 snaps (#2277) Make changes to accommodate the move to core18 snaps. Rename the wrapper script to bcc-wrapper so it is less generic. Fix up the wrappering of bcc tools. Remove libc cruft from snap and explicitly state the staging packages required. Signed-off-by: Colin Ian King --- snapcraft/{wrapper => bcc-wrapper} | 2 +- snapcraft/snapcraft.yaml | 231 +++++++++++++++-------------- 2 files changed, 121 insertions(+), 112 deletions(-) rename snapcraft/{wrapper => bcc-wrapper} (82%) diff --git a/snapcraft/wrapper b/snapcraft/bcc-wrapper similarity index 82% rename from snapcraft/wrapper rename to snapcraft/bcc-wrapper index 02569621e..6e5c6f3d7 100755 --- a/snapcraft/wrapper +++ b/snapcraft/bcc-wrapper @@ -7,7 +7,7 @@ cmd="$1" if [ `id -u` = 0 ] ; then shift - stdbuf -oL $SNAP/usr/bin/python "$SNAP/usr/share/bcc/tools/$cmd" $@ + stdbuf -oL $SNAP/usr/bin/python2.7 "$SNAP/usr/share/bcc/tools/$cmd" $@ else echo "Need to run $cmd as root (use sudo $@)" exit 1 diff --git a/snapcraft/snapcraft.yaml b/snapcraft/snapcraft.yaml index 4be910732..5044b2a2f 100644 --- a/snapcraft/snapcraft.yaml +++ b/snapcraft/snapcraft.yaml @@ -16,7 +16,7 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # name: bcc -version: 0.7.0-20181122-2831-166fba57 +version: 0.9.0-20190318-2991-6cf63612 summary: BPF Compiler Collection (BCC) description: A toolkit for creating efficient kernel tracing and manipulation programs confinement: strict @@ -25,7 +25,8 @@ plugs: mount-observe: null system-observe: null system-trace: null -assumes: [snapd2.23] +assumes: [snapd2.37] +base: core18 parts: bcc: @@ -34,6 +35,12 @@ parts: - '-DCMAKE_INSTALL_PREFIX=/usr' source: .. source-type: git + stage-packages: + - libbz2-1.0 + - liblzma5 + - libncursesw5 + - libtinfo5 + - libzzip-0-13 build-packages: - bison - build-essential @@ -47,227 +54,229 @@ parts: - zlib1g-dev - libelf-dev - iperf - stage-packages: - - libc6 prime: - usr/share/bcc/tools - - usr/lib/*/lib*.so* - usr/lib/python2.7 - + - usr/lib/*/lib*.so* - -usr/share/bcc/tools/doc python-deps: plugin: python python-version: python2 - stage-packages: - - libc6 + + wrapper: + plugin: dump + after: [bcc] + source: . + organize: + wrapper: bin/bcc-wrapper apps: argdist: - command: usr/share/bcc/tools/argdist + command: bcc-wrapper argdist bashreadline: - command: usr/share/bcc/tools/bashreadline + command: bcc-wrapper bashreadline biolatency: - command: usr/share/bcc/tools/biolatency + command: bcc-wrapper biolatency biosnoop: - command: usr/share/bcc/tools/biosnoop + command: bcc-wrapper biosnoop biotop: - command: usr/share/bcc/tools/biotop + command: bcc-wrapper biotop bitesize: - command: usr/share/bcc/tools/bitesize + command: bcc-wrapper bitesize bpflist: - command: usr/share/bcc/tools/bpflist + command: bcc-wrapper bpflist btrfsdist: - command: usr/share/bcc/tools/btrfsdist + command: bcc-wrapper btrfsdist btrfsslower: - command: usr/share/bcc/tools/btrfsslower + command: bcc-wrapper btrfsslower cachestat: - command: usr/share/bcc/tools/cachestat + command: bcc-wrapper cachestat cachetop: - command: usr/share/bcc/tools/cachetop + command: bcc-wrapper cachetop capable: - command: usr/share/bcc/tools/capable + command: bcc-wrapper capable cobjnew: - command: usr/share/bcc/tools/cobjnew + command: bcc-wrapper cobjnew cpudist: - command: usr/share/bcc/tools/cpudist + command: bcc-wrapper cpudist cpuunclaimed: - command: usr/share/bcc/tools/cpuunclaimed + command: bcc-wrapper cpuunclaimed dbslower: - command: usr/share/bcc/tools/dbslower + command: bcc-wrapper dbslower dbstat: - command: usr/share/bcc/tools/dbstat + command: bcc-wrapper dbstat dcsnoop: - command: usr/share/bcc/tools/dcsnoop + command: bcc-wrapper dcsnoop dcstat: - command: usr/share/bcc/tools/dcstat + command: bcc-wrapper dcstat deadlock: - command: usr/share/bcc/tools/deadlock + command: bcc-wrapper deadlock execsnoop: - command: usr/share/bcc/tools/execsnoop + command: bcc-wrapper execsnoop ext4dist: - command: usr/share/bcc/tools/ext4dist + command: bcc-wrapper ext4dist ext4slower: - command: usr/share/bcc/tools/ext4slower + command: bcc-wrapper ext4slower filelife: - command: usr/share/bcc/tools/filelife + command: bcc-wrapper filelife fileslower: - command: usr/share/bcc/tools/fileslower + command: bcc-wrapper fileslower filetop: - command: usr/share/bcc/tools/filetop + command: bcc-wrapper filetop funccount: - command: usr/share/bcc/tools/funccount + command: bcc-wrapper funccount funclatency: - command: usr/share/bcc/tools/funclatency + command: bcc-wrapper funclatency funcslower: - command: usr/share/bcc/tools/funcslower + command: bcc-wrapper funcslower gethostlatency: - command: usr/share/bcc/tools/gethostlatency + command: bcc-wrapper gethostlatency hardirqs: - command: usr/share/bcc/tools/hardirqs + command: bcc-wrapper hardirqs javacalls: - command: usr/share/bcc/tools/javacalls + command: bcc-wrapper javacalls javaflow: - command: usr/share/bcc/tools/javaflow + command: bcc-wrapper javaflow javagc: - command: usr/share/bcc/tools/javagc + command: bcc-wrapper javagc javaobjnew: - command: usr/share/bcc/tools/javaobjnew + command: bcc-wrapper javaobjnew javastat: - command: usr/share/bcc/tools/javastat + command: bcc-wrapper javastat javathreads: - command: usr/share/bcc/tools/javathreads + command: bcc-wrapper javathreads killsnoop: - command: usr/share/bcc/tools/killsnoop + command: bcc-wrapper killsnoop llcstat: - command: usr/share/bcc/tools/llcstat + command: bcc-wrapper llcstat mdflush: - command: usr/share/bcc/tools/mdflush + command: bcc-wrapper mdflush memleak: - command: usr/share/bcc/tools/memleak + command: bcc-wrapper memleak mountsnoop: - command: usr/share/bcc/tools/mountsnoop + command: bcc-wrapper mountsnoop mysqld-qslower: - command: usr/share/bcc/tools/mysqld_qslower + command: bcc-wrapper mysqld_qslower nfsdist: - command: usr/share/bcc/tools/nfsdist + command: bcc-wrapper nfsdist nfsslower: - command: usr/share/bcc/tools/nfsslower + command: bcc-wrapper nfsslower nodegc: - command: usr/share/bcc/tools/nodegc + command: bcc-wrapper nodegc nodestat: - command: usr/share/bcc/tools/nodestat + command: bcc-wrapper nodestat offcputime: - command: usr/share/bcc/tools/offcputime + command: bcc-wrapper offcputime offwaketime: - command: usr/share/bcc/tools/offwaketime + command: bcc-wrapper offwaketime oomkill: - command: usr/share/bcc/tools/oomkill + command: bcc-wrapper oomkill opensnoop: - command: usr/share/bcc/tools/opensnoop + command: bcc-wrapper opensnoop perlcalls: - command: usr/share/bcc/tools/perlcalls + command: bcc-wrapper perlcalls perlflow: - command: usr/share/bcc/tools/perlflow + command: bcc-wrapper perlflow perlstat: - command: usr/share/bcc/tools/perlstat + command: bcc-wrapper perlstat shmsnoop: - command: usr/share/bcc/tools/shmsnoop + command: bcc-wrapper shmsnoop sofdsnoop: - command: usr/share/bcc/tools/sofdsnoop + command: bcc-wrapper sofdsnoop phpcalls: - command: usr/share/bcc/tools/phpcalls + command: bcc-wrapper phpcalls phpflow: - command: usr/share/bcc/tools/phpflow + command: bcc-wrapper phpflow phpstat: - command: usr/share/bcc/tools/phpstat + command: bcc-wrapper phpstat pidpersec: - command: usr/share/bcc/tools/pidpersec + command: bcc-wrapper pidpersec profile: - command: usr/share/bcc/tools/profile + command: bcc-wrapper profile pythoncalls: - command: usr/share/bcc/tools/pythoncalls + command: bcc-wrapper pythoncalls pythonflow: - command: usr/share/bcc/tools/pythonflow + command: bcc-wrapper pythonflow pythongc: - command: usr/share/bcc/tools/pythongc + command: bcc-wrapper pythongc pythonstat: - command: usr/share/bcc/tools/pythonstat + command: bcc-wrapper pythonstat rubycalls: - command: usr/share/bcc/tools/rubycalls + command: bcc-wrapper rubycalls rubyflow: - command: usr/share/bcc/tools/rubyflow + command: bcc-wrapper rubyflow rubygc: - command: usr/share/bcc/tools/rubygc + command: bcc-wrapper rubygc rubyobjnew: - command: usr/share/bcc/tools/rubyobjnew + command: bcc-wrapper rubyobjnew rubystat: - command: usr/share/bcc/tools/rubystat + command: bcc-wrapper rubystat runqlat: - command: usr/share/bcc/tools/runqlat + command: bcc-wrapper runqlat runqlen: - command: usr/share/bcc/tools/runqlen + command: bcc-wrapper runqlen slabratetop: - command: usr/share/bcc/tools/slabratetop + command: bcc-wrapper slabratetop softirqs: - command: usr/share/bcc/tools/softirqs + command: bcc-wrapper softirqs solisten: - command: usr/share/bcc/tools/solisten + command: bcc-wrapper solisten sslsniff: - command: usr/share/bcc/tools/sslsniff + command: bcc-wrapper sslsniff stackcount: - command: usr/share/bcc/tools/stackcount + command: bcc-wrapper stackcount statsnoop: - command: usr/share/bcc/tools/statsnoop + command: bcc-wrapper statsnoop syncsnoop: - command: usr/share/bcc/tools/syncsnoop + command: bcc-wrapper syncsnoop syscount: - command: usr/share/bcc/tools/syscount + command: bcc-wrapper syscount tcpaccept: - command: usr/share/bcc/tools/tcpaccept + command: bcc-wrapper tcpaccept tcpconnect: - command: usr/share/bcc/tools/tcpconnect + command: bcc-wrapper tcpconnect tcpconnlat: - command: usr/share/bcc/tools/tcpconnlat + command: bcc-wrapper tcpconnlat tcplife: - command: usr/share/bcc/tools/tcplife + command: bcc-wrapper tcplife tcpretrans: - command: usr/share/bcc/tools/tcpretrans + command: bcc-wrapper tcpretrans tcptop: - command: usr/share/bcc/tools/tcptop + command: bcc-wrapper tcptop tcptracer: - command: usr/share/bcc/tools/tcptracer + command: bcc-wrapper tcptracer tplist: - command: usr/share/bcc/tools/tplist + command: bcc-wrapper tplist trace: - command: usr/share/bcc/tools/trace + command: bcc-wrapper trace ttysnoop: - command: usr/share/bcc/tools/ttysnoop + command: bcc-wrapper ttysnoop ucalls: - command: usr/share/bcc/tools/lib/ucalls + command: bcc-wrapper lib/ucalls uflow: - command: usr/share/bcc/tools/lib/uflow + command: bcc-wrapper lib/uflow ugc: - command: usr/share/bcc/tools/lib/ugc + command: bcc-wrapper lib/ugc uobjnew: - command: usr/share/bcc/tools/lib/uobjnew + command: bcc-wrapper lib/uobjnew ustat: - command: usr/share/bcc/tools/lib/ustat + command: bcc-wrapper lib/ustat uthreads: - command: usr/share/bcc/tools/lib/uthreads + command: bcc-wrapper lib/uthreads vfscount: - command: usr/share/bcc/tools/vfscount + command: bcc-wrapper vfscount vfsstat: - command: usr/share/bcc/tools/vfsstat + command: bcc-wrapper vfsstat wakeuptime: - command: usr/share/bcc/tools/wakeuptime + command: bcc-wrapper wakeuptime xfsdist: - command: usr/share/bcc/tools/xfsdist + command: bcc-wrapper xfsdist xfsslower: - command: usr/share/bcc/tools/xfsslower + command: bcc-wrapper xfsslower zfsdist: - command: usr/share/bcc/tools/zfsdist + command: bcc-wrapper zfsdist zfsslower: - command: usr/share/bcc/tools/zfsslower + command: bcc-wrapper zfsslower # vim: set ai et sts=4 tabstop=4 sw=4: From eb32c1587fd5f4186e7dcc4597326c42e476528e Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Tue, 19 Mar 2019 22:29:49 -0700 Subject: [PATCH 093/135] biosnoop: add -Q for queued time (#2279) * biosnoop: add -Q for queued time * biotop: fix PID --- man/man8/biosnoop.8 | 15 ++++++- tools/biosnoop.py | 91 +++++++++++++++++++++++++------------- tools/biosnoop_example.txt | 60 ++++++++++++++++++------- tools/biotop.py | 2 +- 4 files changed, 118 insertions(+), 50 deletions(-) diff --git a/man/man8/biosnoop.8 b/man/man8/biosnoop.8 index e5dbeaa4c..2a41348cd 100644 --- a/man/man8/biosnoop.8 +++ b/man/man8/biosnoop.8 @@ -2,7 +2,7 @@ .SH NAME biosnoop \- Trace block device I/O and print details incl. issuing PID. .SH SYNOPSIS -.B biosnoop +.B biosnoop [\-hQ] .SH DESCRIPTION This tools traces block device I/O (disk I/O), and prints a one-line summary for each I/O showing various details. These include the latency from the time of @@ -22,6 +22,13 @@ which uses an older mechanism Since this uses BPF, only the root user can use this tool. .SH REQUIREMENTS CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +\-Q +Include a column showing the time spent quueued in the OS. .SH EXAMPLES .TP Trace all block device I/O and print a summary line per I/O: @@ -30,7 +37,7 @@ Trace all block device I/O and print a summary line per I/O: .SH FIELDS .TP TIME(s) -Time of the I/O, in seconds since the first I/O was seen. +Time of the I/O completion, in seconds since the first I/O was seen. .TP COMM Cached process name, if present. This usually (but isn't guaranteed) to identify @@ -52,6 +59,10 @@ Device sector for the I/O. BYTES Size of the I/O, in bytes. .TP +QUE(ms) +Time the I/O was queued in the OS before being issued to the device, +in milliseconds. +.TP LAT(ms) Time for the I/O (latency) from the issue to the device, to its completion, in milliseconds. diff --git a/tools/biosnoop.py b/tools/biosnoop.py index 97f478587..e6f708fae 100755 --- a/tools/biosnoop.py +++ b/tools/biosnoop.py @@ -2,7 +2,7 @@ # @lint-avoid-python-3-compatibility-imports # # biosnoop Trace block device I/O and print details including issuing PID. -# For Linux, uses BCC, eBPF. +# For Linux, uses BCC, eBPF. # # This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O # request, as well as a starting timestamp for calculating I/O latency. @@ -16,13 +16,31 @@ from __future__ import print_function from bcc import BPF import re - -# load BPF program -b = BPF(text=""" +import argparse + +# arguments +examples = """examples: + ./biosnoop # trace all block I/O + ./biosnoop -Q # include OS queued time +""" +parser = argparse.ArgumentParser( + description="Trace block I/O", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-Q", "--queue", action="store_true", + help="include OS queued time") +parser.add_argument("--ebpf", action="store_true", + help=argparse.SUPPRESS) +args = parser.parse_args() +debug = 0 + +# define BPF program +bpf_text=""" #include #include struct val_t { + u64 ts; u32 pid; char name[TASK_COMM_LEN]; }; @@ -31,6 +49,7 @@ u32 pid; u64 rwflag; u64 delta; + u64 qdelta; u64 sector; u64 len; u64 ts; @@ -46,9 +65,13 @@ int trace_pid_start(struct pt_regs *ctx, struct request *req) { struct val_t val = {}; + u64 ts; if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) { - val.pid = bpf_get_current_pid_tgid(); + val.pid = bpf_get_current_pid_tgid() >> 32; + if (##QUEUE##) { + val.ts = bpf_ktime_get_ns(); + } infobyreq.update(&req, &val); } return 0; @@ -58,18 +81,15 @@ int trace_req_start(struct pt_regs *ctx, struct request *req) { u64 ts; - ts = bpf_ktime_get_ns(); start.update(&req, &ts); - return 0; } // output int trace_req_completion(struct pt_regs *ctx, struct request *req) { - u64 *tsp, delta; - u32 *pidp = 0; + u64 *tsp; struct val_t *valp; struct data_t data = {}; u64 ts; @@ -83,12 +103,16 @@ ts = bpf_ktime_get_ns(); data.delta = ts - *tsp; data.ts = ts / 1000; + data.qdelta = 0; valp = infobyreq.lookup(&req); if (valp == 0) { data.len = req->__data_len; strcpy(data.name, "?"); } else { + if (##QUEUE##) { + data.qdelta = *tsp - valp->ts; + } data.pid = valp->pid; data.len = req->__data_len; data.sector = req->__sector; @@ -119,7 +143,18 @@ return 0; } -""", debug=0) +""" +if args.queue: + bpf_text = bpf_text.replace('##QUEUE##', '1') +else: + bpf_text = bpf_text.replace('##QUEUE##', '0') +if debug or args.ebpf: + print(bpf_text) + if args.ebpf: + exit() + +# initialize BPF +b = BPF(text=bpf_text) b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start") if BPF.get_kprobe_functions(b'blk_start_request'): b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") @@ -128,8 +163,11 @@ fn_name="trace_req_completion") # header -print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % ("TIME(s)", "COMM", "PID", - "DISK", "T", "SECTOR", "BYTES", "LAT(ms)")) +print("%-11s %-14s %-6s %-7s %-1s %-10s %-7s" % ("TIME(s)", "COMM", "PID", + "DISK", "T", "SECTOR", "BYTES"), end="") +if args.queue: + print("%7s " % ("QUE(ms)"), end="") +print("%7s" % "LAT(ms)") rwflg = "" start_ts = 0 @@ -140,33 +178,24 @@ def print_event(cpu, data, size): event = b["events"].event(data) - val = -1 global start_ts - global prev_ts - global delta + if start_ts == 0: + start_ts = event.ts if event.rwflag == 1: rwflg = "W" - - if event.rwflag == 0: + else: rwflg = "R" - if not re.match(b'\?', event.name): - val = event.sector - - if start_ts == 0: - prev_ts = start_ts - - if start_ts == 1: - delta = float(delta) + (event.ts - prev_ts) + delta = float(event.ts) - start_ts - print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % ( + print("%-11.6f %-14.14s %-6s %-7s %-1s %-10s %-7s" % ( delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid, - event.disk_name.decode('utf-8', 'replace'), rwflg, val, - event.len, float(event.delta) / 1000000)) - - prev_ts = event.ts - start_ts = 1 + event.disk_name.decode('utf-8', 'replace'), rwflg, event.sector, + event.len), end="") + if args.queue: + print("%7.2f " % (float(event.qdelta) / 1000000), end="") + print("%7.2f" % (float(event.delta) / 1000000)) # loop with callback to print_event b["events"].open_perf_buffer(print_event, page_cnt=64) diff --git a/tools/biosnoop_example.txt b/tools/biosnoop_example.txt index b5cee7f0a..d8be0624c 100644 --- a/tools/biosnoop_example.txt +++ b/tools/biosnoop_example.txt @@ -5,22 +5,22 @@ biosnoop traces block device I/O (disk I/O), and prints a line of output per I/O. Example: # ./biosnoop -TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms) -0.000004001 supervise 1950 xvda1 W 13092560 4096 0.74 -0.000178002 supervise 1950 xvda1 W 13092432 4096 0.61 -0.001469001 supervise 1956 xvda1 W 13092440 4096 1.24 -0.001588002 supervise 1956 xvda1 W 13115128 4096 1.09 -1.022346001 supervise 1950 xvda1 W 13115272 4096 0.98 -1.022568002 supervise 1950 xvda1 W 13188496 4096 0.93 -1.023534000 supervise 1956 xvda1 W 13188520 4096 0.79 -1.023585003 supervise 1956 xvda1 W 13189512 4096 0.60 -2.003920000 xfsaild/md0 456 xvdc W 62901512 8192 0.23 -2.003931001 xfsaild/md0 456 xvdb W 62901513 512 0.25 -2.004034001 xfsaild/md0 456 xvdb W 62901520 8192 0.35 -2.004042000 xfsaild/md0 456 xvdb W 63542016 4096 0.36 -2.004204001 kworker/0:3 26040 xvdb W 41950344 65536 0.34 -2.044352002 supervise 1950 xvda1 W 13192672 4096 0.65 -2.044574000 supervise 1950 xvda1 W 13189072 4096 0.58 +TIME(s) COMM PID DISK T SECTOR BYTES LAT(ms) +0.000004 supervise 1950 xvda1 W 13092560 4096 0.74 +0.000178 supervise 1950 xvda1 W 13092432 4096 0.61 +0.001469 supervise 1956 xvda1 W 13092440 4096 1.24 +0.001588 supervise 1956 xvda1 W 13115128 4096 1.09 +1.022346 supervise 1950 xvda1 W 13115272 4096 0.98 +1.022568 supervise 1950 xvda1 W 13188496 4096 0.93 +1.023534 supervise 1956 xvda1 W 13188520 4096 0.79 +1.023585 supervise 1956 xvda1 W 13189512 4096 0.60 +2.003920 xfsaild/md0 456 xvdc W 62901512 8192 0.23 +2.003931 xfsaild/md0 456 xvdb W 62901513 512 0.25 +2.004034 xfsaild/md0 456 xvdb W 62901520 8192 0.35 +2.004042 xfsaild/md0 456 xvdb W 63542016 4096 0.36 +2.004204 kworker/0:3 26040 xvdb W 41950344 65536 0.34 +2.044352 supervise 1950 xvda1 W 13192672 4096 0.65 +2.044574 supervise 1950 xvda1 W 13189072 4096 0.58 This includes the PID and comm (process name) that were on-CPU at the time of issue (which usually means the process responsible). @@ -47,3 +47,31 @@ There are 4 write IOPS. The output of biosnoop identifies the reason: multiple supervise processes are issuing writes to the xvda1 disk. I can now drill down on supervise using other tools to understand its file system workload. + + +The -Q option includes a column to show the time spent queued in the OS: + +# biosnoop.py -Q +TIME(s) COMM PID DISK T SECTOR BYTES QUE(ms) LAT(ms) +0.000000 kworker/u72:1 13379 nvme1n1 W 1142400 4096 0.01 0.55 +0.000771 sync 22177 nvme1n1 W 41963894 3072 0.11 0.47 +5.332998 xfsaild/nvme1n 1061 nvme1n1 W 545728 16384 0.01 0.61 +5.333044 xfsaild/nvme1n 1061 nvme1n1 W 2349728 16384 0.02 0.64 +5.333065 xfsaild/nvme1n 1061 nvme1n1 W 20971521 512 0.02 0.65 +5.333067 xfsaild/nvme1n 1061 nvme1n1 W 20971528 8192 0.00 0.65 +[...] + + +USAGE message: + +usage: biosnoop.py [-h] [-Q] + +Trace block I/O + +optional arguments: + -h, --help show this help message and exit + -Q, --queue include OS queued time + +examples: + ./biosnoop # trace all block I/O + ./biosnoop -Q # include OS queued time diff --git a/tools/biotop.py b/tools/biotop.py index 62c295d16..6c959f671 100755 --- a/tools/biotop.py +++ b/tools/biotop.py @@ -93,7 +93,7 @@ def signal_ignore(signal_value, frame): struct who_t who = {}; if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) { - who.pid = bpf_get_current_pid_tgid(); + who.pid = bpf_get_current_pid_tgid() >> 32; whobyreq.update(&req, &who); } From df481a4d724b407be2f30fc34a2a649c949f8171 Mon Sep 17 00:00:00 2001 From: lilydjwg Date: Wed, 20 Mar 2019 14:42:06 +0800 Subject: [PATCH 094/135] fix string re being used on bytes for Python 3 --- src/python/bcc/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/bcc/table.py b/src/python/bcc/table.py index d33d46eb0..d32400e91 100644 --- a/src/python/bcc/table.py +++ b/src/python/bcc/table.py @@ -621,7 +621,7 @@ def _get_event_class(self): num_fields = lib.bpf_perf_event_fields(self.bpf.module, self._name) i = 0 while i < num_fields: - field = lib.bpf_perf_event_field(self.bpf.module, self._name, i) + field = lib.bpf_perf_event_field(self.bpf.module, self._name, i).decode() m = re.match(r"(.*)#(.*)", field) field_name = m.group(1) field_type = m.group(2) From c7736eec1808687172ccb008f94b27aa43b8ecda Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Wed, 20 Mar 2019 09:01:57 -0700 Subject: [PATCH 095/135] bitesize: switch to tracepoints (#2281) bitesize: switch to tracepoints --- man/man8/bitesize.8 | 5 ++--- tools/bitesize.py | 37 ++++++------------------------------- 2 files changed, 8 insertions(+), 34 deletions(-) diff --git a/man/man8/bitesize.8 b/man/man8/bitesize.8 index 07046e8bf..7dc8c937f 100644 --- a/man/man8/bitesize.8 +++ b/man/man8/bitesize.8 @@ -6,8 +6,7 @@ bitesize \- Summarize block device I/O size as a histogram \- Linux eBPF/bcc. .SH DESCRIPTION Show I/O distribution for requested block sizes, by process name. -This works by tracing block I/O kernel functions using dynamic -tracing and prints a historgram of I/O size. +This works by tracing block:block_rq_insert and prints a historgram of I/O size. Since this uses BPF, only the root user can use this tool. .SH REQUIREMENTS @@ -29,7 +28,7 @@ distribution An ASCII bar chart to visualize the distribution (count column) .SH OVERHEAD -This traces kernel block I/O functions to update a histgroam, which are +This traces a block I/O tracepoint to update a histogram, which is asynchronously copied to user-space. This method is very efficient, and the overhead for most storage I/O rates (< 10k IOPS) should be negligible. If you have a higher IOPS storage environment, test and quantify the overhead diff --git a/tools/bitesize.py b/tools/bitesize.py index f70f09148..f23feec10 100755 --- a/tools/bitesize.py +++ b/tools/bitesize.py @@ -11,6 +11,7 @@ # Licensed under the Apache License, Version 2.0 (the "License") # # 05-Feb-2016 Allan McAleavy ran pep8 against file +# 19-Mar-2019 Brendan Gregg Switched to use tracepoints. from bcc import BPF from time import sleep @@ -24,47 +25,21 @@ u64 slot; }; -struct val_t { - char name[TASK_COMM_LEN]; -}; - BPF_HISTOGRAM(dist, struct proc_key_t); -BPF_HASH(commbyreq, struct request *, struct val_t); - -int trace_pid_start(struct pt_regs *ctx, struct request *req) -{ - struct val_t val = {}; - if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) { - commbyreq.update(&req, &val); - } - return 0; -} - -int do_count(struct pt_regs *ctx, struct request *req) +TRACEPOINT_PROBE(block, block_rq_insert) { - struct val_t *valp; - - valp = commbyreq.lookup(&req); - if (valp == 0) { - return 0; - } - - if (req->__data_len > 0) { - struct proc_key_t key = {.slot = bpf_log2l(req->__data_len / 1024)}; - bpf_probe_read(&key.name, sizeof(key.name),valp->name); - dist.increment(key); - } + struct proc_key_t key = {.slot = bpf_log2l(args->bytes / 1024)}; + bpf_probe_read(&key.name, sizeof(key.name), args->comm); + dist.increment(key); return 0; } """ # load BPF program b = BPF(text=bpf_text) -b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start") -b.attach_kprobe(event="blk_account_io_completion", fn_name="do_count") -print("Tracing... Hit Ctrl-C to end.") +print("Tracing block I/O... Hit Ctrl-C to end.") # trace until Ctrl-C dist = b.get_table("dist") From c6cded439f77d345d2ff132798f8dde00dbdc60a Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Wed, 20 Mar 2019 09:32:20 -0700 Subject: [PATCH 096/135] biolatency: add -F for I/O flags (#2280) biolatency: add -F for I/O flags --- man/man8/biolatency.8 | 5 +- tools/biolatency.py | 74 ++++++++++++++++++++++++++- tools/biolatency_example.txt | 99 +++++++++++++++++++++++++++++++++++- 3 files changed, 175 insertions(+), 3 deletions(-) diff --git a/man/man8/biolatency.8 b/man/man8/biolatency.8 index 84d261e24..7aa3dd897 100644 --- a/man/man8/biolatency.8 +++ b/man/man8/biolatency.8 @@ -2,7 +2,7 @@ .SH NAME biolatency \- Summarize block device I/O latency as a histogram. .SH SYNOPSIS -.B biolatency [\-h] [\-T] [\-Q] [\-m] [\-D] [interval [count]] +.B biolatency [\-h] [\-F] [\-T] [\-Q] [\-m] [\-D] [interval [count]] .SH DESCRIPTION biolatency traces block device I/O (disk I/O), and records the distribution of I/O latency (time). This is printed as a histogram either on Ctrl-C, or @@ -33,6 +33,9 @@ Output histogram in milliseconds. \-D Print a histogram per disk device. .TP +\-F +Print a histogram per set of I/O flags. +.TP interval Output interval, in seconds. .TP diff --git a/tools/biolatency.py b/tools/biolatency.py index a265c3182..86d994370 100755 --- a/tools/biolatency.py +++ b/tools/biolatency.py @@ -23,6 +23,7 @@ ./biolatency -mT 1 # 1s summaries, milliseconds, and timestamps ./biolatency -Q # include OS queued time in I/O time ./biolatency -D # show each disk device separately + ./biolatency -F # show I/O flags separately """ parser = argparse.ArgumentParser( description="Summarize block device I/O latency as a histogram", @@ -36,6 +37,8 @@ help="millisecond histogram") parser.add_argument("-D", "--disks", action="store_true", help="print a histogram per disk device") +parser.add_argument("-F", "--flags", action="store_true", + help="print a histogram per set of I/O flags") parser.add_argument("interval", nargs="?", default=99999999, help="output interval, in seconds") parser.add_argument("count", nargs="?", default=99999999, @@ -45,6 +48,9 @@ args = parser.parse_args() countdown = int(args.count) debug = 0 +if args.flags and args.disks: + print("ERROR: can only use -D or -F. Exiting.") + exit() # define BPF program bpf_text = """ @@ -55,6 +61,12 @@ char disk[DISK_NAME_LEN]; u64 slot; } disk_key_t; + +typedef struct flag_key { + u64 flags; + u64 slot; +} flag_key_t; + BPF_HASH(start, struct request *); STORAGE @@ -102,6 +114,13 @@ 'void *__tmp = (void *)req->rq_disk->disk_name; ' + 'bpf_probe_read(&key.disk, sizeof(key.disk), __tmp); ' + 'dist.increment(key);') +elif args.flags: + bpf_text = bpf_text.replace('STORAGE', + 'BPF_HISTOGRAM(dist, flag_key_t);') + bpf_text = bpf_text.replace('STORE', + 'flag_key_t key = {.slot = bpf_log2l(delta)}; ' + + 'key.flags = req->cmd_flags; ' + + 'dist.increment(key);') else: bpf_text = bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist);') bpf_text = bpf_text.replace('STORE', @@ -124,6 +143,56 @@ print("Tracing block device I/O... Hit Ctrl-C to end.") +# see blk_fill_rwbs(): +req_opf = { + 0: "Read", + 1: "Write", + 2: "Flush", + 3: "Discard", + 5: "SecureErase", + 6: "ZoneReset", + 7: "WriteSame", + 9: "WriteZeros" +} +REQ_OP_BITS = 8 +REQ_OP_MASK = ((1 << REQ_OP_BITS) - 1) +REQ_SYNC = 1 << (REQ_OP_BITS + 3) +REQ_META = 1 << (REQ_OP_BITS + 4) +REQ_PRIO = 1 << (REQ_OP_BITS + 5) +REQ_NOMERGE = 1 << (REQ_OP_BITS + 6) +REQ_IDLE = 1 << (REQ_OP_BITS + 7) +REQ_FUA = 1 << (REQ_OP_BITS + 9) +REQ_RAHEAD = 1 << (REQ_OP_BITS + 11) +REQ_BACKGROUND = 1 << (REQ_OP_BITS + 12) +REQ_NOWAIT = 1 << (REQ_OP_BITS + 13) +def flags_print(flags): + desc = "" + # operation + if flags & REQ_OP_MASK in req_opf: + desc = req_opf[flags & REQ_OP_MASK] + else: + desc = "Unknown" + # flags + if flags & REQ_SYNC: + desc = "Sync-" + desc + if flags & REQ_META: + desc = "Metadata-" + desc + if flags & REQ_FUA: + desc = "ForcedUnitAccess-" + desc + if flags & REQ_PRIO: + desc = "Priority-" + desc + if flags & REQ_NOMERGE: + desc = "NoMerge-" + desc + if flags & REQ_IDLE: + desc = "Idle-" + desc + if flags & REQ_RAHEAD: + desc = "ReadAhead-" + desc + if flags & REQ_BACKGROUND: + desc = "Background-" + desc + if flags & REQ_NOWAIT: + desc = "NoWait-" + desc + return desc + # output exiting = 0 if args.interval else 1 dist = b.get_table("dist") @@ -137,7 +206,10 @@ if args.timestamp: print("%-8s\n" % strftime("%H:%M:%S"), end="") - dist.print_log2_hist(label, "disk") + if args.flags: + dist.print_log2_hist(label, "flags", flags_print) + else: + dist.print_log2_hist(label, "disk") dist.clear() countdown -= 1 diff --git a/tools/biolatency_example.txt b/tools/biolatency_example.txt index 5d39b7ec5..933d9a7fc 100644 --- a/tools/biolatency_example.txt +++ b/tools/biolatency_example.txt @@ -198,10 +198,105 @@ This output sows that xvda1 has much higher latency, usually between 0.5 ms and 32 ms, whereas xvdc is usually between 0.2 ms and 4 ms. +The -F option prints a separate histogram for each unique set of request +flags. For example: + +./biolatency.py -Fm +Tracing block device I/O... Hit Ctrl-C to end. +^C + +flags = Read + msecs : count distribution + 0 -> 1 : 180 |************* | + 2 -> 3 : 519 |****************************************| + 4 -> 7 : 60 |**** | + 8 -> 15 : 123 |********* | + 16 -> 31 : 68 |***** | + 32 -> 63 : 0 | | + 64 -> 127 : 2 | | + 128 -> 255 : 12 | | + 256 -> 511 : 0 | | + 512 -> 1023 : 1 | | + +flags = Sync-Write + msecs : count distribution + 0 -> 1 : 5 |****************************************| + +flags = Flush + msecs : count distribution + 0 -> 1 : 2 |****************************************| + +flags = Metadata-Read + msecs : count distribution + 0 -> 1 : 3 |****************************************| + 2 -> 3 : 2 |************************** | + 4 -> 7 : 0 | | + 8 -> 15 : 1 |************* | + 16 -> 31 : 1 |************* | + +flags = Write + msecs : count distribution + 0 -> 1 : 103 |******************************* | + 2 -> 3 : 106 |******************************** | + 4 -> 7 : 130 |****************************************| + 8 -> 15 : 79 |************************ | + 16 -> 31 : 5 |* | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 0 | | + 256 -> 511 : 1 | | + +flags = NoMerge-Read + msecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 5 |****************************************| + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 1 |******** | + +flags = NoMerge-Write + msecs : count distribution + 0 -> 1 : 30 |** | + 2 -> 3 : 293 |******************** | + 4 -> 7 : 564 |****************************************| + 8 -> 15 : 463 |******************************** | + 16 -> 31 : 21 |* | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 0 | | + 256 -> 511 : 5 | | + +flags = Priority-Metadata-Read + msecs : count distribution + 0 -> 1 : 1 |****************************************| + 2 -> 3 : 0 | | + 4 -> 7 : 1 |****************************************| + 8 -> 15 : 1 |****************************************| + +flags = ForcedUnitAccess-Metadata-Sync-Write + msecs : count distribution + 0 -> 1 : 2 |****************************************| + +flags = ReadAhead-Read + msecs : count distribution + 0 -> 1 : 15 |*************************** | + 2 -> 3 : 22 |****************************************| + 4 -> 7 : 14 |************************* | + 8 -> 15 : 8 |************** | + 16 -> 31 : 1 |* | + +flags = Priority-Metadata-Write + msecs : count distribution + 0 -> 1 : 9 |****************************************| + +These can be handled differently by the storage device, and this mode lets us +examine their performance in isolation. + + USAGE message: # ./biolatency -h -usage: biolatency [-h] [-T] [-Q] [-m] [-D] [interval] [count] +usage: biolatency [-h] [-T] [-Q] [-m] [-D] [-F] [interval] [count] Summarize block device I/O latency as a histogram @@ -215,6 +310,7 @@ optional arguments: -Q, --queued include OS queued time in I/O time -m, --milliseconds millisecond histogram -D, --disks print a histogram per disk device + -F, --flags print a histogram per set of I/O flags examples: ./biolatency # summarize block I/O latency as a histogram @@ -222,3 +318,4 @@ examples: ./biolatency -mT 1 # 1s summaries, milliseconds, and timestamps ./biolatency -Q # include OS queued time in I/O time ./biolatency -D # show each disk device separately + ./biolatency -F # show I/O flags separately From 6b05e616464715e0e152d84b52ecdbc1bfccb76b Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Wed, 20 Mar 2019 21:13:59 -0700 Subject: [PATCH 097/135] bitesize: switch to issue tracepoint (#2283) bitesize: switch to trace_block_rq_issue tracepoint --- man/man8/bitesize.8 | 2 +- tools/bitesize.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/man/man8/bitesize.8 b/man/man8/bitesize.8 index 7dc8c937f..99cdbaab0 100644 --- a/man/man8/bitesize.8 +++ b/man/man8/bitesize.8 @@ -6,7 +6,7 @@ bitesize \- Summarize block device I/O size as a histogram \- Linux eBPF/bcc. .SH DESCRIPTION Show I/O distribution for requested block sizes, by process name. -This works by tracing block:block_rq_insert and prints a historgram of I/O size. +This works by tracing block:block_rq_issue and prints a historgram of I/O size. Since this uses BPF, only the root user can use this tool. .SH REQUIREMENTS diff --git a/tools/bitesize.py b/tools/bitesize.py index f23feec10..f4cea7cdd 100755 --- a/tools/bitesize.py +++ b/tools/bitesize.py @@ -27,7 +27,7 @@ BPF_HISTOGRAM(dist, struct proc_key_t); -TRACEPOINT_PROBE(block, block_rq_insert) +TRACEPOINT_PROBE(block, block_rq_issue) { struct proc_key_t key = {.slot = bpf_log2l(args->bytes / 1024)}; bpf_probe_read(&key.name, sizeof(key.name), args->comm); From 5570f70f363f75c97e1e6393d999ef5c0c8910c4 Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Wed, 20 Mar 2019 22:49:48 -0700 Subject: [PATCH 098/135] sync with latest libbpf repo (#2284) Sync with latest libbpf repo. Two new helpers are added, bpf_get_listener_sock() and bpf_skb_ecn_set_ce(). Signed-off-by: Yonghong Song --- docs/kernel-versions.md | 2 + src/cc/compat/linux/virtual_bpf.h | 186 +++++++++++++++++++++--------- src/cc/export/helpers.h | 4 + src/cc/libbpf | 2 +- src/cc/libbpf.c | 2 + 5 files changed, 140 insertions(+), 56 deletions(-) diff --git a/docs/kernel-versions.md b/docs/kernel-versions.md index c2ab17b64..6890d10ac 100644 --- a/docs/kernel-versions.md +++ b/docs/kernel-versions.md @@ -162,6 +162,7 @@ Helper | Kernel version | License | Commit | `BPF_FUNC_get_current_task()` | 4.8 | GPL | [`606274c5abd8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=606274c5abd8e245add01bc7145a8cbb92b69ba8) `BPF_FUNC_get_current_uid_gid()` | 4.2 | | [`ffeedafbf023`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=ffeedafbf0236f03aeb2e8db273b3e5ae5f5bc89) `BPF_FUNC_get_hash_recalc()` | 4.8 | | [`13c5c240f789`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=13c5c240f789bbd2bcacb14a23771491485ae61f) +`BPF_FUNC_get_listener_sock()` | 5.1 | | [`dbafd7ddd623`](https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/dbafd7ddd62369b2f3926ab847cbf8fc40e800b7) `BPF_FUNC_get_local_storage()` | 4.19 | | [`cd3394317653`](https://github.com/torvalds/linux/commit/cd3394317653837e2eb5c5d0904a8996102af9fc) `BPF_FUNC_get_numa_node_id()` | 4.10 | | [`2d0e30c30f84`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2d0e30c30f84d08dc16f0f2af41f1b8a85f0755e) `BPF_FUNC_get_prandom_u32()` | 4.1 | | [`03e69b508b6f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=03e69b508b6f7c51743055c9f61d1dfeadf4b635) @@ -221,6 +222,7 @@ Helper | Kernel version | License | Commit | `BPF_FUNC_skb_change_tail()` | 4.9 | | [`5293efe62df8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=5293efe62df81908f2e90c9820c7edcc8e61f5e9) `BPF_FUNC_skb_change_type()` | 4.8 | | [`d2485c4242a8`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d2485c4242a826fdf493fd3a27b8b792965b9b9e) `BPF_FUNC_skb_cgroup_id()` | 4.18 | | [`cb20b08ead40`](https://github.com/torvalds/linux/commit/cb20b08ead401fd17627a36f035c0bf5bfee5567) +`BPF_FUNC_skb_ecn_set_ce()` | 5.1 | | [`f7c917ba11a6`](https://github.com/torvalds/linux/commit/f7c917ba11a67632a8452ea99fe132f626a7a2cc) `BPF_FUNC_skb_get_tunnel_key()` | 4.3 | | [`d3aa45ce6b94`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d3aa45ce6b94c65b83971257317867db13e5f492) `BPF_FUNC_skb_get_tunnel_opt()` | 4.6 | | [`14ca0751c96f`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460) `BPF_FUNC_skb_get_xfrm_state()` | 4.18 | | [`12bed760a78d`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=12bed760a78da6e12ac8252fec64d019a9eac523) diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h index 1b53fbd68..eac9c14f7 100644 --- a/src/cc/compat/linux/virtual_bpf.h +++ b/src/cc/compat/linux/virtual_bpf.h @@ -503,16 +503,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) - * Description - * Push an element *value* in *map*. *flags* is one of: - * - * **BPF_EXIST** - * If the queue/stack is full, the oldest element is removed to - * make room for this. - * Return - * 0 on success, or a negative error in case of failure. - * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -1436,14 +1426,14 @@ union bpf_attr { * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts - * *skb*, but gets socket from **struct bpf_sock_addr** contex. + * *skb*, but gets socket from **struct bpf_sock_addr** context. * Return * A 8-byte long non-decreasing number. * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts - * *skb*, but gets socket from **struct bpf_sock_ops** contex. + * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return * A 8-byte long non-decreasing number. * @@ -2099,52 +2089,52 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * int bpf_rc_repeat(void *ctx) * Description * This helper is used in programs implementing IR decoding, to - * report a successfully decoded key press with *scancode*, - * *toggle* value in the given *protocol*. The scancode will be - * translated to a keycode using the rc keymap, and reported as - * an input key down event. After a period a key up event is - * generated. This period can be extended by calling either - * **bpf_rc_keydown**\ () again with the same values, or calling - * **bpf_rc_repeat**\ (). + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. * - * Some protocols include a toggle bit, in case the button was - * released and pressed again between consecutive scancodes. + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. * * The *ctx* should point to the lirc sample as passed into * the program. * - * The *protocol* is the decoded protocol number (see - * **enum rc_proto** for some predefined values). - * * This helper is only available is the kernel was compiled with * the **CONFIG_BPF_LIRC_MODE2** configuration option set to * "**y**". * Return * 0 * - * int bpf_rc_repeat(void *ctx) + * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) * Description * This helper is used in programs implementing IR decoding, to - * report a successfully decoded repeat key message. This delays - * the generation of a key up event for previously generated - * key down event. + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown**\ () again with the same values, or calling + * **bpf_rc_repeat**\ (). * - * Some IR protocols like NEC have a special IR message for - * repeating last button, for when a button is held down. + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. * * The *ctx* should point to the lirc sample as passed into * the program. * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * * This helper is only available is the kernel was compiled with * the **CONFIG_BPF_LIRC_MODE2** configuration option set to * "**y**". * Return * 0 * - * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb) + * u64 bpf_skb_cgroup_id(struct sk_buff *skb) * Description * Return the cgroup v2 id of the socket associated with the *skb*. * This is roughly similar to the **bpf_get_cgroup_classid**\ () @@ -2160,30 +2150,12 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) - * Description - * Return id of cgroup v2 that is ancestor of cgroup associated - * with the *skb* at the *ancestor_level*. The root cgroup is at - * *ancestor_level* zero and each step down the hierarchy - * increments the level. If *ancestor_level* == level of cgroup - * associated with *skb*, then return value will be same as that - * of **bpf_skb_cgroup_id**\ (). - * - * The helper is useful to implement policies based on cgroups - * that are upper in hierarchy than immediate cgroup associated - * with *skb*. - * - * The format of returned id and helper limitations are same as in - * **bpf_skb_cgroup_id**\ (). - * Return - * The id is returned or 0 in case the id could not be retrieved. - * * u64 bpf_get_current_cgroup_id(void) * Return * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. * - * void* get_local_storage(void *map, u64 flags) + * void *bpf_get_local_storage(void *map, u64 flags) * Description * Get the pointer to the local storage area. * The type and the size of the local storage is defined @@ -2210,6 +2182,24 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for TCP socket matching *tuple*, optionally in a child @@ -2290,6 +2280,16 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is + * removed to make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_map_pop_elem(struct bpf_map *map, void *value) * Description * Pop an element from *map*. @@ -2344,21 +2344,93 @@ union bpf_attr { * Return * 0 * + * int bpf_spin_lock(struct bpf_spin_lock *lock) + * Description + * Acquire a spinlock represented by the pointer *lock*, which is + * stored as part of a value of a map. Taking the lock allows to + * safely update the rest of the fields in that value. The + * spinlock can (and must) later be released with a call to + * **bpf_spin_unlock**\ (\ *lock*\ ). + * + * Spinlocks in BPF programs come with a number of restrictions + * and constraints: + * + * * **bpf_spin_lock** objects are only allowed inside maps of + * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this + * list could be extended in the future). + * * BTF description of the map is mandatory. + * * The BPF program can take ONE lock at a time, since taking two + * or more could cause dead locks. + * * Only one **struct bpf_spin_lock** is allowed per map element. + * * When the lock is taken, calls (either BPF to BPF or helpers) + * are not allowed. + * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not + * allowed inside a spinlock-ed region. + * * The BPF program MUST call **bpf_spin_unlock**\ () to release + * the lock, on all execution paths, before it returns. + * * The BPF program can access **struct bpf_spin_lock** only via + * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () + * helpers. Loading or storing data into the **struct + * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. + * * To use the **bpf_spin_lock**\ () helper, the BTF description + * of the map value must be a struct and have **struct + * bpf_spin_lock** *anyname*\ **;** field at the top level. + * Nested lock inside another struct is not allowed. + * * The **struct bpf_spin_lock** *lock* field in a map value must + * be aligned on a multiple of 4 bytes in that value. + * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy + * the **bpf_spin_lock** field to user space. + * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from + * a BPF program, do not update the **bpf_spin_lock** field. + * * **bpf_spin_lock** cannot be on the stack or inside a + * networking packet (it can only be inside of a map values). + * * **bpf_spin_lock** is available to root only. + * * Tracing programs and socket filter programs cannot use + * **bpf_spin_lock**\ () due to insufficient preemption checks + * (but this may change in the future). + * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. + * Return + * 0 + * + * int bpf_spin_unlock(struct bpf_spin_lock *lock) + * Description + * Release the *lock* previously locked by a call to + * **bpf_spin_lock**\ (\ *lock*\ ). + * Return + * 0 + * * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_sock** pointer such - * that all the fields in bpf_sock can be accessed. + * that all the fields in this **bpf_sock** can be accessed. * Return - * A **struct bpf_sock** pointer on success, or NULL in + * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. * * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_tcp_sock** pointer from a * **struct bpf_sock** pointer. + * Return + * A **struct bpf_tcp_sock** pointer on success, or **NULL** in + * case of failure. + * + * int bpf_skb_ecn_set_ce(struct sk_buf *skb) + * Description + * Set ECN (Explicit Congestion Notification) field of IP header + * to **CE** (Congestion Encountered) if current value is **ECT** + * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 + * and IPv4. + * Return + * 1 if the **CE** flag is set (either by the current helper call + * or because it was already present), 0 if it is not set. * + * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) + * Description + * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. + * **bpf_sk_release**\ () is unnecessary and not allowed. * Return - * A **struct bpf_tcp_sock** pointer on success, or NULL in + * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ @@ -2458,7 +2530,9 @@ union bpf_attr { FN(spin_lock), \ FN(spin_unlock), \ FN(sk_fullsock), \ - FN(tcp_sock), + FN(tcp_sock), \ + FN(skb_ecn_set_ce), \ + FN(get_listener_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2814,6 +2888,8 @@ struct bpf_prog_info { __u32 jited_line_info_rec_size; __u32 nr_prog_tags; __aligned_u64 prog_tags; + __u64 run_time_ns; + __u64 run_cnt; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index 527700f9e..20f738718 100644 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -480,6 +480,10 @@ static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = (void *) BPF_FUNC_sk_fullsock; static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) BPF_FUNC_tcp_sock; +static int (*bpf_skb_ecn_set_ce)(void *ctx) = + (void *) BPF_FUNC_skb_ecn_set_ce; +static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = + (void *) BPF_FUNC_get_listener_sock; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/src/cc/libbpf b/src/cc/libbpf index 5beb8a2eb..33b017498 160000 --- a/src/cc/libbpf +++ b/src/cc/libbpf @@ -1 +1 @@ -Subproject commit 5beb8a2ebffd1045e3edb9b522d6ff5bb477c541 +Subproject commit 33b017498543167b65fa948d3a0267794c78787f diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 50986493b..122745171 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -193,6 +193,8 @@ static struct bpf_helper helpers[] = { {"spin_unlock", "5.1"}, {"sk_fullsock", "5.1"}, {"tcp_sock", "5.1"}, + {"skb_ecn_set_ce", "5.1"}, + {"get_listener_sock", "5.1"}, }; static uint64_t ptr_to_u64(void *ptr) From c72f6284358294a6271437946a899287281e5a98 Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Thu, 21 Mar 2019 07:04:29 -0700 Subject: [PATCH 099/135] delete allocated array properly (#2285) In bcc_btf.cc, new_btf_sec is allocated with something like new_btf_sec = new uint8_t[tmp_sec_size] Since the allocation is an array, the deletion should be delete[] new_btf_sec instead of delete new_btf_sec This patch fixed the problem. Signed-off-by: Yonghong Song --- src/cc/bcc_btf.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cc/bcc_btf.cc b/src/cc/bcc_btf.cc index e46cb9ae0..0eca5db2b 100644 --- a/src/cc/bcc_btf.cc +++ b/src/cc/bcc_btf.cc @@ -179,7 +179,7 @@ int BTF::load(uint8_t *btf_sec, uintptr_t btf_sec_size, if (new_btf_sec) { btf = btf__new(new_btf_sec, new_btf_sec_size); - delete new_btf_sec; + delete[] new_btf_sec; } else { btf = btf__new(btf_sec, btf_sec_size); } From 12bd958186a8c3bfcf2f74d246b8c06dd20bbc20 Mon Sep 17 00:00:00 2001 From: David Cook Date: Sat, 23 Mar 2019 01:46:01 -0500 Subject: [PATCH 100/135] Clarify kernel configuration flags (#2286) Clarify kernel configuration flag for CONFIG_HAVE_EBPF_JIT --- INSTALL.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/INSTALL.md b/INSTALL.md index c043d4c3e..3c27d4f0b 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -32,7 +32,10 @@ CONFIG_NET_CLS_BPF=m # [optional, for tc actions] CONFIG_NET_ACT_BPF=m CONFIG_BPF_JIT=y +# [for Linux kernel versions 4.1 through 4.6] CONFIG_HAVE_BPF_JIT=y +# [for Linux kernel versions 4.7 and later] +CONFIG_HAVE_EBPF_JIT=y # [optional, for kprobes] CONFIG_BPF_EVENTS=y ``` From 0267b4840ba2881583e075bf552e2837f1646042 Mon Sep 17 00:00:00 2001 From: 10ne1 Date: Tue, 26 Mar 2019 03:07:40 +0200 Subject: [PATCH 101/135] tools/cachestat: fix python 3 str/bytes type error (#2287) TypeError: argument should be integer or bytes-like object, not 'str' Signed-off-by: Adrian Ratiu --- tools/cachestat.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/cachestat.py b/tools/cachestat.py index 119fd9cf6..bb949493c 100755 --- a/tools/cachestat.py +++ b/tools/cachestat.py @@ -124,13 +124,13 @@ def get_meminfo(): for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): func = b.ksym(k.ip) # partial string matches in case of .isra (necessary?) - if func.find("mark_page_accessed") == 0: + if func.find(b"mark_page_accessed") == 0: mpa = max(0, v.value) - if func.find("mark_buffer_dirty") == 0: + if func.find(b"mark_buffer_dirty") == 0: mbd = max(0, v.value) - if func.find("add_to_page_cache_lru") == 0: + if func.find(b"add_to_page_cache_lru") == 0: apcl = max(0, v.value) - if func.find("account_page_dirtied") == 0: + if func.find(b"account_page_dirtied") == 0: apd = max(0, v.value) # total = total cache accesses without counting dirties From c7859d47b38d418d23403f78c5c413855466d204 Mon Sep 17 00:00:00 2001 From: 10ne1 Date: Tue, 26 Mar 2019 17:53:40 +0200 Subject: [PATCH 102/135] clang/kbuild_helper: fix arm64 cross compilation (#2288) Commit 28949f17e0 ("Translate arch into source directory when ARCH is set") moved $ARCH processing earlier in the function so $ARCH gets processed by the the same logic which parses the uname_machine value. This introduced two bugs breaking ARCH=arm64 cross-compilation: 1. The arch.compare(0, 3, "arm") test matches both $ARCH=arm and $ARCH=arm64 leading to builds which always include from "arch/arm/include" instead of "arch/arm64/include". 2. The only way arch == arm64 is if $ARCH=aarch64. uname returns aarch64 but the rest of the compiler logic expects $ARCH=arm64. This fixes the above bugs by moving the "aarch64" test earlier than the "arm" test and also accepting ARCH=arm64. Fixes: 28949f17e0 ("Translate arch into source directory when ARCH is set (#2122)") Signed-off-by: Adrian Ratiu --- src/cc/frontends/clang/kbuild_helper.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cc/frontends/clang/kbuild_helper.cc b/src/cc/frontends/clang/kbuild_helper.cc index acacdd2eb..e631eed5c 100644 --- a/src/cc/frontends/clang/kbuild_helper.cc +++ b/src/cc/frontends/clang/kbuild_helper.cc @@ -46,6 +46,8 @@ int KBuildHelper::get_flags(const char *uname_machine, vector *cflags) { arch = "x86"; } else if (arch[0] == 'i' && !arch.compare(2, 2, "86")) { arch = "x86"; + } else if (!arch.compare(0, 7, "aarch64") || !arch.compare(0, 5, "arm64")) { + arch = "arm64"; } else if (!arch.compare(0, 3, "arm")) { arch = "arm"; } else if (!arch.compare(0, 5, "sa110")) { @@ -60,8 +62,6 @@ int KBuildHelper::get_flags(const char *uname_machine, vector *cflags) { arch = "mips"; } else if (!arch.compare(0, 2, "sh")) { arch = "sh"; - } else if (!arch.compare(0, 7, "aarch64")) { - arch = "arm64"; } cflags->push_back("-nostdinc"); From 3677925107c203d6e959250776cf7afd80e04da0 Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Wed, 27 Mar 2019 08:52:24 -0700 Subject: [PATCH 103/135] rename libbpf.{a,so} to libbcc_bpf.{a,so} (#2290) This is based on discussion in netdev regarding to libbpf repo packaging: https://lore.kernel.org/bpf/20190325202009.GA14511@krava/T/#t The libbpf repo https://github.com/libbpf/libbpf contains the linux:tools/lib/bpf codes plus some other uapi and auxiliary headers. It is natural for libbpf repo to generate a libbpf.{a,so} installable as a package. To avoid conflicts, let us rename bcc libbpf.{a,so} to libbcc_bpf.{a,so}. Signed-off-by: Yonghong Song --- src/cc/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cc/CMakeLists.txt b/src/cc/CMakeLists.txt index 59a598560..1c3df632c 100644 --- a/src/cc/CMakeLists.txt +++ b/src/cc/CMakeLists.txt @@ -30,10 +30,10 @@ include(static_libstdc++) file(GLOB libbpf_sources "libbpf/src/*.c") add_library(bpf-static STATIC libbpf.c perf_reader.c ${libbpf_sources}) -set_target_properties(bpf-static PROPERTIES OUTPUT_NAME bpf) +set_target_properties(bpf-static PROPERTIES OUTPUT_NAME bcc_bpf) add_library(bpf-shared SHARED libbpf.c perf_reader.c ${libbpf_sources}) set_target_properties(bpf-shared PROPERTIES VERSION ${REVISION_LAST} SOVERSION 0) -set_target_properties(bpf-shared PROPERTIES OUTPUT_NAME bpf) +set_target_properties(bpf-shared PROPERTIES OUTPUT_NAME bcc_bpf) set(bcc_common_sources bcc_common.cc bpf_module.cc bcc_btf.cc exported_files.cc) if (${LLVM_PACKAGE_VERSION} VERSION_EQUAL 6 OR ${LLVM_PACKAGE_VERSION} VERSION_GREATER 6) From d1e9d2221a754806f463ee950b6cd2b1e1c2c54c Mon Sep 17 00:00:00 2001 From: Oriol Arcas Date: Wed, 27 Mar 2019 19:16:15 +0100 Subject: [PATCH 104/135] doc: explain how to use perf with skb programs (#2289) explain how to use perf with skb programs Signed-off-by: Oriol Arcas --- docs/reference_guide.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/reference_guide.md b/docs/reference_guide.md index a9bd02d77..1eb46995e 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -521,6 +521,8 @@ Return: 0 on success A method of a BPF_PERF_OUTPUT table, for submitting custom event data to user space. See the BPF_PERF_OUTPUT entry. (This ultimately calls bpf_perf_event_output().) +The ```ctx``` parameter is provided in [kprobes](#1-kprobes) or [kretprobes](#2-kretprobes). For ```SCHED_CLS``` or ```SOCKET_FILTER``` programs, the ```struct __sk_buff *skb``` must be used instead. + Examples in situ: [search /examples](https://github.com/iovisor/bcc/search?q=perf_submit+path%3Aexamples&type=Code), [search /tools](https://github.com/iovisor/bcc/search?q=perf_submit+path%3Atools&type=Code) From e7c8f5561d4172461439b917ebdb1342d5947115 Mon Sep 17 00:00:00 2001 From: Dan Xu Date: Mon, 1 Apr 2019 18:46:53 -0700 Subject: [PATCH 105/135] Add libbcc-loader-static.a symbols into libbcc.a (#2296) It's useful to have all the bcc symbols in one place when statically linking against bcc. This patch adds all the symbols from libbcc-loader-static into libbcc. This is in line with how libbcc-lua-static does it as well. --- src/cc/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cc/CMakeLists.txt b/src/cc/CMakeLists.txt index 1c3df632c..bc3355c5d 100644 --- a/src/cc/CMakeLists.txt +++ b/src/cc/CMakeLists.txt @@ -68,7 +68,7 @@ endif() add_library(bcc-loader-static STATIC ${bcc_sym_sources} ${bcc_util_sources}) target_link_libraries(bcc-loader-static elf) add_library(bcc-static STATIC - ${bcc_common_sources} ${bcc_table_sources} ${bcc_util_sources} ${bcc_usdt_sources}) + ${bcc_common_sources} ${bcc_table_sources} ${bcc_util_sources} ${bcc_usdt_sources} ${bcc_sym_sources} ${bcc_util_sources}) set_target_properties(bcc-static PROPERTIES OUTPUT_NAME bcc) set(bcc-lua-static ${bcc_common_sources} ${bcc_table_sources} ${bcc_sym_sources} ${bcc_util_sources}) From 1b9ae96e6024a8967799d8009912d8d2fb0b8ccf Mon Sep 17 00:00:00 2001 From: Matheus Marchini Date: Thu, 4 Apr 2019 23:11:13 -0700 Subject: [PATCH 106/135] cmake: expose bcc_elf.h in the API headers (#2293) Ref: https://github.com/iovisor/bpftrace/pull/410 --- src/cc/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cc/CMakeLists.txt b/src/cc/CMakeLists.txt index bc3355c5d..bd34fd481 100644 --- a/src/cc/CMakeLists.txt +++ b/src/cc/CMakeLists.txt @@ -51,7 +51,7 @@ set(bcc_util_sources ns_guard.cc common.cc) set(bcc_sym_sources bcc_syms.cc bcc_elf.c bcc_perf_map.c bcc_proc.c) set(bcc_common_headers libbpf.h perf_reader.h) set(bcc_table_headers file_desc.h table_desc.h table_storage.h) -set(bcc_api_headers bcc_common.h bpf_module.h bcc_exception.h bcc_syms.h) +set(bcc_api_headers bcc_common.h bpf_module.h bcc_exception.h bcc_syms.h bcc_elf.h) if(ENABLE_CLANG_JIT) add_library(bcc-shared SHARED From 8cd3efd21c1d26509edd24bc608dcfa7825b06f1 Mon Sep 17 00:00:00 2001 From: Brendan Gregg Date: Sun, 7 Apr 2019 12:32:53 -0700 Subject: [PATCH 107/135] tcpaccept/tcpconnect: use PID not TID --- tools/tcpaccept.py | 4 ++-- tools/tcpconnect.py | 16 ++++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/tools/tcpaccept.py b/tools/tcpaccept.py index 169b0f31d..e08a08eb3 100755 --- a/tools/tcpaccept.py +++ b/tools/tcpaccept.py @@ -89,7 +89,7 @@ int kretprobe__inet_csk_accept(struct pt_regs *ctx) { struct sock *newsk = (struct sock *)PT_REGS_RC(ctx); - u32 pid = bpf_get_current_pid_tgid(); + u32 pid = bpf_get_current_pid_tgid() >> 32; ##FILTER_PID## @@ -177,7 +177,7 @@ return 0; if (args->oldstate != TCP_SYN_RECV || args->newstate != TCP_ESTABLISHED) return 0; - u32 pid = bpf_get_current_pid_tgid(); + u32 pid = bpf_get_current_pid_tgid() >> 32; ##FILTER_PID## diff --git a/tools/tcpconnect.py b/tools/tcpconnect.py index e230f6551..e31ff770f 100755 --- a/tools/tcpconnect.py +++ b/tools/tcpconnect.py @@ -89,14 +89,16 @@ int trace_connect_entry(struct pt_regs *ctx, struct sock *sk) { - u32 pid = bpf_get_current_pid_tgid(); + u64 pid_tgid = bpf_get_current_pid_tgid(); + u32 pid = pid_tgid >> 32; + u32 tid = pid_tgid; FILTER_PID u32 uid = bpf_get_current_uid_gid(); FILTER_UID // stash the sock ptr for lookup on return - currsock.update(&pid, &sk); + currsock.update(&tid, &sk); return 0; }; @@ -104,10 +106,12 @@ static int trace_connect_return(struct pt_regs *ctx, short ipver) { int ret = PT_REGS_RC(ctx); - u32 pid = bpf_get_current_pid_tgid(); + u64 pid_tgid = bpf_get_current_pid_tgid(); + u32 pid = pid_tgid >> 32; + u32 tid = pid_tgid; struct sock **skpp; - skpp = currsock.lookup(&pid); + skpp = currsock.lookup(&tid); if (skpp == 0) { return 0; // missed entry } @@ -115,7 +119,7 @@ if (ret != 0) { // failed to send SYNC packet, may not have populated // socket __sk_common.{skc_rcv_saddr, ...} - currsock.delete(&pid); + currsock.delete(&tid); return 0; } @@ -148,7 +152,7 @@ ipv6_events.perf_submit(ctx, &data6, sizeof(data6)); } - currsock.delete(&pid); + currsock.delete(&tid); return 0; } From fd73745c87c7082f293bd7b95f97825c2480ea4c Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Wed, 10 Apr 2019 23:35:16 +0800 Subject: [PATCH 108/135] tcpaccept: remove sock:inet_sock_set_state tracepoint code (#2305) Fixes #2304. On Linux 4.16 and later, sock:inet_sock_set_state tracepoint was used for efficency, but it may output wrong PIDs. This is because sock:inet_sock_set_state may run outside of process context. Hence, we stick to kprobes until we find a proper solution. --- tools/tcpaccept.py | 59 +++++----------------------------------------- 1 file changed, 6 insertions(+), 53 deletions(-) diff --git a/tools/tcpaccept.py b/tools/tcpaccept.py index e08a08eb3..70202dd39 100755 --- a/tools/tcpaccept.py +++ b/tools/tcpaccept.py @@ -80,10 +80,11 @@ """ # -# The following is the code for older kernels(Linux pre-4.16). -# It uses kprobes to instrument inet_csk_accept(). On Linux 4.16 and -# later, the sock:inet_sock_set_state tracepoint should be used instead, as -# is done by the code that follows this. +# The following code uses kprobes to instrument inet_csk_accept(). +# On Linux 4.16 and later, we could use sock:inet_sock_set_state +# tracepoint for efficency, but it may output wrong PIDs. This is +# because sock:inet_sock_set_state may run outside of process context. +# Hence, we stick to kprobes until we find a proper solution. # bpf_text_kprobe = """ int kretprobe__inet_csk_accept(struct pt_regs *ctx) @@ -170,55 +171,7 @@ } """ -bpf_text_tracepoint = """ -TRACEPOINT_PROBE(sock, inet_sock_set_state) -{ - if (args->protocol != IPPROTO_TCP) - return 0; - if (args->oldstate != TCP_SYN_RECV || args->newstate != TCP_ESTABLISHED) - return 0; - u32 pid = bpf_get_current_pid_tgid() >> 32; - - ##FILTER_PID## - - // pull in details - u16 family = 0, lport = 0, dport; - family = args->family; - lport = args->sport; - dport = args->dport; - - ##FILTER_PORT## - - if (family == AF_INET) { - struct ipv4_data_t data4 = {.pid = pid, .ip = 4}; - data4.ts_us = bpf_ktime_get_ns() / 1000; - __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr)); - __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr)); - data4.lport = lport; - data4.dport = dport; - bpf_get_current_comm(&data4.task, sizeof(data4.task)); - ipv4_events.perf_submit(args, &data4, sizeof(data4)); - } else if (family == AF_INET6) { - struct ipv6_data_t data6 = {.pid = pid, .ip = 6}; - data6.ts_us = bpf_ktime_get_ns() / 1000; - __builtin_memcpy(&data6.saddr, args->saddr, sizeof(data6.saddr)); - __builtin_memcpy(&data6.daddr, args->daddr, sizeof(data6.daddr)); - data6.lport = lport; - data6.dport = dport; - bpf_get_current_comm(&data6.task, sizeof(data6.task)); - ipv6_events.perf_submit(args, &data6, sizeof(data6)); - } - // else drop - - return 0; -} -""" - -if (BPF.tracepoint_exists("sock", "inet_sock_set_state")): - bpf_text += bpf_text_tracepoint -else: - bpf_text += bpf_text_kprobe - +bpf_text += bpf_text_kprobe # code substitutions if args.pid: From e75480dab8ef5ca093c4601e3d207c9e53370516 Mon Sep 17 00:00:00 2001 From: dstepanovsrc Date: Wed, 10 Apr 2019 22:49:20 +0300 Subject: [PATCH 109/135] stackcount: add -c option for CPU only events (#2307) Add the -c option to trace events happened only the requested CPU. Update command line, man and example. Signed-off-by: Dima Stepanov Signed-off-by: Yury Kotov --- man/man8/stackcount.8 | 11 +++++++++-- tools/stackcount.py | 26 +++++++++++++++++--------- tools/stackcount_example.txt | 6 ++++-- 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/man/man8/stackcount.8 b/man/man8/stackcount.8 index d6ab993af..f577fa75a 100644 --- a/man/man8/stackcount.8 +++ b/man/man8/stackcount.8 @@ -2,8 +2,8 @@ .SH NAME stackcount \- Count function calls and their stack traces. Uses Linux eBPF/bcc. .SH SYNOPSIS -.B stackcount [\-h] [\-p PID] [\-i INTERVAL] [\-D DURATION] [\-T] [\-r] [\-s] - [\-P] [\-K] [\-U] [\-v] [\-d] [\-f] pattern +.B stackcount [\-h] [\-p PID] [\-c CPU] [\-i INTERVAL] [\-D DURATION] [\-T] [\-r] + [\-s] [\-P] [\-K] [\-U] [\-v] [\-d] [\-f] pattern .SH DESCRIPTION stackcount traces functions and frequency counts them with their entire stack trace, kernel stack and user stack, summarized in-kernel for efficiency. @@ -54,6 +54,9 @@ Folded output format. \-p PID Trace this process ID only (filtered in-kernel). .TP +\-c CPU +Trace this CPU only (filtered in-kernel). +.TP .TP pattern A function name, or a search pattern. Can include wildcards ("*"). If the @@ -104,6 +107,10 @@ Only count stacks when PID 185 is on-CPU: # .B stackcount \-p 185 ip_output .TP +Only count stacks for CPU 1: +# +.B stackcount \-c 1 put_prev_entity +.TP Count user stacks for dynamic heap allocations with malloc in PID 185: # .B stackcount \-p 185 c:malloc diff --git a/tools/stackcount.py b/tools/stackcount.py index 9dfc06f11..894a71a49 100755 --- a/tools/stackcount.py +++ b/tools/stackcount.py @@ -3,8 +3,8 @@ # stackcount Count events and their stack traces. # For Linux, uses BCC, eBPF. # -# USAGE: stackcount.py [-h] [-p PID] [-i INTERVAL] [-D DURATION] [-T] [-r] [-s] -# [-P] [-K] [-U] [-v] [-d] [-f] [--debug] +# USAGE: stackcount.py [-h] [-p PID] [-c CPU] [-i INTERVAL] [-D DURATION] [-T] +# [-r] [-s] [-P] [-K] [-U] [-v] [-d] [-f] [--debug] # # The pattern is a string with optional '*' wildcards, similar to file # globbing. If you'd prefer to use regular expressions, use the -r option. @@ -28,7 +28,7 @@ class Probe(object): def __init__(self, pattern, kernel_stack, user_stack, use_regex=False, - pid=None, per_pid=False): + pid=None, per_pid=False, cpu=None): """Init a new probe. Init the probe from the pattern provided by the user. The supported @@ -75,6 +75,7 @@ def __init__(self, pattern, kernel_stack, user_stack, use_regex=False, self.pid = pid self.per_pid = per_pid + self.cpu = cpu self.matched = 0 def is_kernel_probe(self): @@ -149,14 +150,18 @@ def load(self): BPF_STACK_TRACE(stack_traces, 1024); """ + filter_text = [] # We really mean the tgid from the kernel's perspective, which is in # the top 32 bits of bpf_get_current_pid_tgid(). if self.is_kernel_probe() and self.pid: - trace_count_text = trace_count_text.replace('FILTER', - ('u32 pid; pid = bpf_get_current_pid_tgid() >> 32; ' + - 'if (pid != %d) { return 0; }') % (self.pid)) - else: - trace_count_text = trace_count_text.replace('FILTER', '') + filter_text.append('u32 pid; pid = bpf_get_current_pid_tgid() >> 32; ' + + 'if (pid != %d) { return 0; }' % self.pid) + + if self.is_kernel_probe() and self.cpu: + filter_text.append('struct task_struct *task; task = (struct task_struct*)bpf_get_current_task(); ' + + 'if (task->cpu != %d) { return 0; }' % self.cpu) + + trace_count_text = trace_count_text.replace('FILTER', '\n '.join(filter_text)) # We need per-pid statistics when tracing a user-space process, because # the meaning of the symbols depends on the pid. We also need them if @@ -211,6 +216,7 @@ def __init__(self): ./stackcount -r '^tcp_send.*' # same as above, using regular expressions ./stackcount -Ti 5 ip_output # output every 5 seconds, with timestamps ./stackcount -p 185 ip_output # count ip_output stacks for PID 185 only + ./stackcount -c 1 put_prev_entity # count put_prev_entity stacks for CPU 1 only ./stackcount -p 185 c:malloc # count stacks for malloc in PID 185 ./stackcount t:sched:sched_fork # count stacks for sched_fork tracepoint ./stackcount -p 185 u:node:* # count stacks for all USDT probes in node @@ -223,6 +229,8 @@ def __init__(self): epilog=examples) parser.add_argument("-p", "--pid", type=int, help="trace this PID only") + parser.add_argument("-c", "--cpu", type=int, + help="trace this CPU only") parser.add_argument("-i", "--interval", help="summary interval, seconds") parser.add_argument("-D", "--duration", @@ -271,7 +279,7 @@ def __init__(self): self.probe = Probe(self.args.pattern, self.kernel_stack, self.user_stack, - self.args.regexp, self.args.pid, self.args.perpid) + self.args.regexp, self.args.pid, self.args.perpid, self.args.cpu) self.need_delimiter = self.args.delimited and not ( self.args.kernel_stacks_only or self.args.user_stacks_only) diff --git a/tools/stackcount_example.txt b/tools/stackcount_example.txt index 92a77a89c..389200f55 100644 --- a/tools/stackcount_example.txt +++ b/tools/stackcount_example.txt @@ -843,8 +843,8 @@ This folded output can be piped directly into flamegraph.pl (the Perl version). USAGE message: # ./stackcount -h -usage: stackcount [-h] [-p PID] [-i INTERVAL] [-D DURATION] [-T] [-r] [-s] - [-P] [-K] [-U] [-v] [-d] [-f] [--debug] +usage: stackcount [-h] [-p PID] [-c CPU] [-i INTERVAL] [-D DURATION] [-T] [-r] + [-s] [-P] [-K] [-U] [-v] [-d] [-f] [--debug] pattern Count events and their stack traces @@ -855,6 +855,7 @@ positional arguments: optional arguments: -h, --help show this help message and exit -p PID, --pid PID trace this PID only + -c CPU, --cpu CPU trace this CPU only -i INTERVAL, --interval INTERVAL summary interval, seconds -D DURATION, --duration DURATION @@ -886,5 +887,6 @@ examples: ./stackcount -p 185 c:malloc # count stacks for malloc in PID 185 ./stackcount t:sched:sched_fork # count stacks for sched_fork tracepoint ./stackcount -p 185 u:node:* # count stacks for all USDT probes in node + ./stackcount -c 1 put_prev_entity # count put_prev_entity stacks for CPU 1 only ./stackcount -K t:sched:sched_switch # kernel stacks only ./stackcount -U t:sched:sched_switch # user stacks only From 79a8e3c1303867a42f3c124b853a894c5b85963a Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Thu, 11 Apr 2019 13:59:23 -0700 Subject: [PATCH 110/135] sync with latest libbpf repo (#2309) sync with latest libbpf (https://github.com/libbpf/libbpf) repo. Signed-off-by: Yonghong Song --- docs/kernel-versions.md | 2 + src/cc/compat/linux/virtual_bpf.h | 92 +++++++++++++++++++++++++++++-- src/cc/export/helpers.h | 7 +++ src/cc/libbpf | 2 +- src/cc/libbpf.c | 2 + 5 files changed, 98 insertions(+), 7 deletions(-) diff --git a/docs/kernel-versions.md b/docs/kernel-versions.md index 6890d10ac..3ec589bb3 100644 --- a/docs/kernel-versions.md +++ b/docs/kernel-versions.md @@ -235,11 +235,13 @@ Helper | Kernel version | License | Commit | `BPF_FUNC_skb_under_cgroup()` | 4.8 | | [`4a482f34afcc`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4a482f34afcc162d8456f449b137ec2a95be60d8) `BPF_FUNC_skb_vlan_pop()` | 4.3 | | [`4e10df9a60d9`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4e10df9a60d96ced321dd2af71da558c6b750078) `BPF_FUNC_skb_vlan_push()` | 4.3 | | [`4e10df9a60d9`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4e10df9a60d96ced321dd2af71da558c6b750078) +`BPF_FUNC_skc_lookup_tcp()` | 5.2 | | [`edbf8c01de5a`](https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/edbf8c01de5a104a71ed6df2bf6421ceb2836a8e) `BPF_FUNC_sock_hash_update()` | 4.18 | | [`81110384441a`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=81110384441a59cff47430f20f049e69b98c17f4) `BPF_FUNC_sock_map_update()` | 4.14 | | [`174a79ff9515`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=174a79ff9515f400b9a6115643dafd62a635b7e6) `BPF_FUNC_spin_lock()` | 5.1 | | [`d83525ca62cf`](https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/?id=d83525ca62cf8ebe3271d14c36fb900c294274a2) `BPF_FUNC_spin_unlock()` | 5.1 | | [`d83525ca62cf`](https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/?id=d83525ca62cf8ebe3271d14c36fb900c294274a2) `BPF_FUNC_tail_call()` | 4.2 | | [`04fd61ab36ec`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=04fd61ab36ec065e194ab5e74ae34a5240d992bb) +`BPF_FUNC_tcp_check_syncookie()` | 5.2 | | [`399040847084`](https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/399040847084a69f345e0a52fd62f04654e0fce3) `BPF_FUNC_tcp_sock()` | 5.1 | | [`655a51e536c0`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=655a51e536c09d15ffa3603b1b6fce2b45b85a1f) `BPF_FUNC_trace_printk()` | 4.1 | GPL | [`9c959c863f82`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=9c959c863f8217a2ff3d7c296e8223654d240569) `BPF_FUNC_xdp_adjust_head()` | 4.10 | | [`17bedab27231`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=17bedab2723145d17b14084430743549e6943d03) diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h index eac9c14f7..c01ae0c23 100644 --- a/src/cc/compat/linux/virtual_bpf.h +++ b/src/cc/compat/linux/virtual_bpf.h @@ -106,6 +106,7 @@ enum bpf_cmd { BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, BPF_MAP_LOOKUP_AND_DELETE_ELEM, + BPF_MAP_FREEZE, }; enum bpf_map_type { @@ -256,8 +257,19 @@ enum bpf_attach_type { */ #define BPF_F_ANY_ALIGNMENT (1U << 1) -/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ +/* When BPF ldimm64's insn[0].src_reg != 0 then this can have + * two extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE + * insn[0].imm: map fd map fd + * insn[1].imm: 0 offset into value + * insn[0].off: 0 0 + * insn[1].off: 0 0 + * ldimm64 rewrite: address of map address of map[0]+offset + * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE + */ #define BPF_PSEUDO_MAP_FD 1 +#define BPF_PSEUDO_MAP_VALUE 2 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function @@ -284,7 +296,7 @@ enum bpf_attach_type { #define BPF_OBJ_NAME_LEN 16U -/* Flags for accessing BPF object */ +/* Flags for accessing BPF object from syscall side. */ #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) @@ -294,6 +306,10 @@ enum bpf_attach_type { /* Zero-initialize hash function seed. This should only be used for testing. */ #define BPF_F_ZERO_SEED (1U << 6) +/* Flags for accessing BPF object from program side. */ +#define BPF_F_RDONLY_PROG (1U << 7) +#define BPF_F_WRONLY_PROG (1U << 8) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) @@ -397,6 +413,13 @@ union bpf_attr { __aligned_u64 data_out; __u32 repeat; __u32 duration; + __u32 ctx_size_in; /* input: len of ctx_in */ + __u32 ctx_size_out; /* input/output: len of ctx_out + * returns ENOSPC if ctx_out + * is too small. + */ + __aligned_u64 ctx_in; + __aligned_u64 ctx_out; } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ @@ -1479,13 +1502,27 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * - * There is a single supported mode at this time: + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed below the layer 2 header). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). * - * All values for *flags* are reserved for future usage, and must - * be left at zero. + * The following flags are supported at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **: + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **: + * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: + * Use with ENCAP_L3 flags to further specify the tunnel type. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2432,6 +2469,38 @@ union bpf_attr { * Return * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. + * + * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to bpf_sk_lookup_tcp, except that it + * also returns timewait or request sockets. Use bpf_sk_fullsock + * or bpf_tcp_socket to access the full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from **reuse->socks**\ [] using the hash of the tuple. + * + * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Check whether iph and th contain a valid SYN cookie ACK for + * the listening socket in sk. + * + * iph points to the start of the IPv4 or IPv6 header, while + * iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr). + * + * th points to the start of the TCP header, while th_len contains + * sizeof(struct tcphdr). + * + * Return + * 0 if iph and th are a valid SYN cookie ACK, or a negative error + * otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2532,7 +2601,9 @@ union bpf_attr { FN(sk_fullsock), \ FN(tcp_sock), \ FN(skb_ecn_set_ce), \ - FN(get_listener_sock), + FN(get_listener_sock), \ + FN(skc_lookup_tcp), \ + FN(tcp_check_syncookie), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2591,9 +2662,18 @@ enum bpf_func_id { /* Current network namespace */ #define BPF_F_CURRENT_NETNS (-1L) +/* BPF_FUNC_skb_adjust_room flags. */ +#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) + +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) +#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) +#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, + BPF_ADJ_ROOM_MAC, }; /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index 20f738718..95bbc47b6 100644 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -341,10 +341,17 @@ static int (*bpf_skb_adjust_room)(void *ctx, int len_diff, u32 mode, u64 flags) (void *) BPF_FUNC_skb_adjust_room; static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) = (void *) BPF_FUNC_skb_under_cgroup; +static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, int size, + unsigned long long netns_id, + unsigned long long flags) = + (void *) BPF_FUNC_skc_lookup_tcp; static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) = (void *) BPF_FUNC_sk_redirect_map; static int (*bpf_sock_map_update)(void *map, void *key, void *value, unsigned long long flags) = (void *) BPF_FUNC_sock_map_update; +static int (*bpf_tcp_check_syncookie)(struct bpf_sock *sk, void *ip, int ip_len, void *tcp, + int tcp_len) = + (void *) BPF_FUNC_tcp_check_syncookie; static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) = (void *) BPF_FUNC_xdp_adjust_meta; diff --git a/src/cc/libbpf b/src/cc/libbpf index 33b017498..5844f6e4d 160000 --- a/src/cc/libbpf +++ b/src/cc/libbpf @@ -1 +1 @@ -Subproject commit 33b017498543167b65fa948d3a0267794c78787f +Subproject commit 5844f6e4dd60d8c941417bcaafe0785c61415195 diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 122745171..6df244ce7 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -195,6 +195,8 @@ static struct bpf_helper helpers[] = { {"tcp_sock", "5.1"}, {"skb_ecn_set_ce", "5.1"}, {"get_listener_sock", "5.1"}, + {"skc_lookup_tcp", "5.2"}, + {"tcp_check_syncookie", "5.2"}, }; static uint64_t ptr_to_u64(void *ptr) From 89dc38e189be99db59dc61ba7d853a4c1b6513e8 Mon Sep 17 00:00:00 2001 From: Antonio Ospite Date: Fri, 12 Apr 2019 17:57:47 +0200 Subject: [PATCH 111/135] man: remove duplicated description of '-f' option from offcputime.8 (#2311) In commit 66bf2e8e (offcputime: one symbol cache per process, improve pid/tid handling, 2016-07-31) the '-f' option was documented again probably with the intent of following the order of options in the synopsis, but it was already documented. Remove the fist instance and keep the new one to follow the same order of the synopsis, but use the original description. --- man/man8/offcputime.8 | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/man/man8/offcputime.8 b/man/man8/offcputime.8 index 440c1dd48..be5387b91 100644 --- a/man/man8/offcputime.8 +++ b/man/man8/offcputime.8 @@ -34,9 +34,6 @@ CONFIG_BPF and bcc. \-h Print usage message. .TP -\-f -Print output in folded stack format. -.TP \-p PID Trace this process ID only (filtered in-kernel). .TP @@ -59,7 +56,7 @@ Show stacks from kernel space only (no user space stacks). Insert delimiter between kernel/user stacks. .TP \-f -Output folded format. +Print output in folded stack format. .TP \-\-stack-storage-size STACK_STORAGE_SIZE Change the number of unique stack traces that can be stored and displayed. From 2a9436d76b0082359444cf5216ada20537c526fc Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Sun, 14 Apr 2019 13:56:39 +0800 Subject: [PATCH 112/135] stackcount: do per-pid statistics only if -P is provided (#2313) Fixes #2308. --- man/man8/stackcount.8 | 13 +++- tools/stackcount.py | 8 +-- tools/stackcount_example.txt | 115 ++++++++++++++++++++++++++++------- 3 files changed, 107 insertions(+), 29 deletions(-) diff --git a/man/man8/stackcount.8 b/man/man8/stackcount.8 index f577fa75a..d24595616 100644 --- a/man/man8/stackcount.8 +++ b/man/man8/stackcount.8 @@ -2,8 +2,8 @@ .SH NAME stackcount \- Count function calls and their stack traces. Uses Linux eBPF/bcc. .SH SYNOPSIS -.B stackcount [\-h] [\-p PID] [\-c CPU] [\-i INTERVAL] [\-D DURATION] [\-T] [\-r] - [\-s] [\-P] [\-K] [\-U] [\-v] [\-d] [\-f] pattern +.B stackcount [\-h] [\-p PID] [\-c CPU] [\-i INTERVAL] [\-D DURATION] [\-T] + [\-r] [\-s] [\-P] [\-K] [\-U] [\-v] [\-d] [\-f] [\-\-debug] pattern .SH DESCRIPTION stackcount traces functions and frequency counts them with their entire stack trace, kernel stack and user stack, summarized in-kernel for efficiency. @@ -31,6 +31,15 @@ wildcards only. \-s Show address offsets. .TP +\-P +Display stacks separately for each process. +.TP +\-K +Show kernel stack only. +.TP +\-U +Show user stack only. +.TP \-T Include a timestamp with interval output. .TP diff --git a/tools/stackcount.py b/tools/stackcount.py index 894a71a49..8c056d684 100755 --- a/tools/stackcount.py +++ b/tools/stackcount.py @@ -163,16 +163,14 @@ def load(self): trace_count_text = trace_count_text.replace('FILTER', '\n '.join(filter_text)) - # We need per-pid statistics when tracing a user-space process, because - # the meaning of the symbols depends on the pid. We also need them if - # per-pid statistics were requested with -P, or for user stacks. - if self.per_pid or not self.is_kernel_probe() or self.user_stack: + # Do per-pid statistics iff -P is provided + if self.per_pid: trace_count_text = trace_count_text.replace('GET_TGID', 'bpf_get_current_pid_tgid() >> 32') trace_count_text = trace_count_text.replace('STORE_COMM', 'bpf_get_current_comm(&key.name, sizeof(key.name));') else: - # kernel stacks only. skip splitting on PID so these aggregate + # skip splitting on PID so these aggregate # together, and don't store the process name. trace_count_text = trace_count_text.replace( 'GET_TGID', '0xffffffff') diff --git a/tools/stackcount_example.txt b/tools/stackcount_example.txt index 389200f55..26233d798 100644 --- a/tools/stackcount_example.txt +++ b/tools/stackcount_example.txt @@ -8,6 +8,78 @@ block device I/O: # ./stackcount submit_bio Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end. +^C + submit_bio + submit_bh + journal_submit_commit_record.isra.13 + jbd2_journal_commit_transaction + kjournald2 + kthread + ret_from_fork + mb_cache_list + 1 + + submit_bio + __block_write_full_page.constprop.39 + block_write_full_page + blkdev_writepage + __writepage + write_cache_pages + generic_writepages + do_writepages + __writeback_single_inode + writeback_sb_inodes + __writeback_inodes_wb + 2 + + submit_bio + __block_write_full_page.constprop.39 + block_write_full_page + blkdev_writepage + __writepage + write_cache_pages + generic_writepages + do_writepages + __filemap_fdatawrite_range + filemap_fdatawrite + fdatawrite_one_bdev + 36 + + submit_bio + submit_bh + jbd2_journal_commit_transaction + kjournald2 + kthread + ret_from_fork + mb_cache_list + 38 + + submit_bio + ext4_writepages + do_writepages + __filemap_fdatawrite_range + filemap_flush + ext4_alloc_da_blocks + ext4_rename + ext4_rename2 + vfs_rename + sys_rename + entry_SYSCALL_64_fastpath + 79 + +Detaching... + +The output shows unique stack traces, in order from leaf (on-CPU) to root, +followed by their occurrence count. The last stack trace in the above output +shows syscall handling, ext4_rename(), and filemap_flush(): looks like an +application issued file rename has caused back end disk I/O due to ext4 +block allocation and a filemap_flush(). + + +Now adding the -P option to display stacks separately for each process: + +# ./stackcount -P submit_bio +Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end. ^C submit_bio ext4_writepages @@ -64,15 +136,14 @@ Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end. Detaching... -The output shows unique stack traces, in order from leaf (on-CPU) to root, -followed by their occurrence count. The last stack trace in the above output -shows syscall handling, sys_read(), vfs_read(), and then "readahead" functions: -looks like an application issued file read has triggered read ahead. The -application can be seen after the stack trace, in this case, "tar [15069]" -for the "tar" command, PID 15069. +The last stack trace in the above output shows syscall handling, sys_read(), +vfs_read(), and then "readahead" functions: looks like an application issued +file read has triggered read ahead. With "-P", the application can be seen +after the stack trace, in this case, "tar [15069]" for the "tar" command, +PID 15069. The order of printed stack traces is from least to most frequent. The most -frequent in this case, the ext4_rename() stack, was taken 113 times during +frequent in this case, the ext4_readpages() stack, was taken 113 times during tracing. The "[unknown]" frames are from user-level, since this simple workload is @@ -82,7 +153,7 @@ walkers. Similar broken stacks will be seen by other profilers and debuggers that use frame pointers. Hopefully your application preserves them so that the user-level stack trace is visible. So how does one get frame pointers, if your application doesn't have them to start with? For the current bcc (until -it supports other stack walkers), you need to be running a application binaries +it supports other stack walkers), you need to be running an application binaries that preserves frame pointers, eg, using gcc's -fno-omit-frame-pointer. That's about all I'll say here: this is a big topic that is not bcc/BPF specific. @@ -92,7 +163,7 @@ disk IOPS. These could have in-kernel origins (eg, background scrub). Now adding the -d option to delimit kernel and user stacks: -# ./stackcount -d submit_bio +# ./stackcount -P -d submit_bio Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end. ^C submit_bio @@ -181,7 +252,7 @@ A "--" is printed between the kernel and user stacks. As a different example, here is the kernel function hrtimer_init_sleeper(): -# ./stackcount.py -d hrtimer_init_sleeper +# ./stackcount.py -P -d hrtimer_init_sleeper Tracing 1 functions for "hrtimer_init_sleeper"... Hit Ctrl-C to end. ^C hrtimer_init_sleeper @@ -294,7 +365,7 @@ JIT symbol translation). dockerd and containerd don't have frame pointers Here's another kernel function, ip_output(): -# ./stackcount.py -d ip_output +# ./stackcount.py -P -d ip_output Tracing 1 functions for "ip_output"... Hit Ctrl-C to end. ^C ip_output @@ -391,7 +462,7 @@ was the same. Here is just the user stacks, fetched during the kernel function ip_output(): -# ./stackcount.py -U ip_output +# ./stackcount.py -P -U ip_output Tracing 1 functions for "ip_output"... Hit Ctrl-C to end. ^C [unknown] @@ -416,7 +487,7 @@ User-space functions can also be traced if a library name is provided. For example, to quickly identify code locations that allocate heap memory for PID 4902 (using -p), by tracing malloc from libc ("c:malloc"): -# ./stackcount -p 4902 c:malloc +# ./stackcount -P -p 4902 c:malloc Tracing 1 functions for "malloc"... Hit Ctrl-C to end. ^C malloc @@ -444,12 +515,12 @@ without debuginfo. In addition to kernel and user-space functions, kernel tracepoints and USDT -tracepoints are also supported. +tracepoints are also supported. -For example, to determine where threads are being created in a particular +For example, to determine where threads are being created in a particular process, use the pthread_create USDT tracepoint: -# ./stackcount -p $(pidof parprimes) u:pthread:pthread_create +# ./stackcount -P -p $(pidof parprimes) u:pthread:pthread_create Tracing 1 functions for "u:pthread:pthread_create"... Hit Ctrl-C to end. ^C @@ -463,10 +534,10 @@ Tracing 1 functions for "u:pthread:pthread_create"... Hit Ctrl-C to end. You can use "readelf -n file" to see if it has USDT tracepoints. -Similarly, to determine where context switching is happening in the kernel, +Similarly, to determine where context switching is happening in the kernel, use the sched:sched_switch kernel tracepoint: -# ./stackcount t:sched:sched_switch +# ./stackcount -P t:sched:sched_switch __schedule schedule worker_thread @@ -518,7 +589,7 @@ use the sched:sched_switch kernel tracepoint: A -i option can be used to set an output interval, and -T to include a timestamp. For example: -# ./stackcount.py -Tdi 1 submit_bio +# ./stackcount.py -P -Tdi 1 submit_bio Tracing 1 functions for "submit_bio"... Hit Ctrl-C to end. 06:05:13 @@ -705,7 +776,7 @@ did not span block device I/O. The -s output prints the return instruction offset for each function (aka symbol offset). Eg: -# ./stackcount.py -s tcp_sendmsg +# ./stackcount.py -P -s tcp_sendmsg Tracing 1 functions for "tcp_sendmsg"... Hit Ctrl-C to end. ^C tcp_sendmsg+0x1 @@ -738,7 +809,7 @@ offset can help you locate the lines of code from a disassembly dump. The -v output is verbose, and shows raw addresses: -./stackcount.py -v tcp_sendmsg +./stackcount.py -P -v tcp_sendmsg Tracing 1 functions for "tcp_sendmsg"... Hit Ctrl-C to end. ^C ffffffff817b05c1 tcp_sendmsg @@ -825,7 +896,7 @@ Use -r to allow regular expressions. The -f option will emit folded output, which can be used as input to other tools including flame graphs. For example, with delimiters as well: -# ./stackcount.py -df t:sched:sched_switch +# ./stackcount.py -P -df t:sched:sched_switch ^Csnmp-pass;[unknown];[unknown];[unknown];[unknown];[unknown];-;entry_SYSCALL_64_fastpath;SyS_select;core_sys_select;do_select;poll_schedule_timeout;schedule_hrtimeout_range;schedule_hrtimeout_range_clock;schedule;__schedule 1 kworker/7:0;-;ret_from_fork;kthread;worker_thread;schedule;__schedule 1 watchdog/0;-;ret_from_fork;kthread;smpboot_thread_fn;schedule;__schedule 1 From 6b1979037b5fc526224c627537df34b0d7f9ed17 Mon Sep 17 00:00:00 2001 From: Xiaozhou Liu Date: Tue, 16 Apr 2019 05:41:18 +0800 Subject: [PATCH 113/135] tools/profile: add -L option to support filtering on TID (#2315) tools/profile already supports "-p PID" to filter on PID (tgid in kernel). Now add "-L TID" to profile thread with this TID only (pid in kernel). --- man/man8/profile.8 | 14 ++++++++++---- tools/profile.py | 24 +++++++++++++++--------- tools/profile_example.txt | 8 +++++--- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/man/man8/profile.8 b/man/man8/profile.8 index e2b6a8438..88311e7e2 100644 --- a/man/man8/profile.8 +++ b/man/man8/profile.8 @@ -2,7 +2,7 @@ .SH NAME profile \- Profile CPU usage by sampling stack traces. Uses Linux eBPF/bcc. .SH SYNOPSIS -.B profile [\-adfh] [\-p PID] [\-U | \-K] [\-F FREQUENCY | \-c COUNT] +.B profile [\-adfh] [\-p PID | \-L TID] [\-U | \-K] [\-F FREQUENCY | \-c COUNT] .B [\-\-stack\-storage\-size COUNT] [duration] .SH DESCRIPTION This is a CPU profiler. It works by taking samples of stack traces at timed @@ -28,8 +28,10 @@ for an older version that may work on Linux 4.6 - 4.8. Print usage message. .TP \-p PID -Trace this process ID only (filtered in-kernel). Without this, all CPUs are -profiled. +Trace this process ID only (filtered in-kernel). +.TP +\-L TID +Trace this thread ID only (filtered in-kernel). .TP \-F frequency Frequency to sample stacks. @@ -80,10 +82,14 @@ Profile 1 in a million events for 5 seconds only: # .B profile -c 1000000 5 .TP -Profile PID 181 only: +Profile process with PID 181 only: # .B profile -p 181 .TP +Profile thread with TID 181 only: +# +.B profile -L 181 +.TP Profile for 5 seconds and output in folded stack format (suitable as input for flame graphs), including a delimiter between kernel and user stacks: # .B profile -df 5 diff --git a/tools/profile.py b/tools/profile.py index 958b6323e..dfbced6aa 100755 --- a/tools/profile.py +++ b/tools/profile.py @@ -34,7 +34,6 @@ import signal import os import errno -import multiprocessing # # Process Arguments @@ -69,7 +68,8 @@ def stack_id_err(stack_id): ./profile -c 1000000 # profile stack traces every 1 in a million events ./profile 5 # profile at 49 Hertz for 5 seconds only ./profile -f 5 # output in folded format for flame graphs - ./profile -p 185 # only profile threads for PID 185 + ./profile -p 185 # only profile process with PID 185 + ./profile -L 185 # only profile thread with TID 185 ./profile -U # only show user space stacks (no kernel) ./profile -K # only show kernel space stacks (no user) """ @@ -79,7 +79,9 @@ def stack_id_err(stack_id): epilog=examples) thread_group = parser.add_mutually_exclusive_group() thread_group.add_argument("-p", "--pid", type=positive_int, - help="profile this PID only") + help="profile process with this PID only") +thread_group.add_argument("-L", "--tid", type=positive_int, + help="profile thread with this TID only") # TODO: add options for user/kernel threads only stack_group = parser.add_mutually_exclusive_group() stack_group.add_argument("-U", "--user-stacks-only", action="store_true", @@ -144,7 +146,10 @@ def stack_id_err(stack_id): // This code gets a bit complex. Probably not suitable for casual hacking. int do_perf_event(struct bpf_perf_event_data *ctx) { - u32 pid = bpf_get_current_pid_tgid() >> 32; + u64 id = bpf_get_current_pid_tgid(); + u32 tgid = id >> 32; + u32 pid = id; + if (IDLE_FILTER) return 0; @@ -152,7 +157,7 @@ def stack_id_err(stack_id): return 0; // create map key - struct key_t key = {.pid = pid}; + struct key_t key = {.pid = tgid}; bpf_get_current_comm(&key.name, sizeof(key.name)); // get stacks @@ -197,13 +202,14 @@ def stack_id_err(stack_id): idle_filter = "0" bpf_text = bpf_text.replace('IDLE_FILTER', idle_filter) -# set thread filter +# set process/thread filter thread_context = "" -perf_filter = "-a" if args.pid is not None: thread_context = "PID %s" % args.pid - thread_filter = 'pid == %s' % args.pid - perf_filter = '-p %s' % args.pid + thread_filter = 'tgid == %s' % args.pid +elif args.tid is not None: + thread_context = "TID %s" % args.tid + thread_filter = 'pid == %s' % args.tid else: thread_context = "all threads" thread_filter = '1' diff --git a/tools/profile_example.txt b/tools/profile_example.txt index 7b1cc2683..9b1e5c2b9 100644 --- a/tools/profile_example.txt +++ b/tools/profile_example.txt @@ -707,7 +707,7 @@ Run ./profile -h to see the default. USAGE message: # ./profile -h -usage: profile.py [-h] [-p PID] [-U | -K] [-F FREQUENCY | -c COUNT] [-d] [-a] +usage: profile.py [-h] [-p PID | -L TID] [-U | -K] [-F FREQUENCY | -c COUNT] [-d] [-a] [-I] [-f] [--stack-storage-size STACK_STORAGE_SIZE] [-C CPU] [duration] @@ -718,7 +718,8 @@ positional arguments: optional arguments: -h, --help show this help message and exit - -p PID, --pid PID profile this PID only + -p PID, --pid PID profile process with this PID only + -L TID, --tid TID profile thread with this TID only -U, --user-stacks-only show stacks from user space only (no kernel space stacks) @@ -745,6 +746,7 @@ examples: ./profile -c 1000000 # profile stack traces every 1 in a million events ./profile 5 # profile at 49 Hertz for 5 seconds only ./profile -f 5 # output in folded format for flame graphs - ./profile -p 185 # only profile threads for PID 185 + ./profile -p 185 # only profile process with PID 185 + ./profile -L 185 # only profile thread with TID 185 ./profile -U # only show user space stacks (no kernel) ./profile -K # only show kernel space stacks (no user) From 47618fedb73e39533398ed8940378b75ceea064e Mon Sep 17 00:00:00 2001 From: Dale Hamel Date: Tue, 16 Apr 2019 12:07:28 -0400 Subject: [PATCH 114/135] Support loading elf notes from memfd backed map (#2314) * Support loading elf notes from file backed on memfd * Use const for arguments to _procutils_memfd_path * Cleanups for procuptils_memfd_path --- src/cc/bcc_proc.c | 52 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/src/cc/bcc_proc.c b/src/cc/bcc_proc.c index ccec0fc27..af91d8f09 100644 --- a/src/cc/bcc_proc.c +++ b/src/cc/bcc_proc.c @@ -14,10 +14,8 @@ * limitations under the License. */ -#include -#include -#include #include +#include #include #include #include @@ -26,6 +24,9 @@ #include #include #include +#include +#include +#include #include #include "bcc_perf_map.h" @@ -82,6 +83,42 @@ int bcc_mapping_is_file_backed(const char *mapname) { STARTS_WITH(mapname, "[vsyscall]")); } +/* +Finds a file descriptor for a given inode if it's a memory-backed fd. +*/ +static char *_procutils_memfd_path(const int pid, const uint64_t inum) { + char path_buffer[PATH_MAX + 1]; + char *path = NULL; + char *dirstr; + DIR *dirstream; + struct stat sb; + struct dirent *dent; + + snprintf(path_buffer, (PATH_MAX + 1), "/proc/%d/fd", pid); + dirstr = malloc(strlen(path_buffer) + 1); + strcpy(dirstr, path_buffer); + dirstream = opendir(dirstr); + + if (dirstream == NULL) + return NULL; + + while (path == NULL && (dent = readdir(dirstream)) != NULL) { + snprintf(path_buffer, (PATH_MAX + 1), "%s/%s", dirstr, dent->d_name); + if (stat(path_buffer, &sb) == -1) + continue; + + if (sb.st_ino == inum) { + char *pid_fd_path = malloc(strlen(path_buffer) + 1); + strcpy(pid_fd_path, path_buffer); + path = pid_fd_path; + } + } + closedir(dirstream); + free(dirstr); + + return path; +} + int bcc_procutils_each_module(int pid, bcc_procutils_modulecb callback, void *payload) { char procmap_filename[128]; @@ -112,6 +149,15 @@ int bcc_procutils_each_module(int pid, bcc_procutils_modulecb callback, if (!bcc_mapping_is_file_backed(name)) continue; + if (strstr(name, "/memfd:")) { + char *memfd_name = _procutils_memfd_path(pid, inode); + if (memfd_name != NULL) { + strcpy(buf, memfd_name); + free(memfd_name); + name = buf; + } + } + if (callback(name, begin, end, (uint64_t)offset, true, payload) < 0) break; } From 9d035b86a985f48cc7e9b586419a12ee6631d3b3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 16 Apr 2019 19:53:58 -0700 Subject: [PATCH 115/135] tools: fix stacktraces formatting in memleak.py (#2316) Fix printing of stacktraces. Example: Before this patch: Attaching to kernel allocators, Ctrl+C to quit. [14:11:43] Top 10 stacks with outstanding allocations: 8 bytes in 1 allocations from stack b'pcpu_alloc+0x3d8 [kernel]\n\t\tpcpu_alloc+0x3d8 [kernel]\n\t\tperf_trace_event_init+0xc9 [kernel]\n\t\tperf_trace_init+0x69 [kernel]\n\t\tperf_tp_event_init+0x1b [kernel]\n\t\tperf_try_init_event+0x42 [kernel]\n\t\tperf_event_alloc+0x620 [kernel]\n\t\t__do_sys_perf_event_open+0x188 [kernel]\n\t\tdo_syscall_64+0x48 [kernel]\n\t\tentry_SYSCALL_64_after_hwframe+0x44 [kernel]' 16 bytes in 1 allocations from stack b'pcpu_alloc+0x3d8 [kernel]\n\t\tpcpu_alloc+0x3d8 [kernel]\n\t\tbpf_prog_alloc+0x33 [kernel]\n\t\tbpf_prog_load+0xf5 [kernel]\n\t\tperf_event_for_each_child+0x34 [kernel]\n\t\t_perf_ioctl+0x1d7 [kernel]\n\t\t__switch_to_asm+0x34 [kernel]\n\t\t__switch_to_asm+0x40 [kernel]\n\t\t__switch_to_asm+0x34 [kernel]\n\t\t__switch_to_asm+0x40 [kernel]\n\t\t__switch_to_asm+0x34 [kernel]\n\t\t__switch_to_asm+0x40 [kernel]\n\t\t__do_sys_bpf+0x953 [kernel]\n\t\tperf_ioctl+0x40 [kernel]\n\t\tdo_vfs_ioctl+0xa5 [kernel]\n\t\tdo_syscall_64+0x48 [kernel]\n\t\tentry_SYSCALL_64_after_hwframe+0x44 [kernel]' With this patch: Attaching to kernel allocators, Ctrl+C to quit. [14:13:09] Top 10 stacks with outstanding allocations: 576 bytes in 3 allocations from stack kmem_cache_alloc+0x15c [kernel] __d_alloc+0x22 [kernel] kmem_cache_alloc+0x15c [kernel] __cpa_flush_tlb+0x0 [kernel] __d_alloc+0x22 [kernel] alloc_file_pseudo+0x65 [kernel] anon_inode_getfile+0x7f [kernel] anon_inode_getfd+0x35 [kernel] bpf_prog_load+0x3ef [kernel] _perf_ioctl+0x1d7 [kernel] alloc_file_pseudo+0xa7 [kernel] __do_sys_bpf+0x953 [kernel] perf_ioctl+0x40 [kernel] do_vfs_ioctl+0xa5 [kernel] do_syscall_64+0x48 [kernel] entry_SYSCALL_64_after_hwframe+0x44 [kernel] 768 bytes in 4 allocations from stack kmem_cache_alloc+0x15c [kernel] __d_alloc+0x22 [kernel] kmem_cache_alloc+0x15c [kernel] __d_alloc+0x22 [kernel] alloc_file_pseudo+0x65 [kernel] ns_capable_common+0x2b [kernel] anon_inode_getfile+0x7f [kernel] __do_sys_perf_event_open+0x86f [kernel] do_syscall_64+0x48 [kernel] entry_SYSCALL_64_after_hwframe+0x44 [kernel] --- tools/memleak.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/memleak.py b/tools/memleak.py index 64c5972d2..29419b3a1 100755 --- a/tools/memleak.py +++ b/tools/memleak.py @@ -472,7 +472,8 @@ def print_outstanding(): key=lambda a: a.size)[-top_stacks:] for alloc in to_show: print("\t%d bytes in %d allocations from stack\n\t\t%s" % - (alloc.size, alloc.count, b"\n\t\t".join(alloc.stack))) + (alloc.size, alloc.count, + b"\n\t\t".join(alloc.stack).decode("ascii"))) def print_outstanding_combined(): stack_traces = bpf["stack_traces"] From 9c8e2a6ab7a4331ba71b4c7638c9531e430194e4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 15 Apr 2019 14:08:39 -0700 Subject: [PATCH 116/135] tools: add percpu allocator tracing to memleak Add an ability to trace percpu allocations using the memleak tool. Example: ./memleak.py --percpu Attaching to kernel allocators, Ctrl+C to quit. [14:15:46] Top 10 stacks with outstanding allocations: 8 bytes in 1 allocations from stack pcpu_alloc+0x3d8 [kernel] pcpu_alloc+0x3d8 [kernel] perf_trace_event_init+0xc9 [kernel] perf_trace_init+0x69 [kernel] perf_tp_event_init+0x1b [kernel] perf_try_init_event+0x42 [kernel] perf_event_alloc+0x620 [kernel] __do_sys_perf_event_open+0x188 [kernel] do_syscall_64+0x48 [kernel] entry_SYSCALL_64_after_hwframe+0x44 [kernel] 16 bytes in 1 allocations from stack pcpu_alloc+0x3d8 [kernel] pcpu_alloc+0x3d8 [kernel] bpf_prog_alloc+0x33 [kernel] bpf_prog_load+0xf5 [kernel] perf_event_for_each_child+0x34 [kernel] _perf_ioctl+0x1d7 [kernel] alloc_file_pseudo+0xa7 [kernel] __do_sys_bpf+0x953 [kernel] perf_ioctl+0x40 [kernel] do_vfs_ioctl+0xa5 [kernel] do_syscall_64+0x48 [kernel] entry_SYSCALL_64_after_hwframe+0x44 [kernel] --- tools/memleak.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/memleak.py b/tools/memleak.py index 29419b3a1..012ab9108 100755 --- a/tools/memleak.py +++ b/tools/memleak.py @@ -100,6 +100,8 @@ def run_command_get_pid(command): help="attach to allocator functions in the specified object") parser.add_argument("--ebpf", action="store_true", help=argparse.SUPPRESS) +parser.add_argument("--percpu", default=False, action="store_true", + help="trace percpu allocations") args = parser.parse_args() @@ -365,8 +367,23 @@ def run_command_get_pid(command): } """ +bpf_source_percpu = """ + +TRACEPOINT_PROBE(percpu, percpu_alloc_percpu) { + gen_alloc_enter((struct pt_regs *)args, args->size); + return gen_alloc_exit2((struct pt_regs *)args, (size_t)args->ptr); +} + +TRACEPOINT_PROBE(percpu, percpu_free_percpu) { + return gen_free_enter((struct pt_regs *)args, (void *)args->ptr); +} +""" + if kernel_trace: - bpf_source += bpf_source_kernel + if args.percpu: + bpf_source += bpf_source_percpu + else: + bpf_source += bpf_source_kernel bpf_source = bpf_source.replace("SHOULD_PRINT", "1" if trace_all else "0") bpf_source = bpf_source.replace("SAMPLE_EVERY_N", str(sample_every_n)) From d2a4626dacc33d3dca9a7e6ea2ef4bdaa6ef3c74 Mon Sep 17 00:00:00 2001 From: Gary Lin Date: Wed, 17 Apr 2019 15:23:16 +0800 Subject: [PATCH 117/135] Convert bytes to string for re in get_tracepoints() When executing funccount with python3, the following error showed. # python3 funccount.py -D 't:block:*' Traceback (most recent call last): File "funccount.py", line 299, in Tool().run() File "funccount.py", line 261, in run self.probe.load() File "funccount.py", line 191, in load bpf_text += self._generate_functions(trace_count_text) File "funccount.py", line 143, in _generate_functions tracepoints = BPF.get_tracepoints(self.pattern) File "/usr/lib/python3.7/site-packages/bcc/__init__.py", line 772, in get_tracepoints if re.match(tp_re, tp): File "/usr/lib64/python3.7/re.py", line 173, in match return _compile(pattern, flags).match(string) TypeError: cannot use a bytes pattern on a string-like object This commit convert 'tp_re' from bytes to string to avoid the crash. Signed-off-by: Gary Lin --- src/python/bcc/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/bcc/__init__.py b/src/python/bcc/__init__.py index 6f114de89..bff5f2820 100644 --- a/src/python/bcc/__init__.py +++ b/src/python/bcc/__init__.py @@ -769,7 +769,7 @@ def get_tracepoints(tp_re): evt_dir = os.path.join(cat_dir, event) if os.path.isdir(evt_dir): tp = ("%s:%s" % (category, event)) - if re.match(tp_re, tp): + if re.match(tp_re.decode(), tp): results.append(tp) return results From 7caf21aac8ba2ab3e0b8a004310d424064486607 Mon Sep 17 00:00:00 2001 From: Dale Hamel Date: Wed, 17 Apr 2019 22:21:54 -0400 Subject: [PATCH 118/135] Free local string variable on error path when parsing elf notes from memfd-backed file --- src/cc/bcc_proc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cc/bcc_proc.c b/src/cc/bcc_proc.c index af91d8f09..6ce63de89 100644 --- a/src/cc/bcc_proc.c +++ b/src/cc/bcc_proc.c @@ -99,8 +99,10 @@ static char *_procutils_memfd_path(const int pid, const uint64_t inum) { strcpy(dirstr, path_buffer); dirstream = opendir(dirstr); - if (dirstream == NULL) + if (dirstream == NULL) { + free(dirstr); return NULL; + } while (path == NULL && (dent = readdir(dirstream)) != NULL) { snprintf(path_buffer, (PATH_MAX + 1), "%s/%s", dirstr, dent->d_name); From e76a7c7126deecb87ddf2d1cbcc7881b9a941b22 Mon Sep 17 00:00:00 2001 From: Ashley Davies Date: Thu, 18 Apr 2019 17:29:33 +0100 Subject: [PATCH 119/135] Add timeout to perf_buffer_poll docs --- docs/reference_guide.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/reference_guide.md b/docs/reference_guide.md index 1eb46995e..07ff4cb12 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -1244,10 +1244,12 @@ Normal output from a BPF program is either: ### 1. perf_buffer_poll() -Syntax: ```BPF.perf_buffer_poll()``` +Syntax: ```BPF.perf_buffer_poll([timeout])``` This polls from all open perf ring buffers, calling the callback function that was provided when calling open_perf_buffer for each entry. +The timeout parameter is optional and measured in milliseconds. + Example: ```Python From 057a16fd206cd32dd8e58f413baebff0be625230 Mon Sep 17 00:00:00 2001 From: Ashley Davies Date: Thu, 18 Apr 2019 17:32:40 +0100 Subject: [PATCH 120/135] Update reference_guide.md --- docs/reference_guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference_guide.md b/docs/reference_guide.md index 07ff4cb12..9e1cbcbe6 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -1248,7 +1248,7 @@ Syntax: ```BPF.perf_buffer_poll([timeout])``` This polls from all open perf ring buffers, calling the callback function that was provided when calling open_perf_buffer for each entry. -The timeout parameter is optional and measured in milliseconds. +The timeout parameter is optional and measured in milliseconds. In its absence, polling continues indefinitely. Example: From 1bbfdb6050903bef8b89c3c2c1d1a8373e2cc04a Mon Sep 17 00:00:00 2001 From: Ashley Davies Date: Thu, 18 Apr 2019 17:35:51 +0100 Subject: [PATCH 121/135] Update reference_guide.md --- docs/reference_guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/reference_guide.md b/docs/reference_guide.md index 9e1cbcbe6..49e9169e1 100644 --- a/docs/reference_guide.md +++ b/docs/reference_guide.md @@ -1244,7 +1244,7 @@ Normal output from a BPF program is either: ### 1. perf_buffer_poll() -Syntax: ```BPF.perf_buffer_poll([timeout])``` +Syntax: ```BPF.perf_buffer_poll(timeout=T)``` This polls from all open perf ring buffers, calling the callback function that was provided when calling open_perf_buffer for each entry. From 6c793317dac5866db2899e62504d047a02c089b7 Mon Sep 17 00:00:00 2001 From: Gary Lin Date: Thu, 18 Apr 2019 15:17:56 +0800 Subject: [PATCH 122/135] tools: don't mix print(end="") with printb() While mixing print(end="") with printb(), some messages may miss due to the underlying buffer handling in python 3. For example: # python3 opensnoop.py PID COMM FD ERR PATH /proc/18849/cmdline 4109 tmux: server 67 0 /proc/18849/cmdline 4109 tmux: server 67 0 /proc/18849/cmdline 4109 tmux: server 67 0 /proc/18849/cmdline The PID, COMM, FD, and ERR are printed with print(end=""), and those of the first instance was eaten by printb() which outputs PATH. The following scripts mix print(end="") and printb() for the same line: tools/execsnoop.py tools/opensnoop.py tools/tcpaccept.py tools/tcpconnect.py Those scripts work fine with python 2 but some messages may miss while using python 3. This commit converts print(end="") to printb(nl="") to avoid the inconsistent outputs. Signed-off-by: Gary Lin --- tools/execsnoop.py | 2 +- tools/opensnoop.py | 12 ++++++------ tools/tcpaccept.py | 8 ++++---- tools/tcpconnect.py | 8 ++++---- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tools/execsnoop.py b/tools/execsnoop.py index 1ce83e07d..7d048d84c 100755 --- a/tools/execsnoop.py +++ b/tools/execsnoop.py @@ -216,7 +216,7 @@ def print_event(cpu, data, size): if not skip: if args.timestamp: - print("%-8.3f" % (time.time() - start_ts), end="") + printb(b"%-8.3f" % (time.time() - start_ts), nl="") ppid = event.ppid if event.ppid > 0 else get_ppid(event.pid) ppid = b"%d" % ppid if ppid > 0 else b"?" argv_text = b' '.join(argv[event.pid]).replace(b'\n', b'\\n') diff --git a/tools/opensnoop.py b/tools/opensnoop.py index 4ffedfa93..60d11c63d 100755 --- a/tools/opensnoop.py +++ b/tools/opensnoop.py @@ -218,17 +218,17 @@ def print_event(cpu, data, size): if args.timestamp: delta = event.ts - initial_ts - print("%-14.9f" % (float(delta) / 1000000), end="") + printb(b"%-14.9f" % (float(delta) / 1000000), nl="") if args.print_uid: - print("%-6d" % event.uid, end="") + printb(b"%-6d" % event.uid, nl="") - print("%-6d %-16s %4d %3d " % - (event.id & 0xffffffff if args.tid else event.id >> 32, - event.comm.decode('utf-8', 'replace'), fd_s, err), end="") + printb(b"%-6d %-16s %4d %3d " % + (event.id & 0xffffffff if args.tid else event.id >> 32, + event.comm, fd_s, err), nl="") if args.extended_fields: - print("%08o " % event.flags, end="") + printb(b"%08o " % event.flags, nl="") printb(b'%s' % event.fname) diff --git a/tools/tcpaccept.py b/tools/tcpaccept.py index 70202dd39..914d51837 100755 --- a/tools/tcpaccept.py +++ b/tools/tcpaccept.py @@ -196,11 +196,11 @@ def print_ipv4_event(cpu, data, size): event = b["ipv4_events"].event(data) global start_ts if args.time: - print("%-9s" % strftime("%H:%M:%S"), end="") + printb(b"%-9s" % strftime("%H:%M:%S").encode('ascii'), nl="") if args.timestamp: if start_ts == 0: start_ts = event.ts_us - print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="") + printb(b"%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), nl="") printb(b"%-7d %-12.12s %-2d %-16s %-5d %-16s %-5d" % (event.pid, event.task, event.ip, inet_ntop(AF_INET, pack("I", event.daddr)).encode(), @@ -212,11 +212,11 @@ def print_ipv6_event(cpu, data, size): event = b["ipv6_events"].event(data) global start_ts if args.time: - print("%-9s" % strftime("%H:%M:%S"), end="") + printb(b"%-9s" % strftime("%H:%M:%S").encode('ascii'), nl="") if args.timestamp: if start_ts == 0: start_ts = event.ts_us - print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="") + printb(b"%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), nl="") printb(b"%-7d %-12.12s %-2d %-16s %-5d %-16s %-5d" % (event.pid, event.task, event.ip, inet_ntop(AF_INET6, event.daddr).encode(), diff --git a/tools/tcpconnect.py b/tools/tcpconnect.py index e31ff770f..cb3e83b48 100755 --- a/tools/tcpconnect.py +++ b/tools/tcpconnect.py @@ -197,9 +197,9 @@ def print_ipv4_event(cpu, data, size): if args.timestamp: if start_ts == 0: start_ts = event.ts_us - print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="") + printb(b"%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), nl="") if args.print_uid: - print("%-6d" % event.uid, end="") + printb(b"%-6d" % event.uid, nl="") printb(b"%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid, event.task, event.ip, inet_ntop(AF_INET, pack("I", event.saddr)).encode(), @@ -211,9 +211,9 @@ def print_ipv6_event(cpu, data, size): if args.timestamp: if start_ts == 0: start_ts = event.ts_us - print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="") + printb(b"%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), nl="") if args.print_uid: - print("%-6d" % event.uid, end="") + printb(b"%-6d" % event.uid, nl="") printb(b"%-6d %-12.12s %-2d %-16s %-16s %-4d" % (event.pid, event.task, event.ip, inet_ntop(AF_INET6, event.saddr).encode(), inet_ntop(AF_INET6, event.daddr).encode(), From 4a51c75c6c3e39130e35fc39b8702048f728197c Mon Sep 17 00:00:00 2001 From: Dale Hamel Date: Fri, 29 Mar 2019 18:28:59 -0400 Subject: [PATCH 123/135] Add bcc_usdt_enable_fully_specified_probe to avoid usdt provider collisions --- src/cc/bcc_usdt.h | 2 ++ src/cc/usdt.h | 1 + src/cc/usdt/usdt.cc | 46 +++++++++++++++++++++++++++++++++++++-------- 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/cc/bcc_usdt.h b/src/cc/bcc_usdt.h index a031bc6ab..0c548091a 100644 --- a/src/cc/bcc_usdt.h +++ b/src/cc/bcc_usdt.h @@ -70,6 +70,8 @@ int bcc_usdt_get_argument(void *usdt, const char *provider_name, struct bcc_usdt_argument *argument); int bcc_usdt_enable_probe(void *, const char *, const char *); +#define BCC_USDT_HAS_FULLY_SPECIFIED_PROBE +int bcc_usdt_enable_fully_specified_probe(void *, const char *, const char *, const char *); const char *bcc_usdt_genargs(void **ctx_array, int len); const char *bcc_usdt_get_probe_argctype( void *ctx, const char* probe_name, const int arg_index diff --git a/src/cc/usdt.h b/src/cc/usdt.h index 406cfd546..f746d03b7 100644 --- a/src/cc/usdt.h +++ b/src/cc/usdt.h @@ -280,6 +280,7 @@ class Context { Probe *get(int pos) { return probes_[pos].get(); } bool enable_probe(const std::string &probe_name, const std::string &fn_name); + bool enable_probe(const std::string &provider_name, const std::string &probe_name, const std::string &fn_name); typedef void (*each_cb)(struct bcc_usdt *); void each(each_cb callback); diff --git a/src/cc/usdt/usdt.cc b/src/cc/usdt/usdt.cc index c91faa016..5f78509d0 100644 --- a/src/cc/usdt/usdt.cc +++ b/src/cc/usdt/usdt.cc @@ -295,24 +295,46 @@ Probe *Context::get(const std::string &provider_name, bool Context::enable_probe(const std::string &probe_name, const std::string &fn_name) { + return enable_probe("", probe_name, fn_name); +} + +bool Context::enable_probe(const std::string &provider_name, + const std::string &probe_name, + const std::string &fn_name) { if (pid_stat_ && pid_stat_->is_stale()) return false; - // FIXME: we may have issues here if the context has two same probes's - // but different providers. For example, libc:setjmp and rtld:setjmp, - // libc:lll_futex_wait and rtld:lll_futex_wait. + unsigned int matches = 0; Probe *found_probe = nullptr; for (auto &p : probes_) { if (p->name_ == probe_name) { - if (found_probe != nullptr) { - fprintf(stderr, "Two same-name probes (%s) but different providers\n", - probe_name.c_str()); - return false; + if (found_probe == nullptr && provider_name == "") + { + found_probe = p.get(); + matches++; + } + else if (found_probe != nullptr && provider_name == "") + { + fprintf(stderr, "Found duplicate provider (%s) for underspecified probe (%s)\n", + p->provider().c_str(), p->name().c_str()); + matches++; + } else if (provider_name != "" && p->provider() == provider_name) + { + found_probe = p.get(); + matches++; } - found_probe = p.get(); } } + if (matches > 1) { + fprintf(stderr, "Found %i duplicate providers for underpecified probe (%s)\n", + matches, fn_name.c_str()); + return false; + } else if(matches < 1) { + fprintf(stderr, "No matches found for probe (%s)\n", fn_name.c_str()); + return false; + } + if (found_probe != nullptr) return found_probe->enable(fn_name); @@ -448,6 +470,14 @@ int bcc_usdt_enable_probe(void *usdt, const char *probe_name, return ctx->enable_probe(probe_name, fn_name) ? 0 : -1; } +int bcc_usdt_enable_fully_specified_probe(void *usdt, + const char *provider_name, + const char *probe_name, + const char *fn_name) { + USDT::Context *ctx = static_cast(usdt); + return ctx->enable_probe(provider_name, probe_name, fn_name) ? 0 : -1; +} + const char *bcc_usdt_genargs(void **usdt_array, int len) { static std::string storage_; std::ostringstream stream; From 33bffcaadcf3bd70807dc1de1145de54b6b7ab67 Mon Sep 17 00:00:00 2001 From: Dale Hamel Date: Thu, 11 Apr 2019 18:51:50 -0400 Subject: [PATCH 124/135] Simplify provider collision detection --- src/cc/bcc_usdt.h | 3 ++- src/cc/usdt.h | 3 ++- src/cc/usdt/usdt.cc | 35 ++++++++--------------------------- 3 files changed, 12 insertions(+), 29 deletions(-) diff --git a/src/cc/bcc_usdt.h b/src/cc/bcc_usdt.h index 0c548091a..86e24e4ca 100644 --- a/src/cc/bcc_usdt.h +++ b/src/cc/bcc_usdt.h @@ -71,7 +71,8 @@ int bcc_usdt_get_argument(void *usdt, const char *provider_name, int bcc_usdt_enable_probe(void *, const char *, const char *); #define BCC_USDT_HAS_FULLY_SPECIFIED_PROBE -int bcc_usdt_enable_fully_specified_probe(void *, const char *, const char *, const char *); +int bcc_usdt_enable_fully_specified_probe(void *, const char *, const char *, + const char *); const char *bcc_usdt_genargs(void **ctx_array, int len); const char *bcc_usdt_get_probe_argctype( void *ctx, const char* probe_name, const int arg_index diff --git a/src/cc/usdt.h b/src/cc/usdt.h index f746d03b7..f1dac48a6 100644 --- a/src/cc/usdt.h +++ b/src/cc/usdt.h @@ -280,7 +280,8 @@ class Context { Probe *get(int pos) { return probes_[pos].get(); } bool enable_probe(const std::string &probe_name, const std::string &fn_name); - bool enable_probe(const std::string &provider_name, const std::string &probe_name, const std::string &fn_name); + bool enable_probe(const std::string &provider_name, + const std::string &probe_name, const std::string &fn_name); typedef void (*each_cb)(struct bcc_usdt *); void each(each_cb callback); diff --git a/src/cc/usdt/usdt.cc b/src/cc/usdt/usdt.cc index 5f78509d0..09b204ee4 100644 --- a/src/cc/usdt/usdt.cc +++ b/src/cc/usdt/usdt.cc @@ -304,37 +304,19 @@ bool Context::enable_probe(const std::string &provider_name, if (pid_stat_ && pid_stat_->is_stale()) return false; - unsigned int matches = 0; Probe *found_probe = nullptr; for (auto &p : probes_) { - if (p->name_ == probe_name) { - if (found_probe == nullptr && provider_name == "") - { - found_probe = p.get(); - matches++; - } - else if (found_probe != nullptr && provider_name == "") - { - fprintf(stderr, "Found duplicate provider (%s) for underspecified probe (%s)\n", - p->provider().c_str(), p->name().c_str()); - matches++; - } else if (provider_name != "" && p->provider() == provider_name) - { - found_probe = p.get(); - matches++; + if (p->name_ == probe_name && + (provider_name.empty() || p->provider() == provider_name)) { + if (found_probe != nullptr) { + fprintf(stderr, "Two same-name probes (%s) but different providers\n", + probe_name.c_str()); + return false; } + found_probe = p.get(); } } - if (matches > 1) { - fprintf(stderr, "Found %i duplicate providers for underpecified probe (%s)\n", - matches, fn_name.c_str()); - return false; - } else if(matches < 1) { - fprintf(stderr, "No matches found for probe (%s)\n", fn_name.c_str()); - return false; - } - if (found_probe != nullptr) return found_probe->enable(fn_name); @@ -470,8 +452,7 @@ int bcc_usdt_enable_probe(void *usdt, const char *probe_name, return ctx->enable_probe(probe_name, fn_name) ? 0 : -1; } -int bcc_usdt_enable_fully_specified_probe(void *usdt, - const char *provider_name, +int bcc_usdt_enable_fully_specified_probe(void *usdt, const char *provider_name, const char *probe_name, const char *fn_name) { USDT::Context *ctx = static_cast(usdt); From 84632859c517ad436ba0f4ea6de1dafd60003e18 Mon Sep 17 00:00:00 2001 From: Christopher Hunt Date: Fri, 26 Apr 2019 12:21:26 -0400 Subject: [PATCH 125/135] tutorial_bcc_python_developer: Finish auto-generated update A sentence was leftover from before we could auto-generate the Python data structure. --- docs/tutorial_bcc_python_developer.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/tutorial_bcc_python_developer.md b/docs/tutorial_bcc_python_developer.md index f5d2ff986..817c109e4 100644 --- a/docs/tutorial_bcc_python_developer.md +++ b/docs/tutorial_bcc_python_developer.md @@ -309,12 +309,10 @@ Things to learn: 1. ```bpf_get_current_comm()```: Populates the first argument address with the current process name. 1. ```events.perf_submit()```: Submit the event for user space to read via a perf ring buffer. 1. ```def print_event()```: Define a Python function that will handle reading events from the ```events``` stream. -1. ```b["events"].event(data)```: Now get the event as a Python object. +1. ```b["events"].event(data)```: Now get the event as a Python object, auto-generated from the C declaration. 1. ```b["events"].open_perf_buffer(print_event)```: Associate the Python ```print_event``` function with the ```events``` stream. 1. ```while 1: b.perf_buffer_poll()```: Block waiting for events. -This may be improved in future bcc versions. Eg, the Python data struct could be auto-generated from the C code. - ### Lesson 8. sync_perf_output.py Rewrite sync_timing.py, from a prior lesson, to use ```BPF_PERF_OUTPUT```. From ae92f3ddb6aa5b81c750abf3540b99f24d219e67 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 12 Apr 2019 11:11:40 -0400 Subject: [PATCH 126/135] loader: Add support for reading kernel headers from /proc BCC right now relies on the filesystem to have kernel headers however this is quite a hindrance to embedded and Android systems, and even distros that may not have the kernel headers for the running kernel. This patch makes use of the new /proc/kheaders.tar.xz archive that is proposed upstream: https://lore.kernel.org/patchwork/patch/1059427/ The approach involves creating a temporary directory containing the headers and compiling from there. It is used as a last resort if headers could not be found by other means. Testing shows this adds around 400ms to start up time of BCC, however the cost is 0 if the headers already exists in /lib or was previously extracted into /tmp directory. Signed-off-by: Joel Fernandes (Google) --- src/cc/frontends/clang/kbuild_helper.cc | 86 ++++++++++++++++++++++++- src/cc/frontends/clang/kbuild_helper.h | 3 + src/cc/frontends/clang/loader.cc | 8 +++ 3 files changed, 96 insertions(+), 1 deletion(-) diff --git a/src/cc/frontends/clang/kbuild_helper.cc b/src/cc/frontends/clang/kbuild_helper.cc index e631eed5c..24084bd8e 100644 --- a/src/cc/frontends/clang/kbuild_helper.cc +++ b/src/cc/frontends/clang/kbuild_helper.cc @@ -13,9 +13,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include +#include + +#include #include #include -#include +#include +#include + #include "kbuild_helper.h" namespace ebpf { @@ -107,4 +113,82 @@ int KBuildHelper::get_flags(const char *uname_machine, vector *cflags) { return 0; } +static inline int file_exists(const char *f) +{ + struct stat buffer; + return (stat(f, &buffer) == 0); +} + +static inline int proc_kheaders_exists(void) +{ + return file_exists(PROC_KHEADERS_PATH); +} + +static inline int extract_kheaders(const std::string &dirpath, + const struct utsname &uname_data) +{ + char tar_cmd[256], dirpath_tmp[256]; + int ret; + bool module = false; + + if (!proc_kheaders_exists()) { + ret = system("modprobe kheaders"); + if (ret) + return ret; + module = true; + if (!proc_kheaders_exists()) { + ret = -1; + goto cleanup; + } + } + + snprintf(dirpath_tmp, 256, "/tmp/kheaders-%s-XXXXXX", uname_data.release); + if (mkdtemp(dirpath_tmp) == NULL) { + ret = -1; + goto cleanup; + } + + snprintf(tar_cmd, 256, "tar -xf %s -C %s", PROC_KHEADERS_PATH, dirpath_tmp); + ret = system(tar_cmd); + if (ret) { + system(("rm -rf " + std::string(dirpath_tmp)).c_str()); + goto cleanup; + } + + /* + * If the new directory exists, it could have raced with a parallel + * extraction, in this case just delete the old directory and ignore. + */ + ret = rename(dirpath_tmp, dirpath.c_str()); + if (ret) + ret = system(("rm -rf " + std::string(dirpath_tmp)).c_str()); + +cleanup: + if (module) { + int ret1 = system("rmmod kheaders"); + if (ret1) + return ret1; + } + + return ret; +} + +int get_proc_kheaders(std::string &dirpath) +{ + struct utsname uname_data; + char dirpath_tmp[256]; + + if (uname(&uname_data)) + return -errno; + + snprintf(dirpath_tmp, 256, "/tmp/kheaders-%s", uname_data.release); + dirpath = std::string(dirpath_tmp); + + if (file_exists(dirpath_tmp)) + return 0; + + // First time so extract it + return extract_kheaders(dirpath, uname_data); +} + } // namespace ebpf diff --git a/src/cc/frontends/clang/kbuild_helper.h b/src/cc/frontends/clang/kbuild_helper.h index 5a271ffa6..be388e843 100644 --- a/src/cc/frontends/clang/kbuild_helper.h +++ b/src/cc/frontends/clang/kbuild_helper.h @@ -21,6 +21,8 @@ #include #include +#define PROC_KHEADERS_PATH "/proc/kheaders.tar.xz" + namespace ebpf { struct FileDeleter { @@ -101,4 +103,5 @@ class KBuildHelper { bool has_source_dir_; }; +int get_proc_kheaders(std::string &dir); } // namespace ebpf diff --git a/src/cc/frontends/clang/loader.cc b/src/cc/frontends/clang/loader.cc index a3e09e6dc..62c8c8abf 100644 --- a/src/cc/frontends/clang/loader.cc +++ b/src/cc/frontends/clang/loader.cc @@ -120,6 +120,7 @@ int ClangLoader::parse(unique_ptr *mod, TableStorage &ts, const char *version_override = ::getenv("BCC_LINUX_VERSION_CODE"); bool has_kpath_source = false; string vmacro; + std::string tmpdir; if (kpath_env) { kpath = string(kpath_env); @@ -130,6 +131,13 @@ int ClangLoader::parse(unique_ptr *mod, TableStorage &ts, kpath = kdir + "/" + kernel_path_info.second; } + // If all attempts to obtain kheaders fail, check for /proc/kheaders.tar.xz + if (!is_dir(kpath)) { + int ret = get_proc_kheaders(tmpdir); + if (!ret) + kpath = tmpdir; + } + if (flags_ & DEBUG_PREPROCESSOR) std::cout << "Running from kernel directory at: " << kpath.c_str() << "\n"; From 912d3571fc528776ffeaa18b994403f7777f00ba Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 29 Apr 2019 11:43:22 -0700 Subject: [PATCH 127/135] sync with latest libbpf repo Sync with latest libbpf repo. Signed-off-by: Yonghong Song --- docs/kernel-versions.md | 6 ++ introspection/bps.c | 1 + src/cc/compat/linux/virtual_bpf.h | 149 +++++++++++++++++++++++++++++- src/cc/export/helpers.h | 12 +++ src/cc/libbpf | 2 +- src/cc/libbpf.c | 6 ++ 6 files changed, 174 insertions(+), 2 deletions(-) diff --git a/docs/kernel-versions.md b/docs/kernel-versions.md index 3ec589bb3..bf5828e55 100644 --- a/docs/kernel-versions.md +++ b/docs/kernel-versions.md @@ -240,6 +240,12 @@ Helper | Kernel version | License | Commit | `BPF_FUNC_sock_map_update()` | 4.14 | | [`174a79ff9515`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=174a79ff9515f400b9a6115643dafd62a635b7e6) `BPF_FUNC_spin_lock()` | 5.1 | | [`d83525ca62cf`](https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/?id=d83525ca62cf8ebe3271d14c36fb900c294274a2) `BPF_FUNC_spin_unlock()` | 5.1 | | [`d83525ca62cf`](https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/?id=d83525ca62cf8ebe3271d14c36fb900c294274a2) +`BPF_FUNC_strtol()` | 5.2 | | [`d7a4cb9b6705`](https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/d7a4cb9b6705a89937d12c8158a35a3145dc967a) +`BPF_FUNC_strtoul()` | 5.2 | | [`d7a4cb9b6705`](https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/d7a4cb9b6705a89937d12c8158a35a3145dc967a) +`BPF_FUNC_sysctl_get_current_value()` | 5.2 | | [`1d11b3016cec`](https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/1d11b3016cec4ed9770b98e82a61708c8f4926e7) +`BPF_FUNC_sysctl_get_name()` | 5.2 | | [`808649fb787d`](https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/808649fb787d918a48a360a668ee4ee9023f0c11) +`BPF_FUNC_sysctl_get_new_value()` | 5.2 | | [`4e63acdff864`](https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4e63acdff864654cee0ac5aaeda3913798ee78f6) +`BPF_FUNC_sysctl_set_new_value()` | 5.2 | | [`4e63acdff864`](https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/4e63acdff864654cee0ac5aaeda3913798ee78f6) `BPF_FUNC_tail_call()` | 4.2 | | [`04fd61ab36ec`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=04fd61ab36ec065e194ab5e74ae34a5240d992bb) `BPF_FUNC_tcp_check_syncookie()` | 5.2 | | [`399040847084`](https://kernel.googlesource.com/pub/scm/linux/kernel/git/davem/net-next/+/399040847084a69f345e0a52fd62f04654e0fce3) `BPF_FUNC_tcp_sock()` | 5.1 | | [`655a51e536c0`](https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=655a51e536c09d15ffa3603b1b6fce2b45b85a1f) diff --git a/introspection/bps.c b/introspection/bps.c index 4993b8e70..d3295f85c 100644 --- a/introspection/bps.c +++ b/introspection/bps.c @@ -39,6 +39,7 @@ static const char * const prog_type_strings[] = { [BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2", [BPF_PROG_TYPE_SK_REUSEPORT] = "sk_reuseport", [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", + [BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl", }; static const char * const map_type_strings[] = { diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h index c01ae0c23..ece4398c6 100644 --- a/src/cc/compat/linux/virtual_bpf.h +++ b/src/cc/compat/linux/virtual_bpf.h @@ -168,6 +168,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, + BPF_PROG_TYPE_CGROUP_SYSCTL, }; enum bpf_attach_type { @@ -189,6 +190,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP6_SENDMSG, BPF_LIRC_MODE2, BPF_FLOW_DISSECTOR, + BPF_CGROUP_SYSCTL, __MAX_BPF_ATTACH_TYPE }; @@ -1524,6 +1526,10 @@ union bpf_attr { * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: * Use with ENCAP_L3 flags to further specify the tunnel type. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2(len) **: + * Use with ENCAP_L3/L4 flags to further specify the tunnel + * type; **len** is the length of the inner MAC header. + * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -2501,6 +2507,122 @@ union bpf_attr { * Return * 0 if iph and th are a valid SYN cookie ACK, or a negative error * otherwise. + * + * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * Description + * Get name of sysctl in /proc/sys/ and copy it into provided by + * program buffer *buf* of size *buf_len*. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is + * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name + * only (e.g. "tcp_mem"). + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get current value of sysctl as it is presented in /proc/sys + * (incl. newline, etc), and copy it as a string into provided + * by program buffer *buf* of size *buf_len*. + * + * The whole value is copied, no matter what file position user + * space issued e.g. sys_read at. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if current value was unavailable, e.g. because + * sysctl is uninitialized and read returns -EIO for it. + * + * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get new value being written by user space to sysctl (before + * the actual write happens) and copy it as a string into + * provided by program buffer *buf* of size *buf_len*. + * + * User space may write new value at file position > 0. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if sysctl is being read. + * + * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * Description + * Override new value being written by user space to sysctl with + * value provided by program in buffer *buf* of size *buf_len*. + * + * *buf* should contain a string in same form as provided by user + * space on sysctl write. + * + * User space may write new value at file position > 0. To override + * the whole sysctl value file position should be set to zero. + * Return + * 0 on success. + * + * **-E2BIG** if the *buf_len* is too big. + * + * **-EINVAL** if sysctl is being read. + * + * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to a long integer according to the given base + * and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by isspace(3)) followed by a single optional '-' + * sign. + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space strtol(3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than buf_len. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to an unsigned long integer according to the + * given base and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by isspace(3)). + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space strtoul(3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than buf_len. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2603,7 +2725,13 @@ union bpf_attr { FN(skb_ecn_set_ce), \ FN(get_listener_sock), \ FN(skc_lookup_tcp), \ - FN(tcp_check_syncookie), + FN(tcp_check_syncookie), \ + FN(sysctl_get_name), \ + FN(sysctl_get_current_value), \ + FN(sysctl_get_new_value), \ + FN(sysctl_set_new_value), \ + FN(strtol), \ + FN(strtoul), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2665,10 +2793,19 @@ enum bpf_func_id { /* BPF_FUNC_skb_adjust_room flags. */ #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff +#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 + #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ + BPF_ADJ_ROOM_ENCAP_L2_MASK) \ + << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) + +/* BPF_FUNC_sysctl_get_name flags. */ +#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { @@ -3299,5 +3436,15 @@ struct bpf_line_info { struct bpf_spin_lock { __u32 val; }; + +struct bpf_sysctl { + __u32 write; /* Sysctl is being read (= 0) or written (= 1). + * Allows 1,2,4-byte read, but no write. + */ + __u32 file_pos; /* Sysctl file position to read from, write to. + * Allows 1,2,4-byte read an 4-byte write. + */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ )********" diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index 95bbc47b6..9c39dfd0d 100644 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -349,6 +349,18 @@ static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) = (void *) BPF_FUNC_sk_redirect_map; static int (*bpf_sock_map_update)(void *map, void *key, void *value, unsigned long long flags) = (void *) BPF_FUNC_sock_map_update; +static int (*bpf_strtol)(const char *buf, size_t buf_len, u64 flags, long *res) = + (void *) BPF_FUNC_strtol; +static int (*bpf_strtoul)(const char *buf, size_t buf_len, u64 flags, unsigned long *res) = + (void *) BPF_FUNC_strtoul; +static int (*bpf_sysctl_get_current_value)(struct bpf_sysctl *ctx, char *buf, size_t buf_len) = + (void *) BPF_FUNC_sysctl_get_current_value; +static int (*bpf_sysctl_get_name)(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) = + (void *) BPF_FUNC_sysctl_get_name; +static int (*bpf_sysctl_get_new_value)(struct bpf_sysctl *ctx, char *buf, size_t buf_len) = + (void *) BPF_FUNC_sysctl_get_new_value; +static int (*bpf_sysctl_set_new_value)(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) = + (void *) BPF_FUNC_sysctl_set_new_value; static int (*bpf_tcp_check_syncookie)(struct bpf_sock *sk, void *ip, int ip_len, void *tcp, int tcp_len) = (void *) BPF_FUNC_tcp_check_syncookie; diff --git a/src/cc/libbpf b/src/cc/libbpf index 5844f6e4d..910c475f0 160000 --- a/src/cc/libbpf +++ b/src/cc/libbpf @@ -1 +1 @@ -Subproject commit 5844f6e4dd60d8c941417bcaafe0785c61415195 +Subproject commit 910c475f09e5c269f441d7496c27dace30dc2335 diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 6df244ce7..dc29eb05f 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -197,6 +197,12 @@ static struct bpf_helper helpers[] = { {"get_listener_sock", "5.1"}, {"skc_lookup_tcp", "5.2"}, {"tcp_check_syncookie", "5.2"}, + {"sysctl_get_name", "5.2"}, + {"sysctl_get_current_value", "5.2"}, + {"sysctl_get_new_value", "5.2"}, + {"sysctl_set_new_value", "5.2"}, + {"strtol", "5.2"}, + {"strtoul", "5.2"}, }; static uint64_t ptr_to_u64(void *ptr) From 21f24631a28cb308aaa76524b0df34e8f2acb097 Mon Sep 17 00:00:00 2001 From: swj <1186093704@qq.com> Date: Tue, 30 Apr 2019 20:34:45 +0800 Subject: [PATCH 128/135] type error --- tools/dbslower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/dbslower.py b/tools/dbslower.py index da2180f84..2f1b6a8b8 100755 --- a/tools/dbslower.py +++ b/tools/dbslower.py @@ -208,7 +208,7 @@ def print_event(cpu, data, size): event = bpf["events"].event(data) print("%-14.6f %-6d %8.3f %s" % ( float(event.timestamp - start) / 1000000000, - event.pid, float(event.delta) / 1000000, event.query)) + event.pid, float(event.duration) / 1000000, event.query)) if mode.startswith("MYSQL"): print("Tracing database queries for application %s slower than %d ms..." % From 4d61a57b4ebd8b387abe3270609674e57e334148 Mon Sep 17 00:00:00 2001 From: Adam Jensen Date: Wed, 8 May 2019 07:15:14 -0400 Subject: [PATCH 129/135] Include linux/types.h --- src/cc/libbpf.c | 1 + src/cc/perf_reader.c | 1 + 2 files changed, 2 insertions(+) diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index dc29eb05f..688dca1a3 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/src/cc/perf_reader.c b/src/cc/perf_reader.c index 3cab01532..dedb11d2b 100644 --- a/src/cc/perf_reader.c +++ b/src/cc/perf_reader.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "libbpf.h" From a2e71a9eb71a44dccbc0ffe27a2dfab8185b57a3 Mon Sep 17 00:00:00 2001 From: Wjie <1186093704@qq.com> Date: Sat, 11 May 2019 00:19:22 +0800 Subject: [PATCH 130/135] vfscount.py: add args time (#2344) add args time for vfscount.py --- man/man8/vfscount.8 | 4 ++++ tools/vfscount.py | 17 +++++++++++++++-- tools/vfscount_example.txt | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/man/man8/vfscount.8 b/man/man8/vfscount.8 index 44acffce0..fbf0e89ec 100644 --- a/man/man8/vfscount.8 +++ b/man/man8/vfscount.8 @@ -19,6 +19,10 @@ CONFIG_BPF and bcc. Count some VFS calls until Ctrl-C is hit: # .B vfscount +.TP +Count some VFS calls in ten seconds +# +.B vfscount 10 .SH FIELDS .TP ADDR diff --git a/tools/vfscount.py b/tools/vfscount.py index 10c6b1eb1..b7c18efd4 100755 --- a/tools/vfscount.py +++ b/tools/vfscount.py @@ -14,7 +14,19 @@ from __future__ import print_function from bcc import BPF from time import sleep - +from sys import argv +def usage(): + print("USAGE: %s [time]" % argv[0]) + exit() + +interval = 99999999 +if len(argv) > 1: + try: + interval = int(argv[1]) + if interval == 0: + raise + except: # also catches -h, --help + usage() # load BPF program b = BPF(text=""" #include @@ -39,9 +51,10 @@ # output try: - sleep(99999999) + sleep(interval) except KeyboardInterrupt: pass + exit() print("\n%-16s %-26s %8s" % ("ADDR", "FUNC", "COUNT")) counts = b.get_table("counts") diff --git a/tools/vfscount_example.txt b/tools/vfscount_example.txt index 1012bffb4..478db97e4 100644 --- a/tools/vfscount_example.txt +++ b/tools/vfscount_example.txt @@ -1,8 +1,8 @@ Demonstrations of vfscount, the Linux eBPF/bcc version. -This counts VFS calls, by tracing all kernel functions beginning with "vfs_": - +This counts VFS calls during time, by tracing all kernel functions beginning +with "vfs_", By defaults, the time is 99999999s # ./vfscount Tracing... Ctrl-C to end. ^C @@ -20,7 +20,36 @@ ffffffff811ec9f1 vfs_getattr_nosec 704 ffffffff811e80a1 vfs_write 1764 ffffffff811e7f71 vfs_read 2283 +Here we are using an output in 10 seconds, and printing 10 seconds summaries +# ./vfscount 10 +Tracing... Ctrl-C to end. + +ADDR FUNC COUNT +ffffffffa1283671 vfs_rename 1 +ffffffffa129f471 vfs_setxattr 1 +ffffffffa12831c1 vfs_mkdir 1 +ffffffffa1282a51 vfs_rmdir 10 +ffffffffa1283f31 vfs_unlink 28 +ffffffffa1273e61 vfs_writev 53 +ffffffffa12ae061 vfs_statfs 55 +ffffffffa129e971 vfs_getxattr 138 +ffffffffa1288561 vfs_readlink 157 +ffffffffa12d6311 vfs_lock_file 223 +ffffffffa1274da1 vfs_write 537 +ffffffffa12798f1 vfs_statx_fd 2337 +ffffffffa1279971 vfs_statx 3064 +ffffffffa1271ba1 vfs_open 4334 +ffffffffa12798b1 vfs_getattr 4823 +ffffffffa1279821 vfs_getattr_nosec 4823 +ffffffffa1274af1 vfs_read 9060 + + This can be useful for workload characterization, to see what types of operations are in use. You can edit the script to customize what kernel functions are matched. + +Full usage: + +# ./vfsstat -h +USAGE: ./vfsstat [time] From e86e0643e1b826583ce78addc471e50d3e60ccd9 Mon Sep 17 00:00:00 2001 From: Lecopzer Date: Sat, 11 May 2019 00:38:37 +0800 Subject: [PATCH 131/135] cc: Fix compile warning (#2346) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add corresponding ifdef macro to get ride of the follow warning. /home/lecopzer/workspace/mybcc/src/cc/bcc_debug.cc: In member function ‘void ebpf::SourceDebugger::dump()’: /home/lecopzer/workspace/mybcc/src/cc/bcc_debug.cc:186:16: warning: unused variable ‘SectionID’ [-Wunused-variable] unsigned SectionID = get<2>(section.second); ^~~~~~~~~ --- src/cc/bcc_debug.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cc/bcc_debug.cc b/src/cc/bcc_debug.cc index b856cd02d..759576c34 100644 --- a/src/cc/bcc_debug.cc +++ b/src/cc/bcc_debug.cc @@ -183,7 +183,9 @@ void SourceDebugger::dump() { uint64_t Size; uint8_t *FuncStart = get<0>(section.second); uint64_t FuncSize = get<1>(section.second); +#if LLVM_MAJOR_VERSION >= 9 unsigned SectionID = get<2>(section.second); +#endif ArrayRef Data(FuncStart, FuncSize); uint32_t CurrentSrcLine = 0; string func_name = section.first.substr(fn_prefix_.size()); From b0358b8c9146013f069b23539d53052389537ca4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 16 May 2019 17:51:37 +0100 Subject: [PATCH 132/135] snapcraft: remove python path wrapper hack The python wrapper hack was not working for non-python scripts. Fix this by patching the python executable path in the python scripts once the source is fetched. Signed-off-by: Colin Ian King --- snapcraft/bcc-wrapper | 2 +- snapcraft/snapcraft.yaml | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/snapcraft/bcc-wrapper b/snapcraft/bcc-wrapper index 6e5c6f3d7..bbe5ec769 100755 --- a/snapcraft/bcc-wrapper +++ b/snapcraft/bcc-wrapper @@ -7,7 +7,7 @@ cmd="$1" if [ `id -u` = 0 ] ; then shift - stdbuf -oL $SNAP/usr/bin/python2.7 "$SNAP/usr/share/bcc/tools/$cmd" $@ + stdbuf -oL $SNAP/usr/share/bcc/tools/$cmd $@ else echo "Need to run $cmd as root (use sudo $@)" exit 1 diff --git a/snapcraft/snapcraft.yaml b/snapcraft/snapcraft.yaml index 5044b2a2f..ee815b957 100644 --- a/snapcraft/snapcraft.yaml +++ b/snapcraft/snapcraft.yaml @@ -31,6 +31,9 @@ base: core18 parts: bcc: plugin: cmake + override-pull: | + snapcraftctl pull + find . -type f -exec sed -i 's|^#\!/usr/bin/python|#\!/usr/bin/env python|' {} \; configflags: - '-DCMAKE_INSTALL_PREFIX=/usr' source: .. From 3bdeff3cec7ec2920d2ea9d972f506298dd8a134 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 17 May 2019 10:26:44 -0700 Subject: [PATCH 133/135] sync to latest libbpf sync to latest libbpf and update other parts of bcc as well. Signed-off-by: Yonghong Song --- docs/kernel-versions.md | 7 ++ introspection/bps.c | 2 + src/cc/compat/linux/virtual_bpf.h | 177 ++++++++++++++++++++---------- src/cc/export/helpers.h | 5 + src/cc/libbpf | 2 +- src/cc/libbpf.c | 2 + 6 files changed, 133 insertions(+), 62 deletions(-) diff --git a/docs/kernel-versions.md b/docs/kernel-versions.md index bf5828e55..ac1307e9f 100644 --- a/docs/kernel-versions.md +++ b/docs/kernel-versions.md @@ -73,6 +73,10 @@ AF_XDP | 4.18 | [`fbfc504a24f5`](https://git.kernel.org/cgit/linux/kernel/git/d bpfilter | 4.18 | [`d2ba09c17a06`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=d2ba09c17a0647f899d6c20a11bab9e6d3382f07) End.BPF action for seg6local LWT | 4.18 | [`004d4b274e2a`](https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/commit/?id=004d4b274e2a1a895a0e5dc66158b90a7d463d44) BPF attached to LIRC devices | 4.18 | [`f4364dcfc86d`](https://git.kernel.org/cgit/linux/kernel/git/bpf/bpf-next.git/commit/?id=f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936) +BPF socket reuseport | 4.19 | [`2dbb9b9e6df6`](https://github.com/torvalds/linux/commit/2dbb9b9e6df67d444fbe425c7f6014858d337adf) +BPF flow dissector | 4.20 | [`d58e468b1112`](https://github.com/torvalds/linux/commit/d58e468b1112dcd1d5193c0a89ff9f98b5a3e8b9) +BPF cgroup sysctl | 5.2 | [`7b146cebe30c`](https://github.com/torvalds/linux/commit/7b146cebe30cb481b0f70d85779da938da818637) +BPF raw tracepoint writable | 5.2 | [`9df1c28bb752`](https://github.com/torvalds/linux/commit/9df1c28bb75217b244257152ab7d788bb2a386d0) ## Tables (_a.k.a._ Maps) @@ -107,6 +111,7 @@ reuseport sockarray | 4.19 | [`5dc4c4b7d4e8`](https://github.com/torvalds/linux/ precpu cgroup storage | 4.20 | [`b741f1630346`](https://github.com/torvalds/linux/commit/b741f1630346defcbc8cc60f1a2bdae8b3b0036f) queue | 4.20 | [`f1a2e44a3aec`](https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92) stack | 4.20 | [`f1a2e44a3aec`](https://github.com/torvalds/linux/commit/f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92) +socket local storage | 5.2 | [`6ac99e8f23d4`](https://github.com/torvalds/linux/commit/6ac99e8f23d4b10258406ca0dd7bffca5f31da9d) ## XDP @@ -215,6 +220,8 @@ Helper | Kernel version | License | Commit | `BPF_FUNC_sk_redirect_map()` | 4.14 | | [`174a79ff9515`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=174a79ff9515f400b9a6115643dafd62a635b7e6) `BPF_FUNC_sk_release()` | 4.20 | | [`6acc9b432e67`](https://github.com/torvalds/linux/commit/6acc9b432e6714d72d7d77ec7c27f6f8358d0c71) `BPF_FUNC_sk_select_reuseport()` | 4.19 | | [`2dbb9b9e6df6`](https://github.com/torvalds/linux/commit/2dbb9b9e6df67d444fbe425c7f6014858d337adf) +`BPF_FUNC_sk_storage_delete()` | 5.2 | | [`6ac99e8f23d4`](https://github.com/torvalds/linux/commit/6ac99e8f23d4b10258406ca0dd7bffca5f31da9d) +`BPF_FUNC_sk_storage_get()` | 5.2 | | [`6ac99e8f23d4`](https://github.com/torvalds/linux/commit/6ac99e8f23d4b10258406ca0dd7bffca5f31da9d) `BPF_FUNC_skb_adjust_room()` | 4.13 | | [`2be7e212d541`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=2be7e212d5419a400d051c84ca9fdd083e5aacac) `BPF_FUNC_skb_ancestor_cgroup_id()` | 4.19 | | [`7723628101aa`](https://github.com/torvalds/linux/commit/7723628101aaeb1d723786747529b4ea65c5b5c5) `BPF_FUNC_skb_change_head()` | 4.10 | | [`3a0af8fd61f9`](https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=3a0af8fd61f90920f6fa04e4f1e9a6a73c1b4fd2) diff --git a/introspection/bps.c b/introspection/bps.c index d3295f85c..5ac80999b 100644 --- a/introspection/bps.c +++ b/introspection/bps.c @@ -40,6 +40,7 @@ static const char * const prog_type_strings[] = { [BPF_PROG_TYPE_SK_REUSEPORT] = "sk_reuseport", [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", [BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl", + [BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable", }; static const char * const map_type_strings[] = { @@ -66,6 +67,7 @@ static const char * const map_type_strings[] = { [BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE] = "precpu_cgroup_storage", [BPF_MAP_TYPE_QUEUE] = "queue", [BPF_MAP_TYPE_STACK] = "stack", + [BPF_MAP_TYPE_SK_STORAGE] = "sk_storage", }; #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h index ece4398c6..22ba4a738 100644 --- a/src/cc/compat/linux/virtual_bpf.h +++ b/src/cc/compat/linux/virtual_bpf.h @@ -134,6 +134,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, + BPF_MAP_TYPE_SK_STORAGE, }; /* Note that tracing related programs such as @@ -169,6 +170,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_CGROUP_SYSCTL, + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, }; enum bpf_attach_type { @@ -628,7 +630,7 @@ union bpf_attr { * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ * **->swhash** and *skb*\ **->l4hash** to 0). * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -653,7 +655,7 @@ union bpf_attr { * flexibility and can handle sizes larger than 2 or 4 for the * checksum to update. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -685,7 +687,7 @@ union bpf_attr { * flexibility and can handle sizes larger than 2 or 4 for the * checksum to update. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -740,7 +742,7 @@ union bpf_attr { * efficient, but it is handled through an action code where the * redirection happens only after the eBPF program has returned. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -805,7 +807,7 @@ union bpf_attr { * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to * be **ETH_P_8021Q**. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -817,7 +819,7 @@ union bpf_attr { * Description * Pop a VLAN header from the packet associated to *skb*. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -1167,7 +1169,7 @@ union bpf_attr { * All values for *flags* are reserved for future usage, and must * be left at zero. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -1280,7 +1282,7 @@ union bpf_attr { * implicitly linearizes, unclones and drops offloads from the * *skb*. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -1316,7 +1318,7 @@ union bpf_attr { * **bpf_skb_pull_data()** to effectively unclone the *skb* from * the very beginning in case it is indeed cloned. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -1368,7 +1370,7 @@ union bpf_attr { * All values for *flags* are reserved for future usage, and must * be left at zero. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -1383,7 +1385,7 @@ union bpf_attr { * can be used to prepare the packet for pushing or popping * headers. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -1517,20 +1519,20 @@ union bpf_attr { * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. * Adjusting mss in this way is not allowed for datagrams. * - * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **: - * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **: + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**: * Any new space is reserved to hold a tunnel header. * Configure skb offsets and other fields accordingly. * - * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **: - * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**, + * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**: * Use with ENCAP_L3 flags to further specify the tunnel type. * - * * **BPF_F_ADJ_ROOM_ENCAP_L2(len) **: + * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*): * Use with ENCAP_L3/L4 flags to further specify the tunnel - * type; **len** is the length of the inner MAC header. + * type; *len* is the length of the inner MAC header. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -1609,7 +1611,7 @@ union bpf_attr { * more flexibility as the user is free to store whatever meta * data they need. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -1738,12 +1740,19 @@ union bpf_attr { * error if an eBPF program tries to set a callback that is not * supported in the current kernel. * - * The supported callback values that *argval* can combine are: + * *argval* is a flag array which can combine these flags: * * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * * Here are some examples of where one could call such eBPF * program: * @@ -1844,7 +1853,7 @@ union bpf_attr { * copied if necessary (i.e. if data was not linear and if start * and end pointers do not point to the same chunk). * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -1878,7 +1887,7 @@ union bpf_attr { * only possible to shrink the packet as of this writing, * therefore *delta* must be a negative integer. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -2053,18 +2062,18 @@ union bpf_attr { * **BPF_LWT_ENCAP_IP** * IP encapsulation (GRE/GUE/IPIP/etc). The outer header * must be IPv4 or IPv6, followed by zero or more - * additional headers, up to LWT_BPF_MAX_HEADROOM total - * bytes in all prepended headers. Please note that - * if skb_is_gso(skb) is true, no more than two headers - * can be prepended, and the inner header, if present, - * should be either GRE or UDP/GUE. - * - * BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of - * type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called - * by bpf programs of types BPF_PROG_TYPE_LWT_IN and - * BPF_PROG_TYPE_LWT_XMIT. - * - * A call to this helper is susceptible to change the underlaying + * additional headers, up to **LWT_BPF_MAX_HEADROOM** + * total bytes in all prepended headers. Please note that + * if **skb_is_gso**\ (*skb*) is true, no more than two + * headers can be prepended, and the inner header, if + * present, should be either GRE or UDP/GUE. + * + * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs + * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can + * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and + * **BPF_PROG_TYPE_LWT_XMIT**. + * + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -2079,7 +2088,7 @@ union bpf_attr { * inside the outermost IPv6 Segment Routing Header can be * modified through this helper. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -2095,7 +2104,7 @@ union bpf_attr { * after the segments are accepted. *delta* can be as well * positive (growing) as negative (shrinking). * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -2118,13 +2127,13 @@ union bpf_attr { * Type of *param*: **int**. * **SEG6_LOCAL_ACTION_END_B6** * End.B6 action: Endpoint bound to an SRv6 policy. - * Type of param: **struct ipv6_sr_hdr**. + * Type of *param*: **struct ipv6_sr_hdr**. * **SEG6_LOCAL_ACTION_END_B6_ENCAP** * End.B6.Encap action: Endpoint bound to an SRv6 * encapsulation policy. - * Type of param: **struct ipv6_sr_hdr**. + * Type of *param*: **struct ipv6_sr_hdr**. * - * A call to this helper is susceptible to change the underlaying + * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be * performed again, if the helper is used in combination with @@ -2277,7 +2286,8 @@ union bpf_attr { * Return * Pointer to **struct bpf_sock**, or **NULL** in case of failure. * For sockets with reuseport option, the **struct bpf_sock** - * result is from **reuse->socks**\ [] using the hash of the tuple. + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. * * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description @@ -2313,7 +2323,8 @@ union bpf_attr { * Return * Pointer to **struct bpf_sock**, or **NULL** in case of failure. * For sockets with reuseport option, the **struct bpf_sock** - * result is from **reuse->socks**\ [] using the hash of the tuple. + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. * * int bpf_sk_release(struct bpf_sock *sock) * Description @@ -2482,31 +2493,34 @@ union bpf_attr { * network namespace *netns*. The return value must be checked, * and if non-**NULL**, released via **bpf_sk_release**\ (). * - * This function is identical to bpf_sk_lookup_tcp, except that it - * also returns timewait or request sockets. Use bpf_sk_fullsock - * or bpf_tcp_socket to access the full structure. + * This function is identical to **bpf_sk_lookup_tcp**\ (), except + * that it also returns timewait or request sockets. Use + * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the + * full structure. * * This helper is available only if the kernel was compiled with * **CONFIG_NET** configuration option. * Return * Pointer to **struct bpf_sock**, or **NULL** in case of failure. * For sockets with reuseport option, the **struct bpf_sock** - * result is from **reuse->socks**\ [] using the hash of the tuple. + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. * * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description - * Check whether iph and th contain a valid SYN cookie ACK for - * the listening socket in sk. + * Check whether *iph* and *th* contain a valid SYN cookie ACK for + * the listening socket in *sk*. * - * iph points to the start of the IPv4 or IPv6 header, while - * iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr). + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ip6hdr**). * - * th points to the start of the TCP header, while th_len contains - * sizeof(struct tcphdr). + * *th* points to the start of the TCP header, while *th_len* + * contains **sizeof**\ (**struct tcphdr**). * * Return - * 0 if iph and th are a valid SYN cookie ACK, or a negative error - * otherwise. + * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative + * error otherwise. * * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) * Description @@ -2584,17 +2598,17 @@ union bpf_attr { * and save the result in *res*. * * The string may begin with an arbitrary amount of white space - * (as determined by isspace(3)) followed by a single optional '-' - * sign. + * (as determined by **isspace**\ (3)) followed by a single + * optional '**-**' sign. * * Five least significant bits of *flags* encode base, other bits * are currently unused. * * Base must be either 8, 10, 16 or 0 to detect it automatically - * similar to user space strtol(3). + * similar to user space **strtol**\ (3). * Return * Number of characters consumed on success. Must be positive but - * no more than buf_len. + * no more than *buf_len*. * * **-EINVAL** if no valid digits were found or unsupported base * was provided. @@ -2608,21 +2622,57 @@ union bpf_attr { * given base and save the result in *res*. * * The string may begin with an arbitrary amount of white space - * (as determined by isspace(3)). + * (as determined by **isspace**\ (3)). * * Five least significant bits of *flags* encode base, other bits * are currently unused. * * Base must be either 8, 10, 16 or 0 to detect it automatically - * similar to user space strtoul(3). + * similar to user space **strtoul**\ (3). * Return * Number of characters consumed on success. Must be positive but - * no more than buf_len. + * no more than *buf_len*. * * **-EINVAL** if no valid digits were found or unsupported base * was provided. * * **-ERANGE** if resulting value was out of range. + * + * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) + * Description + * Get a bpf-local-storage from a *sk*. + * + * Logically, it could be thought of getting the value from + * a *map* with *sk* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this + * helper enforces the key must be a full socket and the map must + * be a **BPF_MAP_TYPE_SK_STORAGE** also. + * + * Underneath, the value is stored locally at *sk* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf-local-storages residing at *sk*. + * + * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf-local-storage will be + * created if one does not exist. *value* can be used + * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf-local-storage. If *value* is + * **NULL**, the new bpf-local-storage will be zero initialized. + * Return + * A bpf-local-storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf-local-storage. + * + * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * Description + * Delete a bpf-local-storage from a *sk*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf-local-storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2731,7 +2781,9 @@ union bpf_attr { FN(sysctl_get_new_value), \ FN(sysctl_set_new_value), \ FN(strtol), \ - FN(strtoul), + FN(strtoul), \ + FN(sk_storage_get), \ + FN(sk_storage_delete), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2807,6 +2859,9 @@ enum bpf_func_id { /* BPF_FUNC_sysctl_get_name flags. */ #define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) +/* BPF_FUNC_sk_storage_get flags */ +#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index 9c39dfd0d..37ad3d75d 100644 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -503,6 +503,11 @@ static int (*bpf_skb_ecn_set_ce)(void *ctx) = (void *) BPF_FUNC_skb_ecn_set_ce; static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = (void *) BPF_FUNC_get_listener_sock; +static void *(*bpf_sk_storage_get)(void *map, struct bpf_sock *sk, + void *value, __u64 flags) = + (void *) BPF_FUNC_sk_storage_get; +static int (*bpf_sk_storage_delete)(void *map, struct bpf_sock *sk) = + (void *)BPF_FUNC_sk_storage_delete; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/src/cc/libbpf b/src/cc/libbpf index 910c475f0..5188b0ca5 160000 --- a/src/cc/libbpf +++ b/src/cc/libbpf @@ -1 +1 @@ -Subproject commit 910c475f09e5c269f441d7496c27dace30dc2335 +Subproject commit 5188b0ca5c16a0c4dabeebe9f83ebb2c7702ec15 diff --git a/src/cc/libbpf.c b/src/cc/libbpf.c index 688dca1a3..b1dc092ef 100644 --- a/src/cc/libbpf.c +++ b/src/cc/libbpf.c @@ -204,6 +204,8 @@ static struct bpf_helper helpers[] = { {"sysctl_set_new_value", "5.2"}, {"strtol", "5.2"}, {"strtoul", "5.2"}, + {"sk_storage_get", "5.2"}, + {"sk_storage_delete", "5.2"}, }; static uint64_t ptr_to_u64(void *ptr) From 6e7500b2c2aab47c23969204777d42625147f6f9 Mon Sep 17 00:00:00 2001 From: Delphix User Date: Thu, 16 May 2019 21:59:42 +0000 Subject: [PATCH 134/135] 2357 SEC macro redefined error I encountered the following error in a script attempting to trace zfs. In file included from /virtual/main.c:6: In file included from /export/home/delphix/zfs/include/sys/zio.h:35: In file included from /export/home/delphix/zfs/include/sys/zfs_context.h:38: In file included from /export/home/delphix/zfs/include/spl/sys/condvar.h:33: /export/home/delphix/zfs/include/spl/sys/time.h:41:9: warning: 'SEC' macro redefined [-Wmacro-redefined] #define SEC 1 ^ /virtual/include/bcc/helpers.h:54:9: note: previous definition is here ^ I added the BPF prefix bcc to avoid similar naming conflicts, --- src/cc/compat/linux/virtual_bpf.h | 2 +- src/cc/export/footer.h | 2 +- src/cc/export/helpers.h | 20 ++++++++++---------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/cc/compat/linux/virtual_bpf.h b/src/cc/compat/linux/virtual_bpf.h index 22ba4a738..7797e913f 100644 --- a/src/cc/compat/linux/virtual_bpf.h +++ b/src/cc/compat/linux/virtual_bpf.h @@ -1409,7 +1409,7 @@ union bpf_attr { * * :: * - * SEC("kprobe/sys_open") + * BCC_SEC("kprobe/sys_open") * void bpf_sys_open(struct pt_regs *ctx) * { * char buf[PATHLEN]; // PATHLEN is defined to 256 diff --git a/src/cc/export/footer.h b/src/cc/export/footer.h index 4e20dd414..8aa77cd3c 100644 --- a/src/cc/export/footer.h +++ b/src/cc/export/footer.h @@ -23,6 +23,6 @@ R"********( #define ___LICENSE(s) #s #define __LICENSE(s) ___LICENSE(s) #define _LICENSE __LICENSE(BPF_LICENSE) -char _license[] SEC("license") = _LICENSE; +char _license[] BCC_SEC("license") = _LICENSE; )********" diff --git a/src/cc/export/helpers.h b/src/cc/export/helpers.h index 37ad3d75d..ddc57b979 100644 --- a/src/cc/export/helpers.h +++ b/src/cc/export/helpers.h @@ -51,7 +51,7 @@ R"********( * different sections in elf_bpf file. Section names * are interpreted by elf_bpf loader */ -#define SEC(NAME) __attribute__((section(NAME), used)) +#define BCC_SEC(NAME) __attribute__((section(NAME), used)) // Associate map with its key/value types #define BPF_ANNOTATE_KV_PAIR(name, type_key, type_val) \ @@ -262,9 +262,9 @@ struct _name##_table_t _name = { .max_entries = (_max_entries) } ({ void *_tmp = _cursor; _cursor += _len; _tmp; }) #ifdef LINUX_VERSION_CODE_OVERRIDE -unsigned _version SEC("version") = LINUX_VERSION_CODE_OVERRIDE; +unsigned _version BCC_SEC("version") = LINUX_VERSION_CODE_OVERRIDE; #else -unsigned _version SEC("version") = LINUX_VERSION_CODE; +unsigned _version BCC_SEC("version") = LINUX_VERSION_CODE; #endif /* helper functions called from eBPF programs written in C */ @@ -629,7 +629,7 @@ unsigned int bpf_log2l(unsigned long v) struct bpf_context; static inline __attribute__((always_inline)) -SEC("helpers") +BCC_SEC("helpers") u64 bpf_dext_pkt(void *pkt, u64 off, u64 bofs, u64 bsz) { if (bofs == 0 && bsz == 8) { return load_byte(pkt, off); @@ -652,7 +652,7 @@ u64 bpf_dext_pkt(void *pkt, u64 off, u64 bofs, u64 bsz) { } static inline __attribute__((always_inline)) -SEC("helpers") +BCC_SEC("helpers") void bpf_dins_pkt(void *pkt, u64 off, u64 bofs, u64 bsz, u64 val) { // The load_xxx function does a bswap before returning the short/word/dword, // so the value in register will always be host endian. However, the bytes @@ -695,25 +695,25 @@ void bpf_dins_pkt(void *pkt, u64 off, u64 bofs, u64 bsz, u64 val) { } static inline __attribute__((always_inline)) -SEC("helpers") +BCC_SEC("helpers") void * bpf_map_lookup_elem_(uintptr_t map, void *key) { return bpf_map_lookup_elem((void *)map, key); } static inline __attribute__((always_inline)) -SEC("helpers") +BCC_SEC("helpers") int bpf_map_update_elem_(uintptr_t map, void *key, void *value, u64 flags) { return bpf_map_update_elem((void *)map, key, value, flags); } static inline __attribute__((always_inline)) -SEC("helpers") +BCC_SEC("helpers") int bpf_map_delete_elem_(uintptr_t map, void *key) { return bpf_map_delete_elem((void *)map, key); } static inline __attribute__((always_inline)) -SEC("helpers") +BCC_SEC("helpers") int bpf_l3_csum_replace_(void *ctx, u64 off, u64 from, u64 to, u64 flags) { switch (flags & 0xf) { case 2: @@ -729,7 +729,7 @@ int bpf_l3_csum_replace_(void *ctx, u64 off, u64 from, u64 to, u64 flags) { } static inline __attribute__((always_inline)) -SEC("helpers") +BCC_SEC("helpers") int bpf_l4_csum_replace_(void *ctx, u64 off, u64 from, u64 to, u64 flags) { switch (flags & 0xf) { case 2: From af2f106bba0068aa0be48597e5c052fda6bc66bb Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Sun, 19 May 2019 15:37:34 +0200 Subject: [PATCH 135/135] memleak: use BPF_HASH macro It's better for tools to use the appropriate bcc macros in case we ever want to change their definitions (e.g., new map.helper function specific to that map type). Signed-off-by: Paul Chaignon --- tools/memleak.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/memleak.py b/tools/memleak.py index 012ab9108..fd08bc4d8 100755 --- a/tools/memleak.py +++ b/tools/memleak.py @@ -141,10 +141,10 @@ def run_command_get_pid(command): }; BPF_HASH(sizes, u64); -BPF_TABLE("hash", u64, struct alloc_info_t, allocs, 1000000); +BPF_HASH(allocs, u64, struct alloc_info_t, 1000000); BPF_HASH(memptrs, u64, u64); BPF_STACK_TRACE(stack_traces, 10240); -BPF_TABLE("hash", u64, struct combined_alloc_info_t, combined_allocs, 10240); +BPF_HASH(combined_allocs, u64, struct combined_alloc_info_t, 10240); static inline void update_statistics_add(u64 stack_id, u64 sz) { struct combined_alloc_info_t *existing_cinfo;