From d4ddad4c307f2b34ee5e807e791072de29482bbe Mon Sep 17 00:00:00 2001 From: Barret Rhoden Date: Thu, 23 Mar 2023 02:26:23 -0700 Subject: [PATCH] ghost: add bpf_select_rq Examples for how to use it in biff.bpf.c and flux_api.bpf.c. Flux retains its existing behavior of wake-on-waker with TTWU_QUEUE. Biff does wake-on-wakee, without TTWU_QUEUE. PiperOrigin-RevId: 507578851 --- bpf/user/agent.c | 1 + bpf/user/agent.h | 2 ++ kernel/ghost_uapi.h | 2 +- kernel/vmlinux_ghost_5_11.h | 19 +++++++++++++++++++ schedulers/biff/biff_scheduler.cc | 7 ++++--- schedulers/flux/flux_scheduler.cc | 6 ++++-- third_party/bpf/biff.bpf.c | 24 ++++++++++++++++++++++++ third_party/bpf/flux_api.bpf.c | 6 ++++++ 8 files changed, 61 insertions(+), 6 deletions(-) diff --git a/bpf/user/agent.c b/bpf/user/agent.c index 5dec94f8..dfe0b11f 100644 --- a/bpf/user/agent.c +++ b/bpf/user/agent.c @@ -85,6 +85,7 @@ static int insert_prog(int ctl_fd, struct bpf_program *prog) switch (eat & 0xFFFF) { case BPF_GHOST_SCHED_PNT: case BPF_GHOST_MSG_SEND: + case BPF_GHOST_SELECT_RQ: ret = bpf_link_create(prog_fd, ctl_fd, eat, NULL); break; default: diff --git a/bpf/user/agent.h b/bpf/user/agent.h index 1ff8d3f4..b2d52bf6 100644 --- a/bpf/user/agent.h +++ b/bpf/user/agent.h @@ -33,9 +33,11 @@ extern "C" { enum { BPF_PROG_TYPE_GHOST_SCHED = 1000, BPF_PROG_TYPE_GHOST_MSG, + BPF_PROG_TYPE_GHOST_SELECT_RQ, BPF_GHOST_SCHED_PNT = 2000, BPF_GHOST_MSG_SEND, + BPF_GHOST_SELECT_RQ, __MAX_BPF_GHOST_ATTACH_TYPE }; diff --git a/kernel/ghost_uapi.h b/kernel/ghost_uapi.h index aedfc883..9df08a7b 100644 --- a/kernel/ghost_uapi.h +++ b/kernel/ghost_uapi.h @@ -25,7 +25,7 @@ * process are the same version as each other. Each successive version changes * values in this header file, assumptions about operations in the kernel, etc. */ -#define GHOST_VERSION 82 +#define GHOST_VERSION 83 /* * Define SCHED_GHOST via the ghost uapi unless it has already been defined diff --git a/kernel/vmlinux_ghost_5_11.h b/kernel/vmlinux_ghost_5_11.h index c734e22c..db35e911 100644 --- a/kernel/vmlinux_ghost_5_11.h +++ b/kernel/vmlinux_ghost_5_11.h @@ -8790,6 +8790,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP = 30, BPF_PROG_TYPE_GHOST_SCHED = 1000, BPF_PROG_TYPE_GHOST_MSG = 1001, + BPF_PROG_TYPE_GHOST_SELECT_RQ = 1002, }; enum bpf_attach_type { @@ -8834,6 +8835,13 @@ enum bpf_attach_type { __MAX_BPF_ATTACH_TYPE = 38, }; +enum { + BPF_GHOST_SCHED_PNT = 2000, + BPF_GHOST_MSG_SEND = 2001, + BPF_GHOST_SELECT_RQ = 2002, + __MAX_BPF_GHOST_ATTACH_TYPE = 2003, +}; + struct sock_filter { __u16 code; __u8 jt; @@ -33242,6 +33250,15 @@ struct bpf_ghost_sched { __u64 next_gtid; }; +struct bpf_ghost_select_rq { + __u64 gtid; + __u32 task_cpu; + __u32 waker_cpu; + __u32 sd_flag; + __u32 wake_flags; + __u8 skip_ttwu_queue; +}; + enum bpf_func_id { BPF_FUNC_unspec = 0, BPF_FUNC_map_lookup_elem = 1, @@ -42514,6 +42531,8 @@ struct bpf_ctx_convert { struct bpf_ghost_sched BPF_PROG_TYPE_GHOST_SCHED_kern; struct bpf_ghost_msg BPF_PROG_TYPE_GHOST_MSG_prog; struct bpf_ghost_msg BPF_PROG_TYPE_GHOST_MSG_kern; + struct bpf_ghost_select_rq BPF_PROG_TYPE_GHOST_SELECT_RQ_prog; + struct bpf_ghost_select_rq BPF_PROG_TYPE_GHOST_SELECT_RQ_kern; }; struct bpf_flow_keys { diff --git a/schedulers/biff/biff_scheduler.cc b/schedulers/biff/biff_scheduler.cc index 74e38b1e..e0fe5c55 100644 --- a/schedulers/biff/biff_scheduler.cc +++ b/schedulers/biff/biff_scheduler.cc @@ -24,6 +24,8 @@ BiffScheduler::BiffScheduler(Enclave* enclave, CpuList cpulist, BPF_PROG_TYPE_GHOST_SCHED, BPF_GHOST_SCHED_PNT); bpf_program__set_types(bpf_obj_->progs.biff_msg_send, BPF_PROG_TYPE_GHOST_MSG, BPF_GHOST_MSG_SEND); + bpf_program__set_types(bpf_obj_->progs.biff_select_rq, + BPF_PROG_TYPE_GHOST_SELECT_RQ, BPF_GHOST_SELECT_RQ); bpf_obj_->rodata->enable_bpf_printd = CapHas(CAP_PERFMON); SetBpfTopologyVars(bpf_obj_->rodata, MachineTopology()); @@ -34,6 +36,8 @@ BiffScheduler::BiffScheduler(Enclave* enclave, CpuList cpulist, 0); CHECK_EQ(agent_bpf_register(bpf_obj_->progs.biff_msg_send, BPF_GHOST_MSG_SEND), 0); + CHECK_EQ(agent_bpf_register(bpf_obj_->progs.biff_select_rq, + BPF_GHOST_SELECT_RQ), 0); bpf_cpu_data_ = static_cast( bpf_map__mmap(bpf_obj_->maps.cpu_data)); @@ -51,9 +55,6 @@ BiffScheduler::~BiffScheduler() { } void BiffScheduler::EnclaveReady() { - // Biff has no cpu locality, so the remote wakeup is never worth it. - enclave()->SetWakeOnWakerCpu(true); - enclave()->SetDeliverTicks(true); enclave()->SetDeliverCpuAvailability(true); WRITE_ONCE(bpf_obj_->bss->initialized, true); diff --git a/schedulers/flux/flux_scheduler.cc b/schedulers/flux/flux_scheduler.cc index efe3dfe6..7887e694 100644 --- a/schedulers/flux/flux_scheduler.cc +++ b/schedulers/flux/flux_scheduler.cc @@ -42,6 +42,8 @@ FluxScheduler::FluxScheduler(Enclave* enclave, CpuList cpulist, BPF_PROG_TYPE_GHOST_SCHED, BPF_GHOST_SCHED_PNT); bpf_program__set_types(bpf_obj_->progs.flux_msg_send, BPF_PROG_TYPE_GHOST_MSG, BPF_GHOST_MSG_SEND); + bpf_program__set_types(bpf_obj_->progs.flux_select_rq, + BPF_PROG_TYPE_GHOST_SELECT_RQ, BPF_GHOST_SELECT_RQ); bpf_obj_->rodata->enable_bpf_printd = CapHas(CAP_PERFMON); @@ -51,6 +53,8 @@ FluxScheduler::FluxScheduler(Enclave* enclave, CpuList cpulist, 0); CHECK_EQ(agent_bpf_register(bpf_obj_->progs.flux_msg_send, BPF_GHOST_MSG_SEND), 0); + CHECK_EQ(agent_bpf_register(bpf_obj_->progs.flux_select_rq, + BPF_GHOST_SELECT_RQ), 0); cpu_data_ = static_cast( bpf_map__mmap(bpf_obj_->maps.cpu_data)); @@ -80,8 +84,6 @@ FluxScheduler::~FluxScheduler() { } void FluxScheduler::EnclaveReady() { - enclave()->SetWakeOnWakerCpu(true); - enclave()->SetDeliverTicks(true); enclave()->SetDeliverCpuAvailability(true); // We learn about cpu availability via a message. Some cpus may currently be diff --git a/third_party/bpf/biff.bpf.c b/third_party/bpf/biff.bpf.c index cd40847e..8e57da22 100644 --- a/third_party/bpf/biff.bpf.c +++ b/third_party/bpf/biff.bpf.c @@ -656,4 +656,28 @@ int biff_msg_send(struct bpf_ghost_msg *msg) return 1; } +SEC("ghost_select_rq/select_rq") +int biff_select_rq(struct bpf_ghost_select_rq *ctx) +{ + u64 gtid = ctx->gtid; + /* Can't pass ctx->gtid to gtid_to_thread (swd) directly. (verifier) */ + struct biff_bpf_sw_data *t = gtid_to_swd(gtid); + + if (!t) { + bpf_printd("Got select_rq without a task!"); + return -1; + } + + /* + * POLICY + * + * Not necessarily a good policy. The combo of skip + picking the + * task_cpu will grab remote cpus RQ locks for remote wakeups. This is + * just an example of what you can do. + */ + ctx->skip_ttwu_queue = true; + + return ctx->task_cpu; +} + char LICENSE[] SEC("license") = "GPL"; diff --git a/third_party/bpf/flux_api.bpf.c b/third_party/bpf/flux_api.bpf.c index 30e8bb97..6d35ac4f 100644 --- a/third_party/bpf/flux_api.bpf.c +++ b/third_party/bpf/flux_api.bpf.c @@ -806,4 +806,10 @@ int flux_msg_send(struct bpf_ghost_msg *msg) return 1; } +SEC("ghost_select_rq/select_rq") +int flux_select_rq(struct bpf_ghost_select_rq *ctx) +{ + return ctx->waker_cpu; +} + char LICENSE[] SEC("license") = "GPL";