From 8c5b199668d32872385a9cf6c31a59b3b2098dce Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 10 Apr 2024 15:07:02 -0500 Subject: [PATCH 1/5] scx: Fix a few typos Update the copyright in a selftest, and make a comment for an exit_code field a bit more generic to reflect that exit_code can be defined when gracefully exiting from the main kernel, not just BPF. Finally, update a pr_err message to print the correct path to the sched_ext sysfs node. Signed-off-by: David Vernet --- kernel/sched/ext.c | 4 ++-- tools/testing/selftests/sched_ext/util.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 87ab3a8adeacd..bb2f6d099ac42 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -26,7 +26,7 @@ struct scx_exit_info { /* %SCX_EXIT_* - broad category of the exit reason */ enum scx_exit_kind kind; - /* exit code if gracefully exiting from BPF */ + /* exit code if gracefully exiting */ s64 exit_code; /* textual representation of the above */ @@ -6068,7 +6068,7 @@ static int __init scx_init(void) scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); if (!scx_kset) { - pr_err("sched_ext: Failed to create /sys/sched_ext\n"); + pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); return -ENOMEM; } diff --git a/tools/testing/selftests/sched_ext/util.h b/tools/testing/selftests/sched_ext/util.h index 4cc65df1f9204..bc13dfec1267a 100644 --- a/tools/testing/selftests/sched_ext/util.h +++ b/tools/testing/selftests/sched_ext/util.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. - * Copyright (c) 2024 Tejun Heo + * Copyright (c) 2024 David Vernet */ #ifndef __SCX_TEST_UTIL_H__ From 8584503508001fb9ed9baf49e28b047f30ac7460 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 10 Apr 2024 15:27:07 -0500 Subject: [PATCH 2/5] scx: Add hotplug sequence number We currently have a possibly tricky race w.r.t. hotplug that schedulers don't have a good way to account for. Once a scheduler has inspected a host topology, if a hotplug event occurs before a scheduler is attached and loaded, then the scheduler will have no way of knowing that its view of the host topology is incorrect. Hotplug events _after_ this are fine, as we'll either pass the events to the scheduler, or evict the scheduler directly. But if a hotplug event happens between inspecting the host topology and attaching the scheduler, we have a problem. To address this, we can use a monotonically increasing hotplug sequence number that is incremented any time a hotplug event occurs, and expose it through a sysfs node in /sys/kernel/sched_ext/. Using this, a user space scheduler can look at the sequence number before loading, and then compare it to the sequence number during attach to see if a hotplug event occurred. If so, we can fail to attach, and return to user space. This patch adds the aforementioned sysfs node. A subsequent patch will update the struct sched_ext_ops and the attach path to check this value to ensure that a hotplug event hasn't occurred. Signed-off-by: David Vernet --- kernel/sched/ext.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index bb2f6d099ac42..4f190fb3d4332 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -796,6 +796,7 @@ static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); static struct scx_exit_info *scx_exit_info; static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); +static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); /* * The maximum amount of time in jiffies that a task may be runnable without @@ -2930,31 +2931,32 @@ void __scx_update_idle(struct rq *rq, bool idle) #endif } -static void hotplug_exit_sched(struct rq *rq, bool online) +static void handle_hotplug(struct rq *rq, bool online) { - scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, - "cpu %d going %s, exiting scheduler", - cpu_of(rq), online ? "online" : "offline"); + int cpu = cpu_of(rq); + + atomic_long_inc(&scx_hotplug_seq); + + if (online && SCX_HAS_OP(cpu_online)) + SCX_CALL_OP(SCX_KF_REST, cpu_online, cpu); + else if (!online && SCX_HAS_OP(cpu_offline)) + SCX_CALL_OP(SCX_KF_REST, cpu_offline, cpu); + else + scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "cpu %d going %s, exiting scheduler", cpu, + online ? "online" : "offline"); } static void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason) { - if (reason == RQ_ONOFF_HOTPLUG) { - if (SCX_HAS_OP(cpu_online)) - SCX_CALL_OP(SCX_KF_REST, cpu_online, cpu_of(rq)); - else - hotplug_exit_sched(rq, true); - } + if (reason == RQ_ONOFF_HOTPLUG) + handle_hotplug(rq, true); } static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason) { - if (reason == RQ_ONOFF_HOTPLUG) { - if (SCX_HAS_OP(cpu_offline)) - SCX_CALL_OP(SCX_KF_REST, cpu_offline, cpu_of(rq)); - else - hotplug_exit_sched(rq, false); - } + if (reason == RQ_ONOFF_HOTPLUG) + handle_hotplug(rq, false); } #else /* !CONFIG_SMP */ @@ -3811,10 +3813,18 @@ static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, } SCX_ATTR(nr_rejected); +static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); +} +SCX_ATTR(hotplug_seq); + static struct attribute *scx_global_attrs[] = { &scx_attr_state.attr, &scx_attr_switch_all.attr, &scx_attr_nr_rejected.attr, + &scx_attr_hotplug_seq.attr, NULL, }; From 89a0d6fe8b1d53f5572df708c59f6766ce68f2dd Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 10 Apr 2024 15:55:21 -0500 Subject: [PATCH 3/5] scx: Add hotplug seq to sched_ext_ops We'll need to have a hotplug sequence number in struct sched_ext_ops if we want to enable user space to deterministically detect a hotplug event between reading a host's topology, and attaching its scheduler. A prior change added a global hotplug sequence number and exported it through a sysfs file. This one connects the two by also adding logic to fail to attach if there is a mismatch between the two. A subsequent patch will add tests. Signed-off-by: David Vernet --- kernel/sched/ext.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4f190fb3d4332..556daeda36909 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -561,6 +561,15 @@ struct sched_ext_ops { */ u32 exit_dump_len; + /** + * hotplug_seq - A sequence number that may be set by the scheduler to + * detect when a hotplug event has occurred during the loading process. + * If 0, no detection occurs. Otherwise, the scheduler will fail to + * load if the sequence number does not match @scx_hotplug_seq on the + * enable path. + */ + u64 hotplug_seq; + /** * name - BPF scheduler's name * @@ -4346,6 +4355,25 @@ static struct kthread_worker *scx_create_rt_helper(const char *name) return helper; } +static void check_hotplug_seq(const struct sched_ext_ops *ops) +{ + unsigned long long global_hotplug_seq; + + /* + * If a hotplug event has occurred between when a scheduler was + * initialized, and when we were able to attach, exit and notify user + * space about it. + */ + if (ops->hotplug_seq) { + global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); + if (ops->hotplug_seq != global_hotplug_seq) { + scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "expected hotplug seq %llu did not match actual %llu", + ops->hotplug_seq, global_hotplug_seq); + } + } +} + static int validate_ops(const struct sched_ext_ops *ops) { /* @@ -4479,6 +4507,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops) cpus_read_lock(); scx_cgroup_lock(); + check_hotplug_seq(ops); + for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) static_branch_enable_cpuslocked(&scx_has_op[i]); @@ -4806,6 +4836,9 @@ static int bpf_scx_init_member(const struct btf_type *t, ops->exit_dump_len = *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; return 1; + case offsetof(struct sched_ext_ops, hotplug_seq): + ops->hotplug_seq = *(u64 *)(udata + moff); + return 1; } return 0; From b276dddab29e155b44d4da356a0a9af5ef5027b7 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 10 Apr 2024 22:45:35 -0500 Subject: [PATCH 4/5] scx: Add new macros to compat.h Now that we have the hotplug sequence number, schedulers can set the sequence number when opening the skeleton to detect hotplug events. In order to provide backwards compatibility and avoid excess boilerplate, let's add a new SCX_OPS_OPEN() macro that encapsulates this for the caller. In addition, we add an SCX_HOTPLUG_SQN() macro that can be used to read the current global sequence number from /sys/kernel/sched_ext/hotplug_sqn. This is called by SCX_OPS_OPEN() when running on a kernel with hotplug sqn support. Signed-off-by: David Vernet --- tools/sched_ext/include/scx/compat.h | 48 ++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index 77ec97a0b468b..e2608ad6ec5cf 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -8,6 +8,9 @@ #define __SCX_COMPAT_H #include +#include +#include +#include struct btf *__COMPAT_vmlinux_btf __attribute__((weak)); @@ -106,16 +109,55 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field #define __COMPAT_SCX_OPS_SWITCH_PARTIAL \ __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") +static inline long scx_hotplug_seq(void) +{ + int fd; + char buf[32]; + ssize_t len; + long val; + + fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY); + if (fd < 0) + return -ENOENT; + + len = read(fd, buf, sizeof(buf) - 1); + SCX_BUG_ON(len <= 0, "read failed (%ld)", len); + buf[len] = 0; + close(fd); + + val = strtoul(buf, NULL, 10); + SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val); + + return val; +} + /* * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE() * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load * and attach it, backward compatibility is automatically maintained where * reasonable. * - * - sched_ext_ops.exit_dump_len was added later. On kernels which don't support - * it, the value is ignored and a warning is triggered if the value is - * requested to be non-zero. + * The following values were added in newer kernels: + * + * - sched_ext_ops.exit_dump_len + * o If nonzero and running on an older kernel, the value is set to zero + * and a warning is emitted + * + * - sched_ext_ops.hotplug_sqn + * o If nonzero and running on an older kernel, the scheduler will fail to + * load */ +#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ + struct __scx_name *__skel; \ + \ + __skel = __scx_name##__open(); \ + SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ + \ + if (__COMPAT_struct_has_field("sched_ext_ops", "hotplug_seq")) \ + __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ + __skel; \ +}) + #define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \ UEI_SET_SIZE(__skel, __ops_name, __uei_name); \ if (__COMPAT_struct_has_field("sched_ext_ops", "exit_dump_len") && \ From 9fabb99442d9770c42891cfebf7518bafa291157 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 10 Apr 2024 17:08:31 -0500 Subject: [PATCH 5/5] scx: Add selftests for validating hotplug seq checks Now that we have full hotplug sequence number support, as well as the necessary macros in compat.h, let's extend the hotplug selftest to also validate that the sequence number can be used to detect hotplug events. Signed-off-by: David Vernet --- tools/testing/selftests/sched_ext/hotplug.c | 59 ++++++++++++++++----- 1 file changed, 47 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c index 4b528504702ca..ce23d797a6bb1 100644 --- a/tools/testing/selftests/sched_ext/hotplug.c +++ b/tools/testing/selftests/sched_ext/hotplug.c @@ -15,8 +15,6 @@ #include "scx_test.h" #include "util.h" -struct hotplug *skel; - const char *online_path = "/sys/devices/system/cpu/cpu1/online"; static bool is_cpu_online(void) @@ -40,22 +38,20 @@ static enum scx_test_status setup(void **ctx) if (!is_cpu_online()) return SCX_TEST_SKIP; - skel = hotplug__open_and_load(); - if (!skel) { - SCX_ERR("Failed to open and load hotplug skel"); - return SCX_TEST_FAIL; - } - return SCX_TEST_PASS; } static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined) { + struct hotplug *skel; struct bpf_link *link; long kind, code; SCX_ASSERT(is_cpu_online()); + skel = hotplug__open_and_load(); + SCX_ASSERT(skel); + /* Testing the offline -> online path, so go offline before starting */ if (onlining) toggle_online_status(0); @@ -78,6 +74,7 @@ static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined) if (!link) { SCX_ERR("Failed to attach scheduler"); + hotplug__destroy(skel); return SCX_TEST_FAIL; } @@ -93,12 +90,51 @@ static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined) toggle_online_status(1); bpf_link__destroy(link); - - UEI_RESET(skel, uei); + hotplug__destroy(skel); return SCX_TEST_PASS; } +static enum scx_test_status test_hotplug_attach(void) +{ + struct hotplug *skel; + struct bpf_link *link; + enum scx_test_status status = SCX_TEST_PASS; + long kind, code; + + SCX_ASSERT(is_cpu_online()); + SCX_ASSERT(scx_hotplug_seq() > 0); + + skel = SCX_OPS_OPEN(hotplug_nocb_ops, hotplug); + SCX_ASSERT(skel); + + SCX_OPS_LOAD(skel, hotplug_nocb_ops, hotplug, uei); + + /* + * Take the CPU offline to increment the global hotplug seq, which + * should cause attach to fail due to us setting the hotplug seq above + */ + toggle_online_status(0); + link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops); + + toggle_online_status(1); + + SCX_ASSERT(link); + while (!UEI_EXITED(skel, uei)) + sched_yield(); + + kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN); + code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | + SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG); + SCX_EQ(UEI_KIND(skel, uei), kind); + SCX_EQ(UEI_ECODE(skel, uei), code); + + bpf_link__destroy(link); + hotplug__destroy(skel); + + return status; +} + static enum scx_test_status run(void *ctx) { @@ -114,12 +150,11 @@ static enum scx_test_status run(void *ctx) #undef HP_TEST - return SCX_TEST_PASS; + return test_hotplug_attach(); } static void cleanup(void *ctx) { - hotplug__destroy(skel); toggle_online_status(1); }