Skip to content
This repository has been archived by the owner on Jun 18, 2024. It is now read-only.

Commit

Permalink
Merge pull request #133 from sched-ext/htejun
Browse files Browse the repository at this point in the history
scx: Purge most schedulers from tools/sched_ext
  • Loading branch information
Byte-Lab authored Feb 2, 2024
2 parents d0e0b1a + c5fcab9 commit efc946e
Show file tree
Hide file tree
Showing 35 changed files with 27 additions and 9,055 deletions.
62 changes: 6 additions & 56 deletions tools/sched_ext/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,7 @@ $(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BP

SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)

################
# C schedulers #
################
c-sched-targets = scx_simple scx_qmap scx_central scx_pair scx_flatcg \
scx_userland
c-sched-targets = scx_simple scx_qmap

$(addprefix $(BINDIR)/,$(c-sched-targets)): \
$(BINDIR)/%: \
Expand All @@ -195,39 +191,14 @@ $(addprefix $(BINDIR)/,$(c-sched-targets)): \
$(eval sched=$(notdir $@))
$(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o
$(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS)
$(c-sched-targets): %: $(BINDIR)/%


###################
# Rust schedulers #
###################
rust-sched-targets := scx_rusty scx_layered

# Separate build target that is available for build systems to use to fetch
# dependencies in a separate step from building. This allows the scheduler
# to be compiled without network access.
#
# If the regular rust scheduler Make target (e.g. scx_rusty) is invoked without
# CARGO_OFFLINE=1 (e.g. if building locally), then cargo build will download
# all of the necessary dependencies, and the deps target can be skipped.
$(addsuffix _deps,$(rust-sched-targets)):
$(eval sched=$(@:_deps=))
$(Q)cargo fetch --manifest-path=$(sched)/Cargo.toml

$(rust-sched-targets): %: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS)
$(eval export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR))
$(eval export BPF_CLANG = $(CLANG))
$(eval export BPF_CFLAGS = $(BPF_CFLAGS))
$(eval sched=$(notdir $@))
$(Q)cargo build --manifest-path=$(sched)/Cargo.toml $(CARGOFLAGS)
$(Q)cp $(OUTPUT_DIR)/release/$(sched) $(BINDIR)/$@
$(c-sched-targets): %: $(BINDIR)/%

install: all
$(Q)mkdir -p $(DESTDIR)/usr/local/bin/
$(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/

clean:
$(foreach sched,$(rust-sched-targets),cargo clean --manifest-path=$(sched)/Cargo.toml;)
rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR)
rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h
rm -f $(c-sched-targets)
Expand All @@ -240,7 +211,7 @@ help:
@echo ''
@echo 'Alternatively, you may compile individual schedulers:'
@echo ''
@printf ' %s\n' $(c-sched-targets) $(rust-sched-targets)
@printf ' %s\n' $(c-sched-targets)
@echo ''
@echo 'For any scheduler build target, you may specify an alternative'
@echo 'build output path with the O= environment variable. For example:'
Expand All @@ -251,26 +222,6 @@ help:
@echo '/tmp/sched_ext/build.'
@echo ''
@echo ''
@echo 'Rust scheduler targets'
@echo '======================'
@echo ''
@printf ' %s\n' $(rust-sched-targets)
@printf ' %s_deps\n' $(rust-sched-targets)
@echo ''
@echo 'For any rust schedulers built with cargo, you can specify'
@echo 'CARGO_OFFLINE=1 to ensure the build portion does not access the'
@echo 'network (e.g. if the scheduler is being packaged).'
@echo ''
@echo 'For such use cases, the build workflow will look something like this:'
@echo ''
@echo ' make scx_rusty_deps'
@echo ' CARGO_OFFLINE=1 make scx_rusty'
@echo ''
@echo 'If network access during build is allowed, you can just make scx_rusty'
@echo 'directly without CARGO_OFFLINE, and dependencies will be downloaded'
@echo 'during the build step.'
@echo ''
@echo ''
@echo 'Installing targets'
@echo '=================='
@echo ''
Expand All @@ -287,12 +238,11 @@ help:
@echo 'Cleaning targets'
@echo '================'
@echo ''
@echo ' clean - Remove all generated files, including intermediate'
@echo ' rust files for rust schedulers.'
@echo ' clean - Remove all generated files'

all_targets: $(c-sched-targets) $(rust-sched-targets)
all_targets: $(c-sched-targets)

.PHONY: all all_targets $(c-sched-targets) $(rust-sched-targets) clean help
.PHONY: all all_targets $(c-sched-targets) clean help

# delete failed targets
.DELETE_ON_ERROR:
Expand Down
138 changes: 2 additions & 136 deletions tools/sched_ext/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ $ make -j($nproc)

# Schedulers

This section lists, in alphabetical order, all of the current example
schedulers.
This directory contains the following simple schedulers as examples. For
more, visit https://github.com/sched-ext/scx.

--------------------------------------------------------------------------------

Expand Down Expand Up @@ -204,140 +204,6 @@ No

--------------------------------------------------------------------------------

## scx_central

### Overview

A "central" scheduler where scheduling decisions are made from a single CPU.
This scheduler illustrates how scheduling decisions can be dispatched from a
single CPU, allowing other cores to run with infinite slices, without timer
ticks, and without having to incur the overhead of making scheduling decisions.

### Typical Use Case

This scheduler could theoretically be useful for any workload that benefits
from minimizing scheduling overhead and timer ticks. An example of where this
could be particularly useful is running VMs, where running with infinite slices
and no timer ticks allows the VM to avoid unnecessary expensive vmexits.

### Production Ready?

Not yet. While tasks are run with an infinite slice (SCX_SLICE_INF), they're
preempted every 20ms in a timer callback. The scheduler also puts the core
schedling logic inside of the central / scheduling CPU's ops.dispatch() path,
and does not yet have any kind of priority mechanism.

--------------------------------------------------------------------------------

## scx_pair

### Overview

A sibling scheduler which ensures that tasks will only ever be co-located on a
physical core if they're in the same cgroup. It illustrates how a scheduling
policy could be implemented to mitigate CPU bugs, such as L1TF, and also shows
how some useful kfuncs such as `scx_bpf_kick_cpu()` can be utilized.

### Typical Use Case

While this scheduler is only meant to be used to illustrate certain sched_ext
features, with a bit more work (e.g. by adding some form of priority handling
inside and across cgroups), it could have been used as a way to quickly
mitigate L1TF before core scheduling was implemented and rolled out.

### Production Ready?

No

--------------------------------------------------------------------------------

## scx_flatcg

### Overview

A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical
weight-based cgroup CPU control by flattening the cgroup hierarchy into a
single layer, by compounding the active weight share at each level. The effect
of this is a much more performant CPU controller, which does not need to
descend down cgroup trees in order to properly compute a cgroup's share.

### Typical Use Case

This scheduler could be useful for any typical workload requiring a CPU
controller, but which cannot tolerate the higher overheads of the fair CPU
controller.

### Production Ready?

Yes, though the scheduler (currently) does not adequately accommodate
thundering herds of cgroups. If, for example, many cgroups which are nested
behind a low-priority cgroup were to wake up around the same time, they may be
able to consume more CPU cycles than they are entitled to.

--------------------------------------------------------------------------------

## scx_userland

### Overview

A simple weighted vtime scheduler where all scheduling decisions take place in
user space. This is in contrast to Rusty, where load balancing lives in user
space, but scheduling decisions are still made in the kernel.

### Typical Use Case

There are many advantages to writing schedulers in user space. For example, you
can use a debugger, you can write the scheduler in Rust, and you can use data
structures bundled with your favorite library.

On the other hand, user space scheduling can be hard to get right. You can
potentially deadlock due to not scheduling a task that's required for the
scheduler itself to make forward progress (though the sched_ext watchdog will
protect the system by unloading your scheduler after a timeout if that
happens). You also have to bootstrap some communication protocol between the
kernel and user space.

A more robust solution to this would be building a user space scheduling
framework that abstracts much of this complexity away from you.

### Production Ready?

No. This scheduler uses an ordered list for vtime scheduling, and is stricly
less performant than just using something like `scx_simple`. It is purely
meant to illustrate that it's possible to build a user space scheduler on
top of sched_ext.

--------------------------------------------------------------------------------

## scx_rusty

### Overview

A multi-domain, BPF / user space hybrid scheduler. The BPF portion of the
scheduler does a simple round robin in each domain, and the user space portion
(written in Rust) calculates the load factor of each domain, and informs BPF of
how tasks should be load balanced accordingly.

### Typical Use Case

Rusty is designed to be flexible, and accommodate different architectures and
workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc),
as well as how Rusty should partition the system into scheduling domains, can
be tuned to achieve the optimal configuration for any given system or workload.

### Production Ready?

Yes. If tuned correctly, rusty should be performant across various CPU
architectures and workloads. Rusty by default creates a separate scheduling
domain per-LLC, so its default configuration may be performant as well.

That said, you may run into an issue with infeasible weights, where a task with
a very high weight may cause the scheduler to incorrectly leave cores idle
because it thinks they're necessary to accommodate the compute for a single
task. This can also happen in CFS, and should soon be addressed for rusty.

--------------------------------------------------------------------------------

# Troubleshooting

There are a number of common issues that you may run into when building the
Expand Down
21 changes: 19 additions & 2 deletions tools/sched_ext/include/scx/user_exit_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,17 @@
#ifndef __USER_EXIT_INFO_H
#define __USER_EXIT_INFO_H

enum uei_sizes {
UEI_REASON_SIZE = 128,
UEI_MSG_SIZE = 1024,
UEI_DUMP_SIZE = 32768,
};

struct user_exit_info {
int kind;
char reason[128];
char msg[1024];
char reason[UEI_REASON_SIZE];
char msg[UEI_MSG_SIZE];
char dump[UEI_DUMP_SIZE];
};

#ifdef __bpf__
Expand All @@ -26,12 +33,16 @@ static inline void uei_record(struct user_exit_info *uei,
{
bpf_probe_read_kernel_str(uei->reason, sizeof(uei->reason), ei->reason);
bpf_probe_read_kernel_str(uei->msg, sizeof(uei->msg), ei->msg);
bpf_probe_read_kernel_str(uei->dump, sizeof(uei->dump), ei->dump);
/* use __sync to force memory barrier */
__sync_val_compare_and_swap(&uei->kind, uei->kind, ei->kind);
}

#else /* !__bpf__ */

#include <stdio.h>
#include <stdbool.h>

static inline bool uei_exited(struct user_exit_info *uei)
{
/* use __sync to force memory barrier */
Expand All @@ -40,6 +51,12 @@ static inline bool uei_exited(struct user_exit_info *uei)

static inline void uei_print(const struct user_exit_info *uei)
{
if (uei->dump[0] != '\0') {
fputs("\nDEBUG DUMP\n", stderr);
fputs("================================================================================\n\n", stderr);
fputs(uei->dump, stderr);
fputs("\n================================================================================\n\n", stderr);
}
fprintf(stderr, "EXIT: %s", uei->reason);
if (uei->msg[0] != '\0')
fprintf(stderr, " (%s)", uei->msg);
Expand Down
Loading

0 comments on commit efc946e

Please sign in to comment.