Skip to content

Commit

Permalink
Merge pull request torvalds#113 from sched-ext/htejun
Browse files Browse the repository at this point in the history
scx: Sync schedulers from SCX v0.1.5 (74923c6cdbc3)
  • Loading branch information
htejun authored Jan 9, 2024
2 parents 8c7f9b2 + 88e7560 commit f4dc571
Show file tree
Hide file tree
Showing 20 changed files with 392 additions and 144 deletions.
3 changes: 2 additions & 1 deletion tools/sched_ext/include/scx/common.bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <linux/errno.h>
#include <asm-generic/errno.h>
#include "user_exit_info.h"

#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
Expand Down Expand Up @@ -68,6 +68,7 @@ const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
bool scx_bpf_task_running(const struct task_struct *p) __ksym;
s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
Expand Down
8 changes: 8 additions & 0 deletions tools/sched_ext/scx_central.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,14 @@ static bool dispatch_to_cpu(s32 cpu)
__sync_fetch_and_add(&nr_mismatches, 1);
scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
bpf_task_release(p);
/*
* We might run out of dispatch buffer slots if we continue dispatching
* to the fallback DSQ, without dispatching to the local DSQ of the
* target CPU. In such a case, break the loop now as will fail the
* next dispatch operation.
*/
if (!scx_bpf_dispatch_nr_slots())
break;
continue;
}

Expand Down
7 changes: 4 additions & 3 deletions tools/sched_ext/scx_central.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <sched.h>
#include <stdio.h>
#include <unistd.h>
#include <inttypes.h>
#include <signal.h>
#include <libgen.h>
#include <bpf/bpf.h>
Expand Down Expand Up @@ -103,17 +104,17 @@ int main(int argc, char **argv)

while (!exit_req && !uei_exited(&skel->bss->uei)) {
printf("[SEQ %llu]\n", seq++);
printf("total :%10lu local:%10lu queued:%10lu lost:%10lu\n",
printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n",
skel->bss->nr_total,
skel->bss->nr_locals,
skel->bss->nr_queued,
skel->bss->nr_lost_pids);
printf("timer :%10lu dispatch:%10lu mismatch:%10lu retry:%10lu\n",
printf("timer :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n",
skel->bss->nr_timers,
skel->bss->nr_dispatches,
skel->bss->nr_mismatches,
skel->bss->nr_retries);
printf("overflow:%10lu\n",
printf("overflow:%10" PRIu64 "\n",
skel->bss->nr_overflows);
fflush(stdout);
sleep(1);
Expand Down
41 changes: 29 additions & 12 deletions tools/sched_ext/scx_flatcg.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ struct {
} task_ctx SEC(".maps");

/* gets inc'd on weight tree changes to expire the cached hweights */
unsigned long hweight_gen = 1;
u64 hweight_gen = 1;

static u64 div_round_up(u64 dividend, u64 divisor)
{
Expand Down Expand Up @@ -302,16 +302,18 @@ static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
bpf_spin_unlock(&cgv_tree_lock);
}

void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
struct fcg_task_ctx *taskc;
struct cgroup *cgrp;
struct fcg_cgrp_ctx *cgc;
bool is_idle = false;
s32 cpu;

cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);

taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
if (!taskc) {
scx_bpf_error("task_ctx lookup failed");
return;
return cpu;
}

/*
Expand All @@ -321,7 +323,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
* affinities so that we don't have to worry about per-cgroup dq's
* containing tasks that can't be executed from some CPUs.
*/
if ((enq_flags & SCX_ENQ_LOCAL) || p->nr_cpus_allowed != nr_cpus) {
if (is_idle || p->nr_cpus_allowed != nr_cpus) {
/*
* Tell fcg_stopping() that this bypassed the regular scheduling
* path and should be force charged to the cgroup. 0 is used to
Expand All @@ -338,14 +340,28 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
* implement per-cgroup fallback dq's instead so that we have
* more control over when tasks with custom cpumask get issued.
*/
if ((enq_flags & SCX_ENQ_LOCAL) ||
if (is_idle ||
(p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) {
stat_inc(FCG_STAT_LOCAL);
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
} else {
stat_inc(FCG_STAT_GLOBAL);
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
}
}

return cpu;
}

void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
{
struct fcg_task_ctx *taskc;
struct cgroup *cgrp;
struct fcg_cgrp_ctx *cgc;

taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
if (!taskc) {
scx_bpf_error("task_ctx lookup failed");
return;
}

Expand Down Expand Up @@ -756,8 +772,8 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
}
}

s32 BPF_STRUCT_OPS(fcg_prep_enable, struct task_struct *p,
struct scx_enable_args *args)
s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
struct scx_init_task_args *args)
{
struct fcg_task_ctx *taskc;
struct fcg_cgrp_ctx *cgc;
Expand Down Expand Up @@ -893,13 +909,14 @@ void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)

SEC(".struct_ops.link")
struct sched_ext_ops flatcg_ops = {
.select_cpu = (void *)fcg_select_cpu,
.enqueue = (void *)fcg_enqueue,
.dispatch = (void *)fcg_dispatch,
.runnable = (void *)fcg_runnable,
.running = (void *)fcg_running,
.stopping = (void *)fcg_stopping,
.quiescent = (void *)fcg_quiescent,
.prep_enable = (void *)fcg_prep_enable,
.init_task = (void *)fcg_init_task,
.cgroup_set_weight = (void *)fcg_cgroup_set_weight,
.cgroup_init = (void *)fcg_cgroup_init,
.cgroup_exit = (void *)fcg_cgroup_exit,
Expand Down
4 changes: 3 additions & 1 deletion tools/sched_ext/scx_flatcg.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <unistd.h>
#include <libgen.h>
#include <limits.h>
#include <inttypes.h>
#include <fcntl.h>
#include <time.h>
#include <bpf/bpf.h>
Expand Down Expand Up @@ -183,7 +184,7 @@ int main(int argc, char **argv)

memcpy(last_stats, acc_stats, sizeof(acc_stats));

printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%lu]\n",
printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
seq++, cpu_util * 100.0, skel->data->hweight_gen);
printf(" act:%6llu deact:%6llu local:%6llu global:%6llu\n",
stats[FCG_STAT_ACT],
Expand All @@ -210,6 +211,7 @@ int main(int argc, char **argv)
stats[FCG_STAT_PNC_GONE]);
printf("BAD remove:%6llu\n",
acc_stats[FCG_STAT_BAD_REMOVAL]);
fflush(stdout);

nanosleep(&intv_ts, NULL);
}
Expand Down
8 changes: 4 additions & 4 deletions tools/sched_ext/scx_layered/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "scx_layered"
version = "0.0.1"
version = "0.0.4"
authors = ["Tejun Heo <htejun@meta.com>", "Meta"]
edition = "2021"
description = "Userspace scheduling with BPF for Ads"
Expand All @@ -13,16 +13,16 @@ clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
ctrlc = { version = "3.1", features = ["termination"] }
fb_procfs = "0.7"
lazy_static = "1.4"
libbpf-rs = "0.21"
libbpf-rs = "0.22"
libc = "0.2"
log = "0.4"
scx_utils = "0.3"
scx_utils = "0.5"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
simplelog = "0.12"

[build-dependencies]
scx_utils = "0.3"
scx_utils = "0.5"

[features]
enable_backtrace = []
37 changes: 37 additions & 0 deletions tools/sched_ext/scx_layered/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# scx_layered

This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main).

## Overview

A highly configurable multi-layer BPF / user space hybrid scheduler.

scx_layered allows the user to classify tasks into multiple layers, and apply
different scheduling policies to those layers. For example, a layer could be
created of all tasks that are part of the `user.slice` cgroup slice, and a
policy could be specified that ensures that the layer is given at least 80% CPU
utilization for some subset of CPUs on the system.

## How To Install

Available as a [Rust crate](https://crates.io/crates/scx_layered): `cargo add scx_layered`

## Typical Use Case

scx_layered is designed to be highly customizable, and can be targeted for
specific applications. For example, if you had a high-priority service that
required priority access to all but 1 physical core to ensure acceptable p99
latencies, you could specify that the service would get priority access to all
but 1 core on the system. If that service ends up not utilizing all of those
cores, they could be used by other layers until they're needed.

## Production Ready?

Yes. If tuned correctly, scx_layered should be performant across various CPU
architectures and workloads.

That said, you may run into an issue with infeasible weights, where a task with
a very high weight may cause the scheduler to incorrectly leave cores idle
because it thinks they're necessary to accommodate the compute for a single
task. This can also happen in CFS, and should soon be addressed for
scx_layered.
19 changes: 6 additions & 13 deletions tools/sched_ext/scx_layered/src/bpf/main.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -745,8 +745,8 @@ void BPF_STRUCT_OPS(layered_set_cpumask, struct task_struct *p,
bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask);
}

s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p,
struct scx_enable_args *args)
s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
struct scx_init_task_args *args)
{
struct task_ctx tctx_init = {
.pid = p->pid,
Expand Down Expand Up @@ -805,14 +805,8 @@ s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p,
return 0;
}

void BPF_STRUCT_OPS(layered_cancel_enable, struct task_struct *p)
{
s32 pid = p->pid;

bpf_map_delete_elem(&task_ctxs, &pid);
}

void BPF_STRUCT_OPS(layered_disable, struct task_struct *p)
void BPF_STRUCT_OPS(layered_exit_task, struct task_struct *p,
struct scx_exit_task_args *args)
{
struct cpu_ctx *cctx;
struct task_ctx *tctx;
Expand Down Expand Up @@ -977,9 +971,8 @@ struct sched_ext_ops layered = {
.quiescent = (void *)layered_quiescent,
.set_weight = (void *)layered_set_weight,
.set_cpumask = (void *)layered_set_cpumask,
.prep_enable = (void *)layered_prep_enable,
.cancel_enable = (void *)layered_cancel_enable,
.disable = (void *)layered_disable,
.init_task = (void *)layered_init_task,
.exit_task = (void *)layered_exit_task,
.init = (void *)layered_init,
.exit = (void *)layered_exit,
.name = "layered",
Expand Down
20 changes: 10 additions & 10 deletions tools/sched_ext/scx_layered/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1122,10 +1122,10 @@ struct Scheduler<'a> {

impl<'a> Scheduler<'a> {
fn init_layers(skel: &mut OpenBpfSkel, specs: &Vec<LayerSpec>) -> Result<()> {
skel.rodata().nr_layers = specs.len() as u32;
skel.rodata_mut().nr_layers = specs.len() as u32;

for (spec_i, spec) in specs.iter().enumerate() {
let layer = &mut skel.bss().layers[spec_i];
let layer = &mut skel.bss_mut().layers[spec_i];

for (or_i, or) in spec.matches.iter().enumerate() {
for (and_i, and) in or.iter().enumerate() {
Expand Down Expand Up @@ -1176,12 +1176,12 @@ impl<'a> Scheduler<'a> {
let mut skel = skel_builder.open().context("Failed to open BPF program")?;

// Initialize skel according to @opts.
skel.rodata().debug = opts.verbose as u32;
skel.rodata().slice_ns = opts.slice_us * 1000;
skel.rodata().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32;
skel.rodata().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
skel.rodata_mut().debug = opts.verbose as u32;
skel.rodata_mut().slice_ns = opts.slice_us * 1000;
skel.rodata_mut().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32;
skel.rodata_mut().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
for cpu in cpu_pool.all_cpus.iter_ones() {
skel.rodata().all_cpus[cpu / 8] |= 1 << (cpu % 8);
skel.rodata_mut().all_cpus[cpu / 8] |= 1 << (cpu % 8);
}
Self::init_layers(&mut skel, &layer_specs)?;

Expand Down Expand Up @@ -1274,7 +1274,7 @@ impl<'a> Scheduler<'a> {
{
Self::update_bpf_layer_cpumask(
&self.layers[idx],
&mut self.skel.bss().layers[idx],
&mut self.skel.bss_mut().layers[idx],
);
updated = true;
}
Expand All @@ -1288,7 +1288,7 @@ impl<'a> Scheduler<'a> {
let nr_available_cpus = available_cpus.count_ones();
for idx in 0..self.layers.len() {
let layer = &mut self.layers[idx];
let bpf_layer = &mut self.skel.bss().layers[idx];
let bpf_layer = &mut self.skel.bss_mut().layers[idx];
match &layer.kind {
LayerKind::Open { .. } => {
layer.cpus.copy_from_bitslice(&available_cpus);
Expand All @@ -1299,7 +1299,7 @@ impl<'a> Scheduler<'a> {
}
}

self.skel.bss().fallback_cpu = self.cpu_pool.fallback_cpu as u32;
self.skel.bss_mut().fallback_cpu = self.cpu_pool.fallback_cpu as u32;

for (lidx, layer) in self.layers.iter().enumerate() {
self.nr_layer_cpus_min_max[lidx] = (
Expand Down
9 changes: 5 additions & 4 deletions tools/sched_ext/scx_pair.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
*/
#include <stdio.h>
#include <unistd.h>
#include <inttypes.h>
#include <signal.h>
#include <libgen.h>
#include <bpf/bpf.h>
Expand Down Expand Up @@ -142,18 +143,18 @@ int main(int argc, char **argv)

while (!exit_req && !uei_exited(&skel->bss->uei)) {
printf("[SEQ %llu]\n", seq++);
printf(" total:%10lu dispatch:%10lu missing:%10lu\n",
printf(" total:%10" PRIu64 " dispatch:%10" PRIu64 " missing:%10" PRIu64 "\n",
skel->bss->nr_total,
skel->bss->nr_dispatched,
skel->bss->nr_missing);
printf(" kicks:%10lu preemptions:%7lu\n",
printf(" kicks:%10" PRIu64 " preemptions:%7" PRIu64 "\n",
skel->bss->nr_kicks,
skel->bss->nr_preemptions);
printf(" exp:%10lu exp_wait:%10lu exp_empty:%10lu\n",
printf(" exp:%10" PRIu64 " exp_wait:%10" PRIu64 " exp_empty:%10" PRIu64 "\n",
skel->bss->nr_exps,
skel->bss->nr_exp_waits,
skel->bss->nr_exp_empty);
printf("cgnext:%10lu cgcoll:%10lu cgempty:%10lu\n",
printf("cgnext:%10" PRIu64 " cgcoll:%10" PRIu64 " cgempty:%10" PRIu64 "\n",
skel->bss->nr_cgrp_next,
skel->bss->nr_cgrp_coll,
skel->bss->nr_cgrp_empty);
Expand Down
Loading

0 comments on commit f4dc571

Please sign in to comment.