Merge pull request torvalds#113 from sched-ext/htejun

scx: Sync schedulers from SCX v0.1.5 (74923c6cdbc3)
logic10492 · Jan 9, 2024 · f4dc571 · f4dc571
2 parents 8c7f9b2 + 88e7560
commit f4dc571
Show file tree

Hide file tree

Showing 20 changed files with 392 additions and 144 deletions.
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
@@ -10,7 +10,7 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include <linux/errno.h>
+#include <asm-generic/errno.h>
 #include "user_exit_info.h"
 
 #define PF_WQ_WORKER			0x00000020	/* I'm a workqueue worker */
@@ -68,6 +68,7 @@ const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
 const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
 void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;

diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
@@ -161,6 +161,14 @@ static bool dispatch_to_cpu(s32 cpu)
 			__sync_fetch_and_add(&nr_mismatches, 1);
 			scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
 			bpf_task_release(p);
+			/*
+			 * We might run out of dispatch buffer slots if we continue dispatching
+			 * to the fallback DSQ, without dispatching to the local DSQ of the
+			 * target CPU. In such a case, break the loop now as will fail the
+			 * next dispatch operation.
+			 */
+			if (!scx_bpf_dispatch_nr_slots())
+				break;
 			continue;
 		}
 

diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
@@ -8,6 +8,7 @@
 #include <sched.h>
 #include <stdio.h>
 #include <unistd.h>
+#include <inttypes.h>
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
@@ -103,17 +104,17 @@ int main(int argc, char **argv)
 
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
 		printf("[SEQ %llu]\n", seq++);
-		printf("total   :%10lu    local:%10lu   queued:%10lu  lost:%10lu\n",
+		printf("total   :%10" PRIu64 "    local:%10" PRIu64 "   queued:%10" PRIu64 "  lost:%10" PRIu64 "\n",
 		       skel->bss->nr_total,
 		       skel->bss->nr_locals,
 		       skel->bss->nr_queued,
 		       skel->bss->nr_lost_pids);
-		printf("timer   :%10lu dispatch:%10lu mismatch:%10lu retry:%10lu\n",
+		printf("timer   :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n",
 		       skel->bss->nr_timers,
 		       skel->bss->nr_dispatches,
 		       skel->bss->nr_mismatches,
 		       skel->bss->nr_retries);
-		printf("overflow:%10lu\n",
+		printf("overflow:%10" PRIu64 "\n",
 		       skel->bss->nr_overflows);
 		fflush(stdout);
 		sleep(1);

diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
@@ -123,7 +123,7 @@ struct {
 } task_ctx SEC(".maps");
 
 /* gets inc'd on weight tree changes to expire the cached hweights */
-unsigned long hweight_gen = 1;
+u64 hweight_gen = 1;
 
 static u64 div_round_up(u64 dividend, u64 divisor)
 {
@@ -302,16 +302,18 @@ static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
 	bpf_spin_unlock(&cgv_tree_lock);
 }
 
-void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
+s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 {
 	struct fcg_task_ctx *taskc;
-	struct cgroup *cgrp;
-	struct fcg_cgrp_ctx *cgc;
+	bool is_idle = false;
+	s32 cpu;
+
+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
 
 	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
 	if (!taskc) {
 		scx_bpf_error("task_ctx lookup failed");
-		return;
+		return cpu;
 	}
 
 	/*
@@ -321,7 +323,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 	 * affinities so that we don't have to worry about per-cgroup dq's
 	 * containing tasks that can't be executed from some CPUs.
 	 */
-	if ((enq_flags & SCX_ENQ_LOCAL) || p->nr_cpus_allowed != nr_cpus) {
+	if (is_idle || p->nr_cpus_allowed != nr_cpus) {
 		/*
 		 * Tell fcg_stopping() that this bypassed the regular scheduling
 		 * path and should be force charged to the cgroup. 0 is used to
@@ -338,14 +340,28 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 		 * implement per-cgroup fallback dq's instead so that we have
 		 * more control over when tasks with custom cpumask get issued.
 		 */
-		if ((enq_flags & SCX_ENQ_LOCAL) ||
+		if (is_idle ||
 		    (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) {
 			stat_inc(FCG_STAT_LOCAL);
-			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 		} else {
 			stat_inc(FCG_STAT_GLOBAL);
-			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
 		}
+	}
+
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct fcg_task_ctx *taskc;
+	struct cgroup *cgrp;
+	struct fcg_cgrp_ctx *cgc;
+
+	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
+	if (!taskc) {
+		scx_bpf_error("task_ctx lookup failed");
 		return;
 	}
 
@@ -756,8 +772,8 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
-s32 BPF_STRUCT_OPS(fcg_prep_enable, struct task_struct *p,
-		   struct scx_enable_args *args)
+s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
 {
 	struct fcg_task_ctx *taskc;
 	struct fcg_cgrp_ctx *cgc;
@@ -893,13 +909,14 @@ void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
 
 SEC(".struct_ops.link")
 struct sched_ext_ops flatcg_ops = {
+	.select_cpu		= (void *)fcg_select_cpu,
 	.enqueue		= (void *)fcg_enqueue,
 	.dispatch		= (void *)fcg_dispatch,
 	.runnable		= (void *)fcg_runnable,
 	.running		= (void *)fcg_running,
 	.stopping		= (void *)fcg_stopping,
 	.quiescent		= (void *)fcg_quiescent,
-	.prep_enable		= (void *)fcg_prep_enable,
+	.init_task		= (void *)fcg_init_task,
 	.cgroup_set_weight	= (void *)fcg_cgroup_set_weight,
 	.cgroup_init		= (void *)fcg_cgroup_init,
 	.cgroup_exit		= (void *)fcg_cgroup_exit,

diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
@@ -9,6 +9,7 @@
 #include <unistd.h>
 #include <libgen.h>
 #include <limits.h>
+#include <inttypes.h>
 #include <fcntl.h>
 #include <time.h>
 #include <bpf/bpf.h>
@@ -183,7 +184,7 @@ int main(int argc, char **argv)
 
 		memcpy(last_stats, acc_stats, sizeof(acc_stats));
 
-		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%lu]\n",
+		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
 		       seq++, cpu_util * 100.0, skel->data->hweight_gen);
 		printf("       act:%6llu  deact:%6llu local:%6llu global:%6llu\n",
 		       stats[FCG_STAT_ACT],
@@ -210,6 +211,7 @@ int main(int argc, char **argv)
 		       stats[FCG_STAT_PNC_GONE]);
 		printf("BAD remove:%6llu\n",
 		       acc_stats[FCG_STAT_BAD_REMOVAL]);
+		fflush(stdout);
 
 		nanosleep(&intv_ts, NULL);
 	}

diff --git a/tools/sched_ext/scx_layered/Cargo.toml b/tools/sched_ext/scx_layered/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "scx_layered"
-version = "0.0.1"
+version = "0.0.4"
 authors = ["Tejun Heo <htejun@meta.com>", "Meta"]
 edition = "2021"
 description = "Userspace scheduling with BPF for Ads"
@@ -13,16 +13,16 @@ clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
 ctrlc = { version = "3.1", features = ["termination"] }
 fb_procfs = "0.7"
 lazy_static = "1.4"
-libbpf-rs = "0.21"
+libbpf-rs = "0.22"
 libc = "0.2"
 log = "0.4"
-scx_utils = "0.3"
+scx_utils = "0.5"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 simplelog = "0.12"
 
 [build-dependencies]
-scx_utils = "0.3"
+scx_utils = "0.5"
 
 [features]
 enable_backtrace = []
diff --git a/tools/sched_ext/scx_layered/README.md b/tools/sched_ext/scx_layered/README.md
@@ -0,0 +1,37 @@
+# scx_layered
+
+This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main).
+
+## Overview
+
+A highly configurable multi-layer BPF / user space hybrid scheduler.
+
+scx_layered allows the user to classify tasks into multiple layers, and apply
+different scheduling policies to those layers. For example, a layer could be
+created of all tasks that are part of the `user.slice` cgroup slice, and a
+policy could be specified that ensures that the layer is given at least 80% CPU
+utilization for some subset of CPUs on the system.
+
+## How To Install
+
+Available as a [Rust crate](https://crates.io/crates/scx_layered): `cargo add scx_layered`
+
+## Typical Use Case
+
+scx_layered is designed to be highly customizable, and can be targeted for
+specific applications. For example, if you had a high-priority service that
+required priority access to all but 1 physical core to ensure acceptable p99
+latencies, you could specify that the service would get priority access to all
+but 1 core on the system. If that service ends up not utilizing all of those
+cores, they could be used by other layers until they're needed.
+
+## Production Ready?
+
+Yes. If tuned correctly, scx_layered should be performant across various CPU
+architectures and workloads.
+
+That said, you may run into an issue with infeasible weights, where a task with
+a very high weight may cause the scheduler to incorrectly leave cores idle
+because it thinks they're necessary to accommodate the compute for a single
+task. This can also happen in CFS, and should soon be addressed for
+scx_layered.
diff --git a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
@@ -745,8 +745,8 @@ void BPF_STRUCT_OPS(layered_set_cpumask, struct task_struct *p,
 		bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask);
 }
 
-s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p,
-		   struct scx_enable_args *args)
+s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
 {
 	struct task_ctx tctx_init = {
 		.pid = p->pid,
@@ -805,14 +805,8 @@ s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p,
 	return 0;
 }
 
-void BPF_STRUCT_OPS(layered_cancel_enable, struct task_struct *p)
-{
-	s32 pid = p->pid;
-
-	bpf_map_delete_elem(&task_ctxs, &pid);
-}
-
-void BPF_STRUCT_OPS(layered_disable, struct task_struct *p)
+void BPF_STRUCT_OPS(layered_exit_task, struct task_struct *p,
+		    struct scx_exit_task_args *args)
 {
 	struct cpu_ctx *cctx;
 	struct task_ctx *tctx;
@@ -977,9 +971,8 @@ struct sched_ext_ops layered = {
 	.quiescent		= (void *)layered_quiescent,
 	.set_weight		= (void *)layered_set_weight,
 	.set_cpumask		= (void *)layered_set_cpumask,
-	.prep_enable		= (void *)layered_prep_enable,
-	.cancel_enable		= (void *)layered_cancel_enable,
-	.disable		= (void *)layered_disable,
+	.init_task		= (void *)layered_init_task,
+	.exit_task		= (void *)layered_exit_task,
 	.init			= (void *)layered_init,
 	.exit			= (void *)layered_exit,
 	.name			= "layered",

diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs
@@ -1122,10 +1122,10 @@ struct Scheduler<'a> {
 
 impl<'a> Scheduler<'a> {
     fn init_layers(skel: &mut OpenBpfSkel, specs: &Vec<LayerSpec>) -> Result<()> {
-        skel.rodata().nr_layers = specs.len() as u32;
+        skel.rodata_mut().nr_layers = specs.len() as u32;
 
         for (spec_i, spec) in specs.iter().enumerate() {
-            let layer = &mut skel.bss().layers[spec_i];
+            let layer = &mut skel.bss_mut().layers[spec_i];
 
             for (or_i, or) in spec.matches.iter().enumerate() {
                 for (and_i, and) in or.iter().enumerate() {
@@ -1176,12 +1176,12 @@ impl<'a> Scheduler<'a> {
         let mut skel = skel_builder.open().context("Failed to open BPF program")?;
 
         // Initialize skel according to @opts.
-        skel.rodata().debug = opts.verbose as u32;
-        skel.rodata().slice_ns = opts.slice_us * 1000;
-        skel.rodata().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32;
-        skel.rodata().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
+        skel.rodata_mut().debug = opts.verbose as u32;
+        skel.rodata_mut().slice_ns = opts.slice_us * 1000;
+        skel.rodata_mut().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32;
+        skel.rodata_mut().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
         for cpu in cpu_pool.all_cpus.iter_ones() {
-            skel.rodata().all_cpus[cpu / 8] |= 1 << (cpu % 8);
+            skel.rodata_mut().all_cpus[cpu / 8] |= 1 << (cpu % 8);
         }
         Self::init_layers(&mut skel, &layer_specs)?;
 
@@ -1274,7 +1274,7 @@ impl<'a> Scheduler<'a> {
                     {
                         Self::update_bpf_layer_cpumask(
                             &self.layers[idx],
-                            &mut self.skel.bss().layers[idx],
+                            &mut self.skel.bss_mut().layers[idx],
                         );
                         updated = true;
                     }
@@ -1288,7 +1288,7 @@ impl<'a> Scheduler<'a> {
             let nr_available_cpus = available_cpus.count_ones();
             for idx in 0..self.layers.len() {
                 let layer = &mut self.layers[idx];
-                let bpf_layer = &mut self.skel.bss().layers[idx];
+                let bpf_layer = &mut self.skel.bss_mut().layers[idx];
                 match &layer.kind {
                     LayerKind::Open { .. } => {
                         layer.cpus.copy_from_bitslice(&available_cpus);
@@ -1299,7 +1299,7 @@ impl<'a> Scheduler<'a> {
                 }
             }
 
-            self.skel.bss().fallback_cpu = self.cpu_pool.fallback_cpu as u32;
+            self.skel.bss_mut().fallback_cpu = self.cpu_pool.fallback_cpu as u32;
 
             for (lidx, layer) in self.layers.iter().enumerate() {
                 self.nr_layer_cpus_min_max[lidx] = (

diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
@@ -6,6 +6,7 @@
  */
 #include <stdio.h>
 #include <unistd.h>
+#include <inttypes.h>
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
@@ -142,18 +143,18 @@ int main(int argc, char **argv)
 
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
 		printf("[SEQ %llu]\n", seq++);
-		printf(" total:%10lu dispatch:%10lu   missing:%10lu\n",
+		printf(" total:%10" PRIu64 " dispatch:%10" PRIu64 "   missing:%10" PRIu64 "\n",
 		       skel->bss->nr_total,
 		       skel->bss->nr_dispatched,
 		       skel->bss->nr_missing);
-		printf(" kicks:%10lu preemptions:%7lu\n",
+		printf(" kicks:%10" PRIu64 " preemptions:%7" PRIu64 "\n",
 		       skel->bss->nr_kicks,
 		       skel->bss->nr_preemptions);
-		printf("   exp:%10lu exp_wait:%10lu exp_empty:%10lu\n",
+		printf("   exp:%10" PRIu64 " exp_wait:%10" PRIu64 " exp_empty:%10" PRIu64 "\n",
 		       skel->bss->nr_exps,
 		       skel->bss->nr_exp_waits,
 		       skel->bss->nr_exp_empty);
-		printf("cgnext:%10lu   cgcoll:%10lu   cgempty:%10lu\n",
+		printf("cgnext:%10" PRIu64 "   cgcoll:%10" PRIu64 "   cgempty:%10" PRIu64 "\n",
 		       skel->bss->nr_cgrp_next,
 		       skel->bss->nr_cgrp_coll,
 		       skel->bss->nr_cgrp_empty);