diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 5c503c23583685..f2336d357106e5 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -10,7 +10,7 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
-#include <linux/errno.h>
+#include <asm-generic/errno.h>
 #include "user_exit_info.h"
 
 #define PF_WQ_WORKER			0x00000020	/* I'm a workqueue worker */
@@ -68,6 +68,7 @@ const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
 const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
 void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 4f398249fb2cca..51ddb0a14bc610 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -161,6 +161,14 @@ static bool dispatch_to_cpu(s32 cpu)
 			__sync_fetch_and_add(&nr_mismatches, 1);
 			scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
 			bpf_task_release(p);
+			/*
+			 * We might run out of dispatch buffer slots if we continue dispatching
+			 * to the fallback DSQ, without dispatching to the local DSQ of the
+			 * target CPU. In such a case, break the loop now as will fail the
+			 * next dispatch operation.
+			 */
+			if (!scx_bpf_dispatch_nr_slots())
+				break;
 			continue;
 		}
 
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index a3d22409e9ce53..501505001bf98b 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -8,6 +8,7 @@
 #include <sched.h>
 #include <stdio.h>
 #include <unistd.h>
+#include <inttypes.h>
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
@@ -103,17 +104,17 @@ int main(int argc, char **argv)
 
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
 		printf("[SEQ %llu]\n", seq++);
-		printf("total   :%10lu    local:%10lu   queued:%10lu  lost:%10lu\n",
+		printf("total   :%10" PRIu64 "    local:%10" PRIu64 "   queued:%10" PRIu64 "  lost:%10" PRIu64 "\n",
 		       skel->bss->nr_total,
 		       skel->bss->nr_locals,
 		       skel->bss->nr_queued,
 		       skel->bss->nr_lost_pids);
-		printf("timer   :%10lu dispatch:%10lu mismatch:%10lu retry:%10lu\n",
+		printf("timer   :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n",
 		       skel->bss->nr_timers,
 		       skel->bss->nr_dispatches,
 		       skel->bss->nr_mismatches,
 		       skel->bss->nr_retries);
-		printf("overflow:%10lu\n",
+		printf("overflow:%10" PRIu64 "\n",
 		       skel->bss->nr_overflows);
 		fflush(stdout);
 		sleep(1);
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 84a60d7e4024ba..869115805b2882 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -123,7 +123,7 @@ struct {
 } task_ctx SEC(".maps");
 
 /* gets inc'd on weight tree changes to expire the cached hweights */
-unsigned long hweight_gen = 1;
+u64 hweight_gen = 1;
 
 static u64 div_round_up(u64 dividend, u64 divisor)
 {
@@ -302,16 +302,18 @@ static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
 	bpf_spin_unlock(&cgv_tree_lock);
 }
 
-void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
+s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 {
 	struct fcg_task_ctx *taskc;
-	struct cgroup *cgrp;
-	struct fcg_cgrp_ctx *cgc;
+	bool is_idle = false;
+	s32 cpu;
+
+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
 
 	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
 	if (!taskc) {
 		scx_bpf_error("task_ctx lookup failed");
-		return;
+		return cpu;
 	}
 
 	/*
@@ -321,7 +323,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 	 * affinities so that we don't have to worry about per-cgroup dq's
 	 * containing tasks that can't be executed from some CPUs.
 	 */
-	if ((enq_flags & SCX_ENQ_LOCAL) || p->nr_cpus_allowed != nr_cpus) {
+	if (is_idle || p->nr_cpus_allowed != nr_cpus) {
 		/*
 		 * Tell fcg_stopping() that this bypassed the regular scheduling
 		 * path and should be force charged to the cgroup. 0 is used to
@@ -338,14 +340,28 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 		 * implement per-cgroup fallback dq's instead so that we have
 		 * more control over when tasks with custom cpumask get issued.
 		 */
-		if ((enq_flags & SCX_ENQ_LOCAL) ||
+		if (is_idle ||
 		    (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) {
 			stat_inc(FCG_STAT_LOCAL);
-			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 		} else {
 			stat_inc(FCG_STAT_GLOBAL);
-			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
 		}
+	}
+
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct fcg_task_ctx *taskc;
+	struct cgroup *cgrp;
+	struct fcg_cgrp_ctx *cgc;
+
+	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
+	if (!taskc) {
+		scx_bpf_error("task_ctx lookup failed");
 		return;
 	}
 
@@ -756,8 +772,8 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
-s32 BPF_STRUCT_OPS(fcg_prep_enable, struct task_struct *p,
-		   struct scx_enable_args *args)
+s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
 {
 	struct fcg_task_ctx *taskc;
 	struct fcg_cgrp_ctx *cgc;
@@ -893,13 +909,14 @@ void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
 
 SEC(".struct_ops.link")
 struct sched_ext_ops flatcg_ops = {
+	.select_cpu		= (void *)fcg_select_cpu,
 	.enqueue		= (void *)fcg_enqueue,
 	.dispatch		= (void *)fcg_dispatch,
 	.runnable		= (void *)fcg_runnable,
 	.running		= (void *)fcg_running,
 	.stopping		= (void *)fcg_stopping,
 	.quiescent		= (void *)fcg_quiescent,
-	.prep_enable		= (void *)fcg_prep_enable,
+	.init_task		= (void *)fcg_init_task,
 	.cgroup_set_weight	= (void *)fcg_cgroup_set_weight,
 	.cgroup_init		= (void *)fcg_cgroup_init,
 	.cgroup_exit		= (void *)fcg_cgroup_exit,
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
index 6a6e47c83ede7f..b326b2d3ec3501 100644
--- a/tools/sched_ext/scx_flatcg.c
+++ b/tools/sched_ext/scx_flatcg.c
@@ -9,6 +9,7 @@
 #include <unistd.h>
 #include <libgen.h>
 #include <limits.h>
+#include <inttypes.h>
 #include <fcntl.h>
 #include <time.h>
 #include <bpf/bpf.h>
@@ -183,7 +184,7 @@ int main(int argc, char **argv)
 
 		memcpy(last_stats, acc_stats, sizeof(acc_stats));
 
-		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%lu]\n",
+		printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
 		       seq++, cpu_util * 100.0, skel->data->hweight_gen);
 		printf("       act:%6llu  deact:%6llu local:%6llu global:%6llu\n",
 		       stats[FCG_STAT_ACT],
@@ -210,6 +211,7 @@ int main(int argc, char **argv)
 		       stats[FCG_STAT_PNC_GONE]);
 		printf("BAD remove:%6llu\n",
 		       acc_stats[FCG_STAT_BAD_REMOVAL]);
+		fflush(stdout);
 
 		nanosleep(&intv_ts, NULL);
 	}
diff --git a/tools/sched_ext/scx_layered/Cargo.toml b/tools/sched_ext/scx_layered/Cargo.toml
index 19dd0243a9f2a6..37a811e3807e22 100644
--- a/tools/sched_ext/scx_layered/Cargo.toml
+++ b/tools/sched_ext/scx_layered/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "scx_layered"
-version = "0.0.1"
+version = "0.0.4"
 authors = ["Tejun Heo <htejun@meta.com>", "Meta"]
 edition = "2021"
 description = "Userspace scheduling with BPF for Ads"
@@ -13,16 +13,16 @@ clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
 ctrlc = { version = "3.1", features = ["termination"] }
 fb_procfs = "0.7"
 lazy_static = "1.4"
-libbpf-rs = "0.21"
+libbpf-rs = "0.22"
 libc = "0.2"
 log = "0.4"
-scx_utils = "0.3"
+scx_utils = "0.5"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 simplelog = "0.12"
 
 [build-dependencies]
-scx_utils = "0.3"
+scx_utils = "0.5"
 
 [features]
 enable_backtrace = []
diff --git a/tools/sched_ext/scx_layered/README.md b/tools/sched_ext/scx_layered/README.md
new file mode 100644
index 00000000000000..37c554b2354db7
--- /dev/null
+++ b/tools/sched_ext/scx_layered/README.md
@@ -0,0 +1,37 @@
+# scx_layered
+
+This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main).
+
+## Overview
+
+A highly configurable multi-layer BPF / user space hybrid scheduler.
+
+scx_layered allows the user to classify tasks into multiple layers, and apply
+different scheduling policies to those layers. For example, a layer could be
+created of all tasks that are part of the `user.slice` cgroup slice, and a
+policy could be specified that ensures that the layer is given at least 80% CPU
+utilization for some subset of CPUs on the system.
+
+## How To Install
+
+Available as a [Rust crate](https://crates.io/crates/scx_layered): `cargo add scx_layered`
+
+## Typical Use Case
+
+scx_layered is designed to be highly customizable, and can be targeted for
+specific applications. For example, if you had a high-priority service that
+required priority access to all but 1 physical core to ensure acceptable p99
+latencies, you could specify that the service would get priority access to all
+but 1 core on the system. If that service ends up not utilizing all of those
+cores, they could be used by other layers until they're needed.
+
+## Production Ready?
+
+Yes. If tuned correctly, scx_layered should be performant across various CPU
+architectures and workloads.
+
+That said, you may run into an issue with infeasible weights, where a task with
+a very high weight may cause the scheduler to incorrectly leave cores idle
+because it thinks they're necessary to accommodate the compute for a single
+task. This can also happen in CFS, and should soon be addressed for
+scx_layered.
diff --git a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
index 98d9418e1adf14..21dd0e4cd83953 100644
--- a/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
+++ b/tools/sched_ext/scx_layered/src/bpf/main.bpf.c
@@ -745,8 +745,8 @@ void BPF_STRUCT_OPS(layered_set_cpumask, struct task_struct *p,
 		bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask);
 }
 
-s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p,
-		   struct scx_enable_args *args)
+s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
 {
 	struct task_ctx tctx_init = {
 		.pid = p->pid,
@@ -805,14 +805,8 @@ s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p,
 	return 0;
 }
 
-void BPF_STRUCT_OPS(layered_cancel_enable, struct task_struct *p)
-{
-	s32 pid = p->pid;
-
-	bpf_map_delete_elem(&task_ctxs, &pid);
-}
-
-void BPF_STRUCT_OPS(layered_disable, struct task_struct *p)
+void BPF_STRUCT_OPS(layered_exit_task, struct task_struct *p,
+		    struct scx_exit_task_args *args)
 {
 	struct cpu_ctx *cctx;
 	struct task_ctx *tctx;
@@ -977,9 +971,8 @@ struct sched_ext_ops layered = {
 	.quiescent		= (void *)layered_quiescent,
 	.set_weight		= (void *)layered_set_weight,
 	.set_cpumask		= (void *)layered_set_cpumask,
-	.prep_enable		= (void *)layered_prep_enable,
-	.cancel_enable		= (void *)layered_cancel_enable,
-	.disable		= (void *)layered_disable,
+	.init_task		= (void *)layered_init_task,
+	.exit_task		= (void *)layered_exit_task,
 	.init			= (void *)layered_init,
 	.exit			= (void *)layered_exit,
 	.name			= "layered",
diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs
index 8f4d77db04ea9e..5b5374226f49aa 100644
--- a/tools/sched_ext/scx_layered/src/main.rs
+++ b/tools/sched_ext/scx_layered/src/main.rs
@@ -1122,10 +1122,10 @@ struct Scheduler<'a> {
 
 impl<'a> Scheduler<'a> {
     fn init_layers(skel: &mut OpenBpfSkel, specs: &Vec<LayerSpec>) -> Result<()> {
-        skel.rodata().nr_layers = specs.len() as u32;
+        skel.rodata_mut().nr_layers = specs.len() as u32;
 
         for (spec_i, spec) in specs.iter().enumerate() {
-            let layer = &mut skel.bss().layers[spec_i];
+            let layer = &mut skel.bss_mut().layers[spec_i];
 
             for (or_i, or) in spec.matches.iter().enumerate() {
                 for (and_i, and) in or.iter().enumerate() {
@@ -1176,12 +1176,12 @@ impl<'a> Scheduler<'a> {
         let mut skel = skel_builder.open().context("Failed to open BPF program")?;
 
         // Initialize skel according to @opts.
-        skel.rodata().debug = opts.verbose as u32;
-        skel.rodata().slice_ns = opts.slice_us * 1000;
-        skel.rodata().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32;
-        skel.rodata().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
+        skel.rodata_mut().debug = opts.verbose as u32;
+        skel.rodata_mut().slice_ns = opts.slice_us * 1000;
+        skel.rodata_mut().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32;
+        skel.rodata_mut().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores;
         for cpu in cpu_pool.all_cpus.iter_ones() {
-            skel.rodata().all_cpus[cpu / 8] |= 1 << (cpu % 8);
+            skel.rodata_mut().all_cpus[cpu / 8] |= 1 << (cpu % 8);
         }
         Self::init_layers(&mut skel, &layer_specs)?;
 
@@ -1274,7 +1274,7 @@ impl<'a> Scheduler<'a> {
                     {
                         Self::update_bpf_layer_cpumask(
                             &self.layers[idx],
-                            &mut self.skel.bss().layers[idx],
+                            &mut self.skel.bss_mut().layers[idx],
                         );
                         updated = true;
                     }
@@ -1288,7 +1288,7 @@ impl<'a> Scheduler<'a> {
             let nr_available_cpus = available_cpus.count_ones();
             for idx in 0..self.layers.len() {
                 let layer = &mut self.layers[idx];
-                let bpf_layer = &mut self.skel.bss().layers[idx];
+                let bpf_layer = &mut self.skel.bss_mut().layers[idx];
                 match &layer.kind {
                     LayerKind::Open { .. } => {
                         layer.cpus.copy_from_bitslice(&available_cpus);
@@ -1299,7 +1299,7 @@ impl<'a> Scheduler<'a> {
                 }
             }
 
-            self.skel.bss().fallback_cpu = self.cpu_pool.fallback_cpu as u32;
+            self.skel.bss_mut().fallback_cpu = self.cpu_pool.fallback_cpu as u32;
 
             for (lidx, layer) in self.layers.iter().enumerate() {
                 self.nr_layer_cpus_min_max[lidx] = (
diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
index 693f095b8c6602..1eb30efeb0ed53 100644
--- a/tools/sched_ext/scx_pair.c
+++ b/tools/sched_ext/scx_pair.c
@@ -6,6 +6,7 @@
  */
 #include <stdio.h>
 #include <unistd.h>
+#include <inttypes.h>
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
@@ -142,18 +143,18 @@ int main(int argc, char **argv)
 
 	while (!exit_req && !uei_exited(&skel->bss->uei)) {
 		printf("[SEQ %llu]\n", seq++);
-		printf(" total:%10lu dispatch:%10lu   missing:%10lu\n",
+		printf(" total:%10" PRIu64 " dispatch:%10" PRIu64 "   missing:%10" PRIu64 "\n",
 		       skel->bss->nr_total,
 		       skel->bss->nr_dispatched,
 		       skel->bss->nr_missing);
-		printf(" kicks:%10lu preemptions:%7lu\n",
+		printf(" kicks:%10" PRIu64 " preemptions:%7" PRIu64 "\n",
 		       skel->bss->nr_kicks,
 		       skel->bss->nr_preemptions);
-		printf("   exp:%10lu exp_wait:%10lu exp_empty:%10lu\n",
+		printf("   exp:%10" PRIu64 " exp_wait:%10" PRIu64 " exp_empty:%10" PRIu64 "\n",
 		       skel->bss->nr_exps,
 		       skel->bss->nr_exp_waits,
 		       skel->bss->nr_exp_empty);
-		printf("cgnext:%10lu   cgcoll:%10lu   cgempty:%10lu\n",
+		printf("cgnext:%10" PRIu64 "   cgcoll:%10" PRIu64 "   cgempty:%10" PRIu64 "\n",
 		       skel->bss->nr_cgrp_next,
 		       skel->bss->nr_cgrp_coll,
 		       skel->bss->nr_cgrp_empty);
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 831df3f644d5a8..2fb75543a1640e 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -95,8 +95,8 @@ struct {
 } dispatch_idx_cnt SEC(".maps");
 
 /* Statistics */
-unsigned long nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
-unsigned long nr_core_sched_execed;
+u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued;
+u64 nr_core_sched_execed;
 
 s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 		   s32 prev_cpu, u64 wake_flags)
@@ -354,8 +354,8 @@ void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args
 		__sync_fetch_and_add(&nr_reenqueued, cnt);
 }
 
-s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p,
-		   struct scx_enable_args *args)
+s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
 {
 	if (p->tgid == disallow_tgid)
 		p->scx.disallow = true;
@@ -391,7 +391,7 @@ struct sched_ext_ops qmap_ops = {
 	.dispatch		= (void *)qmap_dispatch,
 	.core_sched_before	= (void *)qmap_core_sched_before,
 	.cpu_release		= (void *)qmap_cpu_release,
-	.prep_enable		= (void *)qmap_prep_enable,
+	.init_task		= (void *)qmap_init_task,
 	.init			= (void *)qmap_init,
 	.exit			= (void *)qmap_exit,
 	.flags			= SCX_OPS_ENQ_LAST,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index d817115c0b0a84..7008b913864490 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -7,6 +7,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
+#include <inttypes.h>
 #include <signal.h>
 #include <libgen.h>
 #include <bpf/bpf.h>
@@ -90,7 +91,7 @@ int main(int argc, char **argv)
 		long nr_enqueued = skel->bss->nr_enqueued;
 		long nr_dispatched = skel->bss->nr_dispatched;
 
-		printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%lu, deq=%lu, core=%lu\n",
+		printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%" PRIu64 ", deq=%" PRIu64 ", core=%" PRIu64 "\n",
 		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
 		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
 		       skel->bss->nr_core_sched_execed);
diff --git a/tools/sched_ext/scx_rusty/Cargo.toml b/tools/sched_ext/scx_rusty/Cargo.toml
index 309643687d0c6f..a8b4231d1bde9d 100644
--- a/tools/sched_ext/scx_rusty/Cargo.toml
+++ b/tools/sched_ext/scx_rusty/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "scx_rusty"
-version = "0.5.0"
+version = "0.5.3"
 authors = ["Dan Schatzberg <dschatzberg@meta.com>", "Meta"]
 edition = "2021"
 description = "Userspace scheduling with BPF"
@@ -13,15 +13,15 @@ clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] }
 ctrlc = { version = "3.1", features = ["termination"] }
 fb_procfs = "0.7.0"
 hex = "0.4.3"
-libbpf-rs = "0.21.0"
+libbpf-rs = "0.22.0"
 libc = "0.2.137"
 log = "0.4.17"
 ordered-float = "3.4.0"
-scx_utils = "0.3"
+scx_utils = "0.5"
 simplelog = "0.12.0"
 
 [build-dependencies]
-scx_utils = "0.3"
+scx_utils = "0.5"
 
 [features]
 enable_backtrace = []
diff --git a/tools/sched_ext/scx_rusty/README.md b/tools/sched_ext/scx_rusty/README.md
new file mode 100644
index 00000000000000..990e51aaf43b3a
--- /dev/null
+++ b/tools/sched_ext/scx_rusty/README.md
@@ -0,0 +1,36 @@
+# scx_rusty
+
+This is a single user-defined scheduler used within [sched_ext](https://github.com/sched-ext/scx/tree/main), which is a Linux kernel feature which enables implementing kernel thread schedulers in BPF and dynamically loading them. [Read more about sched_ext](https://github.com/sched-ext/scx/tree/main).
+
+## Overview
+
+A multi-domain, BPF / user space hybrid scheduler. The BPF portion of the
+scheduler does a simple round robin in each domain, and the user space portion
+(written in Rust) calculates the load factor of each domain, and informs BPF of
+how tasks should be load balanced accordingly.
+
+## How To Install
+
+Available as a [Rust crate](https://crates.io/crates/scx_rusty): `cargo add scx_rusty`
+
+## Typical Use Case
+
+Rusty is designed to be flexible, and accommodate different architectures and
+workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc),
+as well as how Rusty should partition the system into scheduling domains, can
+be tuned to achieve the optimal configuration for any given system or workload.
+
+## Production Ready?
+
+Yes. If tuned correctly, rusty should be performant across various CPU
+architectures and workloads. Rusty by default creates a separate scheduling
+domain per-LLC, so its default configuration may be performant as well. Note
+however that scx_rusty does not yet disambiguate between LLCs in different NUMA
+nodes, so it may perform better on multi-CCX machines where all the LLCs share
+the same socket, as opposed to multi-socket machines.
+
+Note as well that you may run into an issue with infeasible weights, where a
+task with a very high weight may cause the scheduler to incorrectly leave cores
+idle because it thinks they're necessary to accommodate the compute for a
+single task. This can also happen in CFS, and should soon be addressed for
+scx_rusty.
diff --git a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
index c85e95bf372a49..fe4de979f2a2df 100644
--- a/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
+++ b/tools/sched_ext/scx_rusty/src/bpf/main.bpf.c
@@ -425,10 +425,13 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 	if (!(taskc = lookup_task_ctx(p)) || !(p_cpumask = taskc->cpumask))
 		goto enoent;
 
-	if (kthreads_local &&
-	    (p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
+	if (p->nr_cpus_allowed == 1) {
 		cpu = prev_cpu;
-		stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
+		if (kthreads_local && (p->flags & PF_KTHREAD)) {
+			stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
+		} else {
+			stat_add(RUSTY_STAT_PINNED, 1);
+		}
 		goto direct;
 	}
 
@@ -436,7 +439,7 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 	 * If WAKE_SYNC and the machine isn't fully saturated, wake up @p to the
 	 * local dsq of the waker.
 	 */
-	if (p->nr_cpus_allowed > 1 && (wake_flags & SCX_WAKE_SYNC)) {
+	if (wake_flags & SCX_WAKE_SYNC) {
 		struct task_struct *current = (void *)bpf_get_current_task();
 
 		if (!(BPF_CORE_READ(current, flags) & PF_EXITING) &&
@@ -475,13 +478,6 @@ s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
 		}
 	}
 
-	/* If only one CPU is allowed, dispatch */
-	if (p->nr_cpus_allowed == 1) {
-		stat_add(RUSTY_STAT_PINNED, 1);
-		cpu = prev_cpu;
-		goto direct;
-	}
-
 	has_idle_cores = !bpf_cpumask_empty(idle_smtmask);
 
 	/* did @p get pulled out to a foreign domain by e.g. greedy execution? */
@@ -956,8 +952,8 @@ void BPF_STRUCT_OPS(rusty_set_cpumask, struct task_struct *p,
 			bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask);
 }
 
-s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p,
-		   struct scx_enable_args *args)
+s32 BPF_STRUCT_OPS(rusty_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
 {
 	struct bpf_cpumask *cpumask;
 	struct task_ctx taskc = { .dom_active_pids_gen = -1 };
@@ -1006,7 +1002,8 @@ s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p,
 	return 0;
 }
 
-void BPF_STRUCT_OPS(rusty_disable, struct task_struct *p)
+void BPF_STRUCT_OPS(rusty_exit_task, struct task_struct *p,
+		    struct scx_exit_task_args *args)
 {
 	pid_t pid = p->pid;
 	long ret;
@@ -1159,8 +1156,8 @@ struct sched_ext_ops rusty = {
 	.quiescent		= (void *)rusty_quiescent,
 	.set_weight		= (void *)rusty_set_weight,
 	.set_cpumask		= (void *)rusty_set_cpumask,
-	.prep_enable		= (void *)rusty_prep_enable,
-	.disable		= (void *)rusty_disable,
+	.init_task		= (void *)rusty_init_task,
+	.exit_task		= (void *)rusty_exit_task,
 	.init			= (void *)rusty_init,
 	.exit			= (void *)rusty_exit,
 	.name			= "rusty",
diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs
index ff7cc9d80a7eaa..3192ee049f9f2d 100644
--- a/tools/sched_ext/scx_rusty/src/main.rs
+++ b/tools/sched_ext/scx_rusty/src/main.rs
@@ -187,6 +187,15 @@ fn read_total_cpu(reader: &procfs::ProcReader) -> Result<procfs::CpuStat> {
         .ok_or_else(|| anyhow!("Could not read total cpu stat in proc"))
 }
 
+fn sub_or_zero(curr: &u64, prev: &u64) -> u64
+{
+    if let Some(res) = curr.checked_sub(*prev) {
+        res
+    } else {
+        0
+    }
+}
+
 fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
     match (curr, prev) {
         (
@@ -213,14 +222,14 @@ fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result<f64> {
                 ..
             },
         ) => {
-            let idle_usec = curr_idle - prev_idle;
-            let iowait_usec = curr_iowait - prev_iowait;
-            let user_usec = curr_user - prev_user;
-            let system_usec = curr_system - prev_system;
-            let nice_usec = curr_nice - prev_nice;
-            let irq_usec = curr_irq - prev_irq;
-            let softirq_usec = curr_softirq - prev_softirq;
-            let stolen_usec = curr_stolen - prev_stolen;
+            let idle_usec = sub_or_zero(curr_idle, prev_idle);
+            let iowait_usec = sub_or_zero(curr_iowait, prev_iowait);
+            let user_usec = sub_or_zero(curr_user, prev_user);
+            let system_usec = sub_or_zero(curr_system, prev_system);
+            let nice_usec = sub_or_zero(curr_nice, prev_nice);
+            let irq_usec = sub_or_zero(curr_irq, prev_irq);
+            let softirq_usec = sub_or_zero(curr_softirq, prev_softirq);
+            let stolen_usec = sub_or_zero(curr_stolen, prev_stolen);
 
             let busy_usec =
                 user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
@@ -426,7 +435,7 @@ impl Tuner {
             .read_stat()?
             .cpus_map
             .ok_or_else(|| anyhow!("Expected cpus_map to exist"))?;
-        let ti = &mut skel.bss().tune_input;
+        let ti = &mut skel.bss_mut().tune_input;
         let mut dom_nr_cpus = vec![0; self.top.nr_doms];
         let mut dom_util_sum = vec![0.0; self.top.nr_doms];
 
@@ -620,7 +629,7 @@ impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> {
         // XXX - We can't read task_ctx inline because self.skel.bss()
         // borrows mutably and thus conflicts with self.skel.maps().
         const MAX_PIDS: u64 = bpf_intf::consts_MAX_DOM_ACTIVE_PIDS as u64;
-        let active_pids = &mut self.skel.bss().dom_active_pids[dom as usize];
+        let active_pids = &mut self.skel.bss_mut().dom_active_pids[dom as usize];
         let mut pids = vec![];
 
         let (mut ridx, widx) = (active_pids.read_idx, active_pids.write_idx);
@@ -901,16 +910,16 @@ impl<'a> Scheduler<'a> {
             Topology::from_cache_level(opts.cache_level, nr_cpus)?
         });
 
-        skel.rodata().nr_doms = top.nr_doms as u32;
-        skel.rodata().nr_cpus = top.nr_cpus as u32;
+        skel.rodata_mut().nr_doms = top.nr_doms as u32;
+        skel.rodata_mut().nr_cpus = top.nr_cpus as u32;
 
         for (cpu, dom) in top.cpu_dom.iter().enumerate() {
-            skel.rodata().cpu_dom_id_map[cpu] = dom.unwrap_or(0) as u32;
+            skel.rodata_mut().cpu_dom_id_map[cpu] = dom.unwrap_or(0) as u32;
         }
 
         for (dom, cpus) in top.dom_cpus.iter().enumerate() {
             let raw_cpus_slice = cpus.as_raw_slice();
-            let dom_cpumask_slice = &mut skel.rodata().dom_cpumasks[dom];
+            let dom_cpumask_slice = &mut skel.rodata_mut().dom_cpumasks[dom];
             let (left, _) = dom_cpumask_slice.split_at_mut(raw_cpus_slice.len());
             left.clone_from_slice(cpus.as_raw_slice());
             info!(
@@ -921,13 +930,13 @@ impl<'a> Scheduler<'a> {
             );
         }
 
-        skel.rodata().slice_ns = opts.slice_us * 1000;
-        skel.rodata().load_half_life = (opts.load_half_life * 1000000000.0) as u32;
-        skel.rodata().kthreads_local = opts.kthreads_local;
-        skel.rodata().fifo_sched = opts.fifo_sched;
-        skel.rodata().switch_partial = opts.partial;
-        skel.rodata().greedy_threshold = opts.greedy_threshold;
-        skel.rodata().debug = opts.verbose as u32;
+        skel.rodata_mut().slice_ns = opts.slice_us * 1000;
+        skel.rodata_mut().load_half_life = (opts.load_half_life * 1000000000.0) as u32;
+        skel.rodata_mut().kthreads_local = opts.kthreads_local;
+        skel.rodata_mut().fifo_sched = opts.fifo_sched;
+        skel.rodata_mut().switch_partial = opts.partial;
+        skel.rodata_mut().greedy_threshold = opts.greedy_threshold;
+        skel.rodata_mut().debug = opts.verbose as u32;
 
         // Attach.
         let mut skel = skel.load().context("Failed to load BPF program")?;
@@ -994,14 +1003,14 @@ impl<'a> Scheduler<'a> {
                     guest_nice_usec: _,
                 },
             ) => {
-                let idle_usec = curr_idle - prev_idle;
-                let iowait_usec = curr_iowait - prev_iowait;
-                let user_usec = curr_user - prev_user;
-                let system_usec = curr_system - prev_system;
-                let nice_usec = curr_nice - prev_nice;
-                let irq_usec = curr_irq - prev_irq;
-                let softirq_usec = curr_softirq - prev_softirq;
-                let stolen_usec = curr_stolen - prev_stolen;
+                let idle_usec = sub_or_zero(curr_idle, prev_idle);
+                let iowait_usec = sub_or_zero(curr_iowait, prev_iowait);
+                let user_usec = sub_or_zero(curr_user, prev_user);
+                let system_usec = sub_or_zero(curr_system, prev_system);
+                let nice_usec = sub_or_zero(curr_nice, prev_nice);
+                let irq_usec = sub_or_zero(curr_irq, prev_irq);
+                let softirq_usec = sub_or_zero(curr_softirq, prev_softirq);
+                let stolen_usec = sub_or_zero(curr_stolen, prev_stolen);
 
                 let busy_usec =
                     user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec;
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index 7485acbc4f5092..95035aa29b10e6 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -51,19 +51,22 @@ static inline bool vtime_before(u64 a, u64 b)
 	return (s64)(a - b) < 0;
 }
 
-void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
+s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
 {
-	/*
-	 * If scx_select_cpu_dfl() is setting %SCX_ENQ_LOCAL, it indicates that
-	 * running @p on its CPU directly shouldn't affect fairness. Just queue
-	 * it on the local FIFO.
-	 */
-	if (enq_flags & SCX_ENQ_LOCAL) {
+	bool is_idle = false;
+	s32 cpu;
+
+	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
+	if (is_idle) {
 		stat_inc(0);	/* count local queueing */
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
-		return;
+		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 	}
 
+	return cpu;
+}
+
+void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
+{
 	stat_inc(1);	/* count global queueing */
 
 	if (fifo_sched) {
@@ -120,8 +123,7 @@ void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
 	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
 }
 
-void BPF_STRUCT_OPS(simple_enable, struct task_struct *p,
-		    struct scx_enable_args *args)
+void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
 {
 	p->scx.dsq_vtime = vtime_now;
 }
@@ -141,6 +143,7 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
 
 SEC(".struct_ops.link")
 struct sched_ext_ops simple_ops = {
+	.select_cpu		= (void *)simple_select_cpu,
 	.enqueue		= (void *)simple_enqueue,
 	.dispatch		= (void *)simple_dispatch,
 	.running		= (void *)simple_running,
diff --git a/tools/sched_ext/scx_userland.bpf.c b/tools/sched_ext/scx_userland.bpf.c
index f2791a6aecc8b8..4cdc3a6fb880a5 100644
--- a/tools/sched_ext/scx_userland.bpf.c
+++ b/tools/sched_ext/scx_userland.bpf.c
@@ -20,10 +20,14 @@
  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
-#include <string.h>
 #include <scx/common.bpf.h>
 #include "scx_userland.h"
 
+/*
+ * Maximum amount of tasks enqueued/dispatched between kernel and user-space.
+ */
+#define MAX_ENQUEUED_TASKS 4096
+
 char _license[] SEC("license") = "GPL";
 
 const volatile bool switch_partial;
@@ -35,13 +39,24 @@ const volatile u32 num_possible_cpus = 64;
 /* Stats that are printed by user space. */
 u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues;
 
-struct user_exit_info uei;
+/*
+ * Number of tasks that are queued for scheduling.
+ *
+ * This number is incremented by the BPF component when a task is queued to the
+ * user-space scheduler and it must be decremented by the user-space scheduler
+ * when a task is consumed.
+ */
+volatile u64 nr_queued;
 
 /*
- * Whether the user space scheduler needs to be scheduled due to a task being
- * enqueued in user space.
+ * Number of tasks that are waiting for scheduling.
+ *
+ * This number must be updated by the user-space scheduler to keep track if
+ * there is still some scheduling work to do.
  */
-static bool usersched_needed;
+volatile u64 nr_scheduled;
+
+struct user_exit_info uei;
 
 /*
  * The map containing tasks that are enqueued in user space from the kernel.
@@ -50,7 +65,7 @@ static bool usersched_needed;
  */
 struct {
 	__uint(type, BPF_MAP_TYPE_QUEUE);
-	__uint(max_entries, USERLAND_MAX_TASKS);
+	__uint(max_entries, MAX_ENQUEUED_TASKS);
 	__type(value, struct scx_userland_enqueued_task);
 } enqueued SEC(".maps");
 
@@ -61,7 +76,7 @@ struct {
  */
 struct {
 	__uint(type, BPF_MAP_TYPE_QUEUE);
-	__uint(max_entries, USERLAND_MAX_TASKS);
+	__uint(max_entries, MAX_ENQUEUED_TASKS);
 	__type(value, s32);
 } dispatched SEC(".maps");
 
@@ -78,6 +93,29 @@ struct {
 	__type(value, struct task_ctx);
 } task_ctx_stor SEC(".maps");
 
+/*
+ * Flag used to wake-up the user-space scheduler.
+ */
+static volatile u32 usersched_needed;
+
+/*
+ * Set user-space scheduler wake-up flag (equivalent to an atomic release
+ * operation).
+ */
+static void set_usersched_needed(void)
+{
+	__sync_fetch_and_or(&usersched_needed, 1);
+}
+
+/*
+ * Check and clear user-space scheduler wake-up flag (equivalent to an atomic
+ * acquire operation).
+ */
+static bool test_and_clear_usersched_needed(void)
+{
+	return __sync_fetch_and_and(&usersched_needed, 0) == 1;
+}
+
 static bool is_usersched_task(const struct task_struct *p)
 {
 	return p->pid == usersched_pid;
@@ -136,7 +174,6 @@ static void dispatch_user_scheduler(void)
 {
 	struct task_struct *p;
 
-	usersched_needed = false;
 	p = usersched_task();
 	if (p) {
 		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
@@ -146,9 +183,8 @@ static void dispatch_user_scheduler(void)
 
 static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)
 {
-	struct scx_userland_enqueued_task task;
+	struct scx_userland_enqueued_task task = {};
 
-	memset(&task, 0, sizeof(task));
 	task.pid = p->pid;
 	task.sum_exec_runtime = p->se.sum_exec_runtime;
 	task.weight = p->scx.weight;
@@ -162,7 +198,7 @@ static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags)
 		scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
 	} else {
 		__sync_fetch_and_add(&nr_user_enqueues, 1);
-		usersched_needed = true;
+		set_usersched_needed();
 	}
 }
 
@@ -191,10 +227,10 @@ void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags)
 
 void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev)
 {
-	if (usersched_needed)
+	if (test_and_clear_usersched_needed())
 		dispatch_user_scheduler();
 
-	bpf_repeat(4096) {
+	bpf_repeat(MAX_ENQUEUED_TASKS) {
 		s32 pid;
 		struct task_struct *p;
 
@@ -215,8 +251,57 @@ void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev)
 	}
 }
 
-s32 BPF_STRUCT_OPS(userland_prep_enable, struct task_struct *p,
-		   struct scx_enable_args *args)
+/*
+ * A CPU is about to change its idle state. If the CPU is going idle, ensure
+ * that the user-space scheduler has a chance to run if there is any remaining
+ * work to do.
+ */
+void BPF_STRUCT_OPS(userland_update_idle, s32 cpu, bool idle)
+{
+	/*
+	 * Don't do anything if we exit from and idle state, a CPU owner will
+	 * be assigned in .running().
+	 */
+	if (!idle)
+		return;
+	/*
+	 * A CPU is now available, notify the user-space scheduler that tasks
+	 * can be dispatched, if there is at least one task waiting to be
+	 * scheduled, either queued (accounted in nr_queued) or scheduled
+	 * (accounted in nr_scheduled).
+	 *
+	 * NOTE: nr_queued is incremented by the BPF component, more exactly in
+	 * enqueue(), when a task is sent to the user-space scheduler, then
+	 * the scheduler drains the queued tasks (updating nr_queued) and adds
+	 * them to its internal data structures / state; at this point tasks
+	 * become "scheduled" and the user-space scheduler will take care of
+	 * updating nr_scheduled accordingly; lastly tasks will be dispatched
+	 * and the user-space scheduler will update nr_scheduled again.
+	 *
+	 * Checking both counters allows to determine if there is still some
+	 * pending work to do for the scheduler: new tasks have been queued
+	 * since last check, or there are still tasks "queued" or "scheduled"
+	 * since the previous user-space scheduler run. If the counters are
+	 * both zero it is pointless to wake-up the scheduler (even if a CPU
+	 * becomes idle), because there is nothing to do.
+	 *
+	 * Keep in mind that update_idle() doesn't run concurrently with the
+	 * user-space scheduler (that is single-threaded): this function is
+	 * naturally serialized with the user-space scheduler code, therefore
+	 * this check here is also safe from a concurrency perspective.
+	 */
+	if (nr_queued || nr_scheduled) {
+		/*
+		 * Kick the CPU to make it immediately ready to accept
+		 * dispatched tasks.
+		 */
+		set_usersched_needed();
+		scx_bpf_kick_cpu(cpu, 0);
+	}
+}
+
+s32 BPF_STRUCT_OPS(userland_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
 {
 	if (bpf_task_storage_get(&task_ctx_stor, p, 0,
 				 BPF_LOCAL_STORAGE_GET_F_CREATE))
@@ -254,9 +339,11 @@ struct sched_ext_ops userland_ops = {
 	.select_cpu		= (void *)userland_select_cpu,
 	.enqueue		= (void *)userland_enqueue,
 	.dispatch		= (void *)userland_dispatch,
-	.prep_enable		= (void *)userland_prep_enable,
+	.update_idle		= (void *)userland_update_idle,
+	.init_task		= (void *)userland_init_task,
 	.init			= (void *)userland_init,
 	.exit			= (void *)userland_exit,
+	.flags			= SCX_OPS_ENQ_LAST | SCX_OPS_KEEP_BUILTIN_IDLE,
 	.timeout_ms		= 3000,
 	.name			= "userland",
 };
diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c
index fef028a1756e0b..368acd0b38bd9e 100644
--- a/tools/sched_ext/scx_userland.c
+++ b/tools/sched_ext/scx_userland.c
@@ -36,6 +36,8 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
+"Try to reduce `sysctl kernel.pid_max` if this program triggers OOMs.\n"
+"\n"
 "Usage: %s [-b BATCH] [-p]\n"
 "\n"
 "  -b BATCH      The number of tasks to batch when dispatching (default: 8)\n"
@@ -55,7 +57,10 @@ static struct scx_userland *skel;
 static struct bpf_link *ops_link;
 
 /* Stats collected in user space. */
-static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches;
+static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches, nr_vruntime_failed;
+
+/* Number of tasks currently enqueued. */
+static __u64 nr_curr_enqueued;
 
 /* The data structure containing tasks that are enqueued in user space. */
 struct enqueued_task {
@@ -80,13 +85,15 @@ LIST_HEAD(listhead, enqueued_task);
 static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head);
 
 /*
- * The statically allocated array of tasks. We use a statically allocated list
- * here to avoid having to allocate on the enqueue path, which could cause a
+ * The main array of tasks. The array is allocated all at once during
+ * initialization, based on /proc/sys/kernel/pid_max, to avoid having to
+ * dynamically allocate memory on the enqueue path, which could cause a
  * deadlock. A more substantive user space scheduler could e.g. provide a hook
  * for newly enabled tasks that are passed to the scheduler from the
  * .prep_enable() callback to allows the scheduler to allocate on safe paths.
  */
-struct enqueued_task tasks[USERLAND_MAX_TASKS];
+struct enqueued_task *tasks;
+static int pid_max;
 
 static double min_vruntime;
 
@@ -95,6 +102,41 @@ static void sigint_handler(int userland)
 	exit_req = 1;
 }
 
+static int get_pid_max(void)
+{
+	FILE *fp;
+	int pid_max;
+
+	fp = fopen("/proc/sys/kernel/pid_max", "r");
+	if (fp == NULL) {
+		fprintf(stderr, "Error opening /proc/sys/kernel/pid_max\n");
+		return -1;
+	}
+	if (fscanf(fp, "%d", &pid_max) != 1) {
+		fprintf(stderr, "Error reading from /proc/sys/kernel/pid_max\n");
+		fclose(fp);
+		return -1;
+	}
+	fclose(fp);
+
+	return pid_max;
+}
+
+static int init_tasks(void)
+{
+	pid_max = get_pid_max();
+	if (pid_max < 0)
+		return pid_max;
+
+	tasks = calloc(pid_max, sizeof(*tasks));
+	if (!tasks) {
+		fprintf(stderr, "Error allocating tasks array\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
 static __u32 task_pid(const struct enqueued_task *task)
 {
 	return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task);
@@ -106,8 +148,7 @@ static int dispatch_task(__s32 pid)
 
 	err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0);
 	if (err) {
-		fprintf(stderr, "Failed to dispatch task %d\n", pid);
-		exit_req = 1;
+		nr_vruntime_failed++;
 	} else {
 		nr_vruntime_dispatches++;
 	}
@@ -117,7 +158,7 @@ static int dispatch_task(__s32 pid)
 
 static struct enqueued_task *get_enqueued_task(__s32 pid)
 {
-	if (pid >= USERLAND_MAX_TASKS)
+	if (pid >= pid_max)
 		return NULL;
 
 	return &tasks[pid];
@@ -153,6 +194,7 @@ static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task)
 
 	update_enqueued(curr, bpf_task);
 	nr_vruntime_enqueues++;
+	nr_curr_enqueued++;
 
 	/*
 	 * Enqueue the task in a vruntime-sorted list. A more optimal data
@@ -186,8 +228,11 @@ static void drain_enqueued_map(void)
 		struct scx_userland_enqueued_task task;
 		int err;
 
-		if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task))
+		if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)) {
+			skel->bss->nr_queued = 0;
+			skel->bss->nr_scheduled = nr_curr_enqueued;
 			return;
+		}
 
 		err = vruntime_enqueue(&task);
 		if (err) {
@@ -210,18 +255,24 @@ static void dispatch_batch(void)
 
 		task = LIST_FIRST(&vruntime_head);
 		if (!task)
-			return;
+			break;
 
 		min_vruntime = task->vruntime;
 		pid = task_pid(task);
 		LIST_REMOVE(task, entries);
 		err = dispatch_task(pid);
 		if (err) {
-			fprintf(stderr, "Failed to dispatch task %d in %u\n",
-				pid, i);
-			return;
+			/*
+			 * If we fail to dispatch, put the task back to the
+			 * vruntime_head list and stop dispatching additional
+			 * tasks in this batch.
+			 */
+			LIST_INSERT_HEAD(&vruntime_head, task, entries);
+			break;
 		}
+		nr_curr_enqueued--;
 	}
+	skel->bss->nr_scheduled = nr_curr_enqueued;
 }
 
 static void *run_stats_printer(void *arg)
@@ -248,8 +299,10 @@ static void *run_stats_printer(void *arg)
 		printf("|-----------------------|\n");
 		printf("|  enq:      %10llu |\n", nr_vruntime_enqueues);
 		printf("|  disp:     %10llu |\n", nr_vruntime_dispatches);
+		printf("|  failed:   %10llu |\n", nr_vruntime_failed);
 		printf("o-----------------------o\n");
 		printf("\n\n");
+		fflush(stdout);
 		sleep(1);
 	}
 
@@ -272,6 +325,10 @@ static void bootstrap(int argc, char **argv)
 	};
 	bool switch_partial = false;
 
+	err = init_tasks();
+	if (err)
+		exit(err);
+
 	signal(SIGINT, sigint_handler);
 	signal(SIGTERM, sigint_handler);
 	libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
diff --git a/tools/sched_ext/scx_userland.h b/tools/sched_ext/scx_userland.h
index 639c6809c5ffe2..684fb2dd5de960 100644
--- a/tools/sched_ext/scx_userland.h
+++ b/tools/sched_ext/scx_userland.h
@@ -4,8 +4,6 @@
 #ifndef __SCX_USERLAND_COMMON_H
 #define __SCX_USERLAND_COMMON_H
 
-#define USERLAND_MAX_TASKS 8192
-
 /*
  * An instance of a task that has been enqueued by the kernel for consumption
  * by a user space global scheduler thread.