Skip to content

Commit

Permalink
sched_ext: Add cgroup support
Browse files Browse the repository at this point in the history
Add sched_ext_ops operations to init/exit cgroups, and track task migrations
and config changes. A BPF scheduler may not implement or implement only
subset of cgroup features. The implemented features can be indicated using
%SCX_OPS_HAS_CGOUP_* flags. If cgroup configuration makes use of features
that are not implemented, a warning is triggered.

While a BPF scheduler is being enabled and disabled, relevant cgroup
operations are locked out using scx_cgroup_rwsem. This avoids situations
like task prep taking place while the task is being moved across cgroups,
making things easier for BPF schedulers.

v7: - cgroup interface file visibility toggling is dropped in favor just
      warning messages. Dynamically changing interface visiblity caused more
      confusion than helping.

v6: - Updated to reflect the removal of SCX_KF_SLEEPABLE.

    - Updated to use CONFIG_GROUP_SCHED_WEIGHT and fixes for
      !CONFIG_FAIR_GROUP_SCHED && CONFIG_EXT_GROUP_SCHED.

v5: - Flipped the locking order between scx_cgroup_rwsem and
      cpus_read_lock() to avoid locking order conflict w/ cpuset. Better
      documentation around locking.

    - sched_move_task() takes an early exit if the source and destination
      are identical. This triggered the warning in scx_cgroup_can_attach()
      as it left p->scx.cgrp_moving_from uncleared. Updated the cgroup
      migration path so that ops.cgroup_prep_move() is skipped for identity
      migrations so that its invocations always match ops.cgroup_move()
      one-to-one.

v4: - Example schedulers moved into their own patches.

    - Fix build failure when !CONFIG_CGROUP_SCHED, reported by Andrea Righi.

v3: - Make scx_example_pair switch all tasks by default.

    - Convert to BPF inline iterators.

    - scx_bpf_task_cgroup() is added to determine the current cgroup from
      CPU controller's POV. This allows BPF schedulers to accurately track
      CPU cgroup membership.

    - scx_example_flatcg added. This demonstrates flattened hierarchy
      implementation of CPU cgroup control and shows significant performance
      improvement when cgroups which are nested multiple levels are under
      competition.

v2: - Build fixes for different CONFIG combinations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Andrea Righi <andrea.righi@canonical.com>
  • Loading branch information
htejun committed Sep 4, 2024
1 parent e179e80 commit 8195136
Show file tree
Hide file tree
Showing 8 changed files with 636 additions and 19 deletions.
3 changes: 3 additions & 0 deletions include/linux/sched/ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ struct sched_ext_entity {
bool disallow; /* reject switching into SCX */

/* cold fields */
#ifdef CONFIG_EXT_GROUP_SCHED
struct cgroup *cgrp_moving_from;
#endif
/* must be the last field, see init_scx_entity() */
struct list_head tasks_node;
};
Expand Down
6 changes: 6 additions & 0 deletions init/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -1055,6 +1055,12 @@ config RT_GROUP_SCHED
realtime bandwidth for them.
See Documentation/scheduler/sched-rt-group.rst for more information.

config EXT_GROUP_SCHED
bool
depends on SCHED_CLASS_EXT && CGROUP_SCHED
select GROUP_SCHED_WEIGHT
default y

endif #CGROUP_SCHED

config SCHED_MM_CID
Expand Down
67 changes: 57 additions & 10 deletions kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -8364,6 +8364,9 @@ void __init sched_init(void)
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_EXT_GROUP_SCHED
root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
#endif /* CONFIG_EXT_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
Expand Down Expand Up @@ -8801,6 +8804,7 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;

scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
alloc_uclamp_sched_group(tg, parent);

return tg;
Expand Down Expand Up @@ -8928,6 +8932,7 @@ void sched_move_task(struct task_struct *tsk)
put_prev_task(rq, tsk);

sched_change_group(tsk, group);
scx_move_task(tsk);

if (queued)
enqueue_task(rq, tsk, queue_flags);
Expand Down Expand Up @@ -8965,6 +8970,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
struct task_group *parent = css_tg(css->parent);
int ret;

ret = scx_tg_online(tg);
if (ret)
return ret;

if (parent)
sched_online_group(tg, parent);
Expand All @@ -8979,6 +8989,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
return 0;
}

static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);

scx_tg_offline(tg);
}

static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
Expand All @@ -8996,19 +9013,19 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_unregister_group(tg);
}

#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
#ifdef CONFIG_RT_GROUP_SCHED
struct task_struct *task;
struct cgroup_subsys_state *css;

cgroup_taskset_for_each(task, css, tset) {
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
}
return 0;
}
#endif
return scx_cgroup_can_attach(tset);
}

static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{
Expand All @@ -9017,6 +9034,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)

cgroup_taskset_for_each(task, css, tset)
sched_move_task(task);

scx_cgroup_finish_attach();
}

static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
scx_cgroup_cancel_attach(tset);
}

#ifdef CONFIG_UCLAMP_TASK_GROUP
Expand Down Expand Up @@ -9196,15 +9220,25 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
#ifdef CONFIG_GROUP_SCHED_WEIGHT
static unsigned long tg_weight(struct task_group *tg)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
return scale_load_down(tg->shares);
#else
return sched_weight_from_cgroup(tg->scx_weight);
#endif
}

static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
{
int ret;

if (shareval > scale_load_down(ULONG_MAX))
shareval = MAX_SHARES;
return sched_group_set_shares(css_tg(css), scale_load(shareval));
ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
if (!ret)
scx_group_set_weight(css_tg(css),
sched_weight_to_cgroup(shareval));
return ret;
}

static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
Expand Down Expand Up @@ -9595,7 +9629,12 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 idle)
{
return sched_group_set_idle(css_tg(css), idle);
int ret;

ret = sched_group_set_idle(css_tg(css), idle);
if (!ret)
scx_group_set_idle(css_tg(css), idle);
return ret;
}
#endif

Expand Down Expand Up @@ -9722,13 +9761,17 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
struct cftype *cft, u64 cgrp_weight)
{
unsigned long weight;
int ret;

if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
return -ERANGE;

weight = sched_weight_from_cgroup(cgrp_weight);

return sched_group_set_shares(css_tg(css), scale_load(weight));
ret = sched_group_set_shares(css_tg(css), scale_load(weight));
if (!ret)
scx_group_set_weight(css_tg(css), cgrp_weight);
return ret;
}

static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
Expand All @@ -9753,7 +9796,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 nice)
{
unsigned long weight;
int idx;
int idx, ret;

if (nice < MIN_NICE || nice > MAX_NICE)
return -ERANGE;
Expand All @@ -9762,7 +9805,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
idx = array_index_nospec(idx, 40);
weight = sched_prio_to_weight[idx];

return sched_group_set_shares(css_tg(css), scale_load(weight));
ret = sched_group_set_shares(css_tg(css), scale_load(weight));
if (!ret)
scx_group_set_weight(css_tg(css),
sched_weight_to_cgroup(weight));
return ret;
}
#endif /* CONFIG_GROUP_SCHED_WEIGHT */

Expand Down Expand Up @@ -9878,14 +9925,14 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_online = cpu_cgroup_css_online,
.css_offline = cpu_cgroup_css_offline,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
.css_local_stat_show = cpu_local_stat_show,
#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
#endif
.attach = cpu_cgroup_attach,
.cancel_attach = cpu_cgroup_cancel_attach,
.legacy_cftypes = cpu_legacy_files,
.dfl_cftypes = cpu_files,
.early_init = true,
Expand Down
Loading

0 comments on commit 8195136

Please sign in to comment.