Skip to content

Commit 041cd64

Browse files
committed
cgroup: Implement cgroup2 basic CPU usage accounting
In cgroup1, while cpuacct isn't actually controlling any resources, it is a separate controller due to combination of two factors - 1. enabling cpu controller has significant side effects, and 2. we have to pick one of the hierarchies to account CPU usages on. cpuacct controller is effectively used to designate a hierarchy to track CPU usages on. cgroup2's unified hierarchy removes the second reason and we can account basic CPU usages by default. While we can use cpuacct for this purpose, both its interface and implementation leave a lot to be desired - it collects and exposes two sources of truth which don't agree with each other and some of the exposed statistics don't make much sense. Also, it propagates all the way up the hierarchy on each accounting event which is unnecessary. This patch adds basic resource accounting mechanism to cgroup2's unified hierarchy and accounts CPU usages using it. * All accountings are done per-cpu and don't propagate immediately. It just bumps the per-cgroup per-cpu counters and links to the parent's updated list if not already on it. * On a read, the per-cpu counters are collected into the global ones and then propagated upwards. Only the per-cpu counters which have changed since the last read are propagated. * CPU usage stats are collected and shown in "cgroup.stat" with "cpu." prefix. Total usage is collected from scheduling events. User/sys breakdown is sourced from tick sampling and adjusted to the usage using cputime_adjust(). This keeps the accounting side hot path O(1) and per-cpu and the read side O(nr_updated_since_last_read). v2: Minor changes and documentation updates as suggested by Waiman and Roman. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Li Zefan <lizefan@huawei.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Waiman Long <longman@redhat.com> Cc: Roman Gushchin <guro@fb.com>
1 parent d2cc5ed commit 041cd64

File tree

7 files changed

+453
-3
lines changed

7 files changed

+453
-3
lines changed

Documentation/cgroup-v2.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -886,6 +886,15 @@ All cgroup core files are prefixed with "cgroup."
886886
A dying cgroup can consume system resources not exceeding
887887
limits, which were active at the moment of cgroup deletion.
888888

889+
cpu.usage_usec
890+
CPU time consumed in the subtree.
891+
892+
cpu.user_usec
893+
User CPU time consumed in the subtree.
894+
895+
cpu.system_usec
896+
System CPU time consumed in the subtree.
897+
889898

890899
Controllers
891900
===========

include/linux/cgroup-defs.h

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <linux/refcount.h>
1717
#include <linux/percpu-refcount.h>
1818
#include <linux/percpu-rwsem.h>
19+
#include <linux/u64_stats_sync.h>
1920
#include <linux/workqueue.h>
2021
#include <linux/bpf-cgroup.h>
2122

@@ -254,6 +255,57 @@ struct css_set {
254255
struct rcu_head rcu_head;
255256
};
256257

258+
/*
259+
* cgroup basic resource usage statistics. Accounting is done per-cpu in
260+
* cgroup_cpu_stat which is then lazily propagated up the hierarchy on
261+
* reads.
262+
*
263+
* When a stat gets updated, the cgroup_cpu_stat and its ancestors are
264+
* linked into the updated tree. On the following read, propagation only
265+
* considers and consumes the updated tree. This makes reading O(the
266+
* number of descendants which have been active since last read) instead of
267+
* O(the total number of descendants).
268+
*
269+
* This is important because there can be a lot of (draining) cgroups which
270+
* aren't active and stat may be read frequently. The combination can
271+
* become very expensive. By propagating selectively, increasing reading
272+
* frequency decreases the cost of each read.
273+
*/
274+
struct cgroup_cpu_stat {
275+
/*
276+
* ->sync protects all the current counters. These are the only
277+
* fields which get updated in the hot path.
278+
*/
279+
struct u64_stats_sync sync;
280+
struct task_cputime cputime;
281+
282+
/*
283+
* Snapshots at the last reading. These are used to calculate the
284+
* deltas to propagate to the global counters.
285+
*/
286+
struct task_cputime last_cputime;
287+
288+
/*
289+
* Child cgroups with stat updates on this cpu since the last read
290+
* are linked on the parent's ->updated_children through
291+
* ->updated_next.
292+
*
293+
* In addition to being more compact, singly-linked list pointing
294+
* to the cgroup makes it unnecessary for each per-cpu struct to
295+
* point back to the associated cgroup.
296+
*
297+
* Protected by per-cpu cgroup_cpu_stat_lock.
298+
*/
299+
struct cgroup *updated_children; /* terminated by self cgroup */
300+
struct cgroup *updated_next; /* NULL iff not on the list */
301+
};
302+
303+
struct cgroup_stat {
304+
/* per-cpu statistics are collected into the folowing global counters */
305+
struct task_cputime cputime;
306+
struct prev_cputime prev_cputime;
307+
};
308+
257309
struct cgroup {
258310
/* self css with NULL ->ss, points back to this cgroup */
259311
struct cgroup_subsys_state self;
@@ -353,6 +405,11 @@ struct cgroup {
353405
*/
354406
struct cgroup *dom_cgrp;
355407

408+
/* cgroup basic resource statistics */
409+
struct cgroup_cpu_stat __percpu *cpu_stat;
410+
struct cgroup_stat pending_stat; /* pending from children */
411+
struct cgroup_stat stat;
412+
356413
/*
357414
* list of pidlists, up to two for each namespace (one for procs, one
358415
* for tasks); created on demand.

include/linux/cgroup.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -703,17 +703,39 @@ static inline void cpuacct_account_field(struct task_struct *tsk, int index,
703703
u64 val) {}
704704
#endif
705705

706+
void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix);
707+
708+
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
709+
void __cgroup_account_cputime_field(struct cgroup *cgrp,
710+
enum cpu_usage_stat index, u64 delta_exec);
711+
706712
static inline void cgroup_account_cputime(struct task_struct *task,
707713
u64 delta_exec)
708714
{
715+
struct cgroup *cgrp;
716+
709717
cpuacct_charge(task, delta_exec);
718+
719+
rcu_read_lock();
720+
cgrp = task_dfl_cgroup(task);
721+
if (cgroup_parent(cgrp))
722+
__cgroup_account_cputime(cgrp, delta_exec);
723+
rcu_read_unlock();
710724
}
711725

712726
static inline void cgroup_account_cputime_field(struct task_struct *task,
713727
enum cpu_usage_stat index,
714728
u64 delta_exec)
715729
{
730+
struct cgroup *cgrp;
731+
716732
cpuacct_account_field(task, index, delta_exec);
733+
734+
rcu_read_lock();
735+
cgrp = task_dfl_cgroup(task);
736+
if (cgroup_parent(cgrp))
737+
__cgroup_account_cputime_field(cgrp, index, delta_exec);
738+
rcu_read_unlock();
717739
}
718740

719741
#else /* CONFIG_CGROUPS */

kernel/cgroup/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
obj-y := cgroup.o namespace.o cgroup-v1.o
1+
obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
22

33
obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
44
obj-$(CONFIG_CGROUP_PIDS) += pids.o

kernel/cgroup/cgroup-internal.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
199199

200200
int cgroup_task_count(const struct cgroup *cgrp);
201201

202+
/*
203+
* stat.c
204+
*/
205+
void cgroup_stat_flush(struct cgroup *cgrp);
206+
int cgroup_stat_init(struct cgroup *cgrp);
207+
void cgroup_stat_exit(struct cgroup *cgrp);
208+
void cgroup_stat_boot(void);
209+
202210
/*
203211
* namespace.c
204212
*/

kernel/cgroup/cgroup.c

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
142142
};
143143
#undef SUBSYS
144144

145+
static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
146+
145147
/*
146148
* The default hierarchy, reserved for the subsystems that are otherwise
147149
* unattached - it never has more than a single cgroup, and all tasks are
148150
* part of that cgroup.
149151
*/
150-
struct cgroup_root cgrp_dfl_root;
152+
struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
151153
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
152154

153155
/*
@@ -3301,6 +3303,8 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
33013303
seq_printf(seq, "nr_dying_descendants %d\n",
33023304
cgroup->nr_dying_descendants);
33033305

3306+
cgroup_stat_show_cputime(seq, "cpu.");
3307+
33043308
return 0;
33053309
}
33063310

@@ -4471,6 +4475,8 @@ static void css_free_work_fn(struct work_struct *work)
44714475
*/
44724476
cgroup_put(cgroup_parent(cgrp));
44734477
kernfs_put(cgrp->kn);
4478+
if (cgroup_on_dfl(cgrp))
4479+
cgroup_stat_exit(cgrp);
44744480
kfree(cgrp);
44754481
} else {
44764482
/*
@@ -4515,6 +4521,9 @@ static void css_release_work_fn(struct work_struct *work)
45154521
/* cgroup release path */
45164522
trace_cgroup_release(cgrp);
45174523

4524+
if (cgroup_on_dfl(cgrp))
4525+
cgroup_stat_flush(cgrp);
4526+
45184527
for (tcgrp = cgroup_parent(cgrp); tcgrp;
45194528
tcgrp = cgroup_parent(tcgrp))
45204529
tcgrp->nr_dying_descendants--;
@@ -4698,14 +4707,20 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
46984707
if (ret)
46994708
goto out_free_cgrp;
47004709

4710+
if (cgroup_on_dfl(parent)) {
4711+
ret = cgroup_stat_init(cgrp);
4712+
if (ret)
4713+
goto out_cancel_ref;
4714+
}
4715+
47014716
/*
47024717
* Temporarily set the pointer to NULL, so idr_find() won't return
47034718
* a half-baked cgroup.
47044719
*/
47054720
cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
47064721
if (cgrp->id < 0) {
47074722
ret = -ENOMEM;
4708-
goto out_cancel_ref;
4723+
goto out_stat_exit;
47094724
}
47104725

47114726
init_cgroup_housekeeping(cgrp);
@@ -4754,6 +4769,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
47544769

47554770
return cgrp;
47564771

4772+
out_stat_exit:
4773+
if (cgroup_on_dfl(parent))
4774+
cgroup_stat_exit(cgrp);
47574775
out_cancel_ref:
47584776
percpu_ref_exit(&cgrp->self.refcnt);
47594777
out_free_cgrp:
@@ -5148,6 +5166,8 @@ int __init cgroup_init(void)
51485166
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
51495167
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
51505168

5169+
cgroup_stat_boot();
5170+
51515171
/*
51525172
* The latency of the synchronize_sched() is too high for cgroups,
51535173
* avoid it at the cost of forcing all readers into the slow path.

0 commit comments

Comments
 (0)