Skip to content

Commit ef60b8f

Browse files
namhyungacmel
authored andcommitted
perf trace: Support --summary-mode=cgroup
Add a new summary mode to collect stats for each cgroup. $ sudo ./perf trace -as --bpf-summary --summary-mode=cgroup -- sleep 1 Summary of events: cgroup /user.slice/user-657345.slice/user@657345.service/session.slice/org.gnome.Shell@x11.service, 535 events syscall calls errors total min avg max stddev (msec) (msec) (msec) (msec) (%) --------------- -------- ------ -------- --------- --------- --------- ------ ppoll 15 0 373.600 0.004 24.907 197.491 55.26% poll 15 0 1.325 0.001 0.088 0.369 38.76% close 66 0 0.567 0.007 0.009 0.026 3.55% write 150 0 0.471 0.001 0.003 0.010 3.29% recvmsg 94 83 0.290 0.000 0.003 0.037 16.39% ioctl 26 0 0.237 0.001 0.009 0.096 50.13% timerfd_create 66 0 0.236 0.003 0.004 0.024 8.92% timerfd_settime 70 0 0.160 0.001 0.002 0.012 7.66% writev 10 0 0.118 0.001 0.012 0.019 18.17% read 9 0 0.021 0.001 0.002 0.004 14.07% getpid 14 0 0.019 0.000 0.001 0.004 20.28% cgroup /system.slice/polkit.service, 94 events syscall calls errors total min avg max stddev (msec) (msec) (msec) (msec) (%) --------------- -------- ------ -------- --------- --------- --------- ------ ppoll 22 0 19.811 0.000 0.900 9.273 63.88% write 30 0 0.040 0.001 0.001 0.003 12.09% recvmsg 12 0 0.018 0.001 0.002 0.006 28.15% read 18 0 0.013 0.000 0.001 0.003 21.99% poll 12 0 0.006 0.000 0.001 0.001 4.48% cgroup /user.slice/user-657345.slice/user@657345.service/app.slice/app-org.gnome.Terminal.slice/gnome-terminal-server.service, 21 events syscall calls errors total min avg max stddev (msec) (msec) (msec) (msec) (%) --------------- -------- ------ -------- --------- --------- --------- ------ ppoll 4 0 17.476 0.003 4.369 13.298 69.65% recvmsg 15 12 0.068 0.002 0.005 0.014 26.53% writev 1 0 0.033 0.033 0.033 0.033 0.00% poll 1 0 0.005 0.005 0.005 0.005 0.00% ... It works only for --bpf-summary for now. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Howard Chu <howardchu95@gmail.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20250501225337.928470-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
1 parent 39922dc commit ef60b8f

File tree

6 files changed

+170
-12
lines changed

6 files changed

+170
-12
lines changed

tools/perf/Documentation/perf-trace.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,8 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
152152

153153
--summary-mode=mode::
154154
To be used with -s or -S, to select how to show summary. By default it'll
155-
show the syscall summary by thread. Possible values are: thread, total.
155+
show the syscall summary by thread. Possible values are: thread, total,
156+
cgroup.
156157

157158
--tool_stats::
158159
Show tool stats such as number of times fd->pathname was discovered thru

tools/perf/builtin-trace.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5302,6 +5302,8 @@ static int trace__parse_summary_mode(const struct option *opt, const char *str,
53025302
trace->summary_mode = SUMMARY__BY_THREAD;
53035303
} else if (!strcmp(str, "total")) {
53045304
trace->summary_mode = SUMMARY__BY_TOTAL;
5305+
} else if (!strcmp(str, "cgroup")) {
5306+
trace->summary_mode = SUMMARY__BY_CGROUP;
53055307
} else {
53065308
pr_err("Unknown summary mode: %s\n", str);
53075309
return -1;
@@ -5461,7 +5463,7 @@ int cmd_trace(int argc, const char **argv)
54615463
OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary,
54625464
"Show errno stats per syscall, use with -s or -S"),
54635465
OPT_CALLBACK(0, "summary-mode", &trace, "mode",
5464-
"How to show summary: select thread (default) or total",
5466+
"How to show summary: select thread (default), total or cgroup",
54655467
trace__parse_summary_mode),
54665468
OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
54675469
"Trace pagefaults", parse_pagefaults, "maj"),
@@ -5775,6 +5777,12 @@ int cmd_trace(int argc, const char **argv)
57755777
symbol_conf.keep_exited_threads = true;
57765778
if (trace.summary_mode == SUMMARY__NONE)
57775779
trace.summary_mode = SUMMARY__BY_THREAD;
5780+
5781+
if (!trace.summary_bpf && trace.summary_mode == SUMMARY__BY_CGROUP) {
5782+
pr_err("Error: --summary-mode=cgroup only works with --bpf-summary\n");
5783+
err = -EINVAL;
5784+
goto out;
5785+
}
57785786
}
57795787

57805788
if (output_name != NULL) {

tools/perf/util/bpf-trace-summary.c

Lines changed: 117 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@
66

77
#include "dwarf-regs.h" /* for EM_HOST */
88
#include "syscalltbl.h"
9+
#include "util/cgroup.h"
910
#include "util/hashmap.h"
1011
#include "util/trace.h"
1112
#include "util/util.h"
1213
#include <bpf/bpf.h>
14+
#include <linux/rbtree.h>
1315
#include <linux/time64.h>
1416
#include <tools/libc_compat.h> /* reallocarray */
1517

@@ -18,6 +20,7 @@
1820

1921

2022
static struct syscall_summary_bpf *skel;
23+
static struct rb_root cgroups = RB_ROOT;
2124

2225
int trace_prepare_bpf_summary(enum trace_summary_mode mode)
2326
{
@@ -29,9 +32,14 @@ int trace_prepare_bpf_summary(enum trace_summary_mode mode)
2932

3033
if (mode == SUMMARY__BY_THREAD)
3134
skel->rodata->aggr_mode = SYSCALL_AGGR_THREAD;
35+
else if (mode == SUMMARY__BY_CGROUP)
36+
skel->rodata->aggr_mode = SYSCALL_AGGR_CGROUP;
3237
else
3338
skel->rodata->aggr_mode = SYSCALL_AGGR_CPU;
3439

40+
if (cgroup_is_v2("perf_event") > 0)
41+
skel->rodata->use_cgroup_v2 = 1;
42+
3543
if (syscall_summary_bpf__load(skel) < 0) {
3644
fprintf(stderr, "failed to load syscall summary bpf skeleton\n");
3745
return -1;
@@ -42,6 +50,9 @@ int trace_prepare_bpf_summary(enum trace_summary_mode mode)
4250
return -1;
4351
}
4452

53+
if (mode == SUMMARY__BY_CGROUP)
54+
read_all_cgroups(&cgroups);
55+
4556
return 0;
4657
}
4758

@@ -88,9 +99,13 @@ static double rel_stddev(struct syscall_stats *stat)
8899
* per-cpu analysis so it's keyed by the syscall number to combine stats
89100
* from different CPUs. And syscall_data always has a syscall_node so
90101
* it can effectively work as flat hierarchy.
102+
*
103+
* For per-cgroup stats, it uses two-level data structure like thread
104+
* syscall_data is keyed by CGROUP and has an array of node which
105+
* represents each syscall for the cgroup.
91106
*/
92107
struct syscall_data {
93-
int key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU */
108+
u64 key; /* tid if AGGR_THREAD, syscall-nr if AGGR_CPU, cgroup if AGGR_CGROUP */
94109
int nr_events;
95110
int nr_nodes;
96111
u64 total_time;
@@ -191,7 +206,7 @@ static int print_thread_stat(struct syscall_data *data, FILE *fp)
191206

192207
qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
193208

194-
printed += fprintf(fp, " thread (%d), ", data->key);
209+
printed += fprintf(fp, " thread (%d), ", (int)data->key);
195210
printed += fprintf(fp, "%d events\n\n", data->nr_events);
196211

197212
printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
@@ -283,6 +298,75 @@ static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp)
283298
return printed;
284299
}
285300

301+
static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key,
302+
struct syscall_stats *map_data)
303+
{
304+
struct syscall_data *data;
305+
struct syscall_node *nodes;
306+
307+
if (!hashmap__find(hash, map_key->cgroup, &data)) {
308+
data = zalloc(sizeof(*data));
309+
if (data == NULL)
310+
return -ENOMEM;
311+
312+
data->key = map_key->cgroup;
313+
if (hashmap__add(hash, data->key, data) < 0) {
314+
free(data);
315+
return -ENOMEM;
316+
}
317+
}
318+
319+
/* update thread total stats */
320+
data->nr_events += map_data->count;
321+
data->total_time += map_data->total_time;
322+
323+
nodes = reallocarray(data->nodes, data->nr_nodes + 1, sizeof(*nodes));
324+
if (nodes == NULL)
325+
return -ENOMEM;
326+
327+
data->nodes = nodes;
328+
nodes = &data->nodes[data->nr_nodes++];
329+
nodes->syscall_nr = map_key->nr;
330+
331+
/* each thread has an entry for each syscall, just use the stat */
332+
memcpy(&nodes->stats, map_data, sizeof(*map_data));
333+
return 0;
334+
}
335+
336+
static int print_cgroup_stat(struct syscall_data *data, FILE *fp)
337+
{
338+
int printed = 0;
339+
struct cgroup *cgrp = __cgroup__find(&cgroups, data->key);
340+
341+
qsort(data->nodes, data->nr_nodes, sizeof(*data->nodes), nodecmp);
342+
343+
if (cgrp)
344+
printed += fprintf(fp, " cgroup %s,", cgrp->name);
345+
else
346+
printed += fprintf(fp, " cgroup id:%lu,", (unsigned long)data->key);
347+
348+
printed += fprintf(fp, " %d events\n\n", data->nr_events);
349+
350+
printed += fprintf(fp, " syscall calls errors total min avg max stddev\n");
351+
printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
352+
printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n");
353+
354+
printed += print_common_stats(data, fp);
355+
printed += fprintf(fp, "\n\n");
356+
357+
return printed;
358+
}
359+
360+
static int print_cgroup_stats(struct syscall_data **data, int nr_data, FILE *fp)
361+
{
362+
int printed = 0;
363+
364+
for (int i = 0; i < nr_data; i++)
365+
printed += print_cgroup_stat(data[i], fp);
366+
367+
return printed;
368+
}
369+
286370
int trace_print_bpf_summary(FILE *fp)
287371
{
288372
struct bpf_map *map = skel->maps.syscall_stats_map;
@@ -305,10 +389,19 @@ int trace_print_bpf_summary(FILE *fp)
305389
struct syscall_stats stat;
306390

307391
if (!bpf_map__lookup_elem(map, &key, sizeof(key), &stat, sizeof(stat), 0)) {
308-
if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
392+
switch (skel->rodata->aggr_mode) {
393+
case SYSCALL_AGGR_THREAD:
309394
update_thread_stats(&schash, &key, &stat);
310-
else
395+
break;
396+
case SYSCALL_AGGR_CPU:
311397
update_total_stats(&schash, &key, &stat);
398+
break;
399+
case SYSCALL_AGGR_CGROUP:
400+
update_cgroup_stats(&schash, &key, &stat);
401+
break;
402+
default:
403+
break;
404+
}
312405
}
313406

314407
prev_key = &key;
@@ -325,10 +418,19 @@ int trace_print_bpf_summary(FILE *fp)
325418

326419
qsort(data, nr_data, sizeof(*data), datacmp);
327420

328-
if (skel->rodata->aggr_mode == SYSCALL_AGGR_THREAD)
421+
switch (skel->rodata->aggr_mode) {
422+
case SYSCALL_AGGR_THREAD:
329423
printed += print_thread_stats(data, nr_data, fp);
330-
else
424+
break;
425+
case SYSCALL_AGGR_CPU:
331426
printed += print_total_stats(data, nr_data, fp);
427+
break;
428+
case SYSCALL_AGGR_CGROUP:
429+
printed += print_cgroup_stats(data, nr_data, fp);
430+
break;
431+
default:
432+
break;
433+
}
332434

333435
for (i = 0; i < nr_data && data; i++) {
334436
free(data[i]->nodes);
@@ -343,5 +445,14 @@ int trace_print_bpf_summary(FILE *fp)
343445

344446
void trace_cleanup_bpf_summary(void)
345447
{
448+
if (!RB_EMPTY_ROOT(&cgroups)) {
449+
struct cgroup *cgrp, *tmp;
450+
451+
rbtree_postorder_for_each_entry_safe(cgrp, tmp, &cgroups, node)
452+
cgroup__put(cgrp);
453+
454+
cgroups = RB_ROOT;
455+
}
456+
346457
syscall_summary_bpf__destroy(skel);
347458
}

tools/perf/util/bpf_skel/syscall_summary.bpf.c

Lines changed: 39 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include <bpf/bpf_helpers.h>
1010
#include <bpf/bpf_tracing.h>
11+
#include <bpf/bpf_core_read.h>
1112

1213
/* This is to calculate a delta between sys-enter and sys-exit for each thread */
1314
struct syscall_trace {
@@ -35,10 +36,41 @@ struct syscall_stats_map {
3536
int enabled; /* controlled from userspace */
3637

3738
const volatile enum syscall_aggr_mode aggr_mode;
39+
const volatile int use_cgroup_v2;
3840

39-
static void update_stats(int cpu_or_tid, int nr, s64 duration, long ret)
41+
int perf_subsys_id = -1;
42+
43+
static inline __u64 get_current_cgroup_id(void)
44+
{
45+
struct task_struct *task;
46+
struct cgroup *cgrp;
47+
48+
if (use_cgroup_v2)
49+
return bpf_get_current_cgroup_id();
50+
51+
task = bpf_get_current_task_btf();
52+
53+
if (perf_subsys_id == -1) {
54+
#if __has_builtin(__builtin_preserve_enum_value)
55+
perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
56+
perf_event_cgrp_id);
57+
#else
58+
perf_subsys_id = perf_event_cgrp_id;
59+
#endif
60+
}
61+
62+
cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
63+
return BPF_CORE_READ(cgrp, kn, id);
64+
}
65+
66+
static void update_stats(int cpu_or_tid, u64 cgroup_id, int nr, s64 duration,
67+
long ret)
4068
{
41-
struct syscall_key key = { .cpu_or_tid = cpu_or_tid, .nr = nr, };
69+
struct syscall_key key = {
70+
.cpu_or_tid = cpu_or_tid,
71+
.cgroup = cgroup_id,
72+
.nr = nr,
73+
};
4274
struct syscall_stats *stats;
4375

4476
stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
@@ -90,7 +122,8 @@ SEC("tp_btf/sys_exit")
90122
int sys_exit(u64 *ctx)
91123
{
92124
int tid;
93-
int key;
125+
int key = 0;
126+
u64 cgroup = 0;
94127
long ret = ctx[1]; /* return value of the syscall */
95128
struct syscall_trace *st;
96129
s64 delta;
@@ -105,11 +138,13 @@ int sys_exit(u64 *ctx)
105138

106139
if (aggr_mode == SYSCALL_AGGR_THREAD)
107140
key = tid;
141+
else if (aggr_mode == SYSCALL_AGGR_CGROUP)
142+
cgroup = get_current_cgroup_id();
108143
else
109144
key = bpf_get_smp_processor_id();
110145

111146
delta = bpf_ktime_get_ns() - st->timestamp;
112-
update_stats(key, st->nr, delta, ret);
147+
update_stats(key, cgroup, st->nr, delta, ret);
113148

114149
bpf_map_delete_elem(&syscall_trace_map, &tid);
115150
return 0;

tools/perf/util/bpf_skel/syscall_summary.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@
66
enum syscall_aggr_mode {
77
SYSCALL_AGGR_THREAD,
88
SYSCALL_AGGR_CPU,
9+
SYSCALL_AGGR_CGROUP,
910
};
1011

1112
struct syscall_key {
13+
u64 cgroup;
1214
int cpu_or_tid;
1315
int nr;
1416
};

tools/perf/util/trace.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ enum trace_summary_mode {
88
SUMMARY__NONE = 0,
99
SUMMARY__BY_TOTAL,
1010
SUMMARY__BY_THREAD,
11+
SUMMARY__BY_CGROUP,
1112
};
1213

1314
#ifdef HAVE_BPF_SKEL

0 commit comments

Comments
 (0)