Skip to content

Commit d3500c6

Browse files
committed
port txg dtrace script to ebpf
closes #30
1 parent 3f6c4b1 commit d3500c6

File tree

1 file changed

+377
-0
lines changed

1 file changed

+377
-0
lines changed

bpf/standalone/txg.py

Lines changed: 377 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,377 @@
1+
#!/usr/bin/env python3
2+
#
3+
# Copyright 2019 Delphix. All rights reserved.
4+
#
5+
# SPDX-License-Identifier: GPL-2.0-or-later
6+
#
7+
#
8+
# This script prints out information about every spa_sync() for the domain0
9+
# zpool. This allows us to find a point in the past where I/O performance
10+
# degraded and some high-level symptoms about why it's slower now than it was,
11+
# which may be enough to correlate the change to a configuration /
12+
# environmental change.
13+
#
14+
15+
from bcc import BPF, PerfType, PerfSWConfig
16+
import argparse
17+
import os
18+
import drgn
19+
import re
20+
from datetime import datetime
21+
22+
# define BPF program
23+
bpf_text = """
24+
#include <uapi/linux/ptrace.h>
25+
#include <linux/bpf_common.h>
26+
#include <uapi/linux/bpf.h>
27+
#include <uapi/linux/bpf_perf_event.h>
28+
#include <sys/spa_impl.h>
29+
#include <linux/sched.h>
30+
"""
31+
parser = argparse.ArgumentParser(
32+
description='Collect spa_sync statistics for each txg.',
33+
usage='estat txg [options]')
34+
parser.add_argument('-c', '--coll', type=int, action='store',
35+
dest='collection_sec',
36+
help='The collection interval in seconds')
37+
parser.add_argument('-p', '--pool', type=str, action='store',
38+
dest='pool',
39+
help='The pool to monitor (default: domain0)')
40+
args = parser.parse_args()
41+
42+
if (args.pool):
43+
bpf_text += '#define POOL "' + str(args.pool) + '"'
44+
else:
45+
bpf_text += '#define POOL "domain0"'
46+
47+
bpf_text += """
48+
typedef struct {
49+
spa_t *spa;
50+
u64 start;
51+
u64 stop;
52+
u64 time_since_last_sync;
53+
u64 pool_sync_enter;
54+
u64 elapsed_pass1;
55+
u64 delay_entry_time;
56+
u64 txg;
57+
u64 throttle;
58+
u64 delay_max;
59+
u64 delay_sum;
60+
u64 delay_count;
61+
} spa_sync_info;
62+
63+
typedef struct {
64+
u64 txg;
65+
u64 time_since_last_sync;
66+
u64 sync_time;
67+
u64 elapsed_pass1;
68+
u64 dirty_b;
69+
u64 throttle;
70+
u64 delay_max;
71+
u64 delay_sum;
72+
u64 delay_count;
73+
} spa_sync_data;
74+
75+
typedef struct {
76+
u64 *dirty_addr;
77+
u64 dirty_max;
78+
} spa_sync_dirty;
79+
80+
#define SPA_SYNC_DIRTY_INDEX 0;
81+
82+
BPF_HASH(sync_info_map, u32, spa_sync_info);
83+
BPF_ARRAY(sync_dirty_map, spa_sync_dirty, 1);
84+
BPF_PERF_OUTPUT(sync_events);
85+
86+
/*
87+
* Store the spa_t address if the pool name matches POOL,
88+
* "domain0" by default. Should only have to do the string
89+
* comparision once.
90+
*/
91+
92+
static inline void pool_set(spa_sync_info *info, spa_t *spa)
93+
{
94+
char comparand[sizeof(POOL)];
95+
bpf_probe_read(&comparand, sizeof(comparand), spa->spa_name);
96+
char compare[] = POOL;
97+
for (int i = 0; i < sizeof(comparand); ++i)
98+
if (compare[i] != comparand[i])
99+
return;
100+
info->spa = spa;
101+
}
102+
103+
/*
104+
* Get the domain0 spa object and store "start" time and
105+
* "time since last sync".
106+
*/
107+
int spa_sync_entry(struct pt_regs *ctx, spa_t *spa, uint64_t txg)
108+
{
109+
u32 tid = bpf_get_current_pid_tgid();
110+
spa_sync_info *info = sync_info_map.lookup(&tid);
111+
if (info == NULL) {
112+
spa_sync_info spa_info = {};
113+
sync_info_map.insert(&tid, &spa_info);
114+
info = &spa_info;
115+
116+
int dirty_index = SPA_SYNC_DIRTY_INDEX;
117+
spa_sync_dirty *spa_dirty;
118+
spa_dirty = sync_dirty_map.lookup(&dirty_index);
119+
if (spa_dirty != NULL) {
120+
spa_dirty->dirty_addr = &spa->spa_dsl_pool->dp_dirty_total;
121+
spa_dirty->dirty_max = spa->spa_dsl_pool->dp_dirty_total;
122+
}
123+
}
124+
125+
if (info->spa == 0)
126+
pool_set(info, spa);
127+
128+
if (info->spa != spa)
129+
return 0;
130+
131+
info->start = bpf_ktime_get_ns();
132+
info->txg = txg;
133+
/*
134+
* We may not have a "stop" time yet, so just print a (kind of bogus)
135+
* zero value for "time since last sync" if that's the case. This only
136+
* affects the first txg we see -- after that "stop" will be set by the
137+
* previous txg.
138+
*/
139+
info->time_since_last_sync = info->stop != 0 ?
140+
info->start - info->stop : 0;
141+
info->pool_sync_enter = 0;
142+
info->delay_entry_time = 0;
143+
info->delay_max = 0;
144+
return 0;
145+
}
146+
147+
/*
148+
* Collect data required to know the percentage of syncing time spent
149+
* in pass 1.
150+
*/
151+
int dsl_pool_sync_entry(struct pt_regs *ctx, dsl_pool_t *dp)
152+
{
153+
u32 tid = bpf_get_current_pid_tgid();
154+
spa_sync_info *info = sync_info_map.lookup(&tid);
155+
if (info == NULL || info->spa != dp->dp_spa ||
156+
info->spa->spa_sync_pass != 1) {
157+
return 0;
158+
}
159+
160+
info->pool_sync_enter = bpf_ktime_get_ns();
161+
return 0;
162+
}
163+
164+
int dsl_pool_sync_return(struct pt_regs *ctx)
165+
{
166+
u32 tid = bpf_get_current_pid_tgid();
167+
spa_sync_info *info = sync_info_map.lookup(&tid);
168+
if (info == NULL || info->pool_sync_enter == 0) {
169+
return 0;
170+
}
171+
172+
info->elapsed_pass1 = bpf_ktime_get_ns() - info->pool_sync_enter;
173+
return 0;
174+
}
175+
176+
/*
177+
* Collect data to know how much we're being throttled / delayed. If we're
178+
* throttled on every tx we could hit these probes a lot (burning CPU), so we
179+
* avoid using syntactic sugar in this section.
180+
*/
181+
int dmu_tx_delay_entry(struct pt_regs *ctx, dmu_tx_t *tx, uint64_t dirty)
182+
{
183+
u32 tid = bpf_get_current_pid_tgid();
184+
spa_sync_info *info = sync_info_map.lookup(&tid);
185+
if (info == NULL || info->spa != tx->tx_pool->dp_spa) {
186+
return 0;
187+
}
188+
189+
info->delay_entry_time = bpf_ktime_get_ns();
190+
return 0;
191+
}
192+
int dmu_tx_delay_mintime(struct pt_regs *ctx, dmu_tx_t *tx, uint64_t dirty,
193+
uint64_t min_tx_time)
194+
{
195+
u32 tid = bpf_get_current_pid_tgid();
196+
spa_sync_info *info = sync_info_map.lookup(&tid);
197+
if (info == NULL || info->delay_entry_time == 0) {
198+
return 0;
199+
}
200+
201+
if (min_tx_time > info->throttle) {
202+
info->throttle = min_tx_time;
203+
}
204+
return 0;
205+
}
206+
207+
int dmu_tx_delay_return(struct pt_regs *ctx)
208+
{
209+
u32 tid = bpf_get_current_pid_tgid();
210+
spa_sync_info *info = sync_info_map.lookup(&tid);
211+
if (info == NULL || info->delay_entry_time == 0) {
212+
return 0;
213+
}
214+
215+
u64 elapsed = bpf_ktime_get_ns() - info->delay_entry_time;
216+
if (elapsed > info->delay_max) {
217+
info->delay_max = elapsed;
218+
}
219+
220+
info->delay_count = info->delay_count + 1;
221+
info->delay_sum = info->delay_sum + elapsed;
222+
return 0;
223+
}
224+
225+
/*
226+
* Submit an event containing the collected stats for each
227+
* completed spa sync and reset counters.
228+
*/
229+
int spa_sync_return(struct pt_regs *ctx)
230+
{
231+
u32 tid = bpf_get_current_pid_tgid();
232+
spa_sync_info *info = sync_info_map.lookup(&tid);
233+
if (info == NULL || info->pool_sync_enter == 0) {
234+
return 0;
235+
}
236+
info->stop = bpf_ktime_get_ns();
237+
238+
239+
spa_sync_data data = {};
240+
data.txg = info->txg;
241+
data.time_since_last_sync = info->time_since_last_sync;
242+
data.sync_time = info->stop - info->start;
243+
data.elapsed_pass1 = info->elapsed_pass1;
244+
245+
data.throttle = info->throttle;
246+
data.delay_max = info->delay_max;
247+
data.delay_sum = info->delay_sum;
248+
data.delay_count = info->delay_count;
249+
250+
int dirty_index = SPA_SYNC_DIRTY_INDEX;
251+
spa_sync_dirty *dirty = sync_dirty_map.lookup(&dirty_index);
252+
if (dirty == NULL) {
253+
data.dirty_b = info->spa->spa_dsl_pool->dp_dirty_total;
254+
} else {
255+
data.dirty_b = dirty->dirty_max;
256+
dirty->dirty_max = 0;
257+
}
258+
259+
sync_events.perf_submit(ctx, &data, sizeof(data));
260+
info->throttle = 0;
261+
info->start = 0;
262+
info->spa = 0;
263+
264+
return 0;
265+
}
266+
267+
/*
268+
* Record how much dirty data we've collected so far. The value doesn't need to
269+
* be exact, so we just check this periodically.
270+
*/
271+
int get_spa_dirty(struct bpf_perf_event_data *ctx) {
272+
u32 tid = bpf_get_current_pid_tgid();
273+
int dirty_index = SPA_SYNC_DIRTY_INDEX;
274+
spa_sync_dirty *dirty = sync_dirty_map.lookup(&dirty_index);
275+
if (dirty == NULL) {
276+
return 0;
277+
}
278+
279+
if (*dirty->dirty_addr > dirty->dirty_max) {
280+
dirty->dirty_max = *dirty->dirty_addr;
281+
}
282+
283+
return 0;
284+
}
285+
"""
286+
287+
288+
def read_zfs_dirty_max_data():
289+
""" Use the drgn program object to read the
290+
zfs_dirty_data_max kernel variable.
291+
"""
292+
global proj
293+
variable = prog['zfs_dirty_data_max']
294+
return int(variable.value_())
295+
296+
297+
def print_event(cpu, data, size):
298+
""" Print the raw txg data and a legend at the start and
299+
after every 30 events.
300+
"""
301+
event_format = ('{:<24} {:>10} {:>5}ms {:>5}ms ({:>2} pass 1)'
302+
'{:>4}MB ({:>2}) {:>4}us {:>5}ms {:>4}ms')
303+
global print_count
304+
if (print_count == 30):
305+
print(" date txg time"
306+
" since last sync")
307+
print(" | | |"
308+
" sync time")
309+
print(" | | |"
310+
" | (%% pass 1)")
311+
print(" | | |"
312+
" | | highest dirty (%%) ")
313+
print(" | | |"
314+
" | | | highest throttle delay")
315+
print(" | | |"
316+
" | | | | | avg delay")
317+
print(" v v v"
318+
" v v v v v v")
319+
#
320+
# Here is an example line:
321+
# "Fri Jan 17 21:21:16 2020 1827162 0ms 1342ms (60% p1) \
322+
# 208MB (10%) 0us 57ms 10ms"
323+
#
324+
print_count = 0
325+
print_count = print_count + 1
326+
327+
zfs_dirty_max_data = read_zfs_dirty_max_data()
328+
event = b["sync_events"].event(data)
329+
date = datetime.now()
330+
average_delay = 0
331+
if (event.delay_count > 1):
332+
average_delay = event.delay_sum / event.delay_count
333+
print(event_format.format(date.ctime(),
334+
event.txg,
335+
int(event.time_since_last_sync / 1000 / 1000),
336+
int(event.sync_time / 1000 / 1000),
337+
int(event.elapsed_pass1 * 100 / event.sync_time),
338+
int(event.dirty_b / 1024 / 1024),
339+
int(event.dirty_b * 100 / zfs_dirty_max_data),
340+
int((event.throttle + 999) / 1000),
341+
event.delay_max,
342+
average_delay))
343+
344+
345+
# load BPF program
346+
KVER = os.popen('uname -r').read().rstrip()
347+
b = BPF(text=bpf_text,
348+
cflags=["-include",
349+
"/usr/src/zfs-" + KVER + "/zfs_config.h",
350+
"-I/usr/src/zfs-" + KVER + "/include/",
351+
"-I/usr/src/zfs-" + KVER + "/include/spl",
352+
"-I/usr/src/zfs-" + KVER + "/include/",
353+
"-I/usr/src/zfs-" + KVER + "/include/linux"])
354+
355+
b.attach_kprobe(event="spa_sync", fn_name="spa_sync_entry")
356+
b.attach_kretprobe(event="spa_sync", fn_name="spa_sync_return")
357+
b.attach_kprobe(event="dsl_pool_sync", fn_name="dsl_pool_sync_entry")
358+
b.attach_kretprobe(event="dsl_pool_sync", fn_name="dsl_pool_sync_return")
359+
b.attach_kprobe(event="dmu_tx_delay", fn_name="dmu_tx_delay_entry")
360+
b.attach_kprobe(event="trace_zfs_delay__mintime",
361+
fn_name="dmu_tx_delay_mintime")
362+
b.attach_perf_event(ev_type=PerfType.SOFTWARE,
363+
ev_config=PerfSWConfig.CPU_CLOCK, fn_name="get_spa_dirty",
364+
sample_freq=10)
365+
366+
print_count = 30
367+
368+
# initialize dgrn program object to read zfs_dirty_data_max
369+
prog = drgn.program_from_kernel()
370+
371+
# loop with callback to print_event
372+
b["sync_events"].open_perf_buffer(print_event)
373+
while 1:
374+
try:
375+
b.perf_buffer_poll()
376+
except KeyboardInterrupt:
377+
exit()

0 commit comments

Comments
 (0)