Skip to content

Commit e9fb7ad

Browse files
Add GIL metric to dashboard (#7646)
* Add GIL metric to dashboard Adds a GIL contention metric to the Event Loop page when `distributed.admin.system-monitor.gil-contention` is enabled and `gilknocker` is installed. Otherwise, the chart will not show anything GIL related, not even placeholders. * Reduce horizonal white space between groups * Update distributed/dashboard/components/scheduler.py Co-authored-by: Jacob Tomlinson <jacobtomlinson@users.noreply.github.com> * Rename EventLoop -> Contention, and page name --------- Co-authored-by: Jacob Tomlinson <jacobtomlinson@users.noreply.github.com>
1 parent 8ae6c9f commit e9fb7ad

File tree

3 files changed

+74
-32
lines changed

3 files changed

+74
-32
lines changed

distributed/dashboard/components/scheduler.py

Lines changed: 70 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3375,57 +3375,99 @@ def update(self):
33753375
)
33763376

33773377

3378-
class EventLoop(DashboardComponent):
3379-
"""Event Loop Health"""
3378+
class Contention(DashboardComponent):
3379+
"""
3380+
Event Loop Health (and GIL Contention, if configured)
3381+
"""
33803382

33813383
@log_errors
33823384
def __init__(self, scheduler, **kwargs):
33833385
self.scheduler = scheduler
3384-
self.source = ColumnDataSource(
3385-
{
3386-
"names": ["Scheduler", "Workers"],
3387-
"values": [0, 0],
3388-
"text": ["0", "0"],
3389-
}
3386+
self.data = dict(
3387+
names=[
3388+
("Scheduler", "Event Loop"),
3389+
("Scheduler", "GIL Contention"),
3390+
("Workers", "Event Loop"),
3391+
("Workers", "GIL Contention"),
3392+
],
3393+
values=[0, 0, 0, 0],
3394+
text=["0s", "0%", "0s", "0%"],
33903395
)
3396+
title = "Event Loop & GIL Contention"
3397+
3398+
# Remove GIL related names/values if not monitoring GIL
3399+
if not self.scheduler.monitor.monitor_gil_contention:
3400+
title = "Event Loop"
3401+
for key in self.data:
3402+
self.data[key] = self.data[key][::2]
33913403

3404+
self.source = ColumnDataSource(data=self.data)
33923405
self.root = figure(
3393-
title="Event Loop Health",
3394-
x_range=["Scheduler", "Workers"],
3395-
y_range=[
3396-
0,
3397-
parse_timedelta(dask.config.get("distributed.admin.tick.interval"))
3398-
* 25,
3399-
],
3406+
title=title,
3407+
x_range=FactorRange(*self.data["names"]),
3408+
y_range=(0, 1),
34003409
tools="",
34013410
toolbar_location="above",
34023411
**kwargs,
34033412
)
3404-
self.root.vbar(x="names", top="values", width=0.9, source=self.source)
3413+
self.root.vbar(
3414+
x="names",
3415+
top="values",
3416+
width=0.9,
3417+
line_color="white",
3418+
source=self.source,
3419+
fill_color=factor_cmap(
3420+
field_name="names",
3421+
palette=["#b8e0ce", "#81aae4"],
3422+
factors=["Event Loop", "GIL Contention"],
3423+
start=1,
3424+
end=2,
3425+
),
3426+
)
34053427

3428+
self.root.x_range.group_padding = 0.25
34063429
self.root.xaxis.minor_tick_line_alpha = 0
34073430
self.root.ygrid.visible = True
34083431
self.root.xgrid.visible = False
34093432

3410-
hover = HoverTool(tooltips=[("Interval", "@text s")], mode="vline")
3433+
hover = HoverTool(
3434+
tooltips=[("Name", "@names"), ("Value", "@text")], mode="vline"
3435+
)
34113436
self.root.add_tools(hover)
34123437

34133438
@without_property_validation
34143439
@log_errors
34153440
def update(self):
34163441
s = self.scheduler
3442+
monitor_gil = s.monitor.monitor_gil_contention
3443+
3444+
self.data["values"] = [
3445+
s._tick_interval_observed,
3446+
self.gil_contention_scheduler,
3447+
sum(w.metrics["event_loop_interval"] for w in s.workers.values())
3448+
/ (len(s.workers) or 1),
3449+
self.gil_contention_workers,
3450+
][:: 1 if monitor_gil else 2]
3451+
3452+
# Format event loop as time and GIL (if configured) as %
3453+
self.data["text"] = [
3454+
f"{x * 100:.1f}%" if i % 2 and monitor_gil else format_time(x)
3455+
for i, x in enumerate(self.data["values"])
3456+
]
3457+
update(self.source, self.data)
34173458

3418-
data = {
3419-
"names": ["Scheduler", "Workers"],
3420-
"values": [
3421-
s._tick_interval_observed,
3422-
sum(w.metrics["event_loop_interval"] for w in s.workers.values())
3423-
/ (len(s.workers) or 1),
3424-
],
3425-
}
3426-
data["text"] = [format_time(x) for x in data["values"]]
3459+
@property
3460+
def gil_contention_workers(self) -> float:
3461+
workers = self.scheduler.workers
3462+
if workers:
3463+
return sum(
3464+
w.metrics.get("gil_contention", 0) for w in workers.values()
3465+
) / len(workers)
3466+
return float("NaN")
34273467

3428-
update(self.source, data)
3468+
@property
3469+
def gil_contention_scheduler(self) -> float:
3470+
return self.scheduler.monitor.recent().get("gil_contention", float("NaN"))
34293471

34303472

34313473
class ExceptionsTable(DashboardComponent):
@@ -4071,7 +4113,7 @@ def shuffling_doc(scheduler, extra, doc):
40714113
timeseries = SystemTimeseries(
40724114
scheduler, width=1600, height=200, follow_interval=3000
40734115
)
4074-
event_loop = EventLoop(scheduler, width=200, height=400)
4116+
event_loop = Contention(scheduler, width=200, height=400)
40754117

40764118
add_periodic_callback(doc, shuffling, 200)
40774119
add_periodic_callback(doc, workers_memory, 200)

distributed/dashboard/scheduler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
BandwidthWorkers,
1818
ClusterMemory,
1919
ComputePerKey,
20+
Contention,
2021
CurrentLoad,
21-
EventLoop,
2222
ExceptionsTable,
2323
MemoryByKey,
2424
Occupancy,
@@ -110,7 +110,7 @@
110110
"/individual-compute-time-per-key": individual_doc(ComputePerKey, 500),
111111
"/individual-aggregate-time-per-action": individual_doc(AggregateAction, 500),
112112
"/individual-scheduler-system": individual_doc(SystemMonitor, 500),
113-
"/individual-event-loop": individual_doc(EventLoop, 500),
113+
"/individual-contention": individual_doc(Contention, 500),
114114
"/individual-profile": individual_profile_doc,
115115
"/individual-profile-server": individual_profile_server_doc,
116116
"/individual-gpu-memory": gpu_memory_doc,

distributed/dashboard/tests/test_scheduler_bokeh.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525
AggregateAction,
2626
ClusterMemory,
2727
ComputePerKey,
28+
Contention,
2829
CurrentLoad,
29-
EventLoop,
3030
Events,
3131
Hardware,
3232
MemoryByKey,
@@ -108,7 +108,7 @@ async def test_basic(c, s, a, b):
108108
SystemMonitor,
109109
Occupancy,
110110
StealingTimeSeries,
111-
EventLoop,
111+
Contention,
112112
]:
113113
ss = component(s)
114114

0 commit comments

Comments
 (0)