1
1
from __future__ import annotations
2
2
3
+ import uuid
4
+ import weakref
3
5
from collections import defaultdict
4
6
from collections .abc import Iterable , Iterator
5
7
from contextlib import contextmanager
6
- from typing import TYPE_CHECKING
8
+ from typing import TYPE_CHECKING , Any
7
9
8
10
import dask .config
9
11
15
17
16
18
17
19
@contextmanager
18
- def span (* tags : str ) -> Iterator [None ]:
20
+ def span (* tags : str ) -> Iterator [str ]:
19
21
"""Tag group of tasks to be part of a certain group, called a span.
20
22
21
- This context manager can be nested, thus creating sub-spans.
22
- Every cluster defines a global "default" span when no span has been defined by the client.
23
+ This context manager can be nested, thus creating sub-spans. If you close and
24
+ re-open a span context manager with the same tag, you'll end up with two separate
25
+ spans.
26
+
27
+ Every cluster defines a global "default" span when no span has been defined by the
28
+ client; the default span is automatically closed and reopened when all tasks
29
+ associated to it have been completed; in other words the cluster is idle save for
30
+ tasks that are explicitly annotated by a span. Note that, in some edge cases, you
31
+ may end up with overlapping default spans, e.g. if a worker crashes and all unique
32
+ tasks that were in memory on it need to be recomputed.
23
33
24
34
Examples
25
35
--------
26
36
>>> import dask.array as da
27
37
>>> import distributed
28
38
>>> client = distributed.Client()
29
- >>> with span("my_workflow "):
39
+ >>> with span("my workflow "):
30
40
... with span("phase 1"):
31
41
... a = da.random.random(10)
32
42
... b = a + 1
@@ -36,39 +46,59 @@ def span(*tags: str) -> Iterator[None]:
36
46
>>> d.compute()
37
47
38
48
In the above example,
39
- - Tasks of collections a and b will be annotated on the scheduler and workers with
40
- ``{'span': ('my_workflow' , 'phase 1')}``
41
- - Tasks of collection c (that aren't already part of a or b) will be annotated with
42
- ``{'span': ('my_workflow' , 'phase 2')}``
43
- - Tasks of collection d (that aren't already part of a, b, or c) will *not* be
44
- annotated but will nonetheless be attached to span ``('default', )``
49
+ - Tasks of collections a and b are annotated to belong to span
50
+ ``('my workflow' , 'phase 1')``, 'ids': (<id0>, <id1>) }``;
51
+ - Tasks of collection c (that aren't already part of a or b) are annotated to belong
52
+ to span ``('my workflow' , 'phase 2')``;
53
+ - Tasks of collection d (that aren't already part of a, b, or c) are *not*
54
+ annotated but will nonetheless be attached to span ``('default', )``.
45
55
46
56
You may also set more than one tag at once; e.g.
47
57
>>> with span("workflow1", "version1"):
48
58
... ...
49
59
60
+ Finally, you may capture the ID of a span on the client to match it with the
61
+ :class:`Span` objects the scheduler:
62
+ >>> cluster = distributed.LocalCluster()
63
+ >>> client = distributed.Client(cluster)
64
+ >>> with span("my workflow") as span_id:
65
+ ... client.submit(lambda: "Hello world!").result()
66
+ >>> span = client.cluster.scheduler.extensions["spans"].spans[span_id]
50
67
51
- Note
52
- ----
68
+ Notes
69
+ -----
53
70
Spans are based on annotations, and just like annotations they can be lost during
54
- optimization. Set config ``optimizatione .fuse.active: false`` to prevent this issue.
71
+ optimization. Set config ``optimization .fuse.active: false`` to prevent this issue.
55
72
"""
56
- prev_id = dask .config .get ("annotations.span" , ())
57
- with dask .config .set ({"annotations.span" : prev_id + tags }):
58
- yield
73
+ if not tags :
74
+ raise ValueError ("Must specify at least one span tag" )
75
+
76
+ prev_tags = dask .config .get ("annotations.span.name" , ())
77
+ # You must specify the full history of IDs, not just the parent, because
78
+ # otherwise you would not be able to uniquely identify grandparents when
79
+ # they have no tasks of their own.
80
+ prev_ids = dask .config .get ("annotations.span.ids" , ())
81
+ ids = tuple (str (uuid .uuid4 ()) for _ in tags )
82
+ with dask .annotate (span = {"name" : prev_tags + tags , "ids" : prev_ids + ids }):
83
+ yield ids [- 1 ]
59
84
60
85
61
86
class Span :
62
87
#: (<tag>, <tag>, ...)
63
- #: Matches ``TaskState.annotations["span"]``, both on the scheduler and the worker,
64
- #: as well as ``TaskGroup.span``.
65
- #: Tasks with no 'span' annotation will be attached to Span ``("default", )``.
66
- id : tuple [str , ...]
88
+ #: Matches ``TaskState.annotations["span"]["name"]``, both on the scheduler and the
89
+ #: worker.
90
+ name : tuple [str , ...]
91
+
92
+ #: <uuid>
93
+ #: Taken from ``TaskState.annotations["span"]["id"][-1]``.
94
+ #: Matches ``distributed.scheduler.TaskState.group.span_id``
95
+ #: and ``distributed.worker_state_machine.TaskState.span_id``.
96
+ id : str
67
97
68
- #: Direct children of this span tree
69
- #: Note: you can get the parent through
70
- #: ``distributed.extensions["spans"].spans[self.id[:-1]]``
71
- children : set [Span ]
98
+ _parent : weakref . ref [ Span ] | None
99
+
100
+ #: Direct children of this span, sorted by creation time
101
+ children : list [Span ]
72
102
73
103
#: Task groups *directly* belonging to this span.
74
104
#:
@@ -94,25 +124,40 @@ class Span:
94
124
#: stop
95
125
enqueued : float
96
126
127
+ # Support for weakrefs to a class with __slots__
128
+ __weakref__ : Any
129
+
97
130
__slots__ = tuple (__annotations__ )
98
131
99
- def __init__ (self , span_id : tuple [str , ...], enqueued : float ):
100
- self .id = span_id
101
- self .enqueued = enqueued
102
- self .children = set ()
132
+ def __init__ (self , name : tuple [str , ...], id_ : str , parent : Span | None ):
133
+ self .name = name
134
+ self .id = id_
135
+ self ._parent = weakref .ref (parent ) if parent is not None else None
136
+ self .enqueued = time ()
137
+ self .children = []
103
138
self .groups = set ()
104
139
105
140
def __repr__ (self ) -> str :
106
- return f"Span{ self .id } "
141
+ return f"Span<name={ self .name } , id={ self .id } >"
142
+
143
+ @property
144
+ def parent (self ) -> Span | None :
145
+ if self ._parent :
146
+ out = self ._parent ()
147
+ assert out
148
+ return out
149
+ return None
107
150
108
151
def traverse_spans (self ) -> Iterator [Span ]:
109
- """Top-down recursion of all spans belonging to this span tree, including self"""
152
+ """Top-down recursion of all spans belonging to this branch off span tree,
153
+ including self
154
+ """
110
155
yield self
111
156
for child in self .children :
112
157
yield from child .traverse_spans ()
113
158
114
159
def traverse_groups (self ) -> Iterator [TaskGroup ]:
115
- """All TaskGroups belonging to this span tree"""
160
+ """All TaskGroups belonging to this branch of span tree"""
116
161
for span in self .traverse_spans ():
117
162
yield from span .groups
118
163
@@ -161,10 +206,26 @@ def states(self) -> defaultdict[TaskStateState, int]:
161
206
"""
162
207
out : defaultdict [TaskStateState , int ] = defaultdict (int )
163
208
for tg in self .traverse_groups ():
164
- for state , cnt in tg .states .items ():
165
- out [state ] += cnt
209
+ for state , count in tg .states .items ():
210
+ out [state ] += count
166
211
return out
167
212
213
+ @property
214
+ def done (self ) -> bool :
215
+ """Return True if all tasks in this span tree are completed; False otherwise.
216
+
217
+ Notes
218
+ -----
219
+ This property may transition from True to False, e.g. when a new sub-span is
220
+ added or when a worker that contained the only replica of a task in memory
221
+ crashes and the task need to be recomputed.
222
+
223
+ See also
224
+ --------
225
+ distributed.scheduler.TaskGroup.done
226
+ """
227
+ return all (tg .done for tg in self .traverse_groups ())
228
+
168
229
@property
169
230
def all_durations (self ) -> defaultdict [str , float ]:
170
231
"""Cumulative duration of all completed actions in this span tree, by action
@@ -205,72 +266,92 @@ def nbytes_total(self) -> int:
205
266
class SpansExtension :
206
267
"""Scheduler extension for spans support"""
207
268
208
- #: All Span objects by span_id
209
- spans : dict [tuple [ str , ...] , Span ]
269
+ #: All Span objects by id
270
+ spans : dict [str , Span ]
210
271
211
- #: Only the spans that don't have any parents {client_id: Span} .
272
+ #: Only the spans that don't have any parents, sorted by creation time .
212
273
#: This is a convenience helper structure to speed up searches.
213
- root_spans : dict [ str , Span ]
274
+ root_spans : list [ Span ]
214
275
215
- #: All spans, keyed by the individual tags that make up their span_id .
276
+ #: All spans, keyed by their full name and sorted by creation time .
216
277
#: This is a convenience helper structure to speed up searches.
217
- spans_search_by_tag : defaultdict [str , set [Span ]]
278
+ spans_search_by_name : defaultdict [tuple [str , ...], list [Span ]]
279
+
280
+ #: All spans, keyed by the individual tags that make up their name and sorted by
281
+ #: creation time.
282
+ #: This is a convenience helper structure to speed up searches.
283
+ spans_search_by_tag : defaultdict [str , list [Span ]]
218
284
219
285
def __init__ (self , scheduler : Scheduler ):
220
286
self .spans = {}
221
- self .root_spans = {}
222
- self .spans_search_by_tag = defaultdict (set )
287
+ self .root_spans = []
288
+ self .spans_search_by_name = defaultdict (list )
289
+ self .spans_search_by_tag = defaultdict (list )
223
290
224
- def new_tasks (self , tss : Iterable [TaskState ]) -> dict [ str , tuple [ str , ...]] :
291
+ def new_tasks (self , tss : Iterable [TaskState ]) -> None :
225
292
"""Acknowledge the creation of new tasks on the scheduler.
226
293
Attach tasks to either the desired span or to ("default", ).
227
- Update TaskState.annotations["span"] and TaskGroup.span.
228
-
229
- Returns
230
- -------
231
- {task key: span id}, only for tasks that explicitly define a span
294
+ Update TaskGroup.span_id and wipe TaskState.annotations["span"].
232
295
"""
233
- out = {}
296
+ default_span = None
297
+
234
298
for ts in tss :
235
299
# You may have different tasks belonging to the same TaskGroup but to
236
300
# different spans. If that happens, arbitrarily force everything onto the
237
301
# span of the earliest encountered TaskGroup.
238
302
tg = ts .group
239
- if tg .span :
240
- span_id = tg .span
241
- else :
242
- span_id = ts .annotations .get ("span" , ("default" ,))
243
- assert isinstance (span_id , tuple )
244
- tg .span = span_id
245
- span = self ._ensure_span (span_id )
303
+ if not tg .span_id :
304
+ ann = ts .annotations .get ("span" )
305
+ if ann :
306
+ span = self ._ensure_span (ann ["name" ], ann ["ids" ])
307
+ else :
308
+ if not default_span :
309
+ default_span = self ._ensure_default_span ()
310
+ span = default_span
311
+
312
+ tg .span_id = span .id
246
313
span .groups .add (tg )
247
314
248
- # Override ts.annotations["span"] with span_id from task group
249
- if span_id == ("default" ,):
250
- ts .annotations .pop ("span" , None )
251
- else :
252
- ts .annotations ["span" ] = out [ts .key ] = span_id
253
-
254
- return out
315
+ # The span may be completely different from the one referenced by the
316
+ # annotation, due to the TaskGroup collision issue explained above.
317
+ # Remove the annotation to avoid confusion, and instead rely on
318
+ # distributed.scheduler.TaskState.group.span_id and
319
+ # distributed.worker_state_machine.TaskState.span_id.
320
+ ts .annotations .pop ("span" , None )
321
+
322
+ def _ensure_default_span (self ) -> Span :
323
+ """Return the currently active default span, or create one if the previous one
324
+ terminated. In other words, do not reuse the previous default span if all tasks
325
+ that were not explicitly annotated with :func:`spans` on the client side are
326
+ finished.
327
+ """
328
+ defaults = self .spans_search_by_name ["default" ,]
329
+ if defaults and not defaults [- 1 ].done :
330
+ return defaults [- 1 ]
331
+ return self ._ensure_span (("default" ,), (str (uuid .uuid4 ()),))
255
332
256
- def _ensure_span (self , span_id : tuple [str , ...], enqueued : float = 0.0 ) -> Span :
333
+ def _ensure_span (self , name : tuple [str , ...], ids : tuple [ str , ...] ) -> Span :
257
334
"""Create Span if it doesn't exist and return it"""
258
335
try :
259
- return self .spans [span_id ]
336
+ return self .spans [ids [ - 1 ] ]
260
337
except KeyError :
261
338
pass
262
339
263
- # When recursively creating parent spans, make sure that parents are not newer
264
- # than the children
265
- enqueued = enqueued or time ()
340
+ assert len (name ) == len (ids )
341
+ assert len (name ) > 0
342
+
343
+ parent = None
344
+ for i in range (1 , len (name )):
345
+ parent = self ._ensure_span (name [:i ], ids [:i ])
266
346
267
- span = self .spans [span_id ] = Span (span_id , enqueued )
268
- for tag in span_id :
269
- self .spans_search_by_tag [tag ].add (span )
270
- if len (span_id ) > 1 :
271
- parent = self ._ensure_span (span_id [:- 1 ], enqueued )
272
- parent .children .add (span )
347
+ span = Span (name = name , id_ = ids [- 1 ], parent = parent )
348
+ self .spans [span .id ] = span
349
+ self .spans_search_by_name [name ].append (span )
350
+ for tag in name :
351
+ self .spans_search_by_tag [tag ].append (span )
352
+ if parent :
353
+ parent .children .append (span )
273
354
else :
274
- self .root_spans [ span_id [ 0 ]] = span
355
+ self .root_spans . append ( span )
275
356
276
357
return span
0 commit comments