@@ -25,13 +25,19 @@ import (
2525 "github.com/prometheus/client_golang/prometheus"
2626 "github.com/prometheus/client_golang/prometheus/promauto"
2727 "github.com/prometheus/common/model"
28+ "go.opentelemetry.io/otel"
29+ "go.opentelemetry.io/otel/attribute"
30+ "go.opentelemetry.io/otel/codes"
31+ "go.opentelemetry.io/otel/trace"
2832
2933 "github.com/prometheus/alertmanager/notify"
3034 "github.com/prometheus/alertmanager/provider"
3135 "github.com/prometheus/alertmanager/store"
3236 "github.com/prometheus/alertmanager/types"
3337)
3438
39+ var tracer = otel .Tracer ("github.com/prometheus/alertmanager/dispatch" )
40+
3541// DispatcherMetrics represents metrics associated to a dispatcher.
3642type DispatcherMetrics struct {
3743 aggrGroups prometheus.Gauge
@@ -162,20 +168,12 @@ func (d *Dispatcher) run(it provider.AlertIterator) {
162168 return
163169 }
164170
165- d .logger .Debug ("Received alert" , "alert" , alert )
166-
167171 // Log errors but keep trying.
168172 if err := it .Err (); err != nil {
169173 d .logger .Error ("Error on alert update" , "err" , err )
170174 continue
171175 }
172-
173- now := time .Now ()
174- for _ , r := range d .route .Match (alert .Labels ) {
175- d .processAlert (alert , r )
176- }
177- d .metrics .processingDuration .Observe (time .Since (now ).Seconds ())
178-
176+ d .dispatch (alert )
179177 case <- maintenance .C :
180178 d .doMaintenance ()
181179 case <- d .ctx .Done ():
@@ -184,6 +182,41 @@ func (d *Dispatcher) run(it provider.AlertIterator) {
184182 }
185183}
186184
185+ func (d * Dispatcher ) dispatch (alert * types.Alert ) {
186+ d .logger .Debug ("Received alert" , "alert" , alert )
187+
188+ attrs := []attribute.KeyValue {
189+ attribute .String ("alert.name" , alert .Name ()),
190+ attribute .String ("alert.fingerprint" , alert .Fingerprint ().String ()),
191+ }
192+
193+ // Build span start options
194+ spanOpts := []trace.SpanStartOption {
195+ trace .WithAttributes (attrs ... ),
196+ // we'll use producer here since the alert is not processed
197+ // synchronously
198+ trace .WithSpanKind (trace .SpanKindProducer ),
199+ }
200+
201+ // Check if the alert has a valid trace context and link to it
202+ alertSpanCtx := alert .TraceSpanContext ()
203+ if alertSpanCtx .IsValid () {
204+ spanOpts = append (spanOpts , trace .WithLinks (trace.Link {
205+ SpanContext : alertSpanCtx ,
206+ }))
207+ }
208+
209+ traceCtx , span := tracer .Start (d .ctx , "dispatch.Dispatcher.dispatch" , spanOpts ... )
210+ defer span .End ()
211+
212+ now := time .Now ()
213+ for _ , r := range d .route .Match (alert .Labels ) {
214+ d .processAlert (trace .LinkFromContext (traceCtx ), alert , r )
215+ span .SetAttributes (attribute .String ("receiver" , r .RouteOpts .Receiver ))
216+ }
217+ d .metrics .processingDuration .Observe (time .Since (now ).Seconds ())
218+ }
219+
187220func (d * Dispatcher ) doMaintenance () {
188221 d .mtx .Lock ()
189222 defer d .mtx .Unlock ()
@@ -310,7 +343,7 @@ type notifyFunc func(context.Context, ...*types.Alert) bool
310343
311344// processAlert determines in which aggregation group the alert falls
312345// and inserts it.
313- func (d * Dispatcher ) processAlert (alert * types.Alert , route * Route ) {
346+ func (d * Dispatcher ) processAlert (dispatchLink trace. Link , alert * types.Alert , route * Route ) {
314347 groupLabels := getGroupLabels (alert , route )
315348
316349 fp := groupLabels .Fingerprint ()
@@ -348,15 +381,26 @@ func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
348381 ag .insert (alert )
349382
350383 go ag .run (func (ctx context.Context , alerts ... * types.Alert ) bool {
384+ ctx , span := tracer .Start (ctx , "dispatch.Dispatch.notify" ,
385+ trace .WithAttributes (attribute .Int ("alerts.count" , len (alerts ))),
386+ trace .WithLinks (dispatchLink ),
387+ trace .WithSpanKind (trace .SpanKindConsumer ),
388+ )
389+ defer span .End ()
390+
351391 _ , _ , err := d .stage .Exec (ctx , d .logger , alerts ... )
352392 if err != nil {
353393 logger := d .logger .With ("aggrGroup" , ag .GroupKey (), "num_alerts" , len (alerts ), "err" , err )
354394 if errors .Is (ctx .Err (), context .Canceled ) {
355395 // It is expected for the context to be canceled on
356396 // configuration reload or shutdown. In this case, the
357397 // message should only be logged at the debug level.
398+ span .RecordError (fmt .Errorf ("notify for alerts failed: %w" , err ))
399+ span .SetStatus (codes .Error , err .Error ())
358400 logger .Debug ("Notify for alerts failed" )
359401 } else {
402+ span .RecordError (fmt .Errorf ("notify for alerts failed: %w" , err ))
403+ span .SetStatus (codes .Error , err .Error ())
360404 logger .Error ("Notify for alerts failed" )
361405 }
362406 }
0 commit comments