Add new histogram to generator - messaging system latency (grafana#3453)

* introduce new service-graph metric for messaging-system latency * added tests for new histogram values * fix linting * make new metric optional via config * fix typo * fix failing tests * add feature to changelog * negative times diff consistency - return 0 instead of negative * update docs * Update docs/sources/tempo/metrics-generator/service_graphs/estimate-cardinality.md use present when possible Co-authored-by: Kim Nylander <104772500+knylander-grafana@users.noreply.github.com> * change 1e9 to time const * added a reference to the "wait" config of the processor * fixed indentations and formatting stuff from rebasing * removed mistaken println found by linter --------- Co-authored-by: Kim Nylander <104772500+knylander-grafana@users.noreply.github.com>
joe-elliott · May 7, 2024 · a87924b · a87924b
1 parent 90e7fe8
commit a87924b
Show file tree

Hide file tree

Showing 20 changed files with 240 additions and 122 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,7 @@
 * [CHANGE] Return a less confusing error message to the client when refusing spans due to ingestion rates. [#3485](https://github.com/grafana/tempo/pull/3485) (@ie-pham)
 * [CHANGE] Clean Metrics Generator's Prometheus wal before creating instance [#3548](https://github.com/grafana/tempo/pull/3548) (@ie-pham)
 * [CHANGE] Update docker examples for permissions, deprecations, and clean-up [#3603](https://github.com/grafana/tempo/pull/3603) (@zalegrala)
+* [FEATURE] Add messaging-system latency histogram to service-graph [#3453](https://github.com/grafana/tempo/pull/3453) (@adirmatzkin)
 * [ENHANCEMENT] Add string interning to TraceQL queries [#3411](https://github.com/grafana/tempo/pull/3411) (@mapno)
 * [ENHANCEMENT] Add new (unsafe) query hints for metrics queries [#3396](https://github.com/grafana/tempo/pull/3396) (@mdisibio)
 * [ENHANCEMENT] Add nestedSetLeft/Right/Parent instrinsics to TraceQL. [#3497](https://github.com/grafana/tempo/pull/3497) (@joe-elliott)

diff --git a/docs/sources/tempo/configuration/_index.md b/docs/sources/tempo/configuration/_index.md
@@ -321,6 +321,11 @@ metrics_generator:
             # per additional dimension instead of one.
             [enable_client_server_prefix: <bool> | default = false]
 
+            # If enabled another histogram will be produced for interactions over messaging systems middlewares
+            # If this feature is relevant over long time ranges (high latencies) - consider increasing
+            # `wait` value for this processor.
+            [enable_messaging_system_latency_histogram: <bool> | default = false]
+
             # Attribute Key to multiply span metrics
             [span_multiplier_key: <string> | default = ""]
 
@@ -1361,6 +1366,7 @@ overrides:
           [dimensions: <list of string>]
           [peer_attributes: <list of string>]
           [enable_client_server_prefix: <bool>]
+          [enable_messaging_system_latency_histogram: <bool>]
 
         # Configuration for the span-metrics processor
         span_metrics:

diff --git a/docs/sources/tempo/configuration/manifest.md b/docs/sources/tempo/configuration/manifest.md
@@ -486,6 +486,7 @@ metrics_generator:
                 - 12.8
             dimensions: []
             enable_client_server_prefix: false
+            enable_messaging_system_latency_histogram: false
             peer_attributes:
                 - peer.service
                 - db.name

diff --git a/docs/sources/tempo/metrics-generator/service_graphs/_index.md b/docs/sources/tempo/metrics-generator/service_graphs/_index.md
@@ -60,14 +60,15 @@ Virtual nodes can be detected in two different ways:
 
 The following metrics are exported:
 
-| Metric                                      | Type      | Labels                          | Description                                                  |
-|---------------------------------------------|-----------|---------------------------------|--------------------------------------------------------------|
-| traces_service_graph_request_total          | Counter   | client, server, connection_type | Total count of requests between two nodes                    |
-| traces_service_graph_request_failed_total   | Counter   | client, server, connection_type | Total count of failed requests between two nodes             |
-| traces_service_graph_request_server_seconds | Histogram | client, server, connection_type | Time for a request between two nodes as seen from the server |
-| traces_service_graph_request_client_seconds | Histogram | client, server, connection_type | Time for a request between two nodes as seen from the client |
-| traces_service_graph_unpaired_spans_total   | Counter   | client, server, connection_type | Total count of unpaired spans                                |
-| traces_service_graph_dropped_spans_total    | Counter   | client, server, connection_type | Total count of dropped spans                                 |
+| Metric                                                | Type      | Labels                          | Description                                                                                                |
+| ----------------------------------------------------- | --------- | ------------------------------- | ---------------------------------------------------------------------------------------------------------- |
+| traces_service_graph_request_total                    | Counter   | client, server, connection_type | Total count of requests between two nodes                                                                  |
+| traces_service_graph_request_failed_total             | Counter   | client, server, connection_type | Total count of failed requests between two nodes                                                           |
+| traces_service_graph_request_server_seconds           | Histogram | client, server, connection_type | Time for a request between two nodes as seen from the server                                               |
+| traces_service_graph_request_client_seconds           | Histogram | client, server, connection_type | Time for a request between two nodes as seen from the client                                               |
+| traces_service_graph_request_messaging_system_seconds | Histogram | client, server, connection_type | (Off by default) Time between publisher and consumer for services communicating through a messaging system |
+| traces_service_graph_unpaired_spans_total             | Counter   | client, server, connection_type | Total count of unpaired spans                                                                              |
+| traces_service_graph_dropped_spans_total              | Counter   | client, server, connection_type | Total count of dropped spans                                                                               |
 
 Duration is measured both from the client and the server sides.
 

diff --git a/docs/sources/tempo/metrics-generator/service_graphs/estimate-cardinality.md b/docs/sources/tempo/metrics-generator/service_graphs/estimate-cardinality.md
@@ -19,30 +19,46 @@ The amount of edges depends on the number of nodes in the system and the directi
 Let’s call this amount hops. Every hop will be a unique combination of client + server labels.
 
 For example:
+
 - A system with 3 nodes `(A, B, C)` of which A only calls B and B only calls C will have 2 hops `(A → B, B → C)`
 - A system with 3 nodes `(A, B, C)` that call each other (i.e., all bidirectional link) will have 6 hops `(A → B, B → A, B → C, C → B, A → C, C → A)`
 
 We can’t calculate the amount of hops automatically based upon the nodes,
 but it should be a value between `#services - 1` and `#services!`.
 
 If we know the amount of hops in a system, we can calculate the cardinality of the generated
-[service graphs]({{< relref "../service_graphs" >}}):
+[service graphs]({{< relref "../service_graphs" >}}) (assuming `#hb` is the number of histogram buckets):
 
 ```
   traces_service_graph_request_total: #hops
   traces_service_graph_request_failed_total: #hops
-  traces_service_graph_request_server_seconds: 3 buckets * #hops
-  traces_service_graph_request_client_seconds: 3 buckets * #hops
+  traces_service_graph_request_server_seconds: #hb * #hops
+  traces_service_graph_request_client_seconds: #hb * #hops
   traces_service_graph_unpaired_spans_total: #services (absolute worst case)
   traces_service_graph_dropped_spans_total: #services (absolute worst case)
 ```
 
 Finally, we get the following cardinality estimation:
 
 ```
-  Sum: 8 * #hops + 2 * #services
+  Sum: [([2 * #hb] + 2) * #hops] + [2 * #services]
+```
+
+{{% admonition type="note" %}}
+If `enable_messaging_system_latency_histogram` configuration is set to `true`, another histogram is produced:
+
+```
+  traces_service_graph_request_messaging_system_seconds: #hb * #hops
 ```
 
+In that case, the estimation formula would be:
+
+```
+  Sum: [([3 * #hb] + 2) * #hops] + [2 * #services]
+```
+
+{{% /admonition %}}
+
 {{< admonition type="note" >}}
 To estimate the number of metrics, refer to the [Dry run metrics generator]({{< relref "../cardinality" >}}) documentation.
 {{% /admonition %}}
diff --git a/docs/sources/tempo/metrics-generator/service_graphs/metrics-queries.md b/docs/sources/tempo/metrics-generator/service_graphs/metrics-queries.md
@@ -71,3 +71,9 @@ These queries will give us latency quantiles for the above rate. If we were inte
 ```promql
 histogram_quantile(.9, sum(rate(traces_service_graph_request_server_seconds_bucket{client="foo"}[5m])) by (server, le))
 ```
+
+Using the optional metric for latency of a messaging system to see potential middleware latencies:
+
+```promql
+histogram_quantile(.9, sum(rate(traces_service_graph_request_messaging_system_seconds_bucket{}[5m])) by (client, server, le))
+```
diff --git a/docs/sources/tempo/operations/user-configurable-overrides.md b/docs/sources/tempo/operations/user-configurable-overrides.md
@@ -62,6 +62,7 @@ metrics_generator:
       [dimensions: <list of string>]
       [peer_attributes: <list of string>]
       [enable_client_server_prefix: <bool>]
+      [enable_messaging_system_latency_histogram: <bool>]
 
     span_metrics:
       [histogram_buckets: <list of float>]

diff --git a/modules/generator/config.go b/modules/generator/config.go
@@ -123,6 +123,8 @@ func (cfg *ProcessorConfig) copyWithOverrides(o metricsGeneratorOverrides, userI
 
 	copyCfg.ServiceGraphs.EnableClientServerPrefix = o.MetricsGeneratorProcessorServiceGraphsEnableClientServerPrefix(userID)
 
+	copyCfg.ServiceGraphs.EnableMessagingSystemLatencyHistogram = o.MetricsGeneratorProcessorServiceGraphsEnableMessagingSystemLatencyHistogram(userID)
+
 	copyCfg.ServiceGraphs.EnableVirtualNodeLabel = o.MetricsGeneratorProcessorServiceGraphsEnableVirtualNodeLabel(userID)
 
 	return copyCfg, nil

diff --git a/modules/generator/overrides.go b/modules/generator/overrides.go
@@ -33,6 +33,7 @@ type metricsGeneratorOverrides interface {
 	MetricsGeneratorProcessorSpanMetricsDimensionMappings(userID string) []sharedconfig.DimensionMappings
 	MetricsGeneratorProcessorSpanMetricsEnableTargetInfo(userID string) bool
 	MetricsGeneratorProcessorServiceGraphsEnableClientServerPrefix(userID string) bool
+	MetricsGeneratorProcessorServiceGraphsEnableMessagingSystemLatencyHistogram(userID string) bool
 	MetricsGeneratorProcessorServiceGraphsEnableVirtualNodeLabel(userID string) bool
 	MetricsGeneratorProcessorSpanMetricsTargetInfoExcludedDimensions(userID string) []string
 	DedicatedColumns(userID string) backend.DedicatedColumns

diff --git a/modules/generator/overrides_test.go b/modules/generator/overrides_test.go
@@ -9,28 +9,29 @@ import (
 )
 
 type mockOverrides struct {
-	processors                              map[string]struct{}
-	serviceGraphsHistogramBuckets           []float64
-	serviceGraphsDimensions                 []string
-	serviceGraphsPeerAttributes             []string
-	serviceGraphsEnableClientServerPrefix   bool
-	serviceGraphsEnableVirtualNodeLabel     bool
-	spanMetricsHistogramBuckets             []float64
-	spanMetricsDimensions                   []string
-	spanMetricsIntrinsicDimensions          map[string]bool
-	spanMetricsFilterPolicies               []filterconfig.FilterPolicy
-	spanMetricsDimensionMappings            []sharedconfig.DimensionMappings
-	spanMetricsEnableTargetInfo             bool
-	spanMetricsTargetInfoExcludedDimensions []string
-	localBlocksMaxLiveTraces                uint64
-	localBlocksMaxBlockDuration             time.Duration
-	localBlocksMaxBlockBytes                uint64
-	localBlocksFlushCheckPeriod             time.Duration
-	localBlocksTraceIdlePeriod              time.Duration
-	localBlocksCompleteBlockTimeout         time.Duration
-	dedicatedColumns                        backend.DedicatedColumns
-	maxBytesPerTrace                        int
-	unsafeQueryHints                        bool
+	processors                                         map[string]struct{}
+	serviceGraphsHistogramBuckets                      []float64
+	serviceGraphsDimensions                            []string
+	serviceGraphsPeerAttributes                        []string
+	serviceGraphsEnableClientServerPrefix              bool
+	serviceGraphsEnableMessagingSystemLatencyHistogram bool
+	serviceGraphsEnableVirtualNodeLabel                bool
+	spanMetricsHistogramBuckets                        []float64
+	spanMetricsDimensions                              []string
+	spanMetricsIntrinsicDimensions                     map[string]bool
+	spanMetricsFilterPolicies                          []filterconfig.FilterPolicy
+	spanMetricsDimensionMappings                       []sharedconfig.DimensionMappings
+	spanMetricsEnableTargetInfo                        bool
+	spanMetricsTargetInfoExcludedDimensions            []string
+	localBlocksMaxLiveTraces                           uint64
+	localBlocksMaxBlockDuration                        time.Duration
+	localBlocksMaxBlockBytes                           uint64
+	localBlocksFlushCheckPeriod                        time.Duration
+	localBlocksTraceIdlePeriod                         time.Duration
+	localBlocksCompleteBlockTimeout                    time.Duration
+	dedicatedColumns                                   backend.DedicatedColumns
+	maxBytesPerTrace                                   int
+	unsafeQueryHints                                   bool
 }
 
 var _ metricsGeneratorOverrides = (*mockOverrides)(nil)
@@ -129,6 +130,10 @@ func (m *mockOverrides) MetricsGeneratorProcessorServiceGraphsEnableClientServer
 	return m.serviceGraphsEnableClientServerPrefix
 }
 
+func (m *mockOverrides) MetricsGeneratorProcessorServiceGraphsEnableMessagingSystemLatencyHistogram(string) bool {
+	return m.serviceGraphsEnableMessagingSystemLatencyHistogram
+}
+
 func (m *mockOverrides) MetricsGeneratorProcessorServiceGraphsEnableVirtualNodeLabel(string) bool {
 	return m.serviceGraphsEnableVirtualNodeLabel
 }

diff --git a/modules/generator/processor/servicegraphs/config.go b/modules/generator/processor/servicegraphs/config.go
@@ -33,6 +33,9 @@ type Config struct {
 	// per dimension.
 	EnableClientServerPrefix bool `yaml:"enable_client_server_prefix"`
 
+	// If enabled another histogram will be produced for interactions over messaging systems middlewares
+	EnableMessagingSystemLatencyHistogram bool `yaml:"enable_messaging_system_latency_histogram"`
+
 	// PeerAttributes are attributes that will be used to create a peer edge
 	// Attributes are searched in the order they are provided
 	PeerAttributes []string `yaml:"peer_attributes"`
@@ -56,4 +59,6 @@ func (cfg *Config) RegisterFlagsAndApplyDefaults(string, *flag.FlagSet) {
 		peerAttr = append(peerAttr, string(attr))
 	}
 	cfg.PeerAttributes = peerAttr
+
+	cfg.EnableMessagingSystemLatencyHistogram = false
 }
diff --git a/modules/generator/processor/servicegraphs/servicegraphs.go b/modules/generator/processor/servicegraphs/servicegraphs.go
@@ -46,10 +46,11 @@ var (
 )
 
 const (
-	metricRequestTotal         = "traces_service_graph_request_total"
-	metricRequestFailedTotal   = "traces_service_graph_request_failed_total"
-	metricRequestServerSeconds = "traces_service_graph_request_server_seconds"
-	metricRequestClientSeconds = "traces_service_graph_request_client_seconds"
+	metricRequestTotal                  = "traces_service_graph_request_total"
+	metricRequestFailedTotal            = "traces_service_graph_request_failed_total"
+	metricRequestServerSeconds          = "traces_service_graph_request_server_seconds"
+	metricRequestClientSeconds          = "traces_service_graph_request_client_seconds"
+	metricRequestMessagingSystemSeconds = "traces_service_graph_request_messaging_system_seconds"
 )
 
 const virtualNodeLabel = "virtual_node"
@@ -75,10 +76,11 @@ type Processor struct {
 
 	closeCh chan struct{}
 
-	serviceGraphRequestTotal                  registry.Counter
-	serviceGraphRequestFailedTotal            registry.Counter
-	serviceGraphRequestServerSecondsHistogram registry.Histogram
-	serviceGraphRequestClientSecondsHistogram registry.Histogram
+	serviceGraphRequestTotal                           registry.Counter
+	serviceGraphRequestFailedTotal                     registry.Counter
+	serviceGraphRequestServerSecondsHistogram          registry.Histogram
+	serviceGraphRequestClientSecondsHistogram          registry.Histogram
+	serviceGraphRequestMessagingSystemSecondsHistogram registry.Histogram
 
 	metricDroppedSpans prometheus.Counter
 	metricTotalEdges   prometheus.Counter
@@ -114,10 +116,11 @@ func New(cfg Config, tenant string, registry registry.Registry, logger log.Logge
 		labels:   labels,
 		closeCh:  make(chan struct{}, 1),
 
-		serviceGraphRequestTotal:                  registry.NewCounter(metricRequestTotal),
-		serviceGraphRequestFailedTotal:            registry.NewCounter(metricRequestFailedTotal),
-		serviceGraphRequestServerSecondsHistogram: registry.NewHistogram(metricRequestServerSeconds, cfg.HistogramBuckets),
-		serviceGraphRequestClientSecondsHistogram: registry.NewHistogram(metricRequestClientSeconds, cfg.HistogramBuckets),
+		serviceGraphRequestTotal:                           registry.NewCounter(metricRequestTotal),
+		serviceGraphRequestFailedTotal:                     registry.NewCounter(metricRequestFailedTotal),
+		serviceGraphRequestServerSecondsHistogram:          registry.NewHistogram(metricRequestServerSeconds, cfg.HistogramBuckets),
+		serviceGraphRequestClientSecondsHistogram:          registry.NewHistogram(metricRequestClientSeconds, cfg.HistogramBuckets),
+		serviceGraphRequestMessagingSystemSecondsHistogram: registry.NewHistogram(metricRequestMessagingSystemSeconds, cfg.HistogramBuckets),
 
 		metricDroppedSpans: metricDroppedSpans.WithLabelValues(tenant),
 		metricTotalEdges:   metricTotalEdges.WithLabelValues(tenant),
@@ -192,6 +195,7 @@ func (p *Processor) consume(resourceSpans []*v1_trace.ResourceSpans) (err error)
 						e.ConnectionType = connectionType
 						e.ClientService = svcName
 						e.ClientLatencySec = spanDurationSec(span)
+						e.ClientEndTimeUnixNano = span.EndTimeUnixNano
 						e.Failed = e.Failed || p.spanFailed(span)
 						p.upsertDimensions("client_", e.Dimensions, rs.Resource.Attributes, span.Attributes)
 						e.SpanMultiplier = spanMultiplier
@@ -217,6 +221,7 @@ func (p *Processor) consume(resourceSpans []*v1_trace.ResourceSpans) (err error)
 						e.ConnectionType = connectionType
 						e.ServerService = svcName
 						e.ServerLatencySec = spanDurationSec(span)
+						e.ServerStartTimeUnixNano = span.StartTimeUnixNano
 						e.Failed = e.Failed || p.spanFailed(span)
 						p.upsertDimensions("server_", e.Dimensions, rs.Resource.Attributes, span.Attributes)
 						e.SpanMultiplier = spanMultiplier
@@ -309,6 +314,15 @@ func (p *Processor) onComplete(e *store.Edge) {
 
 	p.serviceGraphRequestServerSecondsHistogram.ObserveWithExemplar(registryLabelValues, e.ServerLatencySec, e.TraceID, e.SpanMultiplier)
 	p.serviceGraphRequestClientSecondsHistogram.ObserveWithExemplar(registryLabelValues, e.ClientLatencySec, e.TraceID, e.SpanMultiplier)
+
+	if p.Cfg.EnableMessagingSystemLatencyHistogram && e.ConnectionType == store.MessagingSystem {
+		messagingSystemLatencySec := unixNanosDiffSec(e.ClientEndTimeUnixNano, e.ServerStartTimeUnixNano)
+		if messagingSystemLatencySec == 0 {
+			level.Warn(p.logger).Log("msg", "producerSpanEndTime must be smaller than consumerSpanStartTime. maybe the peers clocks are not synced", "messagingSystemLatencySec", messagingSystemLatencySec, "traceID", e.TraceID)
+		} else {
+			p.serviceGraphRequestMessagingSystemSecondsHistogram.ObserveWithExemplar(registryLabelValues, messagingSystemLatencySec, e.TraceID, e.SpanMultiplier)
+		}
+	}
 }
 
 func (p *Processor) onExpire(e *store.Edge) {
@@ -348,8 +362,17 @@ func (p *Processor) spanFailed(span *v1_trace.Span) bool {
 	return span.GetStatus().GetCode() == v1_trace.Status_STATUS_CODE_ERROR
 }
 
+func unixNanosDiffSec(unixNanoStart uint64, unixNanoEnd uint64) float64 {
+	if unixNanoStart > unixNanoEnd {
+		// To prevent underflow, return 0.
+		return 0
+	}
+	// Safe subtraction.
+	return float64(unixNanoEnd-unixNanoStart) / float64(time.Second)
+}
+
 func spanDurationSec(span *v1_trace.Span) float64 {
-	return float64(span.EndTimeUnixNano-span.StartTimeUnixNano) / float64(time.Second.Nanoseconds())
+	return unixNanosDiffSec(span.StartTimeUnixNano, span.EndTimeUnixNano)
 }
 
 func buildKey(k1, k2 string) string {