[processor/tailsamplingprocessor] Improve accuracy of `count_traces_s…

…ampled` metric (open-telemetry#26724) **Description:** Previously, the calculation of `count_traces_sampled` was incorrect, resulting in all per-policy metric counts being the same. ``` count_traces_sampled{policy="policy-a", sampled="false"} 50 count_traces_sampled{policy="policy-a", sampled="true"} 50 count_traces_sampled{policy="policy-b", sampled="false"} 50 count_traces_sampled{policy="policy-b", sampled="true"} 50 ``` This issue stemmed from the metrics being incremented based on the final decision of trace sampling, rather than being attributed to the policies causing the sampling. With the fix implemented, the total sampling count can no longer be inferred from the metrics alone. To address this, a new metric, `global_count_traces_sampled`, was introduced to accurately capture the global count of sampled traces. ``` count_traces_sampled{policy="policy-a", sampled="false"} 50 count_traces_sampled{policy="policy-a", sampled="true"} 50 count_traces_sampled{policy="policy-b", sampled="false"} 24 count_traces_sampled{policy="policy-b", sampled="true"} 76 global_count_traces_sampled{sampled="false"} 24 global_count_traces_sampled{sampled="true"} 76 ``` Reusing the `count_traces_sampled` policy metric for this purpose would have either meant: 1. Leaving the policy field empty, which didn't feel very explicit. 2. Setting a global placeholder policy name (such as `final-decision`), which could then potentially collide with existing user-configured policy names without validation. **Link to tracking Issue:** Fixes open-telemetry#25882 **Testing:** Tested with various combinations of `probabilistic`, `latency`, `status_code` and attribute policies (including combinations of `and` and `composite` policies). No unit tests were added, open to suggestions for adding tests for metrics but I couldn't find any examples of it being done elsewhere. **Documentation:** No documentation changes.
jmsnll · Sep 27, 2023 · 66cbd50 · 66cbd50
1 parent 85eb91d
commit 66cbd50
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 3 deletions.
diff --git a/.chloggen/fix_tailsampling-metric-incorrect-count.yaml b/.chloggen/fix_tailsampling-metric-incorrect-count.yaml
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: 'breaking'
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: bug_fix
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Improve counting for the `count_traces_sampled` metric
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [25882]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext:
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: [user]
diff --git a/processor/tailsamplingprocessor/metrics.go b/processor/tailsamplingprocessor/metrics.go
@@ -27,7 +27,8 @@ var (
 
 	statPolicyEvaluationErrorCount = stats.Int64("sampling_policy_evaluation_error", "Count of sampling policy evaluation errors", stats.UnitDimensionless)
 
-	statCountTracesSampled = stats.Int64("count_traces_sampled", "Count of traces that were sampled or not", stats.UnitDimensionless)
+	statCountTracesSampled       = stats.Int64("count_traces_sampled", "Count of traces that were sampled or not per sampling policy", stats.UnitDimensionless)
+	statCountGlobalTracesSampled = stats.Int64("global_count_traces_sampled", "Global count of traces that were sampled or not by at least one policy", stats.UnitDimensionless)
 
 	statDroppedTooEarlyCount    = stats.Int64("sampling_trace_dropped_too_early", "Count of traces that needed to be dropped the configured wait time", stats.UnitDimensionless)
 	statNewTraceIDReceivedCount = stats.Int64("new_trace_id_received", "Counts the arrival of new traces", stats.UnitDimensionless)
@@ -88,6 +89,14 @@ func samplingProcessorMetricViews(level configtelemetry.Level) []*view.View {
 		Aggregation: view.Sum(),
 	}
 
+	countGlobalTracesSampledView := &view.View{
+		Name:        obsreport.BuildProcessorCustomMetricName(metadata.Type, statCountGlobalTracesSampled.Name()),
+		Measure:     statCountGlobalTracesSampled,
+		Description: statCountGlobalTracesSampled.Description(),
+		TagKeys:     []tag.Key{tagSampledKey},
+		Aggregation: view.Sum(),
+	}
+
 	countTraceDroppedTooEarlyView := &view.View{
 		Name:        processorhelper.BuildCustomMetricName(metadata.Type, statDroppedTooEarlyCount.Name()),
 		Measure:     statDroppedTooEarlyCount,
@@ -117,6 +126,7 @@ func samplingProcessorMetricViews(level configtelemetry.Level) []*view.View {
 		countPolicyEvaluationErrorView,
 
 		countTracesSampledView,
+		countGlobalTracesSampledView,
 
 		countTraceDroppedTooEarlyView,
 		countTraceIDArrivalView,

diff --git a/processor/tailsamplingprocessor/processor.go b/processor/tailsamplingprocessor/processor.go
@@ -279,8 +279,8 @@ func (tsp *tailSamplingSpanProcessor) makeDecision(id pcommon.TraceID, trace *sa
 		finalDecision = sampling.Sampled
 	}
 
-	for _, p := range tsp.policies {
-		switch finalDecision {
+	for i, p := range tsp.policies {
+		switch trace.Decisions[i] {
 		case sampling.Sampled:
 			// any single policy that decides to sample will cause the decision to be sampled
 			// the nextConsumer will get the context from the first matching policy
@@ -305,6 +305,21 @@ func (tsp *tailSamplingSpanProcessor) makeDecision(id pcommon.TraceID, trace *sa
 		}
 	}
 
+	switch finalDecision {
+	case sampling.Sampled:
+		_ = stats.RecordWithTags(
+			tsp.ctx,
+			[]tag.Mutator{tag.Upsert(tagSampledKey, "true")},
+			statCountGlobalTracesSampled.M(int64(1)),
+		)
+	case sampling.NotSampled:
+		_ = stats.RecordWithTags(
+			tsp.ctx,
+			[]tag.Mutator{tag.Upsert(tagSampledKey, "false")},
+			statCountGlobalTracesSampled.M(int64(1)),
+		)
+	}
+
 	return finalDecision, matchingPolicy
 }