Skip to content

Commit 17a7fe3

Browse files
committed
feat: add ability to include reason in count metrics
Adds a configuration flag to enable including the `reason` for a TaskRun or PipelineRun status on their count metrics. This allows for more fine-grained monitoring and alerting of run failures. Signed-off-by: Marcus Noble <github@marcusnoble.co.uk>
1 parent 445734d commit 17a7fe3

File tree

9 files changed

+222
-19
lines changed

9 files changed

+222
-19
lines changed

config/config-observability.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,4 @@ data:
5858
metrics.taskrun.duration-type: "histogram"
5959
metrics.pipelinerun.level: "pipeline"
6060
metrics.pipelinerun.duration-type: "histogram"
61+
metrics.count.reason: "false"

docs/metrics.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ We expose several kinds of exporters, including Prometheus, Google Stackdriver,
1515
| ---------- | ----------- | ----------- | ----------- |
1616
| `tekton_pipelines_controller_pipelinerun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `namespace`=&lt;pipelinerun-namespace&gt; | experimental |
1717
| `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `*pipeline`=&lt;pipeline_name&gt; <br> `*pipelinerun`=&lt;pipelinerun_name&gt; <br> `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt;| experimental |
18-
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; | experimental |
18+
| `tekton_pipelines_controller_pipelinerun_count` | Counter | `status`=&lt;status&gt; <br> `*reason`=&lt;reason&gt; | experimental |
1919
| `tekton_pipelines_controller_running_pipelineruns_count` | Gauge | | experimental |
2020
| `tekton_pipelines_controller_taskrun_duration_seconds_[bucket, sum, count]` | Histogram/LastValue(Gauge) | `status`=&lt;status&gt; <br> `*task`=&lt;task_name&gt; <br> `*taskrun`=&lt;taskrun_name&gt;<br> `namespace`=&lt;pipelineruns-taskruns-namespace&gt; | experimental |
21-
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; | experimental |
21+
| `tekton_pipelines_controller_taskrun_count` | Counter | `status`=&lt;status&gt; <br> `*reason`=&lt;reason&gt; | experimental |
2222
| `tekton_pipelines_controller_running_taskruns_count` | Gauge | | experimental |
2323
| `tekton_pipelines_controller_running_taskruns_throttled_by_quota_count` | Gauge | | experimental |
2424
| `tekton_pipelines_controller_running_taskruns_throttled_by_node_count` | Gauge | | experimental |
@@ -37,6 +37,7 @@ A sample config-map has been provided as [config-observability](./../config/conf
3737
metrics.taskrun.duration-type: "histogram"
3838
metrics.pipelinerun.level: "pipeline"
3939
metrics.pipelinerun.duration-type: "histogram"
40+
metrics.count.reason: "false"
4041
```
4142
4243
Following values are available in the configmap:
@@ -53,6 +54,7 @@ Following values are available in the configmap:
5354
| metrics.taskrun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_taskrun_duration_seconds` and `tekton_pipelines_controller_taskrun_duration_seconds` is of type gauge or lastvalue |
5455
| metrics.pipelinerun.duration-type | `histogram` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type histogram |
5556
| metrics.pipelinerun.duration-type | `lastvalue` | `tekton_pipelines_controller_pipelinerun_duration_seconds` is of type gauge or lastvalue |
57+
| metrics.count.reason | `false` | Sets if the `reason` label should be included on count metrics |
5658

5759
Histogram value isn't available when pipelinerun or taskrun labels are selected. The Lastvalue or Gauge will be provided. Histogram would serve no purpose because it would generate a single bar. TaskRun and PipelineRun level metrics aren't recommended because they lead to an unbounded cardinality which degrades the observability database.
5860

pkg/apis/config/metrics.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ const (
3636
// metrics to use for aggregating duration for pipelinerun
3737
metricsDurationPipelinerunType = "metrics.pipelinerun.duration-type"
3838

39+
// countWithReasonKey sets if the reason label should be included on count metrics
40+
countWithReasonKey = "metrics.count.reason"
41+
3942
// DefaultTaskrunLevel determines to what level to aggregate metrics
4043
// when it isn't specified in configmap
4144
DefaultTaskrunLevel = TaskrunLevelAtTask
@@ -92,6 +95,7 @@ type Metrics struct {
9295
PipelinerunLevel string
9396
DurationTaskrunType string
9497
DurationPipelinerunType string
98+
CountWithReason bool
9599
}
96100

97101
// GetMetricsConfigName returns the name of the configmap containing all
@@ -113,7 +117,8 @@ func (cfg *Metrics) Equals(other *Metrics) bool {
113117
return other.TaskrunLevel == cfg.TaskrunLevel &&
114118
other.PipelinerunLevel == cfg.PipelinerunLevel &&
115119
other.DurationTaskrunType == cfg.DurationTaskrunType &&
116-
other.DurationPipelinerunType == cfg.DurationPipelinerunType
120+
other.DurationPipelinerunType == cfg.DurationPipelinerunType &&
121+
other.CountWithReason == cfg.CountWithReason
117122
}
118123

119124
// newMetricsFromMap returns a Config given a map corresponding to a ConfigMap
@@ -123,6 +128,7 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
123128
PipelinerunLevel: DefaultPipelinerunLevel,
124129
DurationTaskrunType: DefaultDurationTaskrunType,
125130
DurationPipelinerunType: DefaultDurationPipelinerunType,
131+
CountWithReason: false,
126132
}
127133

128134
if taskrunLevel, ok := cfgMap[metricsTaskrunLevelKey]; ok {
@@ -138,6 +144,11 @@ func newMetricsFromMap(cfgMap map[string]string) (*Metrics, error) {
138144
if durationPipelinerun, ok := cfgMap[metricsDurationPipelinerunType]; ok {
139145
tc.DurationPipelinerunType = durationPipelinerun
140146
}
147+
148+
if countWithReason, ok := cfgMap[countWithReasonKey]; ok && countWithReason != "false" {
149+
tc.CountWithReason = true
150+
}
151+
141152
return &tc, nil
142153
}
143154

pkg/apis/config/metrics_test.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
3838
PipelinerunLevel: config.PipelinerunLevelAtPipelinerun,
3939
DurationTaskrunType: config.DurationPipelinerunTypeHistogram,
4040
DurationPipelinerunType: config.DurationPipelinerunTypeHistogram,
41+
CountWithReason: false,
4142
},
4243
fileName: config.GetMetricsConfigName(),
4344
},
@@ -47,9 +48,20 @@ func TestNewMetricsFromConfigMap(t *testing.T) {
4748
PipelinerunLevel: config.PipelinerunLevelAtNS,
4849
DurationTaskrunType: config.DurationTaskrunTypeHistogram,
4950
DurationPipelinerunType: config.DurationPipelinerunTypeLastValue,
51+
CountWithReason: false,
5052
},
5153
fileName: "config-observability-namespacelevel",
5254
},
55+
{
56+
expectedConfig: &config.Metrics{
57+
TaskrunLevel: config.TaskrunLevelAtNS,
58+
PipelinerunLevel: config.PipelinerunLevelAtNS,
59+
DurationTaskrunType: config.DurationTaskrunTypeHistogram,
60+
DurationPipelinerunType: config.DurationPipelinerunTypeLastValue,
61+
CountWithReason: true,
62+
},
63+
fileName: "config-observability-reason",
64+
},
5365
}
5466

5567
for _, tc := range testCases {
@@ -64,6 +76,7 @@ func TestNewMetricsFromEmptyConfigMap(t *testing.T) {
6476
PipelinerunLevel: config.PipelinerunLevelAtPipeline,
6577
DurationTaskrunType: config.DurationPipelinerunTypeHistogram,
6678
DurationPipelinerunType: config.DurationPipelinerunTypeHistogram,
79+
CountWithReason: false,
6780
}
6881
verifyConfigFileWithExpectedMetricsConfig(t, MetricsConfigEmptyName, expectedConfig)
6982
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Copyright 2019 The Tekton Authors
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v1
16+
kind: ConfigMap
17+
metadata:
18+
name: config-observability
19+
namespace: tekton-pipelines
20+
labels:
21+
app.kubernetes.io/instance: default
22+
app.kubernetes.io/part-of: tekton-pipelines
23+
data:
24+
metrics.backend-destination: prometheus
25+
metrics.stackdriver-project-id: "<your stackdriver project id>"
26+
metrics.allow-stackdriver-custom-metrics: "false"
27+
metrics.taskrun.level: "namespace"
28+
metrics.taskrun.duration-type: "histogram"
29+
metrics.pipelinerun.level: "namespace"
30+
metrics.pipelinerun.duration-type: "lastvalue"
31+
metrics.count.reason: "true"

pkg/pipelinerunmetrics/metrics.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ var (
4343
pipelineTag = tag.MustNewKey("pipeline")
4444
namespaceTag = tag.MustNewKey("namespace")
4545
statusTag = tag.MustNewKey("status")
46+
reasonTag = tag.MustNewKey("reason")
4647

4748
prDuration = stats.Float64(
4849
"pipelinerun_duration_seconds",
@@ -149,11 +150,15 @@ func viewRegister(cfg *config.Metrics) error {
149150
TagKeys: append([]tag.Key{statusTag, namespaceTag}, prunTag...),
150151
}
151152

153+
prCountViewTags := []tag.Key{statusTag}
154+
if cfg.CountWithReason {
155+
prCountViewTags = append(prCountViewTags, reasonTag)
156+
}
152157
prCountView = &view.View{
153158
Description: prCount.Description(),
154159
Measure: prCount,
155160
Aggregation: view.Count(),
156-
TagKeys: []tag.Key{statusTag},
161+
TagKeys: prCountViewTags,
157162
}
158163
runningPRsCountView = &view.View{
159164
Description: runningPRsCount.Description(),
@@ -230,13 +235,15 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
230235
}
231236
}
232237

238+
cond := pr.Status.GetCondition(apis.ConditionSucceeded)
233239
status := "success"
234-
if cond := pr.Status.GetCondition(apis.ConditionSucceeded); cond.Status == corev1.ConditionFalse {
240+
if cond.Status == corev1.ConditionFalse {
235241
status = "failed"
236242
if cond.Reason == ReasonCancelled {
237243
status = "cancelled"
238244
}
239245
}
246+
reason := cond.Reason
240247

241248
pipelineName := "anonymous"
242249
if pr.Spec.PipelineRef != nil && pr.Spec.PipelineRef.Name != "" {
@@ -245,7 +252,7 @@ func (r *Recorder) DurationAndCount(pr *v1.PipelineRun, beforeCondition *apis.Co
245252
ctx, err := tag.New(
246253
context.Background(),
247254
append([]tag.Mutator{tag.Insert(namespaceTag, pr.Namespace),
248-
tag.Insert(statusTag, status)}, r.insertTag(pipelineName, pr.Name)...)...)
255+
tag.Insert(statusTag, status), tag.Insert(reasonTag, reason)}, r.insertTag(pipelineName, pr.Name)...)...)
249256
if err != nil {
250257
return err
251258
}

pkg/pipelinerunmetrics/metrics_test.go

Lines changed: 82 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,15 @@ var (
4242
completionTime = metav1.NewTime(startTime.Time.Add(time.Minute))
4343
)
4444

45-
func getConfigContext() context.Context {
45+
func getConfigContext(countWithReason bool) context.Context {
4646
ctx := context.Background()
4747
cfg := &config.Config{
4848
Metrics: &config.Metrics{
4949
TaskrunLevel: config.TaskrunLevelAtTaskrun,
5050
PipelinerunLevel: config.PipelinerunLevelAtPipelinerun,
5151
DurationTaskrunType: config.DefaultDurationTaskrunType,
5252
DurationPipelinerunType: config.DefaultDurationPipelinerunType,
53+
CountWithReason: countWithReason,
5354
},
5455
}
5556
return config.ToContext(ctx, cfg)
@@ -71,7 +72,7 @@ func TestMetricsOnStore(t *testing.T) {
7172
defer log.Sync()
7273
logger := log.Sugar()
7374

74-
ctx := getConfigContext()
75+
ctx := getConfigContext(false)
7576
metrics, err := NewRecorder(ctx)
7677
if err != nil {
7778
t.Fatalf("NewRecorder: %v", err)
@@ -117,6 +118,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
117118
expectedDuration float64
118119
expectedCount int64
119120
beforeCondition *apis.Condition
121+
countWithReason bool
120122
}{{
121123
name: "for succeeded pipeline",
122124
pipelineRun: &v1.PipelineRun{
@@ -149,6 +151,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
149151
expectedDuration: 60,
150152
expectedCount: 1,
151153
beforeCondition: nil,
154+
countWithReason: false,
152155
}, {
153156
name: "for succeeded pipeline different condition",
154157
pipelineRun: &v1.PipelineRun{
@@ -184,6 +187,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
184187
Type: apis.ConditionReady,
185188
Status: corev1.ConditionUnknown,
186189
},
190+
countWithReason: false,
187191
}, {
188192
name: "for succeeded pipeline recount",
189193
pipelineRun: &v1.PipelineRun{
@@ -212,6 +216,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
212216
Type: apis.ConditionSucceeded,
213217
Status: corev1.ConditionTrue,
214218
},
219+
countWithReason: false,
215220
}, {
216221
name: "for cancelled pipeline",
217222
pipelineRun: &v1.PipelineRun{
@@ -245,6 +250,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
245250
expectedDuration: 60,
246251
expectedCount: 1,
247252
beforeCondition: nil,
253+
countWithReason: false,
248254
}, {
249255
name: "for failed pipeline",
250256
pipelineRun: &v1.PipelineRun{
@@ -277,6 +283,7 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
277283
expectedDuration: 60,
278284
expectedCount: 1,
279285
beforeCondition: nil,
286+
countWithReason: false,
280287
}, {
281288
name: "for pipeline without start or completion time",
282289
pipelineRun: &v1.PipelineRun{
@@ -306,11 +313,82 @@ func TestRecordPipelineRunDurationCount(t *testing.T) {
306313
expectedDuration: 0,
307314
expectedCount: 1,
308315
beforeCondition: nil,
316+
countWithReason: false,
317+
}, {
318+
name: "for failed pipeline with reason",
319+
pipelineRun: &v1.PipelineRun{
320+
ObjectMeta: metav1.ObjectMeta{Name: "pipelinerun-1", Namespace: "ns"},
321+
Spec: v1.PipelineRunSpec{
322+
PipelineRef: &v1.PipelineRef{Name: "pipeline-1"},
323+
},
324+
Status: v1.PipelineRunStatus{
325+
Status: duckv1.Status{
326+
Conditions: duckv1.Conditions{{
327+
Type: apis.ConditionSucceeded,
328+
Status: corev1.ConditionFalse,
329+
Reason: "Failed",
330+
}},
331+
},
332+
PipelineRunStatusFields: v1.PipelineRunStatusFields{
333+
StartTime: &startTime,
334+
CompletionTime: &completionTime,
335+
},
336+
},
337+
},
338+
expectedDurationTags: map[string]string{
339+
"pipeline": "pipeline-1",
340+
"pipelinerun": "pipelinerun-1",
341+
"namespace": "ns",
342+
"status": "failed",
343+
},
344+
expectedCountTags: map[string]string{
345+
"status": "failed",
346+
"reason": "Failed",
347+
},
348+
expectedDuration: 60,
349+
expectedCount: 1,
350+
beforeCondition: nil,
351+
countWithReason: true,
352+
}, {
353+
name: "for cancelled pipeline with reason",
354+
pipelineRun: &v1.PipelineRun{
355+
ObjectMeta: metav1.ObjectMeta{Name: "pipelinerun-1", Namespace: "ns"},
356+
Spec: v1.PipelineRunSpec{
357+
PipelineRef: &v1.PipelineRef{Name: "pipeline-1"},
358+
},
359+
Status: v1.PipelineRunStatus{
360+
Status: duckv1.Status{
361+
Conditions: duckv1.Conditions{{
362+
Type: apis.ConditionSucceeded,
363+
Status: corev1.ConditionFalse,
364+
Reason: ReasonCancelled,
365+
}},
366+
},
367+
PipelineRunStatusFields: v1.PipelineRunStatusFields{
368+
StartTime: &startTime,
369+
CompletionTime: &completionTime,
370+
},
371+
},
372+
},
373+
expectedDurationTags: map[string]string{
374+
"pipeline": "pipeline-1",
375+
"pipelinerun": "pipelinerun-1",
376+
"namespace": "ns",
377+
"status": "cancelled",
378+
},
379+
expectedCountTags: map[string]string{
380+
"status": "cancelled",
381+
"reason": ReasonCancelled,
382+
},
383+
expectedDuration: 60,
384+
expectedCount: 1,
385+
beforeCondition: nil,
386+
countWithReason: true,
309387
}} {
310388
t.Run(test.name, func(t *testing.T) {
311389
unregisterMetrics()
312390

313-
ctx := getConfigContext()
391+
ctx := getConfigContext(test.countWithReason)
314392
metrics, err := NewRecorder(ctx)
315393
if err != nil {
316394
t.Fatalf("NewRecorder: %v", err)
@@ -363,7 +441,7 @@ func TestRecordRunningPipelineRunsCount(t *testing.T) {
363441
}
364442
}
365443

366-
ctx = getConfigContext()
444+
ctx = getConfigContext(false)
367445
metrics, err := NewRecorder(ctx)
368446
if err != nil {
369447
t.Fatalf("NewRecorder: %v", err)

0 commit comments

Comments
 (0)