Skip to content

Commit 36d88c0

Browse files
Adding user label to metrics to be able to track these metrics at a user level (#5312)
Signed-off-by: Anand Rajagopal <anrajag@amazon.com>
1 parent 1b6968f commit 36d88c0

File tree

3 files changed

+126
-14
lines changed

3 files changed

+126
-14
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Changelog
22

33
## master / unreleased
4+
* [CHANGE] Ruler: Added user label to `cortex_ruler_write_requests_total`, `cortex_ruler_write_requests_failed_total`, `cortex_ruler_queries_total`, and `cortex_ruler_queries_failed_total` metrics. #5312
45
* [CHANGE] Alertmanager: Validating new fields on the PagerDuty AM config. #5290
56
* [CHANGE] Ingester: Creating label `native-histogram-sample` on the `cortex_discarded_samples_total` to keep track of discarded native histogram samples. #5289
67
* [FEATURE] Store Gateway: Add `max_downloaded_bytes_per_request` to limit max bytes to download per store gateway request.

integration/ruler_test.go

Lines changed: 112 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ import (
1717
"testing"
1818
"time"
1919

20+
"github.com/cortexproject/cortex/pkg/storage/tsdb"
21+
2022
"github.com/prometheus/common/model"
2123
"github.com/prometheus/prometheus/model/labels"
2224
"github.com/prometheus/prometheus/model/rulefmt"
@@ -576,8 +578,8 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
576578
require.Equal(t, 200, res.StatusCode)
577579
}
578580

579-
totalQueries, err := ruler.SumMetrics([]string{"cortex_ruler_queries_total"})
580-
require.NoError(t, err)
581+
matcher := labels.MustNewMatcher(labels.MatchEqual, "user", user)
582+
var totalQueries = []float64{0}
581583

582584
// Verify that user-failures don't increase cortex_ruler_queries_failed_total
583585
for groupName, expression := range map[string]string{
@@ -601,7 +603,7 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
601603
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_prometheus_rule_evaluation_failures_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
602604

603605
// But these failures were not reported as "failed queries"
604-
sum, err := ruler.SumMetrics([]string{"cortex_ruler_queries_failed_total"})
606+
sum, err := ruler.SumMetrics([]string{"cortex_ruler_queries_failed_total"}, e2e.WithLabelMatchers(matcher))
605607
require.NoError(t, err)
606608
require.Equal(t, float64(0), sum[0])
607609

@@ -612,7 +614,7 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
612614
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_prometheus_rule_group_rules"}, e2e.SkipMissingMetrics))
613615

614616
// Check that cortex_ruler_queries_total went up since last test.
615-
newTotalQueries, err := ruler.SumMetrics([]string{"cortex_ruler_queries_total"})
617+
newTotalQueries, err := ruler.SumMetrics([]string{"cortex_ruler_queries_total"}, e2e.WithLabelMatchers(matcher))
616618
require.NoError(t, err)
617619
require.Greater(t, newTotalQueries[0], totalQueries[0])
618620

@@ -637,15 +639,119 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
637639
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_prometheus_rule_evaluation_failures_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
638640

639641
// Still no failures.
640-
sum, err := ruler.SumMetrics([]string{"cortex_ruler_queries_failed_total"})
642+
sum, err := ruler.SumMetrics([]string{"cortex_ruler_queries_failed_total"}, e2e.WithLabelMatchers(matcher))
641643
require.NoError(t, err)
642644
require.Equal(t, float64(0), sum[0])
643645

644646
// Now let's stop ingester, and recheck metrics. This should increase cortex_ruler_queries_failed_total failures.
645647
require.NoError(t, s.Stop(ingester))
646648

647649
// We should start getting "real" failures now.
648-
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_ruler_queries_failed_total"}))
650+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_ruler_queries_failed_total"}, e2e.WithLabelMatchers(matcher)))
651+
})
652+
}
653+
654+
func TestRulerMetricsWhenIngesterFails(t *testing.T) {
655+
s, err := e2e.NewScenario(networkName)
656+
require.NoError(t, err)
657+
defer s.Close()
658+
659+
// Start dependencies.
660+
consul := e2edb.NewConsul()
661+
minio := e2edb.NewMinio(9000, bucketName, rulestoreBucketName)
662+
require.NoError(t, s.StartAndWaitReady(consul, minio))
663+
664+
const blockRangePeriod = 2 * time.Second
665+
// Configure the ruler.
666+
flags := mergeFlags(
667+
BlocksStorageFlags(),
668+
RulerFlags(),
669+
map[string]string{
670+
"-blocks-storage.tsdb.block-ranges-period": blockRangePeriod.String(),
671+
"-blocks-storage.tsdb.ship-interval": "1s",
672+
"-blocks-storage.bucket-store.sync-interval": "1s",
673+
"-blocks-storage.bucket-store.index-cache.backend": tsdb.IndexCacheBackendInMemory,
674+
"-blocks-storage.tsdb.retention-period": ((blockRangePeriod * 2) - 1).String(),
675+
676+
// Enable the bucket index so we can skip the initial bucket scan.
677+
"-blocks-storage.bucket-store.bucket-index.enabled": "false",
678+
// Evaluate rules often, so that we don't need to wait for metrics to show up.
679+
"-ruler.evaluation-interval": "2s",
680+
"-ruler.poll-interval": "2s",
681+
// No delay
682+
"-ruler.evaluation-delay-duration": "0",
683+
684+
// We run single ingester only, no replication.
685+
"-distributor.replication-factor": "1",
686+
687+
// Very low limit so that ruler hits it.
688+
"-querier.max-fetched-chunks-per-query": "15",
689+
"-querier.query-store-after": (1 * time.Second).String(),
690+
"-querier.query-ingesters-within": (2 * time.Second).String(),
691+
},
692+
)
693+
694+
const namespace = "test"
695+
const user = "user"
696+
697+
storeGateway := e2ecortex.NewStoreGateway("store-gateway-1", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
698+
699+
flags = mergeFlags(flags, map[string]string{
700+
"-querier.store-gateway-addresses": storeGateway.NetworkGRPCEndpoint(),
701+
})
702+
703+
distributor := e2ecortex.NewDistributor("distributor", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
704+
ruler := e2ecortex.NewRuler("ruler", consul.NetworkHTTPEndpoint(), flags, "")
705+
ingester := e2ecortex.NewIngester("ingester", e2ecortex.RingStoreConsul, consul.NetworkHTTPEndpoint(), flags, "")
706+
require.NoError(t, s.StartAndWaitReady(distributor, ingester, ruler, storeGateway))
707+
708+
// Wait until both the distributor and ruler have updated the ring. The querier will also watch
709+
// the store-gateway ring if blocks sharding is enabled.
710+
require.NoError(t, distributor.WaitSumMetrics(e2e.Equals(512), "cortex_ring_tokens_total"))
711+
require.NoError(t, ruler.WaitSumMetrics(e2e.Equals(1024), "cortex_ring_tokens_total"))
712+
713+
c, err := e2ecortex.NewClient(distributor.HTTPEndpoint(), "", "", ruler.HTTPEndpoint(), user)
714+
require.NoError(t, err)
715+
716+
matcher := labels.MustNewMatcher(labels.MatchEqual, "user", user)
717+
expression := "absent(sum_over_time(metric{}[2s] offset 1h))"
718+
719+
// Now let's upload a non-failing rule, and make sure that it works.
720+
t.Run("real_error", func(t *testing.T) {
721+
const groupName = "good_rule"
722+
723+
var ruleEvalCount float64
724+
ruleGroup := ruleGroupWithRule(groupName, "rule", expression)
725+
ruleGroup.Interval = 2
726+
require.NoError(t, c.SetRuleGroup(ruleGroup, namespace))
727+
m := ruleGroupMatcher(user, namespace, groupName)
728+
729+
// Wait until ruler has loaded the group.
730+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_prometheus_rule_group_rules"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
731+
732+
// Wait until rule group has tried to evaluate the rule, and succeeded.
733+
ruleEvalCount++
734+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(ruleEvalCount), []string{"cortex_prometheus_rule_evaluations_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
735+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_prometheus_rule_evaluation_failures_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
736+
737+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_ruler_write_requests_total"}, e2e.WithLabelMatchers(matcher), e2e.WaitMissingMetrics))
738+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_ruler_write_requests_failed_total"}, e2e.WithLabelMatchers(matcher), e2e.WaitMissingMetrics))
739+
740+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_ruler_queries_failed_total"}, e2e.WithLabelMatchers(matcher), e2e.WaitMissingMetrics))
741+
742+
// Wait until the TSDB head is compacted and shipped to the storage.
743+
// The shipped block contains the 1st series, while the 2ns series in the head.
744+
require.NoError(t, ingester.WaitSumMetrics(e2e.Equals(1), "cortex_ingester_shipper_uploads_total"))
745+
746+
// Now let's stop ingester, and recheck metrics. This should increase cortex_ruler_write_requests_failed_total failures.
747+
require.NoError(t, s.Stop(ingester))
748+
ruleEvalCount++
749+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(ruleEvalCount), []string{"cortex_prometheus_rule_evaluations_total"}, e2e.WithLabelMatchers(m), e2e.WaitMissingMetrics))
750+
751+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_ruler_queries_failed_total"}, e2e.WithLabelMatchers(matcher), e2e.WaitMissingMetrics))
752+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(2), []string{"cortex_ruler_write_requests_total"}, e2e.WithLabelMatchers(matcher), e2e.WaitMissingMetrics))
753+
754+
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.GreaterOrEqual(1), []string{"cortex_ruler_write_requests_failed_total"}, e2e.WithLabelMatchers(matcher), e2e.WaitMissingMetrics))
649755
})
650756
}
651757

pkg/ruler/compat.go

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -251,23 +251,23 @@ type RulesManager interface {
251251
type ManagerFactory func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager
252252

253253
func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engine v1.QueryEngine, overrides RulesLimits, reg prometheus.Registerer) ManagerFactory {
254-
totalWrites := promauto.With(reg).NewCounter(prometheus.CounterOpts{
254+
totalWritesVec := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
255255
Name: "cortex_ruler_write_requests_total",
256256
Help: "Number of write requests to ingesters.",
257-
})
258-
failedWrites := promauto.With(reg).NewCounter(prometheus.CounterOpts{
257+
}, []string{"user"})
258+
failedWritesVec := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
259259
Name: "cortex_ruler_write_requests_failed_total",
260260
Help: "Number of failed write requests to ingesters.",
261-
})
261+
}, []string{"user"})
262262

263-
totalQueries := promauto.With(reg).NewCounter(prometheus.CounterOpts{
263+
totalQueriesVec := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
264264
Name: "cortex_ruler_queries_total",
265265
Help: "Number of queries executed by ruler.",
266-
})
267-
failedQueries := promauto.With(reg).NewCounter(prometheus.CounterOpts{
266+
}, []string{"user"})
267+
failedQueriesVec := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
268268
Name: "cortex_ruler_queries_failed_total",
269269
Help: "Number of failed queries by ruler.",
270-
})
270+
}, []string{"user"})
271271
var rulerQuerySeconds *prometheus.CounterVec
272272
if cfg.EnableQueryStats {
273273
rulerQuerySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
@@ -287,6 +287,11 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi
287287
queryTime = rulerQuerySeconds.WithLabelValues(userID)
288288
}
289289

290+
failedQueries := failedQueriesVec.WithLabelValues(userID)
291+
totalQueries := totalQueriesVec.WithLabelValues(userID)
292+
totalWrites := totalWritesVec.WithLabelValues(userID)
293+
failedWrites := failedWritesVec.WithLabelValues(userID)
294+
290295
return rules.NewManager(&rules.ManagerOptions{
291296
Appendable: NewPusherAppendable(p, userID, overrides, totalWrites, failedWrites),
292297
Queryable: q,

0 commit comments

Comments
 (0)