Skip to content

Commit 5dc489a

Browse files
committed
Add cortex_ruler_rule_groups_in_store metric
Signed-off-by: Emmanuel Lodovice <lodovice@amazon.com>
1 parent 00ffb3c commit 5dc489a

File tree

5 files changed

+99
-0
lines changed

5 files changed

+99
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
* [ENHANCEMENT] Ruler: Improve GetRules response time by refactoring mutexes and introducing a temporary rules cache in `ruler/manager.go`. #5805
3737
* [ENHANCEMENT] Querier: Add context error check when merging slices from ingesters for GetLabel operations. #5837
3838
* [ENHANCEMENT] Ring: Add experimental `-ingester.tokens-generator-strategy=minimize-spread` flag to enable the new minimize spread token generator strategy. #5855
39+
* [ENHANCEMENT] Ruler: Add new ruler metric `cortex_ruler_rule_groups_in_store`. #5869
3940
* [BUGFIX] Distributor: Do not use label with empty values for sharding #5717
4041
* [BUGFIX] Query Frontend: queries with negative offset should check whether it is cacheable or not. #5719
4142
* [BUGFIX] Redis Cache: pass `cache_size` config correctly. #5734

integration/ruler_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,9 @@ func TestRulerSharding(t *testing.T) {
382382
// between the two rulers.
383383
require.NoError(t, ruler1.WaitSumMetrics(e2e.Less(numRulesGroups), "cortex_prometheus_rule_group_rules"))
384384
require.NoError(t, ruler2.WaitSumMetrics(e2e.Less(numRulesGroups), "cortex_prometheus_rule_group_rules"))
385+
// Even with rules sharded, we expect rulers to have the same cortex_ruler_rule_groups_in_store metric values
386+
require.NoError(t, ruler1.WaitSumMetrics(e2e.Equals(numRulesGroups), "cortex_ruler_rule_groups_in_store"))
387+
require.NoError(t, ruler2.WaitSumMetrics(e2e.Equals(numRulesGroups), "cortex_ruler_rule_groups_in_store"))
385388

386389
// Fetch the rules and ensure they match the configured ones.
387390
actualGroups, err := c.GetPrometheusRules(e2ecortex.DefaultFilter)

pkg/ruler/manager_metrics.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,3 +271,34 @@ func (m *RuleEvalMetrics) deletePerUserMetrics(userID string) {
271271
m.RulerQuerySeconds.DeleteLabelValues(userID)
272272
}
273273
}
274+
275+
type RuleGroupMetrics struct {
276+
RuleGroupsInStore *prometheus.GaugeVec
277+
tenants map[string]struct{}
278+
}
279+
280+
func NewRuleGroupMetrics(reg prometheus.Registerer) *RuleGroupMetrics {
281+
m := &RuleGroupMetrics{
282+
RuleGroupsInStore: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
283+
Name: "cortex_ruler_rule_groups_in_store",
284+
Help: "The number of rule groups a tenant has in store.",
285+
}, []string{"user"}),
286+
}
287+
return m
288+
}
289+
290+
// UpdateRuleGroupsInStore updates the cortex_ruler_rule_groups_in_store metric with the provided number of rule
291+
// groups per tenant and removing the metrics for tenants that are not present anymore
292+
func (r *RuleGroupMetrics) UpdateRuleGroupsInStore(ruleGroupsCount map[string]int) {
293+
tenants := make(map[string]struct{}, len(ruleGroupsCount))
294+
for userID, count := range ruleGroupsCount {
295+
tenants[userID] = struct{}{}
296+
r.RuleGroupsInStore.WithLabelValues(userID).Set(float64(count))
297+
}
298+
for userID := range r.tenants {
299+
if _, ok := tenants[userID]; !ok {
300+
r.RuleGroupsInStore.DeleteLabelValues(userID)
301+
}
302+
}
303+
r.tenants = tenants
304+
}

pkg/ruler/manager_metrics_test.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"github.com/prometheus/client_golang/prometheus/promauto"
99
"github.com/prometheus/client_golang/prometheus/testutil"
1010
dto "github.com/prometheus/client_model/go"
11+
io_prometheus_client "github.com/prometheus/client_model/go"
1112
"github.com/stretchr/testify/assert"
1213
"github.com/stretchr/testify/require"
1314

@@ -595,3 +596,49 @@ func TestRuleEvalMetricsDeletePerUserMetrics(t *testing.T) {
595596
require.Contains(t, mfm[name].String(), "value:\"fake2\"")
596597
}
597598
}
599+
600+
func TestRuleGroupMetrics(t *testing.T) {
601+
reg := prometheus.NewPedanticRegistry()
602+
m := NewRuleGroupMetrics(reg)
603+
m.UpdateRuleGroupsInStore(map[string]int{
604+
"fake1": 10,
605+
"fake2": 20,
606+
})
607+
gm, err := reg.Gather()
608+
require.NoError(t, err)
609+
mfm, err := util.NewMetricFamilyMap(gm)
610+
require.NoError(t, err)
611+
require.Equal(t, 2, len(mfm["cortex_ruler_rule_groups_in_store"].Metric))
612+
requireMetricEqual(t, mfm["cortex_ruler_rule_groups_in_store"].Metric[0], map[string]string{
613+
"user": "fake1",
614+
}, float64(10))
615+
requireMetricEqual(t, mfm["cortex_ruler_rule_groups_in_store"].Metric[1], map[string]string{
616+
"user": "fake2",
617+
}, float64(20))
618+
m.UpdateRuleGroupsInStore(map[string]int{
619+
"fake2": 30,
620+
})
621+
gm, err = reg.Gather()
622+
require.NoError(t, err)
623+
mfm, err = util.NewMetricFamilyMap(gm)
624+
require.NoError(t, err)
625+
require.Equal(t, 1, len(mfm["cortex_ruler_rule_groups_in_store"].Metric))
626+
requireMetricEqual(t, mfm["cortex_ruler_rule_groups_in_store"].Metric[0], map[string]string{
627+
"user": "fake2",
628+
}, float64(30))
629+
m.UpdateRuleGroupsInStore(make(map[string]int))
630+
gm, err = reg.Gather()
631+
require.NoError(t, err)
632+
mfm, err = util.NewMetricFamilyMap(gm)
633+
require.NoError(t, err)
634+
require.Nil(t, mfm["cortex_ruler_rule_groups_in_store"])
635+
}
636+
637+
func requireMetricEqual(t *testing.T, m *io_prometheus_client.Metric, labels map[string]string, value float64) {
638+
l := m.GetLabel()
639+
require.Equal(t, len(labels), len(l))
640+
for _, pair := range l {
641+
require.Equal(t, labels[*pair.Name], *pair.Value)
642+
}
643+
require.Equal(t, value, *m.Gauge.Value)
644+
}

pkg/ruler/ruler.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ type Ruler struct {
270270
rulerSync *prometheus.CounterVec
271271
ruleGroupStoreLoadDuration prometheus.Gauge
272272
ruleGroupSyncDuration prometheus.Gauge
273+
ruleGroupMetrics *RuleGroupMetrics
273274

274275
allowedTenants *util.AllowedTenants
275276

@@ -312,6 +313,8 @@ func newRuler(cfg Config, manager MultiTenantManager, reg prometheus.Registerer,
312313
Name: "cortex_ruler_rule_group_sync_duration_seconds",
313314
Help: "The duration in seconds required to sync and load rule groups from storage.",
314315
}),
316+
317+
ruleGroupMetrics: NewRuleGroupMetrics(reg),
315318
}
316319

317320
if len(cfg.EnabledTenants) > 0 {
@@ -606,7 +609,9 @@ func (r *Ruler) listRulesNoSharding(ctx context.Context) (map[string]rulespb.Rul
606609
if err != nil {
607610
return nil, err
608611
}
612+
ruleGroupCounts := make(map[string]int, len(allRuleGroups))
609613
for userID, groups := range allRuleGroups {
614+
ruleGroupCounts[userID] = len(groups)
610615
disabledRuleGroupsForUser := r.limits.DisabledRuleGroups(userID)
611616
if len(disabledRuleGroupsForUser) == 0 {
612617
continue
@@ -621,6 +626,7 @@ func (r *Ruler) listRulesNoSharding(ctx context.Context) (map[string]rulespb.Rul
621626
}
622627
allRuleGroups[userID] = filteredGroupsForUser
623628
}
629+
r.ruleGroupMetrics.UpdateRuleGroupsInStore(ruleGroupCounts)
624630
return allRuleGroups, nil
625631
}
626632

@@ -630,13 +636,16 @@ func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulesp
630636
return nil, err
631637
}
632638

639+
ruleGroupCounts := make(map[string]int, len(configs))
633640
filteredConfigs := make(map[string]rulespb.RuleGroupList)
634641
for userID, groups := range configs {
642+
ruleGroupCounts[userID] = len(groups)
635643
filtered := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
636644
if len(filtered) > 0 {
637645
filteredConfigs[userID] = filtered
638646
}
639647
}
648+
r.ruleGroupMetrics.UpdateRuleGroupsInStore(ruleGroupCounts)
640649
return filteredConfigs, nil
641650
}
642651

@@ -664,6 +673,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
664673
}
665674

666675
if len(userRings) == 0 {
676+
r.ruleGroupMetrics.UpdateRuleGroupsInStore(make(map[string]int))
667677
return nil, nil
668678
}
669679

@@ -675,6 +685,8 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
675685

676686
mu := sync.Mutex{}
677687
result := map[string]rulespb.RuleGroupList{}
688+
gLock := sync.Mutex{}
689+
ruleGroupCounts := make(map[string]int, len(userRings))
678690

679691
concurrency := loadRulesConcurrency
680692
if len(userRings) < concurrency {
@@ -690,6 +702,10 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
690702
return errors.Wrapf(err, "failed to fetch rule groups for user %s", userID)
691703
}
692704

705+
gLock.Lock()
706+
ruleGroupCounts[userID] = len(groups)
707+
gLock.Unlock()
708+
693709
filtered := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
694710
if len(filtered) == 0 {
695711
continue
@@ -704,6 +720,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
704720
}
705721

706722
err = g.Wait()
723+
r.ruleGroupMetrics.UpdateRuleGroupsInStore(ruleGroupCounts)
707724
return result, err
708725
}
709726

0 commit comments

Comments
 (0)