Skip to content

Commit 840cd27

Browse files
committed
Add cortex_ruler_rule_groups_in_store metric
Signed-off-by: Emmanuel Lodovice <lodovice@amazon.com>
1 parent fe105a9 commit 840cd27

File tree

5 files changed

+98
-0
lines changed

5 files changed

+98
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
* [ENHANCEMENT] Ruler: Improve GetRules response time by refactoring mutexes and introducing a temporary rules cache in `ruler/manager.go`. #5805
3939
* [ENHANCEMENT] Querier: Add context error check when merging slices from ingesters for GetLabel operations. #5837
4040
* [ENHANCEMENT] Ring: Add experimental `-ingester.tokens-generator-strategy=minimize-spread` flag to enable the new minimize spread token generator strategy. #5855
41+
* [ENHANCEMENT] Ruler: Add new ruler metric `cortex_ruler_rule_groups_in_store`. #5869
4142
* [BUGFIX] Distributor: Do not use label with empty values for sharding #5717
4243
* [BUGFIX] Query Frontend: queries with negative offset should check whether it is cacheable or not. #5719
4344
* [BUGFIX] Redis Cache: pass `cache_size` config correctly. #5734

integration/ruler_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,9 @@ func TestRulerSharding(t *testing.T) {
382382
// between the two rulers.
383383
require.NoError(t, ruler1.WaitSumMetrics(e2e.Less(numRulesGroups), "cortex_prometheus_rule_group_rules"))
384384
require.NoError(t, ruler2.WaitSumMetrics(e2e.Less(numRulesGroups), "cortex_prometheus_rule_group_rules"))
385+
// Even with rules sharded, we expect rulers to have the same cortex_ruler_rule_groups_in_store metric values
386+
require.NoError(t, ruler1.WaitSumMetrics(e2e.Equals(numRulesGroups), "cortex_ruler_rule_groups_in_store"))
387+
require.NoError(t, ruler2.WaitSumMetrics(e2e.Equals(numRulesGroups), "cortex_ruler_rule_groups_in_store"))
385388

386389
// Fetch the rules and ensure they match the configured ones.
387390
actualGroups, err := c.GetPrometheusRules(e2ecortex.DefaultFilter)

pkg/ruler/manager_metrics.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package ruler
22

33
import (
4+
"sync"
5+
46
"github.com/prometheus/client_golang/prometheus"
57
"github.com/prometheus/client_golang/prometheus/promauto"
68

@@ -271,3 +273,42 @@ func (m *RuleEvalMetrics) deletePerUserMetrics(userID string) {
271273
m.RulerQuerySeconds.DeleteLabelValues(userID)
272274
}
273275
}
276+
277+
type RuleGroupMetrics struct {
278+
mtx sync.Mutex
279+
RuleGroupsInStore *prometheus.GaugeVec
280+
tenants map[string]struct{}
281+
allowedTenants *util.AllowedTenants
282+
}
283+
284+
func NewRuleGroupMetrics(reg prometheus.Registerer, allowedTenants *util.AllowedTenants) *RuleGroupMetrics {
285+
m := &RuleGroupMetrics{
286+
RuleGroupsInStore: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
287+
Name: "cortex_ruler_rule_groups_in_store",
288+
Help: "The number of rule groups a tenant has in store.",
289+
}, []string{"user"}),
290+
allowedTenants: allowedTenants,
291+
}
292+
return m
293+
}
294+
295+
// UpdateRuleGroupsInStore updates the cortex_ruler_rule_groups_in_store metric with the provided number of rule
296+
// groups per tenant and removing the metrics for tenants that are not present anymore
297+
func (r *RuleGroupMetrics) UpdateRuleGroupsInStore(ruleGroupsCount map[string]int) {
298+
r.mtx.Lock()
299+
defer r.mtx.Unlock()
300+
tenants := make(map[string]struct{}, len(ruleGroupsCount))
301+
for userID, count := range ruleGroupsCount {
302+
if !r.allowedTenants.IsAllowed(userID) { // if the tenant is disabled just ignore its rule groups
303+
continue
304+
}
305+
tenants[userID] = struct{}{}
306+
r.RuleGroupsInStore.WithLabelValues(userID).Set(float64(count))
307+
}
308+
for userID := range r.tenants {
309+
if _, ok := tenants[userID]; !ok {
310+
r.RuleGroupsInStore.DeleteLabelValues(userID)
311+
}
312+
}
313+
r.tenants = tenants
314+
}

pkg/ruler/manager_metrics_test.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -595,3 +595,41 @@ func TestRuleEvalMetricsDeletePerUserMetrics(t *testing.T) {
595595
require.Contains(t, mfm[name].String(), "value:\"fake2\"")
596596
}
597597
}
598+
599+
func TestRuleGroupMetrics(t *testing.T) {
600+
reg := prometheus.NewPedanticRegistry()
601+
m := NewRuleGroupMetrics(reg, util.NewAllowedTenants(nil, []string{"fake3"}))
602+
m.UpdateRuleGroupsInStore(map[string]int{
603+
"fake1": 10,
604+
"fake2": 20,
605+
"fake3": 30,
606+
})
607+
gm, err := reg.Gather()
608+
require.NoError(t, err)
609+
mfm, err := util.NewMetricFamilyMap(gm)
610+
require.NoError(t, err)
611+
require.Equal(t, 2, len(mfm["cortex_ruler_rule_groups_in_store"].Metric))
612+
requireMetricEqual(t, mfm["cortex_ruler_rule_groups_in_store"].Metric[0], map[string]string{
613+
"user": "fake1",
614+
}, float64(10))
615+
requireMetricEqual(t, mfm["cortex_ruler_rule_groups_in_store"].Metric[1], map[string]string{
616+
"user": "fake2",
617+
}, float64(20))
618+
m.UpdateRuleGroupsInStore(map[string]int{
619+
"fake2": 30,
620+
})
621+
gm, err = reg.Gather()
622+
require.NoError(t, err)
623+
mfm, err = util.NewMetricFamilyMap(gm)
624+
require.NoError(t, err)
625+
require.Equal(t, 1, len(mfm["cortex_ruler_rule_groups_in_store"].Metric))
626+
requireMetricEqual(t, mfm["cortex_ruler_rule_groups_in_store"].Metric[0], map[string]string{
627+
"user": "fake2",
628+
}, float64(30))
629+
m.UpdateRuleGroupsInStore(make(map[string]int))
630+
gm, err = reg.Gather()
631+
require.NoError(t, err)
632+
mfm, err = util.NewMetricFamilyMap(gm)
633+
require.NoError(t, err)
634+
require.Nil(t, mfm["cortex_ruler_rule_groups_in_store"])
635+
}

pkg/ruler/ruler.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ type Ruler struct {
280280
ruleGroupStoreLoadDuration prometheus.Gauge
281281
ruleGroupSyncDuration prometheus.Gauge
282282
rulerGetRulesFailures *prometheus.CounterVec
283+
ruleGroupMetrics *RuleGroupMetrics
283284

284285
allowedTenants *util.AllowedTenants
285286

@@ -328,6 +329,7 @@ func newRuler(cfg Config, manager MultiTenantManager, reg prometheus.Registerer,
328329
Help: "The total number of failed rules request sent to rulers in getShardedRules.",
329330
}, []string{"ruler"}),
330331
}
332+
ruler.ruleGroupMetrics = NewRuleGroupMetrics(reg, ruler.allowedTenants)
331333

332334
if len(cfg.EnabledTenants) > 0 {
333335
level.Info(ruler.logger).Log("msg", "ruler using enabled users", "enabled", strings.Join(cfg.EnabledTenants, ", "))
@@ -652,7 +654,9 @@ func (r *Ruler) listRulesNoSharding(ctx context.Context) (map[string]rulespb.Rul
652654
if err != nil {
653655
return nil, nil, err
654656
}
657+
ruleGroupCounts := make(map[string]int, len(allRuleGroups))
655658
for userID, groups := range allRuleGroups {
659+
ruleGroupCounts[userID] = len(groups)
656660
disabledRuleGroupsForUser := r.limits.DisabledRuleGroups(userID)
657661
if len(disabledRuleGroupsForUser) == 0 {
658662
continue
@@ -667,6 +671,7 @@ func (r *Ruler) listRulesNoSharding(ctx context.Context) (map[string]rulespb.Rul
667671
}
668672
allRuleGroups[userID] = filteredGroupsForUser
669673
}
674+
r.ruleGroupMetrics.UpdateRuleGroupsInStore(ruleGroupCounts)
670675
return allRuleGroups, nil, nil
671676
}
672677

@@ -676,9 +681,11 @@ func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulesp
676681
return nil, nil, err
677682
}
678683

684+
ruleGroupCounts := make(map[string]int, len(configs))
679685
ownedConfigs := make(map[string]rulespb.RuleGroupList)
680686
backedUpConfigs := make(map[string]rulespb.RuleGroupList)
681687
for userID, groups := range configs {
688+
ruleGroupCounts[userID] = len(groups)
682689
owned := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
683690
if len(owned) > 0 {
684691
ownedConfigs[userID] = owned
@@ -690,6 +697,7 @@ func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulesp
690697
}
691698
}
692699
}
700+
r.ruleGroupMetrics.UpdateRuleGroupsInStore(ruleGroupCounts)
693701
return ownedConfigs, backedUpConfigs, nil
694702
}
695703

@@ -717,6 +725,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
717725
}
718726

719727
if len(userRings) == 0 {
728+
r.ruleGroupMetrics.UpdateRuleGroupsInStore(make(map[string]int))
720729
return nil, nil, nil
721730
}
722731

@@ -729,6 +738,8 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
729738
mu := sync.Mutex{}
730739
owned := map[string]rulespb.RuleGroupList{}
731740
backedUp := map[string]rulespb.RuleGroupList{}
741+
gLock := sync.Mutex{}
742+
ruleGroupCounts := make(map[string]int, len(userRings))
732743

733744
concurrency := loadRulesConcurrency
734745
if len(userRings) < concurrency {
@@ -743,6 +754,9 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
743754
if err != nil {
744755
return errors.Wrapf(err, "failed to fetch rule groups for user %s", userID)
745756
}
757+
gLock.Lock()
758+
ruleGroupCounts[userID] = len(groups)
759+
gLock.Unlock()
746760

747761
filterOwned := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
748762
var filterBackup []*rulespb.RuleGroupDesc
@@ -766,6 +780,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
766780
}
767781

768782
err = g.Wait()
783+
r.ruleGroupMetrics.UpdateRuleGroupsInStore(ruleGroupCounts)
769784
return owned, backedUp, err
770785
}
771786

0 commit comments

Comments
 (0)