Skip to content

Commit be3d241

Browse files
disable rule groups
Signed-off-by: Anand Rajagopal <anrajag@amazon.com>
1 parent 526a6d9 commit be3d241

File tree

6 files changed

+359
-15
lines changed

6 files changed

+359
-15
lines changed

docs/configuration/config-file-reference.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3084,6 +3084,9 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
30843084
# alerts will fail with a log message and metric increment. 0 = no limit.
30853085
# CLI flag: -alertmanager.max-alerts-size-bytes
30863086
[alertmanager_max_alerts_size_bytes: <int> | default = 0]
3087+
3088+
# list of rule groups to disable
3089+
[disabled_rule_groups: <list of rule groups to disable> | default = ]
30873090
```
30883091
30893092
### `memberlist_config`

pkg/ruler/compat.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import (
55
"errors"
66
"time"
77

8+
"github.com/cortexproject/cortex/pkg/util/validation"
9+
810
"github.com/go-kit/log"
911
"github.com/go-kit/log/level"
1012
"github.com/prometheus/client_golang/prometheus"
@@ -142,6 +144,7 @@ type RulesLimits interface {
142144
RulerTenantShardSize(userID string) int
143145
RulerMaxRuleGroupsPerTenant(userID string) int
144146
RulerMaxRulesPerRuleGroup(userID string) int
147+
DisabledRuleGroups(userID string) validation.DisabledRuleGroups
145148
}
146149

147150
// EngineQueryFunc returns a new engine query function by passing an altered timestamp.

pkg/ruler/ruler.go

Lines changed: 63 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,14 @@ const (
7171
recordingRuleFilter string = "record"
7272
)
7373

74+
type DisabledRuleGroupErr struct {
75+
Message string
76+
}
77+
78+
func (e *DisabledRuleGroupErr) Error() string {
79+
return e.Message
80+
}
81+
7482
// Config is the configuration for the recording rules server.
7583
type Config struct {
7684
// This is used for template expansion in alerts; must be a valid URL.
@@ -415,9 +423,19 @@ func tokenForGroup(g *rulespb.RuleGroupDesc) uint32 {
415423
return ringHasher.Sum32()
416424
}
417425

418-
func instanceOwnsRuleGroup(r ring.ReadRing, g *rulespb.RuleGroupDesc, instanceAddr string) (bool, error) {
426+
func instanceOwnsRuleGroup(r ring.ReadRing, g *rulespb.RuleGroupDesc, disabledRuleGroups validation.DisabledRuleGroups, instanceAddr string) (bool, error) {
419427
hash := tokenForGroup(g)
420428

429+
for _, disabledGroup := range disabledRuleGroups {
430+
431+
if hash == tokenForGroup(&rulespb.RuleGroupDesc{
432+
Name: disabledGroup.Name,
433+
Namespace: disabledGroup.Namespace,
434+
User: disabledGroup.User,
435+
}) {
436+
return false, &DisabledRuleGroupErr{Message: fmt.Sprintf("rule group %s, namespace %s, user %s is disabled", g.Name, g.Namespace, g.User)}
437+
}
438+
}
421439
rlrs, err := r.Get(hash, RingOp, nil, nil, nil)
422440
if err != nil {
423441
return false, errors.Wrap(err, "error reading ring to verify rule group ownership")
@@ -533,7 +551,37 @@ func (r *Ruler) listRules(ctx context.Context) (result map[string]rulespb.RuleGr
533551
}
534552

535553
func (r *Ruler) listRulesNoSharding(ctx context.Context) (map[string]rulespb.RuleGroupList, error) {
536-
return r.store.ListAllRuleGroups(ctx)
554+
allRuleGroups, err := r.store.ListAllRuleGroups(ctx)
555+
if err != nil {
556+
return nil, err
557+
}
558+
for userID, groups := range allRuleGroups {
559+
disabledRuleGroupsForUser := r.limits.DisabledRuleGroups(userID)
560+
if len(disabledRuleGroupsForUser) == 0 {
561+
continue
562+
}
563+
filteredGroupsForUser := rulespb.RuleGroupList{}
564+
for _, group := range groups {
565+
if !ruleGroupDisabled(group, disabledRuleGroupsForUser) {
566+
filteredGroupsForUser = append(filteredGroupsForUser, group)
567+
} else {
568+
level.Info(r.logger).Log("msg", "rule group disabled", "rule group name", group.Name, "namespace", group.Namespace, "user", group.User)
569+
}
570+
}
571+
allRuleGroups[userID] = filteredGroupsForUser
572+
}
573+
return allRuleGroups, nil
574+
}
575+
576+
func ruleGroupDisabled(ruleGroup *rulespb.RuleGroupDesc, disabledRuleGroupsForUser validation.DisabledRuleGroups) bool {
577+
for _, disabledRuleGroupForUser := range disabledRuleGroupsForUser {
578+
if ruleGroup.Namespace == disabledRuleGroupForUser.Namespace &&
579+
ruleGroup.Name == disabledRuleGroupForUser.Name &&
580+
ruleGroup.User == disabledRuleGroupForUser.User {
581+
return true
582+
}
583+
}
584+
return false
537585
}
538586

539587
func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulespb.RuleGroupList, error) {
@@ -544,7 +592,7 @@ func (r *Ruler) listRulesShardingDefault(ctx context.Context) (map[string]rulesp
544592

545593
filteredConfigs := make(map[string]rulespb.RuleGroupList)
546594
for userID, groups := range configs {
547-
filtered := filterRuleGroups(userID, groups, r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
595+
filtered := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), r.ring, r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
548596
if len(filtered) > 0 {
549597
filteredConfigs[userID] = filtered
550598
}
@@ -602,7 +650,7 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
602650
return errors.Wrapf(err, "failed to fetch rule groups for user %s", userID)
603651
}
604652

605-
filtered := filterRuleGroups(userID, groups, userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
653+
filtered := filterRuleGroups(userID, groups, r.limits.DisabledRuleGroups(userID), userRings[userID], r.lifecycler.GetInstanceAddr(), r.logger, r.ringCheckErrors)
606654
if len(filtered) == 0 {
607655
continue
608656
}
@@ -624,15 +672,21 @@ func (r *Ruler) listRulesShuffleSharding(ctx context.Context) (map[string]rulesp
624672
//
625673
// Reason why this function is not a method on Ruler is to make sure we don't accidentally use r.ring,
626674
// but only ring passed as parameter.
627-
func filterRuleGroups(userID string, ruleGroups []*rulespb.RuleGroupDesc, ring ring.ReadRing, instanceAddr string, log log.Logger, ringCheckErrors prometheus.Counter) []*rulespb.RuleGroupDesc {
675+
func filterRuleGroups(userID string, ruleGroups []*rulespb.RuleGroupDesc, disabledRuleGroups validation.DisabledRuleGroups, ring ring.ReadRing, instanceAddr string, log log.Logger, ringCheckErrors prometheus.Counter) []*rulespb.RuleGroupDesc {
628676
// Prune the rule group to only contain rules that this ruler is responsible for, based on ring.
629677
var result []*rulespb.RuleGroupDesc
630678
for _, g := range ruleGroups {
631-
owned, err := instanceOwnsRuleGroup(ring, g, instanceAddr)
679+
owned, err := instanceOwnsRuleGroup(ring, g, disabledRuleGroups, instanceAddr)
632680
if err != nil {
633-
ringCheckErrors.Inc()
634-
level.Error(log).Log("msg", "failed to check if the ruler replica owns the rule group", "user", userID, "namespace", g.Namespace, "group", g.Name, "err", err)
635-
continue
681+
switch e := err.(type) {
682+
case *DisabledRuleGroupErr:
683+
level.Info(log).Log("msg", e.Message)
684+
continue
685+
default:
686+
ringCheckErrors.Inc()
687+
level.Error(log).Log("msg", "failed to check if the ruler replica owns the rule group", "user", userID, "namespace", g.Namespace, "group", g.Name, "err", err)
688+
continue
689+
}
636690
}
637691

638692
if owned {

0 commit comments

Comments
 (0)