Skip to content

Commit 38ca8ec

Browse files
committed
Remove user specific evaluation metrics when rule manager is removed
Signed-off-by: Emmanuel Lodovice <lodovice@amazon.com>
1 parent 0a19a7d commit 38ca8ec

File tree

8 files changed

+165
-53
lines changed

8 files changed

+165
-53
lines changed

integration/ruler_test.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -741,7 +741,6 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
741741
}
742742

743743
matcher := labels.MustNewMatcher(labels.MatchEqual, "user", user)
744-
var totalQueries = []float64{0}
745744

746745
// Verify that user-failures don't increase cortex_ruler_queries_failed_total
747746
for groupName, expression := range map[string]string{
@@ -769,19 +768,20 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
769768
require.NoError(t, err)
770769
require.Equal(t, float64(0), sum[0])
771770

771+
// Check that cortex_ruler_queries_total went up
772+
totalQueries, err := ruler.SumMetrics([]string{"cortex_ruler_queries_total"}, e2e.WithLabelMatchers(matcher))
773+
require.NoError(t, err)
774+
require.Greater(t, totalQueries[0], float64(0))
775+
772776
// Delete rule before checkin "cortex_ruler_queries_total", as we want to reuse value for next test.
773777
require.NoError(t, c.DeleteRuleGroup(namespace, groupName))
774778

775779
// Wait until ruler has unloaded the group. We don't use any matcher, so there should be no groups (in fact, metric disappears).
776780
require.NoError(t, ruler.WaitSumMetricsWithOptions(e2e.Equals(0), []string{"cortex_prometheus_rule_group_rules"}, e2e.SkipMissingMetrics))
777781

778-
// Check that cortex_ruler_queries_total went up since last test.
779-
newTotalQueries, err := ruler.SumMetrics([]string{"cortex_ruler_queries_total"}, e2e.WithLabelMatchers(matcher))
780-
require.NoError(t, err)
781-
require.Greater(t, newTotalQueries[0], totalQueries[0])
782-
783-
// Remember totalQueries for next test.
784-
totalQueries = newTotalQueries
782+
// Deleting the rule group should clean up the cortex_ruler_queries_total metrics
783+
_, err = ruler.SumMetrics([]string{"cortex_ruler_queries_total"}, e2e.WithLabelMatchers(matcher))
784+
require.EqualError(t, err, "metric=cortex_ruler_queries_total service=ruler: metric not found")
785785
})
786786
}
787787

pkg/cortex/modules.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -577,15 +577,15 @@ func (t *Cortex) initRuler() (serv services.Service, err error) {
577577
queryEngine = promql.NewEngine(opts)
578578
}
579579

580-
managerFactory := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Cfg.ExternalPusher, t.Cfg.ExternalQueryable, queryEngine, t.Overrides, prometheus.DefaultRegisterer)
581-
manager, err = ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, managerFactory, prometheus.DefaultRegisterer, util_log.Logger)
580+
managerFactory, metrics := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Cfg.ExternalPusher, t.Cfg.ExternalQueryable, queryEngine, t.Overrides, prometheus.DefaultRegisterer)
581+
manager, err = ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, managerFactory, metrics, prometheus.DefaultRegisterer, util_log.Logger)
582582
} else {
583583
rulerRegisterer := prometheus.WrapRegistererWith(prometheus.Labels{"engine": "ruler"}, prometheus.DefaultRegisterer)
584584
// TODO: Consider wrapping logger to differentiate from querier module logger
585585
queryable, _, engine := querier.New(t.Cfg.Querier, t.Overrides, t.Distributor, t.StoreQueryables, rulerRegisterer, util_log.Logger)
586586

587-
managerFactory := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Distributor, queryable, engine, t.Overrides, prometheus.DefaultRegisterer)
588-
manager, err = ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, managerFactory, prometheus.DefaultRegisterer, util_log.Logger)
587+
managerFactory, metrics := ruler.DefaultTenantManagerFactory(t.Cfg.Ruler, t.Distributor, queryable, engine, t.Overrides, prometheus.DefaultRegisterer)
588+
manager, err = ruler.NewDefaultMultiTenantManager(t.Cfg.Ruler, managerFactory, metrics, prometheus.DefaultRegisterer, util_log.Logger)
589589
}
590590

591591
if err != nil {

pkg/ruler/compat.go

Lines changed: 9 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import (
88
"github.com/go-kit/log"
99
"github.com/go-kit/log/level"
1010
"github.com/prometheus/client_golang/prometheus"
11-
"github.com/prometheus/client_golang/prometheus/promauto"
1211
"github.com/prometheus/prometheus/model/exemplar"
1312
"github.com/prometheus/prometheus/model/histogram"
1413
"github.com/prometheus/prometheus/model/labels"
@@ -279,31 +278,8 @@ type RulesManager interface {
279278
// ManagerFactory is a function that creates new RulesManager for given user and notifier.Manager.
280279
type ManagerFactory func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager
281280

282-
func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engine v1.QueryEngine, overrides RulesLimits, reg prometheus.Registerer) ManagerFactory {
283-
totalWritesVec := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
284-
Name: "cortex_ruler_write_requests_total",
285-
Help: "Number of write requests to ingesters.",
286-
}, []string{"user"})
287-
failedWritesVec := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
288-
Name: "cortex_ruler_write_requests_failed_total",
289-
Help: "Number of failed write requests to ingesters.",
290-
}, []string{"user"})
291-
292-
totalQueriesVec := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
293-
Name: "cortex_ruler_queries_total",
294-
Help: "Number of queries executed by ruler.",
295-
}, []string{"user"})
296-
failedQueriesVec := promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
297-
Name: "cortex_ruler_queries_failed_total",
298-
Help: "Number of failed queries by ruler.",
299-
}, []string{"user"})
300-
var rulerQuerySeconds *prometheus.CounterVec
301-
if cfg.EnableQueryStats {
302-
rulerQuerySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
303-
Name: "cortex_ruler_query_seconds_total",
304-
Help: "Total amount of wall clock time spent processing queries by the ruler.",
305-
}, []string{"user"})
306-
}
281+
func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engine v1.QueryEngine, overrides RulesLimits, reg prometheus.Registerer) (ManagerFactory, *RuleEvalMetrics) {
282+
metrics := NewRuleEvalMetrics(cfg, reg)
307283

308284
// Wrap errors returned by Queryable to our wrapper, so that we can distinguish between those errors
309285
// and errors returned by PromQL engine. Errors from Queryable can be either caused by user (limits) or internal errors.
@@ -312,14 +288,14 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi
312288

313289
return func(ctx context.Context, userID string, notifier *notifier.Manager, logger log.Logger, reg prometheus.Registerer) RulesManager {
314290
var queryTime prometheus.Counter
315-
if rulerQuerySeconds != nil {
316-
queryTime = rulerQuerySeconds.WithLabelValues(userID)
291+
if metrics.RulerQuerySeconds != nil {
292+
queryTime = metrics.RulerQuerySeconds.WithLabelValues(userID)
317293
}
318294

319-
failedQueries := failedQueriesVec.WithLabelValues(userID)
320-
totalQueries := totalQueriesVec.WithLabelValues(userID)
321-
totalWrites := totalWritesVec.WithLabelValues(userID)
322-
failedWrites := failedWritesVec.WithLabelValues(userID)
295+
failedQueries := metrics.FailedQueriesVec.WithLabelValues(userID)
296+
totalQueries := metrics.TotalQueriesVec.WithLabelValues(userID)
297+
totalWrites := metrics.TotalWritesVec.WithLabelValues(userID)
298+
failedWrites := metrics.FailedWritesVec.WithLabelValues(userID)
323299

324300
engineQueryFunc := EngineQueryFunc(engine, q, overrides, userID)
325301
metricsQueryFunc := MetricsQueryFunc(engineQueryFunc, totalQueries, failedQueries)
@@ -339,7 +315,7 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, q storage.Queryable, engi
339315
ConcurrentEvalsEnabled: cfg.ConcurrentEvalsEnabled,
340316
MaxConcurrentEvals: cfg.MaxConcurrentEvals,
341317
})
342-
}
318+
}, metrics
343319
}
344320

345321
type QueryableError struct {

pkg/ruler/manager.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,10 @@ import (
2626
)
2727

2828
type DefaultMultiTenantManager struct {
29-
cfg Config
30-
notifierCfg *config.Config
31-
managerFactory ManagerFactory
29+
cfg Config
30+
notifierCfg *config.Config
31+
managerFactory ManagerFactory
32+
ruleEvalMetrics *RuleEvalMetrics
3233

3334
mapper *mapper
3435

@@ -51,7 +52,7 @@ type DefaultMultiTenantManager struct {
5152
logger log.Logger
5253
}
5354

54-
func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger) (*DefaultMultiTenantManager, error) {
55+
func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, evalMetrics *RuleEvalMetrics, reg prometheus.Registerer, logger log.Logger) (*DefaultMultiTenantManager, error) {
5556
ncfg, err := buildNotifierConfig(&cfg)
5657
if err != nil {
5758
return nil, err
@@ -78,6 +79,7 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg
7879
cfg: cfg,
7980
notifierCfg: ncfg,
8081
managerFactory: managerFactory,
82+
ruleEvalMetrics: evalMetrics,
8183
notifiers: map[string]*rulerNotifier{},
8284
notifiersDiscoveryMetrics: notifiersDiscoveryMetrics,
8385
mapper: newMapper(cfg.RulePath, logger),
@@ -130,6 +132,9 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou
130132
r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID)
131133
r.configUpdatesTotal.DeleteLabelValues(userID)
132134
r.userManagerMetrics.RemoveUserRegistry(userID)
135+
if r.ruleEvalMetrics != nil {
136+
r.ruleEvalMetrics.deletePerUserMetrics(userID)
137+
}
133138
level.Info(r.logger).Log("msg", "deleted rule manager and local rule files", "user", userID)
134139
}
135140
}

pkg/ruler/manager_metrics.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package ruler
22

33
import (
44
"github.com/prometheus/client_golang/prometheus"
5+
"github.com/prometheus/client_golang/prometheus/promauto"
56

67
"github.com/cortexproject/cortex/pkg/util"
78
)
@@ -222,3 +223,51 @@ func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) {
222223
data.SendSumOfGaugesPerUser(out, m.NotificationQueueCapacity, "prometheus_notifications_queue_capacity")
223224
data.SendSumOfGaugesPerUser(out, m.AlertmanagersDiscovered, "prometheus_notifications_alertmanagers_discovered")
224225
}
226+
227+
type RuleEvalMetrics struct {
228+
TotalWritesVec *prometheus.CounterVec
229+
FailedWritesVec *prometheus.CounterVec
230+
TotalQueriesVec *prometheus.CounterVec
231+
FailedQueriesVec *prometheus.CounterVec
232+
RulerQuerySeconds *prometheus.CounterVec
233+
}
234+
235+
func NewRuleEvalMetrics(cfg Config, reg prometheus.Registerer) *RuleEvalMetrics {
236+
m := &RuleEvalMetrics{
237+
TotalWritesVec: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
238+
Name: "cortex_ruler_write_requests_total",
239+
Help: "Number of write requests to ingesters.",
240+
}, []string{"user"}),
241+
FailedWritesVec: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
242+
Name: "cortex_ruler_write_requests_failed_total",
243+
Help: "Number of failed write requests to ingesters.",
244+
}, []string{"user"}),
245+
TotalQueriesVec: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
246+
Name: "cortex_ruler_queries_total",
247+
Help: "Number of queries executed by ruler.",
248+
}, []string{"user"}),
249+
FailedQueriesVec: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
250+
Name: "cortex_ruler_queries_failed_total",
251+
Help: "Number of failed queries by ruler.",
252+
}, []string{"user"}),
253+
}
254+
if cfg.EnableQueryStats {
255+
m.RulerQuerySeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
256+
Name: "cortex_ruler_query_seconds_total",
257+
Help: "Total amount of wall clock time spent processing queries by the ruler.",
258+
}, []string{"user"})
259+
}
260+
261+
return m
262+
}
263+
264+
func (m *RuleEvalMetrics) deletePerUserMetrics(userID string) {
265+
m.TotalWritesVec.DeleteLabelValues(userID)
266+
m.FailedWritesVec.DeleteLabelValues(userID)
267+
m.TotalQueriesVec.DeleteLabelValues(userID)
268+
m.FailedQueriesVec.DeleteLabelValues(userID)
269+
270+
if m.RulerQuerySeconds != nil {
271+
m.RulerQuerySeconds.DeleteLabelValues(userID)
272+
}
273+
}

pkg/ruler/manager_metrics_test.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ import (
1010
dto "github.com/prometheus/client_model/go"
1111
"github.com/stretchr/testify/assert"
1212
"github.com/stretchr/testify/require"
13+
14+
"github.com/cortexproject/cortex/pkg/util"
1315
)
1416

1517
func TestManagerMetricsWithRuleGroupLabel(t *testing.T) {
@@ -556,3 +558,40 @@ func TestMetricsArePerUser(t *testing.T) {
556558
assert.True(t, foundUserLabel, "user label not found for metric %s", desc.String())
557559
}
558560
}
561+
562+
func TestRuleEvalMetricsDeletePerUserMetrics(t *testing.T) {
563+
dir := t.TempDir()
564+
reg := prometheus.NewPedanticRegistry()
565+
566+
m := NewRuleEvalMetrics(Config{RulePath: dir, EnableQueryStats: true}, reg)
567+
m.TotalWritesVec.WithLabelValues("fake1").Add(10)
568+
m.TotalWritesVec.WithLabelValues("fake2").Add(10)
569+
m.FailedWritesVec.WithLabelValues("fake1").Add(10)
570+
m.FailedWritesVec.WithLabelValues("fake2").Add(10)
571+
m.TotalQueriesVec.WithLabelValues("fake1").Add(10)
572+
m.TotalQueriesVec.WithLabelValues("fake2").Add(10)
573+
m.FailedQueriesVec.WithLabelValues("fake1").Add(10)
574+
m.FailedQueriesVec.WithLabelValues("fake2").Add(10)
575+
m.RulerQuerySeconds.WithLabelValues("fake1").Add(10)
576+
m.RulerQuerySeconds.WithLabelValues("fake2").Add(10)
577+
578+
metricNames := []string{"cortex_ruler_write_requests_total", "cortex_ruler_write_requests_failed_total", "cortex_ruler_queries_total", "cortex_ruler_queries_failed_total", "cortex_ruler_query_seconds_total"}
579+
gm, err := reg.Gather()
580+
require.NoError(t, err)
581+
mfm, err := util.NewMetricFamilyMap(gm)
582+
require.NoError(t, err)
583+
for _, name := range metricNames {
584+
require.Contains(t, mfm[name].String(), "value:\"fake1\"")
585+
require.Contains(t, mfm[name].String(), "value:\"fake2\"")
586+
}
587+
588+
m.deletePerUserMetrics("fake1")
589+
gm, err = reg.Gather()
590+
require.NoError(t, err)
591+
mfm, err = util.NewMetricFamilyMap(gm)
592+
require.NoError(t, err)
593+
for _, name := range metricNames {
594+
require.NotContains(t, mfm[name].String(), "value:\"fake1\"")
595+
require.Contains(t, mfm[name].String(), "value:\"fake2\"")
596+
}
597+
}

pkg/ruler/manager_test.go

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,14 @@ import (
1414
"go.uber.org/atomic"
1515

1616
"github.com/cortexproject/cortex/pkg/ruler/rulespb"
17+
"github.com/cortexproject/cortex/pkg/util"
1718
"github.com/cortexproject/cortex/pkg/util/test"
1819
)
1920

2021
func TestSyncRuleGroups(t *testing.T) {
2122
dir := t.TempDir()
2223

23-
m, err := NewDefaultMultiTenantManager(Config{RulePath: dir}, factory, nil, log.NewNopLogger())
24+
m, err := NewDefaultMultiTenantManager(Config{RulePath: dir}, factory, nil, nil, log.NewNopLogger())
2425
require.NoError(t, err)
2526

2627
const user = "testUser"
@@ -96,6 +97,47 @@ func TestSyncRuleGroups(t *testing.T) {
9697
})
9798
}
9899

100+
func TestSyncRuleGroupsCleanUpPerUserMetrics(t *testing.T) {
101+
dir := t.TempDir()
102+
reg := prometheus.NewPedanticRegistry()
103+
evalMetrics := NewRuleEvalMetrics(Config{RulePath: dir, EnableQueryStats: true}, reg)
104+
m, err := NewDefaultMultiTenantManager(Config{RulePath: dir}, factory, evalMetrics, reg, log.NewNopLogger())
105+
require.NoError(t, err)
106+
107+
const user = "testUser"
108+
109+
evalMetrics.TotalWritesVec.WithLabelValues(user).Add(10)
110+
111+
userRules := map[string]rulespb.RuleGroupList{
112+
user: {
113+
&rulespb.RuleGroupDesc{
114+
Name: "group1",
115+
Namespace: "ns",
116+
Interval: 1 * time.Minute,
117+
User: user,
118+
},
119+
},
120+
}
121+
m.SyncRuleGroups(context.Background(), userRules)
122+
gm, err := reg.Gather()
123+
require.NoError(t, err)
124+
mfm, err := util.NewMetricFamilyMap(gm)
125+
require.NoError(t, err)
126+
require.Contains(t, mfm["cortex_ruler_write_requests_total"].String(), "value:\""+user+"\"")
127+
require.Contains(t, mfm["cortex_ruler_config_last_reload_successful"].String(), "value:\""+user+"\"")
128+
129+
// Passing empty map / nil stops all managers.
130+
m.SyncRuleGroups(context.Background(), nil)
131+
require.Nil(t, getManager(m, user))
132+
133+
gm, err = reg.Gather()
134+
require.NoError(t, err)
135+
mfm, err = util.NewMetricFamilyMap(gm)
136+
require.NoError(t, err)
137+
require.NotContains(t, mfm["cortex_ruler_write_requests_total"].String(), "value:\""+user+"\"")
138+
require.NotContains(t, mfm["cortex_ruler_config_last_reload_successful"].String(), "value:\""+user+"\"")
139+
}
140+
99141
func getManager(m *DefaultMultiTenantManager, user string) RulesManager {
100142
m.userManagerMtx.Lock()
101143
defer m.userManagerMtx.Unlock()

pkg/ruler/ruler_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,8 @@ func testSetup(t *testing.T, querierTestConfig *querier.TestConfig) (*promql.Eng
175175

176176
func newManager(t *testing.T, cfg Config) *DefaultMultiTenantManager {
177177
engine, queryable, pusher, logger, overrides, reg := testSetup(t, nil)
178-
manager, err := NewDefaultMultiTenantManager(cfg, DefaultTenantManagerFactory(cfg, pusher, queryable, engine, overrides, nil), reg, logger)
178+
managerFactory, metrics := DefaultTenantManagerFactory(cfg, pusher, queryable, engine, overrides, nil)
179+
manager, err := NewDefaultMultiTenantManager(cfg, managerFactory, metrics, reg, logger)
179180
require.NoError(t, err)
180181

181182
return manager
@@ -222,8 +223,8 @@ func newMockClientsPool(cfg Config, logger log.Logger, reg prometheus.Registerer
222223
func buildRuler(t *testing.T, rulerConfig Config, querierTestConfig *querier.TestConfig, store rulestore.RuleStore, rulerAddrMap map[string]*Ruler) (*Ruler, *DefaultMultiTenantManager) {
223224
engine, queryable, pusher, logger, overrides, reg := testSetup(t, querierTestConfig)
224225

225-
managerFactory := DefaultTenantManagerFactory(rulerConfig, pusher, queryable, engine, overrides, reg)
226-
manager, err := NewDefaultMultiTenantManager(rulerConfig, managerFactory, reg, log.NewNopLogger())
226+
managerFactory, metrics := DefaultTenantManagerFactory(rulerConfig, pusher, queryable, engine, overrides, reg)
227+
manager, err := NewDefaultMultiTenantManager(rulerConfig, managerFactory, metrics, reg, log.NewNopLogger())
227228
require.NoError(t, err)
228229

229230
ruler, err := newRuler(

0 commit comments

Comments
 (0)