@@ -3,7 +3,9 @@ package ruler
3
3
import (
4
4
"flag"
5
5
"fmt"
6
+ "net/http"
6
7
"net/url"
8
+ "sync"
7
9
"time"
8
10
9
11
"github.com/prometheus/client_golang/prometheus"
@@ -14,6 +16,7 @@ import (
14
16
"github.com/prometheus/prometheus/promql"
15
17
"github.com/prometheus/prometheus/rules"
16
18
"golang.org/x/net/context"
19
+ "golang.org/x/net/context/ctxhttp"
17
20
18
21
"github.com/weaveworks/common/user"
19
22
"github.com/weaveworks/cortex/chunk"
@@ -87,30 +90,30 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
87
90
88
91
// Ruler evaluates rules.
89
92
type Ruler struct {
90
- engine * promql.Engine
91
- pusher Pusher
92
- alertURL * url.URL
93
- notifier * notifier.Notifier
93
+ engine * promql.Engine
94
+ pusher Pusher
95
+ alertURL * url.URL
96
+ notifierCfg * config.Config
97
+ queueCapacity int
98
+
99
+ // Per-user notifiers with separate queues.
100
+ notifiersMtx sync.Mutex
101
+ notifiers map [string ]* notifier.Notifier
94
102
}
95
103
96
104
// NewRuler creates a new ruler from a distributor and chunk store.
97
105
func NewRuler (cfg Config , d * distributor.Distributor , c * chunk.Store ) (* Ruler , error ) {
98
- n := notifier .New (& notifier.Options {
99
- QueueCapacity : cfg .NotificationQueueCapacity ,
100
- })
101
106
ncfg , err := buildNotifierConfig (& cfg )
102
107
if err != nil {
103
108
return nil , err
104
109
}
105
- if err = n .ApplyConfig (ncfg ); err != nil {
106
- return nil , err
107
- }
108
- go n .Run ()
109
110
return & Ruler {
110
- engine : querier .NewEngine (d , c ),
111
- pusher : d ,
112
- alertURL : cfg .ExternalURL .URL ,
113
- notifier : n ,
111
+ engine : querier .NewEngine (d , c ),
112
+ pusher : d ,
113
+ alertURL : cfg .ExternalURL .URL ,
114
+ notifierCfg : ncfg ,
115
+ queueCapacity : cfg .NotificationQueueCapacity ,
116
+ notifiers : map [string ]* notifier.Notifier {},
114
117
}, nil
115
118
}
116
119
@@ -163,25 +166,65 @@ func buildNotifierConfig(rulerConfig *Config) (*config.Config, error) {
163
166
return promConfig , nil
164
167
}
165
168
166
- func (r * Ruler ) newGroup (ctx context.Context , rs []rules.Rule ) * rules.Group {
169
+ func (r * Ruler ) newGroup (ctx context.Context , rs []rules.Rule ) ( * rules.Group , error ) {
167
170
appender := appenderAdapter {pusher : r .pusher , ctx : ctx }
171
+ userID , err := user .GetID (ctx )
172
+ if err != nil {
173
+ return nil , err
174
+ }
175
+ notifier , err := r .getOrCreateNotifier (userID )
176
+ if err != nil {
177
+ return nil , err
178
+ }
168
179
opts := & rules.ManagerOptions {
169
180
SampleAppender : appender ,
170
181
QueryEngine : r .engine ,
171
182
Context : ctx ,
172
183
ExternalURL : r .alertURL ,
173
- Notifier : r . notifier ,
184
+ Notifier : notifier ,
174
185
}
175
186
delay := 0 * time .Second // Unused, so 0 value is fine.
176
- return rules .NewGroup ("default" , delay , rs , opts )
187
+ return rules .NewGroup ("default" , delay , rs , opts ), nil
188
+ }
189
+
190
+ func (r * Ruler ) getOrCreateNotifier (userID string ) (* notifier.Notifier , error ) {
191
+ r .notifiersMtx .Lock ()
192
+ defer r .notifiersMtx .Unlock ()
193
+
194
+ n , ok := r .notifiers [userID ]
195
+ if ok {
196
+ return n , nil
197
+ }
198
+
199
+ n = notifier .New (& notifier.Options {
200
+ QueueCapacity : r .queueCapacity ,
201
+ Do : func (ctx context.Context , client * http.Client , req * http.Request ) (* http.Response , error ) {
202
+ req .Header .Set (user .OrgIDHeaderName , userID )
203
+ return ctxhttp .Do (ctx , client , req )
204
+ },
205
+ })
206
+
207
+ // This should never fail, unless there's a programming mistake.
208
+ if err := n .ApplyConfig (r .notifierCfg ); err != nil {
209
+ return nil , err
210
+ }
211
+ go n .Run ()
212
+
213
+ // TODO: Remove notifiers for stale users. Right now this is a slow leak.
214
+ r .notifiers [userID ] = n
215
+ return n , nil
177
216
}
178
217
179
218
// Evaluate a list of rules in the given context.
180
219
func (r * Ruler ) Evaluate (ctx context.Context , rs []rules.Rule ) {
181
220
log .Debugf ("Evaluating %d rules..." , len (rs ))
182
221
start := time .Now ()
183
- g := r .newGroup (ctx , rs )
184
- g .Eval ()
222
+ g , err := r .newGroup (ctx , rs )
223
+ if err != nil {
224
+ log .Errorf ("Failed to create rule group: %v" , err )
225
+ } else {
226
+ g .Eval ()
227
+ }
185
228
// The prometheus routines we're calling have their own instrumentation
186
229
// but, a) it's rule-based, not group-based, b) it's a summary, not a
187
230
// histogram, so we can't reliably aggregate.
@@ -191,7 +234,12 @@ func (r *Ruler) Evaluate(ctx context.Context, rs []rules.Rule) {
191
234
192
235
// Stop stops the Ruler.
193
236
func (r * Ruler ) Stop () {
194
- r .notifier .Stop ()
237
+ r .notifiersMtx .Lock ()
238
+ defer r .notifiersMtx .Unlock ()
239
+
240
+ for _ , n := range r .notifiers {
241
+ n .Stop ()
242
+ }
195
243
}
196
244
197
245
// Server is a rules server.
0 commit comments