Skip to content

Commit d916e8c

Browse files
authored
Fix Alertmanager status page if clustering via gossip is disabled or sharding is enabled (#4184)
Signed-off-by: Xiaochao Dong (@damnever) <the.xcdong@gmail.com>
1 parent 2ba3fdd commit d916e8c

6 files changed

Lines changed: 201 additions & 88 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
* [ENHANCEMENT] Added `tenant_ids` tag to tracing spans #4147
2929
* [BUGFIX] Purger: fix `Invalid null value in condition for column range` caused by `nil` value in range for WriteBatch query. #4128
3030
* [BUGFIX] Ingester: fixed infrequent panic caused by a race condition between TSDB mmap-ed head chunks truncation and queries. #4176
31+
* [BUGFIX] Alertmanager: fix Alertmanager status page if clustering via gossip is disabled or sharding is enabled. #4184
3132
* [BUGFIX] Ruler: fix `/ruler/rule_groups` endpoint doesn't work when used with object store. #4182
3233
* [BUGFIX] Ruler: Honor the evaluation delay for the `ALERTS` and `ALERTS_FOR_STATE` series. #4227
3334
* [BUGFIX] Fixed cache fetch error on Redis Cluster. #4056

integration/alertmanager_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,14 @@ func TestAlertmanagerSharding(t *testing.T) {
450450
}
451451
}
452452

453+
// Endpoint: GET /multitenant_alertmanager/status
454+
{
455+
for _, c := range clients {
456+
_, err := c.GetAlertmanagerStatusPage(context.Background())
457+
assert.NoError(t, err)
458+
}
459+
}
460+
453461
// Endpoint: GET /status
454462
{
455463
for _, c := range clients {

integration/e2ecortex/client.go

Lines changed: 44 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -27,20 +27,19 @@ import (
2727
"github.com/cortexproject/cortex/pkg/ruler"
2828
)
2929

30-
var (
31-
ErrNotFound = errors.New("not found")
32-
)
30+
var ErrNotFound = errors.New("not found")
3331

3432
// Client is a client used to interact with Cortex in integration tests
3533
type Client struct {
36-
alertmanagerClient promapi.Client
37-
querierAddress string
38-
rulerAddress string
39-
distributorAddress string
40-
timeout time.Duration
41-
httpClient *http.Client
42-
querierClient promv1.API
43-
orgID string
34+
alertmanagerClient promapi.Client
35+
querierAddress string
36+
alertmanagerAddress string
37+
rulerAddress string
38+
distributorAddress string
39+
timeout time.Duration
40+
httpClient *http.Client
41+
querierClient promv1.API
42+
orgID string
4443
}
4544

4645
// NewClient makes a new Cortex client
@@ -61,13 +60,14 @@ func NewClient(
6160
}
6261

6362
c := &Client{
64-
distributorAddress: distributorAddress,
65-
querierAddress: querierAddress,
66-
rulerAddress: rulerAddress,
67-
timeout: 5 * time.Second,
68-
httpClient: &http.Client{},
69-
querierClient: promv1.NewAPI(querierAPIClient),
70-
orgID: orgID,
63+
distributorAddress: distributorAddress,
64+
querierAddress: querierAddress,
65+
alertmanagerAddress: alertmanagerAddress,
66+
rulerAddress: rulerAddress,
67+
timeout: 5 * time.Second,
68+
httpClient: &http.Client{},
69+
querierClient: promv1.NewAPI(querierAPIClient),
70+
orgID: orgID,
7171
}
7272

7373
if alertmanagerAddress != "" {
@@ -391,6 +391,32 @@ type userConfig struct {
391391
AlertmanagerConfig string `yaml:"alertmanager_config"`
392392
}
393393

394+
// GetAlertmanagerStatusPage gets the status page of alertmanager.
395+
func (c *Client) GetAlertmanagerStatusPage(ctx context.Context) ([]byte, error) {
396+
return c.getRawPage(ctx, "http://"+c.alertmanagerAddress+"/multitenant_alertmanager/status")
397+
}
398+
399+
func (c *Client) getRawPage(ctx context.Context, url string) ([]byte, error) {
400+
req, err := http.NewRequest(http.MethodGet, url, nil)
401+
if err != nil {
402+
return nil, err
403+
}
404+
resp, err := c.httpClient.Do(req.WithContext(ctx))
405+
if err != nil {
406+
return nil, err
407+
}
408+
defer resp.Body.Close()
409+
410+
content, err := ioutil.ReadAll(resp.Body)
411+
if err != nil {
412+
return nil, err
413+
}
414+
if resp.StatusCode/100 != 2 {
415+
return nil, fmt.Errorf("fetching page failed with status %d and content %v", resp.StatusCode, string(content))
416+
}
417+
return content, nil
418+
}
419+
394420
// GetAlertmanagerConfig gets the status of an alertmanager instance
395421
func (c *Client) GetAlertmanagerConfig(ctx context.Context) (*alertConfig.Config, error) {
396422
u := c.alertmanagerClient.URL("/api/prom/api/v1/status", nil)
@@ -433,7 +459,6 @@ func (c *Client) SetAlertmanagerConfig(ctx context.Context, amConfig string, tem
433459
AlertmanagerConfig: amConfig,
434460
TemplateFiles: templates,
435461
})
436-
437462
if err != nil {
438463
return err
439464
}

pkg/alertmanager/alertmanager_http.go

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import (
1111
)
1212

1313
var (
14-
statusPageTemplate = template.Must(template.New("main").Parse(`
14+
ringStatusPageTemplate = template.Must(template.New("ringStatusPage").Parse(`
1515
<!DOCTYPE html>
1616
<html>
1717
<head>
@@ -23,31 +23,90 @@ var (
2323
<p>{{ .Message }}</p>
2424
</body>
2525
</html>`))
26+
27+
statusTemplate = template.Must(template.New("statusPage").Parse(`
28+
<!doctype html>
29+
<html>
30+
<head><title>Cortex Alertmanager Status</title></head>
31+
<body>
32+
<h1>Cortex Alertmanager Status</h1>
33+
{{ if not .ClusterInfo }}
34+
<p>Alertmanager gossip-based clustering is disabled.</p>
35+
{{ else }}
36+
<h2>Node</h2>
37+
<dl>
38+
<dt>Name</dt><dd>{{.ClusterInfo.self.Name}}</dd>
39+
<dt>Addr</dt><dd>{{.ClusterInfo.self.Addr}}</dd>
40+
<dt>Port</dt><dd>{{.ClusterInfo.self.Port}}</dd>
41+
</dl>
42+
<h3>Members</h3>
43+
{{ with .ClusterInfo.members }}
44+
<table>
45+
<tr><th>Name</th><th>Addr</th></tr>
46+
{{ range . }}
47+
<tr><td>{{ .Name }}</td><td>{{ .Addr }}</td></tr>
48+
{{ end }}
49+
</table>
50+
{{ else }}
51+
<p>No peers</p>
52+
{{ end }}
53+
{{ end }}
54+
</body>
55+
</html>`))
2656
)
2757

28-
func writeMessage(w http.ResponseWriter, message string) {
58+
func writeRingStatusMessage(w http.ResponseWriter, message string) {
2959
w.WriteHeader(http.StatusOK)
30-
err := statusPageTemplate.Execute(w, struct {
60+
err := ringStatusPageTemplate.Execute(w, struct {
3161
Message string
3262
}{Message: message})
33-
3463
if err != nil {
3564
level.Error(util_log.Logger).Log("msg", "unable to serve alertmanager ring page", "err", err)
3665
}
3766
}
3867

3968
func (am *MultitenantAlertmanager) RingHandler(w http.ResponseWriter, req *http.Request) {
4069
if !am.cfg.ShardingEnabled {
41-
writeMessage(w, "Alertmanager has no ring because sharding is disabled.")
70+
writeRingStatusMessage(w, "Alertmanager has no ring because sharding is disabled.")
4271
return
4372
}
4473

4574
if am.State() != services.Running {
4675
// we cannot read the ring before the alertmanager is in Running state,
4776
// because that would lead to race condition.
48-
writeMessage(w, "Alertmanager is not running yet.")
77+
writeRingStatusMessage(w, "Alertmanager is not running yet.")
4978
return
5079
}
5180

5281
am.ring.ServeHTTP(w, req)
5382
}
83+
84+
// GetStatusHandler returns the status handler for this multi-tenant
85+
// alertmanager.
86+
func (am *MultitenantAlertmanager) GetStatusHandler() StatusHandler {
87+
return StatusHandler{
88+
am: am,
89+
}
90+
}
91+
92+
// StatusHandler shows the status of the alertmanager.
93+
type StatusHandler struct {
94+
am *MultitenantAlertmanager
95+
}
96+
97+
// ServeHTTP serves the status of the alertmanager.
98+
func (s StatusHandler) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
99+
var clusterInfo map[string]interface{}
100+
if s.am.peer != nil {
101+
clusterInfo = s.am.peer.Info()
102+
}
103+
err := statusTemplate.Execute(w, struct {
104+
ClusterInfo map[string]interface{}
105+
}{
106+
ClusterInfo: clusterInfo,
107+
})
108+
if err != nil {
109+
level.Error(util_log.Logger).Log("msg", "unable to serve alertmanager status page", "err", err)
110+
http.Error(w, err.Error(), http.StatusInternalServerError)
111+
}
112+
}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
package alertmanager
2+
3+
import (
4+
"context"
5+
"io/ioutil"
6+
"net/http/httptest"
7+
"testing"
8+
"time"
9+
10+
"github.com/go-kit/kit/log"
11+
"github.com/prometheus/alertmanager/cluster"
12+
"github.com/prometheus/client_golang/prometheus"
13+
"github.com/stretchr/testify/require"
14+
)
15+
16+
func TestMultitenantAlertmanager_GetStatusHandler(t *testing.T) {
17+
ctx, cancel := context.WithCancel(context.Background())
18+
defer cancel()
19+
var peer *cluster.Peer
20+
{
21+
logger := log.NewNopLogger()
22+
createPeer := func(peers []string) (*cluster.Peer, error) {
23+
return cluster.Create(
24+
logger,
25+
prometheus.NewRegistry(),
26+
"127.0.0.1:0",
27+
"",
28+
peers,
29+
true,
30+
cluster.DefaultPushPullInterval,
31+
cluster.DefaultGossipInterval,
32+
cluster.DefaultTcpTimeout,
33+
cluster.DefaultProbeTimeout,
34+
cluster.DefaultProbeInterval,
35+
)
36+
}
37+
38+
peer1, err := createPeer(nil)
39+
require.NoError(t, err)
40+
require.NotNil(t, peer1)
41+
err = peer1.Join(cluster.DefaultReconnectInterval, cluster.DefaultReconnectTimeout)
42+
require.NoError(t, err)
43+
go peer1.Settle(ctx, 0*time.Second)
44+
require.NoError(t, peer1.WaitReady(ctx))
45+
require.Equal(t, peer1.Status(), "ready")
46+
47+
peer2, err := createPeer([]string{peer1.Self().Address()})
48+
require.NoError(t, err)
49+
require.NotNil(t, peer2)
50+
err = peer2.Join(cluster.DefaultReconnectInterval, cluster.DefaultReconnectTimeout)
51+
require.NoError(t, err)
52+
go peer2.Settle(ctx, 0*time.Second)
53+
peer = peer2
54+
}
55+
56+
for _, tt := range []struct {
57+
am *MultitenantAlertmanager
58+
content string
59+
nocontent string
60+
}{
61+
{
62+
am: &MultitenantAlertmanager{peer: nil},
63+
content: "Alertmanager gossip-based clustering is disabled.",
64+
nocontent: "Node",
65+
},
66+
{
67+
am: &MultitenantAlertmanager{peer: peer},
68+
content: "Members",
69+
nocontent: "No peers",
70+
},
71+
} {
72+
req := httptest.NewRequest("GET", "http://alertmanager.cortex/status", nil)
73+
w := httptest.NewRecorder()
74+
tt.am.GetStatusHandler().ServeHTTP(w, req)
75+
76+
resp := w.Result()
77+
require.Equal(t, 200, w.Code)
78+
body, _ := ioutil.ReadAll(resp.Body)
79+
content := string(body)
80+
require.Contains(t, content, tt.content)
81+
require.NotContains(t, content, tt.nocontent)
82+
}
83+
}

0 commit comments

Comments
 (0)