Skip to content

Commit cad2653

Browse files
ymao1kibanamachine
andauthored
[Alerting] Improving health status check (#93282)
* wip * Moving catchError so observable stream does not complete. Adding retry on failure * Using retryWhen. Updating unit tests * PR fixes Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
1 parent 51b416b commit cad2653

File tree

3 files changed

+220
-80
lines changed

3 files changed

+220
-80
lines changed

x-pack/plugins/alerts/server/health/get_state.test.ts

Lines changed: 166 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -5,73 +5,182 @@
55
* 2.0.
66
*/
77

8+
import { ServiceStatusLevels } from '../../../../../src/core/server';
89
import { taskManagerMock } from '../../../task_manager/server/mocks';
9-
import { getHealthStatusStream } from '.';
10-
import { TaskStatus } from '../../../task_manager/server';
10+
import {
11+
getHealthStatusStream,
12+
getHealthServiceStatusWithRetryAndErrorHandling,
13+
MAX_RETRY_ATTEMPTS,
14+
} from './get_state';
15+
import { ConcreteTaskInstance, TaskStatus } from '../../../task_manager/server';
1116
import { HealthStatus } from '../types';
1217

13-
describe('getHealthStatusStream()', () => {
14-
const mockTaskManager = taskManagerMock.createStart();
15-
16-
it('should return an object with the "unavailable" level and proper summary of "Alerting framework is unhealthy"', async () => {
17-
mockTaskManager.get.mockReturnValue(
18-
new Promise((_resolve, _reject) => {
19-
return {
20-
id: 'test',
21-
attempts: 0,
22-
status: TaskStatus.Running,
23-
version: '123',
24-
runAt: new Date(),
25-
scheduledAt: new Date(),
26-
startedAt: new Date(),
27-
retryAt: new Date(Date.now() + 5 * 60 * 1000),
28-
state: {
29-
runs: 1,
30-
health_status: HealthStatus.Warning,
31-
},
32-
taskType: 'alerting:alerting_health_check',
33-
params: {
34-
alertId: '1',
35-
},
36-
ownerId: null,
37-
};
18+
const tick = () => new Promise((resolve) => setImmediate(resolve));
19+
20+
const getHealthCheckTask = (overrides = {}): ConcreteTaskInstance => ({
21+
id: 'test',
22+
attempts: 0,
23+
status: TaskStatus.Running,
24+
version: '123',
25+
runAt: new Date(),
26+
scheduledAt: new Date(),
27+
startedAt: new Date(),
28+
retryAt: new Date(Date.now() + 5 * 60 * 1000),
29+
state: {
30+
runs: 1,
31+
health_status: HealthStatus.OK,
32+
},
33+
taskType: 'alerting:alerting_health_check',
34+
params: {
35+
alertId: '1',
36+
},
37+
ownerId: null,
38+
...overrides,
39+
});
40+
41+
describe('getHealthServiceStatusWithRetryAndErrorHandling', () => {
42+
beforeEach(() => jest.useFakeTimers());
43+
44+
it('should get status at each interval', async () => {
45+
const mockTaskManager = taskManagerMock.createStart();
46+
mockTaskManager.get.mockResolvedValue(getHealthCheckTask());
47+
const pollInterval = 100;
48+
const halfInterval = Math.floor(pollInterval / 2);
49+
50+
getHealthStatusStream(mockTaskManager, pollInterval).subscribe();
51+
52+
// shouldn't fire before poll interval passes
53+
// should fire once each poll interval
54+
jest.advanceTimersByTime(halfInterval);
55+
expect(mockTaskManager.get).toHaveBeenCalledTimes(0);
56+
jest.advanceTimersByTime(halfInterval);
57+
expect(mockTaskManager.get).toHaveBeenCalledTimes(1);
58+
jest.advanceTimersByTime(pollInterval);
59+
expect(mockTaskManager.get).toHaveBeenCalledTimes(2);
60+
jest.advanceTimersByTime(pollInterval);
61+
expect(mockTaskManager.get).toHaveBeenCalledTimes(3);
62+
});
63+
64+
it('should retry on error', async () => {
65+
const mockTaskManager = taskManagerMock.createStart();
66+
mockTaskManager.get.mockRejectedValue(new Error('Failure'));
67+
const retryDelay = 10;
68+
const pollInterval = 100;
69+
const halfInterval = Math.floor(pollInterval / 2);
70+
71+
getHealthStatusStream(mockTaskManager, pollInterval, retryDelay).subscribe();
72+
73+
jest.advanceTimersByTime(halfInterval);
74+
expect(mockTaskManager.get).toHaveBeenCalledTimes(0);
75+
jest.advanceTimersByTime(halfInterval);
76+
expect(mockTaskManager.get).toHaveBeenCalledTimes(1);
77+
78+
// Retry on failure
79+
let numTimesCalled = 1;
80+
for (let i = 0; i < MAX_RETRY_ATTEMPTS; i++) {
81+
await tick();
82+
jest.advanceTimersByTime(retryDelay);
83+
expect(mockTaskManager.get).toHaveBeenCalledTimes(numTimesCalled++ + 1);
84+
}
85+
86+
// Once we've exceeded max retries, should not try again
87+
await tick();
88+
jest.advanceTimersByTime(retryDelay);
89+
expect(mockTaskManager.get).toHaveBeenCalledTimes(numTimesCalled);
90+
91+
// Once another poll interval passes, should call fn again
92+
await tick();
93+
jest.advanceTimersByTime(pollInterval - MAX_RETRY_ATTEMPTS * retryDelay);
94+
expect(mockTaskManager.get).toHaveBeenCalledTimes(numTimesCalled + 1);
95+
});
96+
97+
it('should return healthy status when health status is "ok"', async () => {
98+
const mockTaskManager = taskManagerMock.createStart();
99+
mockTaskManager.get.mockResolvedValue(getHealthCheckTask());
100+
101+
const status = await getHealthServiceStatusWithRetryAndErrorHandling(
102+
mockTaskManager
103+
).toPromise();
104+
105+
expect(status.level).toEqual(ServiceStatusLevels.available);
106+
expect(status.summary).toEqual('Alerting framework is available');
107+
});
108+
109+
it('should return degraded status when health status is "warn"', async () => {
110+
const mockTaskManager = taskManagerMock.createStart();
111+
mockTaskManager.get.mockResolvedValue(
112+
getHealthCheckTask({
113+
state: {
114+
runs: 1,
115+
health_status: HealthStatus.Warning,
116+
},
38117
})
39118
);
40-
getHealthStatusStream(mockTaskManager).subscribe(
41-
(val: { level: Readonly<unknown>; summary: string }) => {
42-
expect(val.level).toBe(false);
43-
}
44-
);
119+
120+
const status = await getHealthServiceStatusWithRetryAndErrorHandling(
121+
mockTaskManager
122+
).toPromise();
123+
124+
expect(status.level).toEqual(ServiceStatusLevels.degraded);
125+
expect(status.summary).toEqual('Alerting framework is degraded');
45126
});
46127

47-
it('should return an object with the "available" level and proper summary of "Alerting framework is healthy"', async () => {
48-
mockTaskManager.get.mockReturnValue(
49-
new Promise((_resolve, _reject) => {
50-
return {
51-
id: 'test',
52-
attempts: 0,
53-
status: TaskStatus.Running,
54-
version: '123',
55-
runAt: new Date(),
56-
scheduledAt: new Date(),
57-
startedAt: new Date(),
58-
retryAt: new Date(Date.now() + 5 * 60 * 1000),
59-
state: {
60-
runs: 1,
61-
health_status: HealthStatus.OK,
62-
},
63-
taskType: 'alerting:alerting_health_check',
64-
params: {
65-
alertId: '1',
66-
},
67-
ownerId: null,
68-
};
128+
it('should return unavailable status when health status is "error"', async () => {
129+
const mockTaskManager = taskManagerMock.createStart();
130+
mockTaskManager.get.mockResolvedValue(
131+
getHealthCheckTask({
132+
state: {
133+
runs: 1,
134+
health_status: HealthStatus.Error,
135+
},
69136
})
70137
);
71-
getHealthStatusStream(mockTaskManager).subscribe(
72-
(val: { level: Readonly<unknown>; summary: string }) => {
73-
expect(val.level).toBe(true);
138+
139+
const status = await getHealthServiceStatusWithRetryAndErrorHandling(
140+
mockTaskManager
141+
).toPromise();
142+
143+
expect(status.level).toEqual(ServiceStatusLevels.unavailable);
144+
expect(status.summary).toEqual('Alerting framework is unavailable');
145+
expect(status.meta).toBeUndefined();
146+
});
147+
148+
it('should retry on error and return healthy status if retry succeeds', async () => {
149+
const retryDelay = 10;
150+
const mockTaskManager = taskManagerMock.createStart();
151+
mockTaskManager.get
152+
.mockRejectedValueOnce(new Error('Failure'))
153+
.mockResolvedValue(getHealthCheckTask());
154+
155+
getHealthServiceStatusWithRetryAndErrorHandling(mockTaskManager, retryDelay).subscribe(
156+
(status) => {
157+
expect(status.level).toEqual(ServiceStatusLevels.available);
158+
expect(status.summary).toEqual('Alerting framework is available');
159+
}
160+
);
161+
162+
await tick();
163+
jest.advanceTimersByTime(retryDelay * 2);
164+
});
165+
166+
it('should retry on error and return unavailable status if retry fails', async () => {
167+
const retryDelay = 10;
168+
const err = new Error('Failure');
169+
const mockTaskManager = taskManagerMock.createStart();
170+
mockTaskManager.get.mockRejectedValue(err);
171+
172+
getHealthServiceStatusWithRetryAndErrorHandling(mockTaskManager, retryDelay).subscribe(
173+
(status) => {
174+
expect(status.level).toEqual(ServiceStatusLevels.unavailable);
175+
expect(status.summary).toEqual('Alerting framework is unavailable');
176+
expect(status.meta).toEqual({ error: err });
74177
}
75178
);
179+
180+
for (let i = 0; i < MAX_RETRY_ATTEMPTS + 1; i++) {
181+
await tick();
182+
jest.advanceTimersByTime(retryDelay);
183+
}
184+
expect(mockTaskManager.get).toHaveBeenCalledTimes(MAX_RETRY_ATTEMPTS + 1);
76185
});
77186
});

x-pack/plugins/alerts/server/health/get_state.ts

Lines changed: 51 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,17 @@
66
*/
77

88
import { i18n } from '@kbn/i18n';
9-
import { interval, Observable } from 'rxjs';
10-
import { catchError, switchMap } from 'rxjs/operators';
9+
import { defer, of, interval, Observable, throwError, timer } from 'rxjs';
10+
import { catchError, mergeMap, retryWhen, switchMap } from 'rxjs/operators';
1111
import { ServiceStatus, ServiceStatusLevels } from '../../../../../src/core/server';
1212
import { TaskManagerStartContract } from '../../../task_manager/server';
1313
import { HEALTH_TASK_ID } from './task';
1414
import { HealthStatus } from '../types';
1515

16+
export const MAX_RETRY_ATTEMPTS = 3;
17+
const HEALTH_STATUS_INTERVAL = 60000 * 5; // Five minutes
18+
const RETRY_DELAY = 5000; // Wait 5 seconds before retrying on errors
19+
1620
async function getLatestTaskState(taskManager: TaskManagerStartContract) {
1721
try {
1822
const result = await taskManager.get(HEALTH_TASK_ID);
@@ -48,27 +52,53 @@ const LEVEL_SUMMARY = {
4852
),
4953
};
5054

51-
export const getHealthStatusStream = (
55+
const getHealthServiceStatus = async (
5256
taskManager: TaskManagerStartContract
57+
): Promise<ServiceStatus<unknown>> => {
58+
const doc = await getLatestTaskState(taskManager);
59+
const level =
60+
doc?.state?.health_status === HealthStatus.OK
61+
? ServiceStatusLevels.available
62+
: doc?.state?.health_status === HealthStatus.Warning
63+
? ServiceStatusLevels.degraded
64+
: ServiceStatusLevels.unavailable;
65+
return {
66+
level,
67+
summary: LEVEL_SUMMARY[level.toString()],
68+
};
69+
};
70+
71+
export const getHealthServiceStatusWithRetryAndErrorHandling = (
72+
taskManager: TaskManagerStartContract,
73+
retryDelay?: number
5374
): Observable<ServiceStatus<unknown>> => {
54-
return interval(60000 * 5).pipe(
55-
switchMap(async () => {
56-
const doc = await getLatestTaskState(taskManager);
57-
const level =
58-
doc?.state?.health_status === HealthStatus.OK
59-
? ServiceStatusLevels.available
60-
: doc?.state?.health_status === HealthStatus.Warning
61-
? ServiceStatusLevels.degraded
62-
: ServiceStatusLevels.unavailable;
63-
return {
64-
level,
65-
summary: LEVEL_SUMMARY[level.toString()],
66-
};
75+
return defer(() => getHealthServiceStatus(taskManager)).pipe(
76+
retryWhen((errors) => {
77+
return errors.pipe(
78+
mergeMap((error, i) => {
79+
const retryAttempt = i + 1;
80+
if (retryAttempt > MAX_RETRY_ATTEMPTS) {
81+
return throwError(error);
82+
}
83+
return timer(retryDelay ?? RETRY_DELAY);
84+
})
85+
);
6786
}),
68-
catchError(async (error) => ({
69-
level: ServiceStatusLevels.unavailable,
70-
summary: LEVEL_SUMMARY[ServiceStatusLevels.unavailable.toString()],
71-
meta: { error },
72-
}))
87+
catchError((error) => {
88+
return of({
89+
level: ServiceStatusLevels.unavailable,
90+
summary: LEVEL_SUMMARY[ServiceStatusLevels.unavailable.toString()],
91+
meta: { error },
92+
});
93+
})
7394
);
7495
};
96+
97+
export const getHealthStatusStream = (
98+
taskManager: TaskManagerStartContract,
99+
healthStatusInterval?: number,
100+
retryDelay?: number
101+
): Observable<ServiceStatus<unknown>> =>
102+
interval(healthStatusInterval ?? HEALTH_STATUS_INTERVAL).pipe(
103+
switchMap(() => getHealthServiceStatusWithRetryAndErrorHandling(taskManager, retryDelay))
104+
);

x-pack/plugins/alerts/server/plugin.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
*/
77

88
import type { PublicMethodsOf } from '@kbn/utility-types';
9-
import { first, map } from 'rxjs/operators';
9+
import { first, map, share } from 'rxjs/operators';
1010
import { Observable } from 'rxjs';
1111
import { UsageCollectionSetup } from 'src/plugins/usage_collection/server';
1212
import { combineLatest } from 'rxjs';
@@ -251,7 +251,8 @@ export class AlertingPlugin {
251251
} else {
252252
return derivedStatus;
253253
}
254-
})
254+
}),
255+
share()
255256
)
256257
);
257258
});

0 commit comments

Comments
 (0)