Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: metrics alerting support #145

Merged
merged 5 commits into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/light-pumas-obey.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
'@hyperdx/api': minor
---

feat: metrics alerting support
10 changes: 8 additions & 2 deletions packages/api/src/clickhouse/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -705,7 +705,7 @@ export const getMetricsChart = async ({
aggFn: AggFn;
dataType: MetricsDataType;
endTime: number; // unix in ms,
granularity: Granularity;
granularity: Granularity | string;
groupBy?: string;
name: string;
q: string;
Expand Down Expand Up @@ -868,7 +868,13 @@ ORDER BY _timestamp_sort_key ASC
query,
format: 'JSON',
});
const result = await rows.json<ResponseJSON<Record<string, unknown>>>();
const result = await rows.json<
ResponseJSON<{
data: number;
group: string;
ts_bucket: number;
}>
>();
logger.info({
message: 'getMetricsChart',
query,
Expand Down
146 changes: 145 additions & 1 deletion packages/api/src/tasks/__tests__/checkAlerts.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ describe('checkAlerts', () => {
);
});

it('CHART alert', async () => {
it('CHART alert (logs table series)', async () => {
jest
.spyOn(slack, 'postMessageToWebhook')
.mockResolvedValueOnce(null as any);
Expand Down Expand Up @@ -388,5 +388,149 @@ describe('checkAlerts', () => {
},
);
});

it('CHART alert (metrics table series)', async () => {
jest
.spyOn(slack, 'postMessageToWebhook')
.mockResolvedValueOnce(null as any);
jest
.spyOn(clickhouse, 'getMetricsChart')
.mockResolvedValueOnce({
rows: 1,
data: [
{
data: 11,
group: 'HyperDX',
ts_bucket: 1700172600,
},
],
} as any)
// no logs found in the next window
.mockResolvedValueOnce({
rows: 0,
data: [],
} as any);

const team = await createTeam({ name: 'My Team' });
const webhook = await new Webhook({
team: team._id,
service: 'slack',
url: 'https://hooks.slack.com/services/123',
name: 'My Webhook',
}).save();
const dashboard = await new Dashboard({
name: 'My Dashboard',
team: team._id,
charts: [
{
id: '198hki',
name: 'Redis Memory',
x: 0,
y: 0,
w: 6,
h: 3,
series: [
{
table: 'metrics',
type: 'time',
aggFn: 'max',
field: 'redis.memory.rss - Gauge',
where: 'cloud.provider:"aws"',
groupBy: ['host'],
},
],
},
{
id: 'obil1',
name: 'Min Duratioin',
x: 6,
y: 0,
w: 6,
h: 3,
series: [
{
table: 'logs',
type: 'time',
aggFn: 'min',
field: 'duration',
where: '',
groupBy: [],
},
],
},
],
}).save();
const alert = await createAlert({
source: 'CHART',
channel: {
type: 'webhook',
webhookId: webhook._id.toString(),
},
interval: '5m',
type: 'presence',
threshold: 10,
dashboardId: dashboard._id.toString(),
chartId: '198hki',
});

const now = new Date('2023-11-16T22:12:00.000Z');

// shoud fetch 5m of logs
await processAlert(now, alert);
// check alert history
const alertHistories = await AlertHistory.find({
alertId: alert._id,
});
expect(alertHistories.length).toBe(1);
expect(alertHistories[0].counts).toBe(1);
expect(alertHistories[0].createdAt).toEqual(
new Date('2023-11-16T22:10:00.000Z'),
);
expect(alert.state).toBe('ALERT');

// skip since time diff is less than 1 window size
const later = new Date('2023-11-16T22:14:00.000Z');
await processAlert(later, alert);
// alert should still be in alert state
expect(alert.state).toBe('ALERT');

const nextWindow = new Date('2023-11-16T22:16:00.000Z');
await processAlert(nextWindow, alert);
// alert should be in ok state
expect(alert.state).toBe('OK');

// check if getLogsChart query + webhook were triggered
expect(clickhouse.getMetricsChart).toHaveBeenNthCalledWith(1, {
aggFn: 'max',
dataType: 'Gauge',
endTime: 1700172600000,
granularity: '5 minute',
groupBy: 'host',
name: 'redis.memory.rss',
q: 'cloud.provider:"aws"',
startTime: 1700172300000,
teamId: team._id.toString(),
});
expect(slack.postMessageToWebhook).toHaveBeenNthCalledWith(
1,
'https://hooks.slack.com/services/123',
{
text: 'Alert for "Redis Memory" in "My Dashboard" - 11 exceeds 10',
blocks: [
{
text: {
text: [
`*<http://localhost:9090/dashboards/${dashboard._id}?from=1700170500000&granularity=5+minute&to=1700175000000 | Alert for "Redis Memory" in "My Dashboard">*`,
'Group: "HyperDX"',
'11 exceeds 10',
].join('\n'),
type: 'mrkdwn',
},
type: 'section',
},
],
},
);
});
});
});
31 changes: 29 additions & 2 deletions packages/api/src/tasks/checkAlerts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ import { URLSearchParams } from 'url';
import * as fns from 'date-fns';
import * as fnsTz from 'date-fns-tz';
import ms from 'ms';
import { isString } from 'lodash';
import { serializeError } from 'serialize-error';
import { z } from 'zod';

import * as clickhouse from '@/clickhouse';
import * as config from '@/config';
Expand Down Expand Up @@ -336,6 +338,7 @@ export const processAlert = async (now: Date, alert: AlertDocument) => {
let checksData:
| Awaited<ReturnType<typeof clickhouse.checkAlert>>
| Awaited<ReturnType<typeof clickhouse.getLogsChart>>
| Awaited<ReturnType<typeof clickhouse.getMetricsChart>>
| null = null;
let logView: Awaited<ReturnType<typeof getLogViewEnhanced>> | null = null;
let targetDashboard: EnhancedDashboard | null = null;
Expand Down Expand Up @@ -412,8 +415,30 @@ export const processAlert = async (now: Date, alert: AlertDocument) => {
tableVersion: dashboard.team.logStreamTableVersion,
teamId: dashboard.team._id.toString(),
});
} else if (
series.type === 'time' &&
series.table === 'metrics' &&
series.field
) {
targetDashboard = dashboard;
const startTimeMs = fns.getTime(checkStartTime);
const endTimeMs = fns.getTime(checkEndTime);
const [metricName, rawMetricDataType] = series.field.split(' - ');
const metricDataType = z
.nativeEnum(clickhouse.MetricsDataType)
.parse(rawMetricDataType);
checksData = await clickhouse.getMetricsChart({
aggFn: series.aggFn,
dataType: metricDataType,
endTime: endTimeMs,
granularity: `${windowSizeInMins} minute`,
groupBy: series.groupBy[0],
name: metricName,
q: series.where,
startTime: startTimeMs,
teamId: dashboard.team._id.toString(),
});
}
// TODO: support metrics table
}

logger.info({
Expand All @@ -439,7 +464,9 @@ export const processAlert = async (now: Date, alert: AlertDocument) => {
let alertState = AlertState.OK;
if (checksData?.rows && checksData?.rows > 0) {
for (const checkData of checksData.data) {
const totalCount = parseInt(checkData.data);
const totalCount = isString(checkData.data)
? parseInt(checkData.data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Should always provide a radix argument for parseInt

: checkData.data;
if (doesExceedThreshold(alert, totalCount)) {
alertState = AlertState.ALERT;
logger.info({
Expand Down
Loading