Skip to content

Commit

Permalink
[ResponseOps][MW] Add telemetry for the maintenance window (#192483)
Browse files Browse the repository at this point in the history
Resolve: #184088

In this PR add telemetry collection of these metrics:

- total number of MW in deployments
- number of active MW with "repeat" toggle on (time based)
- number of active MW with "filter alerts" toggle on (KQL based)

## Testing

Create several MW with different settings (toggles on and off)
To test changes reflected in telemetry object,  
modify this file: `x-pack/plugins/alerting/server/usage/task.ts`

With:

```
async function scheduleTasks(logger: Logger, taskManager: TaskManagerStartContract) {
  try {
    await taskManager.ensureScheduled({
      id: TASK_ID,
      taskType: TELEMETRY_TASK_TYPE,
      state: emptyState,
      params: {},
      schedule: SCHEDULE,
    });
  } catch (e) {
    logger.error(`Error scheduling ${TASK_ID}, received ${e.message}`);
  }
  await taskManager.runSoon(TASK_ID);
}
```

This will cause the telemetry to be sent as soon as the server is
restarted.

**Run Telemetry usage payload API in your browser console to verify
telemetry object:**

https://docs.elastic.dev/telemetry/collection/snapshot-telemetry#telemetry-usage-payload-api
P.S.: Add space at the beginning of URL


### Checklist

- [x] [Unit or functional
tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)
were updated or added to match the most common scenarios

---------

Co-authored-by: Elastic Machine <elasticmachine@users.noreply.github.com>
  • Loading branch information
guskovaue and elasticmachine authored Sep 19, 2024
1 parent 210f552 commit eabb102
Show file tree
Hide file tree
Showing 8 changed files with 364 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,9 @@ export function createAlertingUsageCollector(
count_rules_with_tags: 0,
count_rules_snoozed: 0,
count_rules_muted: 0,
count_mw_total: 0,
count_mw_with_repeat_toggle_on: 0,
count_mw_with_filter_alert_toggle_on: 0,
count_rules_with_muted_alerts: 0,
count_connector_types_by_consumers: {},
count_rules_by_execution_status_per_day: {},
Expand Down Expand Up @@ -289,6 +292,9 @@ export function createAlertingUsageCollector(
count_rules_by_notify_when: byNotifyWhenSchema,
count_rules_snoozed: { type: 'long' },
count_rules_muted: { type: 'long' },
count_mw_total: { type: 'long' },
count_mw_with_repeat_toggle_on: { type: 'long' },
count_mw_with_filter_alert_toggle_on: { type: 'long' },
count_rules_with_muted_alerts: { type: 'long' },
count_connector_types_by_consumers: { DYNAMIC_KEY: { DYNAMIC_KEY: { type: 'long' } } },
count_rules_by_execution_status_per_day: byStatusPerDaySchema,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,97 @@
*/

import { elasticsearchServiceMock, loggingSystemMock } from '@kbn/core/server/mocks';
import { getTotalCountAggregations, getTotalCountInUse } from './get_telemetry_from_kibana';
import {
getTotalCountAggregations,
getTotalCountInUse,
getMWTelemetry,
} from './get_telemetry_from_kibana';
import { savedObjectsClientMock } from '@kbn/core/server/mocks';
import { MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE } from '../../../common';
import { ISavedObjectsRepository } from '@kbn/core/server';

const elasticsearch = elasticsearchServiceMock.createStart();
const esClient = elasticsearch.client.asInternalUser;
const logger: ReturnType<typeof loggingSystemMock.createLogger> = loggingSystemMock.createLogger();
const savedObjectsClient = savedObjectsClientMock.create() as unknown as ISavedObjectsRepository;
const thrownError = new Error('Fail');

const mockedResponse = {
saved_objects: [
{
id: '1',
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
attributes: {
title: 'test_rule_1',
enabled: true,
duration: 1800000,
expirationDate: '2025-09-09T13:13:07.824Z',
events: [],
rRule: {
dtstart: '2024-09-09T13:13:02.054Z',
tzid: 'Europe/Stockholm',
freq: 0,
count: 1,
},
createdBy: null,
updatedBy: null,
createdAt: '2024-09-09T13:13:07.825Z',
updatedAt: '2024-09-09T13:13:07.825Z',
scopedQuery: null,
},
},
{
id: '2',
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
attributes: {
title: 'test_rule_2',
enabled: true,
duration: 1800000,
expirationDate: '2025-09-09T13:13:07.824Z',
events: [],
rRule: {
dtstart: '2024-09-09T13:13:02.054Z',
tzid: 'Europe/Stockholm',
freq: 3,
interval: 1,
byweekday: ['SU'],
},
createdBy: null,
updatedBy: null,
createdAt: '2024-09-09T13:13:07.825Z',
updatedAt: '2024-09-09T13:13:07.825Z',
scopedQuery: {
filters: [],
kql: 'kibana.alert.job_errors_results.job_id : * ',
dsl: '{"bool":{"must":[],"filter":[{"bool":{"should":[{"exists":{"field":"kibana.alert.job_errors_results.job_id"}}],"minimum_should_match":1}}],"should":[],"must_not":[]}}',
},
},
},
{
id: '3',
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
attributes: {
title: 'test_rule_3',
enabled: true,
duration: 1800000,
expirationDate: '2025-09-09T13:13:07.824Z',
events: [],
rRule: {
dtstart: '2024-09-09T13:13:02.054Z',
tzid: 'Europe/Stockholm',
freq: 3,
interval: 1,
byweekday: ['TU'],
},
createdBy: null,
updatedBy: null,
createdAt: '2024-09-09T13:13:07.825Z',
updatedAt: '2024-09-09T13:13:07.825Z',
scopedQuery: null,
},
},
],
};

describe('kibana index telemetry', () => {
beforeEach(() => {
Expand Down Expand Up @@ -420,4 +506,94 @@ describe('kibana index telemetry', () => {
});
});
});

describe('getMWTelemetry', () => {
test('should return MW telemetry', async () => {
savedObjectsClient.createPointInTimeFinder = jest.fn().mockReturnValue({
close: jest.fn(),
find: jest.fn().mockImplementation(async function* () {
yield mockedResponse;
}),
});
const telemetry = await getMWTelemetry({
savedObjectsClient,
logger,
});

expect(savedObjectsClient.createPointInTimeFinder).toHaveBeenCalledWith({
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
namespaces: ['*'],
perPage: 100,
fields: ['rRule', 'scopedQuery'],
});
expect(telemetry).toStrictEqual({
count_mw_total: 3,
count_mw_with_repeat_toggle_on: 2,
count_mw_with_filter_alert_toggle_on: 1,
hasErrors: false,
});
});
});

test('should throw the error', async () => {
savedObjectsClient.createPointInTimeFinder = jest.fn().mockReturnValue({
close: jest.fn(),
find: jest.fn().mockImplementation(async function* () {
throw thrownError;
}),
});

const telemetry = await getMWTelemetry({
savedObjectsClient,
logger,
});

expect(savedObjectsClient.createPointInTimeFinder).toHaveBeenCalledWith({
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
namespaces: ['*'],
perPage: 100,
fields: ['rRule', 'scopedQuery'],
});

expect(telemetry).toStrictEqual({
count_mw_total: 0,
count_mw_with_repeat_toggle_on: 0,
count_mw_with_filter_alert_toggle_on: 0,
hasErrors: true,
errorMessage: 'Fail',
});
expect(logger.warn).toHaveBeenCalled();
const loggerCall = logger.warn.mock.calls[0][0];
const loggerMeta = logger.warn.mock.calls[0][1];
expect(loggerCall).toBe('Error executing alerting telemetry task: getTotalMWCount - {}');
expect(loggerMeta?.tags).toEqual(['alerting', 'telemetry-failed']);
expect(loggerMeta?.error?.stack_trace).toBeDefined();
});

test('should stop on MW max limit count', async () => {
savedObjectsClient.createPointInTimeFinder = jest.fn().mockReturnValue({
close: jest.fn(),
find: jest.fn().mockImplementation(async function* () {
yield mockedResponse;
}),
});
const telemetry = await getMWTelemetry({
savedObjectsClient,
logger,
maxDocuments: 1,
});

expect(savedObjectsClient.createPointInTimeFinder).toHaveBeenCalledWith({
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
namespaces: ['*'],
perPage: 100,
fields: ['rRule', 'scopedQuery'],
});
expect(telemetry).toStrictEqual({
count_mw_total: 2,
count_mw_with_repeat_toggle_on: 1,
count_mw_with_filter_alert_toggle_on: 1,
hasErrors: false,
});
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import type {
AggregationsTermsAggregateBase,
AggregationsStringTermsBucketKeys,
} from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { ElasticsearchClient, Logger } from '@kbn/core/server';
import { ElasticsearchClient, Logger, ISavedObjectsRepository } from '@kbn/core/server';

import {
ConnectorsByConsumersBucket,
Expand All @@ -23,13 +23,21 @@ import { AlertingUsage } from '../types';
import { NUM_ALERTING_RULE_TYPES } from '../alerting_usage_collector';
import { parseSimpleRuleTypeBucket } from './parse_simple_rule_type_bucket';
import { groupRulesBySearchType } from './group_rules_by_search_type';
import { MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE } from '../../../common';
import { MaintenanceWindowAttributes } from '../../data/maintenance_window/types';

interface Opts {
esClient: ElasticsearchClient;
alertIndex: string;
logger: Logger;
}

interface MWOpts {
savedObjectsClient: ISavedObjectsRepository;
logger: Logger;
maxDocuments?: number;
}

type GetTotalCountsResults = Pick<
AlertingUsage,
| 'count_total'
Expand All @@ -48,6 +56,14 @@ type GetTotalCountsResults = Pick<
| 'connectors_per_alert'
> & { errorMessage?: string; hasErrors: boolean };

type GetMWTelemetryResults = Pick<
AlertingUsage,
'count_mw_total' | 'count_mw_with_repeat_toggle_on' | 'count_mw_with_filter_alert_toggle_on'
> & {
errorMessage?: string;
hasErrors: boolean;
};

interface GetTotalCountInUseResults {
countTotal: number;
countByType: Record<string, number>;
Expand All @@ -56,6 +72,8 @@ interface GetTotalCountInUseResults {
hasErrors: boolean;
}

const TELEMETRY_MW_COUNT_LIMIT = 10000;

export async function getTotalCountAggregations({
esClient,
alertIndex,
Expand Down Expand Up @@ -490,3 +508,60 @@ export async function getTotalCountInUse({
};
}
}

export async function getMWTelemetry({
savedObjectsClient,
logger,
maxDocuments = TELEMETRY_MW_COUNT_LIMIT,
}: MWOpts): Promise<GetMWTelemetryResults> {
try {
const mwFinder = savedObjectsClient.createPointInTimeFinder<MaintenanceWindowAttributes>({
type: MAINTENANCE_WINDOW_SAVED_OBJECT_TYPE,
namespaces: ['*'],
perPage: 100,
fields: ['rRule', 'scopedQuery'],
});

let countMWTotal = 0;
let countMWWithRepeatToggleON = 0;
let countMWWithFilterAlertToggleON = 0;
mwLoop: for await (const response of mwFinder.find()) {
for (const mwSavedObject of response.saved_objects) {
if (countMWTotal > maxDocuments) break mwLoop;
countMWTotal = countMWTotal + 1;
// scopedQuery property will be null if "Filter alerts" toggle will be off
if (mwSavedObject.attributes.scopedQuery) {
countMWWithFilterAlertToggleON = countMWWithFilterAlertToggleON + 1;
}
// interval property will be not in place if "Repeat" toggle will be off
if (Object.hasOwn(mwSavedObject.attributes.rRule, 'interval')) {
countMWWithRepeatToggleON = countMWWithRepeatToggleON + 1;
}
}
}
await mwFinder.close();

return {
hasErrors: false,
count_mw_total: countMWTotal,
count_mw_with_repeat_toggle_on: countMWWithRepeatToggleON,
count_mw_with_filter_alert_toggle_on: countMWWithFilterAlertToggleON,
};
} catch (err) {
const errorMessage = err?.message ? err.message : err.toString();
logger.warn(
`Error executing alerting telemetry task: getTotalMWCount - ${JSON.stringify(err)}`,
{
tags: ['alerting', 'telemetry-failed'],
error: { stack_trace: err?.stack },
}
);
return {
hasErrors: true,
errorMessage,
count_mw_total: 0,
count_mw_with_repeat_toggle_on: 0,
count_mw_with_filter_alert_toggle_on: 0,
};
}
}
Loading

0 comments on commit eabb102

Please sign in to comment.