Skip to content
Draft
5 changes: 5 additions & 0 deletions packages/elastic_agent/changelog.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
# newer versions go on top
- version: "2.6.4"
changes:
- description: Adds alerting rule templates
type: enhancement
link: https://github.com/elastic/integrations/pull/15572
- version: "2.6.3"
changes:
- description: Elastic Agent memory charts now prioritise RSS memory for more accurate usage reporting.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"id": "elastic-agent-cpu-usage-spike-rule",
"type": "alerting_rule_template",
"attributes": {
"name": "[Elastic Agent] CPU usage spike",
"tags": ["Elastic Agent", "Resource Consumption"],
"ruleTypeId": ".es-query",
"schedule": {
"interval": "1m"
},
"params": {
"searchType": "esqlQuery",
"timeWindowSize": 7,
"timeWindowUnit": "m",
"threshold": [0],
"thresholdComparator": ">",
"size": 100,
"esqlQuery": {
"esql": "FROM metrics-*\n | WHERE process.executable LIKE \"*elastic*agent*\"\n | STATS cpu_process_pct = MAX(system.process.cpu.total.pct) * 100\n BY elastic_agent.id, process.name,\n time_bucket = BUCKET(@timestamp, 1 minute)\n // Count the 1 minute timebuckets that are above 80% by process and agent\n | WHERE cpu_process_pct >= 80\n | STATS count_above_threshold = COUNT(*)\n BY elastic_agent.id, process.name\n // Alert if there are 5 or more occurences\n | WHERE count_above_threshold >= 5"
},
"aggType": "count",
"groupBy": "row",
"termSize": 5,
"sourceFields": [],
"timeField": "@timestamp",
"excludeHitsFromPreviousRun": true
},
"alertDelay": {
"active": 1
}
},
"coreMigrationVersion": "8.8.0",
"typeMigrationVersion": "10.1.0"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"id": "elastic-agent-dropped-events",
"type": "alerting_rule_template",
"attributes": {
"name": "[Elastic Agent] Dropped events",
"tags": ["Elastic Agent", "Pipeline and Queues"],
"ruleTypeId": ".es-query",
"schedule": {
"interval": "1m"
},
"params": {
"searchType": "esqlQuery",
"timeWindowSize": 5,
"timeWindowUnit": "m",
"threshold": [0],
"thresholdComparator": ">",
"size": 100,
"esqlQuery": {
"esql": "TS metrics-*\n| WHERE data_stream.dataset LIKE \"elastic_agent.*beat\"\n| STATS events_dropped_rate = max(rate(beat.stats.libbeat.pipeline.events.dropped)), pipeline_acked_rate = max(rate(beat.stats.libbeat.pipeline.queue.acked)) BY time_bucket = bucket(@timestamp,5minute), elastic_agent.id, component.id\n| EVAL percent_drop_rate = (events_dropped_rate / pipeline_acked_rate)\n| WHERE percent_drop_rate >= 0.05\n\n"
},
"aggType": "count",
"groupBy": "row",
"termSize": 5,
"sourceFields": [],
"timeField": "@timestamp",
"excludeHitsFromPreviousRun": true
},
"alertDelay": {
"active": 1
}
},
"coreMigrationVersion": "8.8.0",
"typeMigrationVersion": "10.1.0"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"id": "elastic-agent-excessive-memory-usage-rule",
"type": "alerting_rule_template",
"attributes": {
"name": "[Elastic Agent] Excessive memory usage",
"tags": ["Elastic Agent", "Resource Consumption"],
"ruleTypeId": ".es-query",
"schedule": {
"interval": "1m"
},
"params": {
"searchType": "esqlQuery",
"timeWindowSize": 5,
"timeWindowUnit": "m",
"threshold": [0],
"thresholdComparator": ">",
"size": 100,
"esqlQuery": {
"esql": "FROM metrics-*\n| WHERE process.executable LIKE \"*elastic*agent*\"\n| STATS max_memory_per_process = MAX(system.process.memory.rss.pct * 100) BY agent.id, process.name\n| STATS total_memory_usage = SUM(max_memory_per_process) BY agent.id\n| WHERE total_memory_usage > 50"
},
"aggType": "count",
"groupBy": "row",
"termSize": 5,
"sourceFields": [],
"timeField": "@timestamp",
"excludeHitsFromPreviousRun": true
},
"alertDelay": {
"active": 1
}
},
"coreMigrationVersion": "8.8.0",
"typeMigrationVersion": "10.1.0"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"id": "elastic-agent-excessive-restarts",
"type": "alerting_rule_template",
"attributes": {
"name": "[Elastic Agent] Excessive restarts",
"tags": ["Elastic Agent"],
"ruleTypeId": ".es-query",
"schedule": {
"interval": "1m"
},
"params": {
"searchType": "esqlQuery",
"timeWindowSize": 5,
"timeWindowUnit": "m",
"threshold": [0],
"thresholdComparator": ">",
"size": 100,
"esqlQuery": {
"esql": "FROM metrics-* \n| WHERE process.executable LIKE \"*elastic*agent*\"\n| STATS restart_count = COUNT_DISTINCT(process.cpu.start_time) BY elastic_agent.id, process.name, bucket(@timestamp,5min) \n| WHERE restart_count > 10"
},
"aggType": "count",
"groupBy": "row",
"termSize": 5,
"sourceFields": [],
"timeField": "@timestamp",
"excludeHitsFromPreviousRun": true
},
"alertDelay": {
"active": 1
}
},
"coreMigrationVersion": "8.8.0",
"typeMigrationVersion": "10.1.0"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"id": "elastic-agent-high-pipeline-queue",
"type": "alerting_rule_template",
"attributes": {
"name": "[Elastic Agent] High pipeline queue",
"tags": ["Elastic Agent", "Pipeline and Queues"],
"ruleTypeId": ".es-query",
"schedule": {
"interval": "1m"
},
"params": {
"searchType": "esqlQuery",
"timeWindowSize": 5,
"timeWindowUnit": "m",
"threshold": [0],
"thresholdComparator": ">",
"size": 100,
"esqlQuery": {
"esql": "TS metrics-*\n| WHERE data_stream.dataset == \"elastic_agent.*beat\"\n| STATS pipeline_queue_pct = MAX(beat.stats.libbeat.pipeline.queue.filled.pct) * 100 BY elastic_agent.id, process.name\n| WHERE pipeline_queue_pct >= 90"
},
"aggType": "count",
"groupBy": "row",
"termSize": 5,
"sourceFields": [],
"timeField": "@timestamp",
"excludeHitsFromPreviousRun": true
},
"alertDelay": {
"active": 1
}
},
"coreMigrationVersion": "8.8.0",
"typeMigrationVersion": "10.1.0"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"id": "elastic-agent-output-errors",
"type": "alerting_rule_template",
"attributes": {
"name": "[Elastic Agent] Output errors",
"tags": ["Elastic Agent", "Pipeline and Queues"],
"ruleTypeId": ".es-query",
"schedule": {
"interval": "1m"
},
"params": {
"searchType": "esqlQuery",
"timeWindowSize": 10,
"timeWindowUnit": "m",
"threshold": [0],
"thresholdComparator": ">",
"size": 100,
"esqlQuery": {
"esql": "TS metrics-*\n| WHERE data_stream.dataset LIKE \"elastic_agent.*beat\"\n| STATS errors_rate = MAX(RATE(beat.stats.libbeat.output.write.errors)) BY time_bucket = BUCKET(@timestamp,5minute), elastic_agent.id, component.id\n| EVAL errors_per_min = errors_rate * 60\n| WHERE errors_per_min > 5"
},
"aggType": "count",
"groupBy": "row",
"termSize": 5,
"sourceFields": [],
"timeField": "@timestamp",
"excludeHitsFromPreviousRun": true
},
"alertDelay": {
"active": 1
}
},
"coreMigrationVersion": "8.8.0",
"typeMigrationVersion": "10.1.0"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"id": "elastic-agent-unhealthy-status",
"type": "alerting_rule_template",
"attributes": {
"name": "[Elastic Agent] Unhealthy status",
"tags": ["Elastic Agent"],
"ruleTypeId": ".es-query",
"schedule": {
"interval": "1m"
},
"params": {
"searchType": "esqlQuery",
"timeWindowSize": 5,
"timeWindowUnit": "m",
"threshold": [0],
"thresholdComparator": ">",
"size": 100,
"esqlQuery": {
"esql": "FROM logs-* \n| WHERE data_stream.dataset == \"elastic_agent.status_change\" and agentless == false and status == \"error\""
},
"aggType": "count",
"groupBy": "row",
"termSize": 5,
"sourceFields": [],
"timeField": "@timestamp",
"excludeHitsFromPreviousRun": true
},
"alertDelay": {
"active": 1
}
},
"coreMigrationVersion": "8.8.0",
"typeMigrationVersion": "10.1.0"
}
4 changes: 2 additions & 2 deletions packages/elastic_agent/manifest.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: elastic_agent
title: Elastic Agent
version: 2.6.3
version: 2.6.4
description: Collect logs and metrics from Elastic Agents.
type: integration
format_version: 3.1.4
format_version: 3.5.0
categories: ["elastic_stack"]
conditions:
kibana:
Expand Down