-
Notifications
You must be signed in to change notification settings - Fork 301
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2558 from douavue/dvue/k8s-otel-alert-policies
Dvue/k8s otel alert policies
- Loading branch information
Showing
22 changed files
with
1,424 additions
and
0 deletions.
There are no files selected for viewing
76 changes: 76 additions & 0 deletions
76
alert-policies/kubernetes-opentelemetry/ContainerCPUThrottling.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
name: Container cpu throttling is high | ||
# Description and details | ||
description: | | ||
Alert when container is being throttled > 25% of the time for more than 5 minutes | ||
# Type of alert: BASELINE | STATIC | ||
type: STATIC | ||
|
||
# NRQL query | ||
nrql: | ||
query: "from Metric select latest(container_cpu_cfs_throttled_periods_total) / latest(container_cpu_cfs_periods_total)* 100 where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" | ||
|
||
# Direction in which baseline is set (Default: LOWER_ONLY) | ||
# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY | ||
|
||
# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) | ||
valueFunction: SINGLE_VALUE | ||
|
||
# List of Critical and Warning thresholds for the condition | ||
terms: | ||
- priority: CRITICAL | ||
# Operator used to compare against the threshold. | ||
operator: ABOVE | ||
# Value that triggers a violation | ||
threshold: 25 | ||
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions | ||
thresholdDuration: 300 | ||
# How many data points must be in violation for the duration | ||
thresholdOccurrences: ALL | ||
|
||
# Adding a Warning threshold is optional | ||
# - priority: WARNING | ||
# # Operator used to compare against the threshold. | ||
# operator: ABOVE | ||
# # Value that triggers a violation | ||
# threshold: 0 | ||
# # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions | ||
# thresholdDuration: 60 | ||
# # How many data points must be in violation for the duration | ||
# thresholdOccurrences: AT_LEAST_ONCE | ||
|
||
# Loss of Signal Settings | ||
expiration: | ||
# Close open violations if signal is lost (Default: false) | ||
closeViolationsOnExpiration: true | ||
# Open "Loss of Signal" violation if signal is lost (Default: false) | ||
openViolationOnExpiration: false | ||
# Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' | ||
expirationDuration: 300 | ||
|
||
# Advanced Signal Settings | ||
# Duration of the time window used to evaluate the NRQL Condition | ||
signal: | ||
# How long we wait for data that belongs in each aggregation window | ||
aggregationDelay: 60 # seconds | ||
# The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. | ||
aggregationMethod: EVENT_FLOW | ||
# How long we wait after each data point arrives to make sure we've processed the whole batch. | ||
aggregationTimer: null # seconds | ||
# Controls the duration of the time window used to evaluate the NRQL query | ||
aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes | ||
# Option that determines the type of value that should be used to fill gaps (empty windows). | ||
fillOption: NONE # defaults to STATIC | ||
# If using the static fill option, this value is used for filling. | ||
fillValue: null # default | ||
# This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. | ||
slideBy: 60 # seconds | ||
# Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. | ||
evaluationDelay: 60 | ||
|
||
# OPTIONAL: URL of runbook to be sent with notification | ||
runbookUrl: | ||
|
||
# Duration after which a violation automatically closes | ||
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) | ||
violationTimeLimitSeconds: 21600 |
76 changes: 76 additions & 0 deletions
76
alert-policies/kubernetes-opentelemetry/ContainerHighCPUUtil.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
name: Container high cpu utilization | ||
# Description and details | ||
description: | | ||
Alert when the average container cpu utilization (vs. Limit) is > 90% for more than 5 minutes | ||
# Type of alert: BASELINE | STATIC | ||
type: STATIC | ||
|
||
# NRQL query | ||
nrql: | ||
query: "from Metric select average(container.cpu.utilization) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" | ||
|
||
# Direction in which baseline is set (Default: LOWER_ONLY) | ||
# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY | ||
|
||
# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) | ||
valueFunction: SINGLE_VALUE | ||
|
||
# List of Critical and Warning thresholds for the condition | ||
terms: | ||
- priority: CRITICAL | ||
# Operator used to compare against the threshold. | ||
operator: ABOVE | ||
# Value that triggers a violation | ||
threshold: 90 | ||
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions | ||
thresholdDuration: 300 | ||
# How many data points must be in violation for the duration | ||
thresholdOccurrences: ALL | ||
|
||
# Adding a Warning threshold is optional | ||
# - priority: WARNING | ||
# # Operator used to compare against the threshold. | ||
# operator: ABOVE | ||
# # Value that triggers a violation | ||
# threshold: 0 | ||
# # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions | ||
# thresholdDuration: 60 | ||
# # How many data points must be in violation for the duration | ||
# thresholdOccurrences: AT_LEAST_ONCE | ||
|
||
# Loss of Signal Settings | ||
expiration: | ||
# Close open violations if signal is lost (Default: false) | ||
closeViolationsOnExpiration: true | ||
# Open "Loss of Signal" violation if signal is lost (Default: false) | ||
openViolationOnExpiration: false | ||
# Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' | ||
expirationDuration: 300 | ||
|
||
# Advanced Signal Settings | ||
# Duration of the time window used to evaluate the NRQL Condition | ||
signal: | ||
# How long we wait for data that belongs in each aggregation window | ||
aggregationDelay: 60 # seconds | ||
# The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. | ||
aggregationMethod: EVENT_FLOW | ||
# How long we wait after each data point arrives to make sure we've processed the whole batch. | ||
aggregationTimer: null # seconds | ||
# Controls the duration of the time window used to evaluate the NRQL query | ||
aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes | ||
# Option that determines the type of value that should be used to fill gaps (empty windows). | ||
fillOption: NONE # defaults to STATIC | ||
# If using the static fill option, this value is used for filling. | ||
fillValue: null # default | ||
# This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. | ||
slideBy: 60 # seconds | ||
# Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. | ||
evaluationDelay: 60 | ||
|
||
# OPTIONAL: URL of runbook to be sent with notification | ||
runbookUrl: | ||
|
||
# Duration after which a violation automatically closes | ||
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) | ||
violationTimeLimitSeconds: 21600 |
76 changes: 76 additions & 0 deletions
76
alert-policies/kubernetes-opentelemetry/ContainerHighMemUtil.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
name: Container high memory utilization | ||
# Description and details | ||
description: | | ||
Alert when the average container memory utilization (vs. Limit) is > 90% for more than 5 minutes | ||
# Type of alert: BASELINE | STATIC | ||
type: STATIC | ||
|
||
# NRQL query | ||
nrql: | ||
query: "from Metric select max(container_memory_working_set_bytes) / filter(max(kube_pod_container_resource_limits), where resource = 'memory') where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet pod, container, k8s.namespace.name, k8s.cluster.name" | ||
|
||
# Direction in which baseline is set (Default: LOWER_ONLY) | ||
# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY | ||
|
||
# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) | ||
valueFunction: SINGLE_VALUE | ||
|
||
# List of Critical and Warning thresholds for the condition | ||
terms: | ||
- priority: CRITICAL | ||
# Operator used to compare against the threshold. | ||
operator: ABOVE | ||
# Value that triggers a violation | ||
threshold: 90 | ||
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions | ||
thresholdDuration: 300 | ||
# How many data points must be in violation for the duration | ||
thresholdOccurrences: ALL | ||
|
||
# Adding a Warning threshold is optional | ||
# - priority: WARNING | ||
# # Operator used to compare against the threshold. | ||
# operator: ABOVE | ||
# # Value that triggers a violation | ||
# threshold: 0 | ||
# # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions | ||
# thresholdDuration: 60 | ||
# # How many data points must be in violation for the duration | ||
# thresholdOccurrences: AT_LEAST_ONCE | ||
|
||
# Loss of Signal Settings | ||
expiration: | ||
# Close open violations if signal is lost (Default: false) | ||
closeViolationsOnExpiration: true | ||
# Open "Loss of Signal" violation if signal is lost (Default: false) | ||
openViolationOnExpiration: false | ||
# Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' | ||
expirationDuration: 300 | ||
|
||
# Advanced Signal Settings | ||
# Duration of the time window used to evaluate the NRQL Condition | ||
signal: | ||
# How long we wait for data that belongs in each aggregation window | ||
aggregationDelay: 60 # seconds | ||
# The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. | ||
aggregationMethod: EVENT_FLOW | ||
# How long we wait after each data point arrives to make sure we've processed the whole batch. | ||
aggregationTimer: null # seconds | ||
# Controls the duration of the time window used to evaluate the NRQL query | ||
aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes | ||
# Option that determines the type of value that should be used to fill gaps (empty windows). | ||
fillOption: NONE # defaults to STATIC | ||
# If using the static fill option, this value is used for filling. | ||
fillValue: null # default | ||
# This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. | ||
slideBy: 60 # seconds | ||
# Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. | ||
evaluationDelay: 60 | ||
|
||
# OPTIONAL: URL of runbook to be sent with notification | ||
runbookUrl: | ||
|
||
# Duration after which a violation automatically closes | ||
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) | ||
violationTimeLimitSeconds: 21600 |
73 changes: 73 additions & 0 deletions
73
alert-policies/kubernetes-opentelemetry/ContainerRestarting.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
name: Container is Restarting | ||
|
||
# Description and details | ||
description: | | ||
Alert when the container restart count is greater than 0 in a sliding 5 minute window | ||
# Type of alert: BASELINE | STATIC | ||
type: STATIC | ||
|
||
# NRQL query | ||
nrql: | ||
query: "from Metric select sum(kube_pod_container_status_restarts_total) where metricName = 'kube_pod_container_status_restarts_total' and k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') FACET k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" | ||
|
||
# Direction in which baseline is set (Default: LOWER_ONLY) | ||
# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY | ||
|
||
# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) | ||
valueFunction: SINGLE_VALUE | ||
|
||
# List of Critical and Warning thresholds for the condition | ||
terms: | ||
- priority: CRITICAL | ||
# Operator used to compare against the threshold. | ||
operator: ABOVE | ||
# Value that triggers a violation | ||
threshold: 0 | ||
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions | ||
thresholdDuration: 300 | ||
# How many data points must be in violation for the duration | ||
thresholdOccurrences: ALL | ||
|
||
# Adding a Warning threshold is optional | ||
# - priority: WARNING | ||
# operator: ABOVE | ||
# threshold: 1 | ||
# thresholdDuration: 300 | ||
# thresholdOccurrences: ALL | AT_LEAST_ONCE | ||
|
||
# Loss of Signal Settings | ||
expiration: | ||
# Close open violations if signal is lost (Default: false) | ||
closeViolationsOnExpiration: true | ||
# Open "Loss of Signal" violation if signal is lost (Default: false) | ||
openViolationOnExpiration: false | ||
# Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' | ||
expirationDuration: 300 | ||
|
||
# Advanced Signal Settings | ||
# Duration of the time window used to evaluate the NRQL Condition | ||
signal: | ||
# How long we wait for data that belongs in each aggregation window | ||
aggregationDelay: 60 # seconds | ||
# The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. | ||
aggregationMethod: EVENT_FLOW | ||
# How long we wait after each data point arrives to make sure we've processed the whole batch. | ||
aggregationTimer: null # seconds | ||
# Controls the duration of the time window used to evaluate the NRQL query | ||
aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes | ||
# Option that determines the type of value that should be used to fill gaps (empty windows). | ||
fillOption: NONE # defaults to STATIC | ||
# If using the static fill option, this value is used for filling. | ||
fillValue: null # default | ||
# This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. | ||
slideBy: 60 # seconds | ||
# Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. | ||
evaluationDelay: 60 | ||
|
||
# OPTIONAL: URL of runbook to be sent with notification | ||
runbookUrl: | ||
|
||
# Duration after which a violation automatically closes | ||
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) | ||
violationTimeLimitSeconds: 21600 |
73 changes: 73 additions & 0 deletions
73
alert-policies/kubernetes-opentelemetry/ContainerWaiting.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
name: Container is Waiting | ||
|
||
# Description and details | ||
description: | | ||
Alert when a container is Waiting for more than 5 minutes | ||
# Type of alert: BASELINE | STATIC | ||
type: STATIC | ||
|
||
# NRQL query | ||
nrql: | ||
query: "from Metric select uniqueCount(k8s.pod.name) WHERE container_phase = 'waiting' and k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') FACET k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name" | ||
|
||
# Direction in which baseline is set (Default: LOWER_ONLY) | ||
# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY | ||
|
||
# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE) | ||
valueFunction: SINGLE_VALUE | ||
|
||
# List of Critical and Warning thresholds for the condition | ||
terms: | ||
- priority: CRITICAL | ||
# Operator used to compare against the threshold. | ||
operator: ABOVE | ||
# Value that triggers a violation | ||
threshold: 0 | ||
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions | ||
thresholdDuration: 300 | ||
# How many data points must be in violation for the duration | ||
thresholdOccurrences: ALL | ||
|
||
# Adding a Warning threshold is optional | ||
# - priority: WARNING | ||
# operator: ABOVE | ||
# threshold: 1 | ||
# thresholdDuration: 300 | ||
# thresholdOccurrences: ALL | AT_LEAST_ONCE | ||
|
||
# Loss of Signal Settings | ||
expiration: | ||
# Close open violations if signal is lost (Default: false) | ||
closeViolationsOnExpiration: true | ||
# Open "Loss of Signal" violation if signal is lost (Default: false) | ||
openViolationOnExpiration: false | ||
# Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false' | ||
expirationDuration: 300 | ||
|
||
# Advanced Signal Settings | ||
# Duration of the time window used to evaluate the NRQL Condition | ||
signal: | ||
# How long we wait for data that belongs in each aggregation window | ||
aggregationDelay: 60 # seconds | ||
# The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations. | ||
aggregationMethod: EVENT_FLOW | ||
# How long we wait after each data point arrives to make sure we've processed the whole batch. | ||
aggregationTimer: null # seconds | ||
# Controls the duration of the time window used to evaluate the NRQL query | ||
aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes | ||
# Option that determines the type of value that should be used to fill gaps (empty windows). | ||
fillOption: NONE # defaults to STATIC | ||
# If using the static fill option, this value is used for filling. | ||
fillValue: null # default | ||
# This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends. | ||
slideBy: null # seconds | ||
# Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition. | ||
evaluationDelay: 60 | ||
|
||
# OPTIONAL: URL of runbook to be sent with notification | ||
runbookUrl: | ||
|
||
# Duration after which a violation automatically closes | ||
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day]) | ||
violationTimeLimitSeconds: 21600 |
Oops, something went wrong.