Skip to content

Commit

Permalink
Merge pull request #2558 from douavue/dvue/k8s-otel-alert-policies
Browse files Browse the repository at this point in the history
Dvue/k8s otel alert policies
  • Loading branch information
nr-mlosier authored Oct 17, 2024
2 parents 0186d82 + 5439de3 commit c5ed531
Show file tree
Hide file tree
Showing 22 changed files with 1,424 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
name: Container cpu throttling is high
# Description and details
description: |
Alert when container is being throttled > 25% of the time for more than 5 minutes
# Type of alert: BASELINE | STATIC
type: STATIC

# NRQL query
nrql:
query: "from Metric select latest(container_cpu_cfs_throttled_periods_total) / latest(container_cpu_cfs_periods_total)* 100 where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name"

# Direction in which baseline is set (Default: LOWER_ONLY)
# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 25
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
# - priority: WARNING
# # Operator used to compare against the threshold.
# operator: ABOVE
# # Value that triggers a violation
# threshold: 0
# # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
# thresholdDuration: 60
# # How many data points must be in violation for the duration
# thresholdOccurrences: AT_LEAST_ONCE

# Loss of Signal Settings
expiration:
# Close open violations if signal is lost (Default: false)
closeViolationsOnExpiration: true
# Open "Loss of Signal" violation if signal is lost (Default: false)
openViolationOnExpiration: false
# Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false'
expirationDuration: 300

# Advanced Signal Settings
# Duration of the time window used to evaluate the NRQL Condition
signal:
# How long we wait for data that belongs in each aggregation window
aggregationDelay: 60 # seconds
# The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations.
aggregationMethod: EVENT_FLOW
# How long we wait after each data point arrives to make sure we've processed the whole batch.
aggregationTimer: null # seconds
# Controls the duration of the time window used to evaluate the NRQL query
aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes
# Option that determines the type of value that should be used to fill gaps (empty windows).
fillOption: NONE # defaults to STATIC
# If using the static fill option, this value is used for filling.
fillValue: null # default
# This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends.
slideBy: 60 # seconds
# Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition.
evaluationDelay: 60

# OPTIONAL: URL of runbook to be sent with notification
runbookUrl:

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 21600
76 changes: 76 additions & 0 deletions alert-policies/kubernetes-opentelemetry/ContainerHighCPUUtil.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
name: Container high cpu utilization
# Description and details
description: |
Alert when the average container cpu utilization (vs. Limit) is > 90% for more than 5 minutes
# Type of alert: BASELINE | STATIC
type: STATIC

# NRQL query
nrql:
query: "from Metric select average(container.cpu.utilization) where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name"

# Direction in which baseline is set (Default: LOWER_ONLY)
# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
# - priority: WARNING
# # Operator used to compare against the threshold.
# operator: ABOVE
# # Value that triggers a violation
# threshold: 0
# # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
# thresholdDuration: 60
# # How many data points must be in violation for the duration
# thresholdOccurrences: AT_LEAST_ONCE

# Loss of Signal Settings
expiration:
# Close open violations if signal is lost (Default: false)
closeViolationsOnExpiration: true
# Open "Loss of Signal" violation if signal is lost (Default: false)
openViolationOnExpiration: false
# Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false'
expirationDuration: 300

# Advanced Signal Settings
# Duration of the time window used to evaluate the NRQL Condition
signal:
# How long we wait for data that belongs in each aggregation window
aggregationDelay: 60 # seconds
# The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations.
aggregationMethod: EVENT_FLOW
# How long we wait after each data point arrives to make sure we've processed the whole batch.
aggregationTimer: null # seconds
# Controls the duration of the time window used to evaluate the NRQL query
aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes
# Option that determines the type of value that should be used to fill gaps (empty windows).
fillOption: NONE # defaults to STATIC
# If using the static fill option, this value is used for filling.
fillValue: null # default
# This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends.
slideBy: 60 # seconds
# Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition.
evaluationDelay: 60

# OPTIONAL: URL of runbook to be sent with notification
runbookUrl:

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 21600
76 changes: 76 additions & 0 deletions alert-policies/kubernetes-opentelemetry/ContainerHighMemUtil.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
name: Container high memory utilization
# Description and details
description: |
Alert when the average container memory utilization (vs. Limit) is > 90% for more than 5 minutes
# Type of alert: BASELINE | STATIC
type: STATIC

# NRQL query
nrql:
query: "from Metric select max(container_memory_working_set_bytes) / filter(max(kube_pod_container_resource_limits), where resource = 'memory') where k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') facet pod, container, k8s.namespace.name, k8s.cluster.name"

# Direction in which baseline is set (Default: LOWER_ONLY)
# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 90
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
# - priority: WARNING
# # Operator used to compare against the threshold.
# operator: ABOVE
# # Value that triggers a violation
# threshold: 0
# # Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
# thresholdDuration: 60
# # How many data points must be in violation for the duration
# thresholdOccurrences: AT_LEAST_ONCE

# Loss of Signal Settings
expiration:
# Close open violations if signal is lost (Default: false)
closeViolationsOnExpiration: true
# Open "Loss of Signal" violation if signal is lost (Default: false)
openViolationOnExpiration: false
# Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false'
expirationDuration: 300

# Advanced Signal Settings
# Duration of the time window used to evaluate the NRQL Condition
signal:
# How long we wait for data that belongs in each aggregation window
aggregationDelay: 60 # seconds
# The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations.
aggregationMethod: EVENT_FLOW
# How long we wait after each data point arrives to make sure we've processed the whole batch.
aggregationTimer: null # seconds
# Controls the duration of the time window used to evaluate the NRQL query
aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes
# Option that determines the type of value that should be used to fill gaps (empty windows).
fillOption: NONE # defaults to STATIC
# If using the static fill option, this value is used for filling.
fillValue: null # default
# This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends.
slideBy: 60 # seconds
# Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition.
evaluationDelay: 60

# OPTIONAL: URL of runbook to be sent with notification
runbookUrl:

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 21600
73 changes: 73 additions & 0 deletions alert-policies/kubernetes-opentelemetry/ContainerRestarting.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
name: Container is Restarting

# Description and details
description: |
Alert when the container restart count is greater than 0 in a sliding 5 minute window
# Type of alert: BASELINE | STATIC
type: STATIC

# NRQL query
nrql:
query: "from Metric select sum(kube_pod_container_status_restarts_total) where metricName = 'kube_pod_container_status_restarts_total' and k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') FACET k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name"

# Direction in which baseline is set (Default: LOWER_ONLY)
# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 0
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
# - priority: WARNING
# operator: ABOVE
# threshold: 1
# thresholdDuration: 300
# thresholdOccurrences: ALL | AT_LEAST_ONCE

# Loss of Signal Settings
expiration:
# Close open violations if signal is lost (Default: false)
closeViolationsOnExpiration: true
# Open "Loss of Signal" violation if signal is lost (Default: false)
openViolationOnExpiration: false
# Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false'
expirationDuration: 300

# Advanced Signal Settings
# Duration of the time window used to evaluate the NRQL Condition
signal:
# How long we wait for data that belongs in each aggregation window
aggregationDelay: 60 # seconds
# The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations.
aggregationMethod: EVENT_FLOW
# How long we wait after each data point arrives to make sure we've processed the whole batch.
aggregationTimer: null # seconds
# Controls the duration of the time window used to evaluate the NRQL query
aggregationWindow: 300 # seconds; 30 seconds <= x < 15 minutes
# Option that determines the type of value that should be used to fill gaps (empty windows).
fillOption: NONE # defaults to STATIC
# If using the static fill option, this value is used for filling.
fillValue: null # default
# This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends.
slideBy: 60 # seconds
# Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition.
evaluationDelay: 60

# OPTIONAL: URL of runbook to be sent with notification
runbookUrl:

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 21600
73 changes: 73 additions & 0 deletions alert-policies/kubernetes-opentelemetry/ContainerWaiting.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
name: Container is Waiting

# Description and details
description: |
Alert when a container is Waiting for more than 5 minutes
# Type of alert: BASELINE | STATIC
type: STATIC

# NRQL query
nrql:
query: "from Metric select uniqueCount(k8s.pod.name) WHERE container_phase = 'waiting' and k8s.cluster.name in ('YOUR_CLUSTER_NAME') and k8s.namespace.name in ('YOUR_NAMESPACE_NAME') FACET k8s.container.name, k8s.pod.name, k8s.namespace.name, k8s.cluster.name"

# Direction in which baseline is set (Default: LOWER_ONLY)
# baselineDirection: LOWER_ONLY | UPPER_AND_LOWER | UPPER_ONLY

# Function used to aggregate the NRQL query value(s) for comparison to the terms.threshold (Default: SINGLE_VALUE)
valueFunction: SINGLE_VALUE

# List of Critical and Warning thresholds for the condition
terms:
- priority: CRITICAL
# Operator used to compare against the threshold.
operator: ABOVE
# Value that triggers a violation
threshold: 0
# Time in seconds; 120 - 3600, must be a multiple of 60 for Baseline conditions
thresholdDuration: 300
# How many data points must be in violation for the duration
thresholdOccurrences: ALL

# Adding a Warning threshold is optional
# - priority: WARNING
# operator: ABOVE
# threshold: 1
# thresholdDuration: 300
# thresholdOccurrences: ALL | AT_LEAST_ONCE

# Loss of Signal Settings
expiration:
# Close open violations if signal is lost (Default: false)
closeViolationsOnExpiration: true
# Open "Loss of Signal" violation if signal is lost (Default: false)
openViolationOnExpiration: false
# Time in seconds; Max value: 172800 (48hrs), null if closeViolationsOnExpiration and openViolationOnExpiration are both 'false'
expirationDuration: 300

# Advanced Signal Settings
# Duration of the time window used to evaluate the NRQL Condition
signal:
# How long we wait for data that belongs in each aggregation window
aggregationDelay: 60 # seconds
# The method that determines when we consider an aggregation window to complete so that we can evaluate the signals for violations.
aggregationMethod: EVENT_FLOW
# How long we wait after each data point arrives to make sure we've processed the whole batch.
aggregationTimer: null # seconds
# Controls the duration of the time window used to evaluate the NRQL query
aggregationWindow: 60 # seconds; 30 seconds <= x < 15 minutes
# Option that determines the type of value that should be used to fill gaps (empty windows).
fillOption: NONE # defaults to STATIC
# If using the static fill option, this value is used for filling.
fillValue: null # default
# This setting gathers data in overlapping time windows to smooth the chart line, making it easier to spot trends.
slideBy: null # seconds
# Evaluation delay is how long we wait before we start evaluating a signal against the thresholds in this condition.
evaluationDelay: 60

# OPTIONAL: URL of runbook to be sent with notification
runbookUrl:

# Duration after which a violation automatically closes
# Time in seconds; 300 - 2592000 (Default: 86400 [1 day])
violationTimeLimitSeconds: 21600
Loading

0 comments on commit c5ed531

Please sign in to comment.