Skip to content

Commit

Permalink
Add dd amazon mq (cloudposse#29)
Browse files Browse the repository at this point in the history
* Initial list of amq interesting metrics

* Adding cpu alert

* Update cpu alert message

* Auto Format

* Initial list of amq interesting metrics

* Adding cpu alert

* Update cpu alert message

* Best baseline monitors that apply broadly to most amq users

* Best baseline monitors that apply broadly to most amq users

* Fixing amq-heap-usage query to match alert threshold

* Moving mq-current-connections-count to anomoly

Co-authored-by: cloudpossebot <11232728+cloudpossebot@users.noreply.github.com>
  • Loading branch information
srhopkins and cloudpossebot authored May 5, 2021
1 parent a9184c1 commit 705447c
Showing 1 changed file with 174 additions and 0 deletions.
174 changes: 174 additions & 0 deletions catalog/monitors/amq.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
# The official Datadog API documentation with available query parameters & alert types:
# https://docs.datadoghq.com/api/v1/monitors/#create-a-monitor

amq-cpu-utilization:
name: "(AMQ) CPU Utilization above 90%"
type: metric alert
query: |
avg(last_15m):avg:aws.amazonmq.cpu_utilization{*} by {broker} > 90
message: |
{{#is_warning}}
({broker}) - CPU Utilization above 85%
{{/is_warning}}
{{#is_alert}}
({broker}) - CPU Utilization above 90%
{{/is_alert}}
escalation_message: ""
tags: [ "ManagedBy:Terraform" ]
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 60
evaluation_delay: 60
new_host_delay: 300
no_data_timeframe: 10
threshold_windows: { }
thresholds:
critical: 90
warning: 85
#unknown:
#ok:
#critical_recovery:
#warning_recovery:

amq-heap-usage:
name: "(AMQ) JVM heap usage above 95%"
type: metric alert
query: |
avg(last_15m):avg:aws.amazonmq.heap_usage{*} by {broker} > 95
message: |
{{#is_warning}}
({broker}) - JVM heap usage above 90%
{{/is_warning}}
{{#is_alert}}
({broker}) - JVM heap usage above 95%
{{/is_alert}}
escalation_message: ""
tags: [ "ManagedBy:Terraform" ]
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 60
evaluation_delay: 60
new_host_delay: 300
no_data_timeframe: 10
threshold_windows: { }
thresholds:
critical: 95
warning: 90
#unknown:
#ok:
#critical_recovery:
#warning_recovery:

amq-network-in:
name: "(AMQ) Anomaly of a large variance in network-in bytes"
type: metric alert
query: |
avg(last_4h):anomalies(avg:aws.amazonmq.network_in{*} by {broker}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
message: |
{{#is_alert}}
({broker}) - Anomaly of a large variance in network-in bytes
{{/is_alert}}
escalation_message: ""
tags: [ "ManagedBy:Terraform" ]
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 60
evaluation_delay: 900
new_host_delay: 300
no_data_timeframe: 10
threshold_windows:
trigger_window: "last_15m"
recovery_window: "last_15m"
thresholds:
critical: 1
#warning:
#unknown:
#ok:
critical_recovery: 0
#warning_recovery:

amq-network-out:
name: "(AMQ) Anomaly of a large variance in network-out bytes"
type: metric alert
query: |
avg(last_4h):anomalies(avg:aws.amazonmq.network_out{*} by {broker}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
message: |
{{#is_alert}}
({broker}) - Anomaly of a large variance in network-out bytes
{{/is_alert}}
escalation_message: ""
tags: [ "ManagedBy:Terraform" ]
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 60
evaluation_delay: 900
new_host_delay: 300
no_data_timeframe: 10
threshold_windows:
trigger_window: "last_15m"
recovery_window: "last_15m"
thresholds:
critical: 1
#warning:
#unknown:
#ok:
critical_recovery: 0
#warning_recovery:

amq-current-connections-count:
name: "(AMQ) Anomaly of a large variance in broker connections"
type: metric alert
query: |
avg(last_4h):anomalies(avg:aws.amazonmq.current_connections_count{*}.as_count(), 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true') >= 1
message: |
{{#is_alert}}
({broker}) - Anomaly of a large variance in broker connections
{{/is_alert}}
escalation_message: ""
tags: [ "ManagedBy:Terraform" ]
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 60
evaluation_delay: 900
new_host_delay: 300
no_data_timeframe: 10
threshold_windows:
trigger_window: "last_15m"
recovery_window: "last_15m"
thresholds:
critical: 1
#warning:
#unknown:
#ok:
critical_recovery: 0
#warning_recovery:

0 comments on commit 705447c

Please sign in to comment.