Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add redshift monitors #33

Merged
merged 3 commits into from
Jun 8, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
309 changes: 309 additions & 0 deletions catalog/monitors/redshift.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,309 @@
# The official Datadog API documentation with available query parameters & alert types:
# https://docs.datadoghq.com/api/v1/monitors/#create-a-monitor

redshift-health-status:
name: (Redshift) Health Status
type: metric alert
query: |
min(last_1h):min:aws.redshift.health_status{*} by {clusteridentifier} <= 0
message: |
{{#is_alert}}
({clusteridentifier}) Redshift cluster is failing health checks
{{/is_alert}}
escalation_message: ""
tags: [ "ManagedBy:Terraform" ]
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 60
evaluation_delay: 60
new_host_delay: 300
no_data_timeframe: 10
threshold_windows: { }
thresholds:
critical: 0
#warning:
#unknown:
#ok:
#critical_recovery:
#warning_recovery:

redshift-database-connections:
name: (Redshift) Anomaly of a large variance in Redshift connection count
type: metric alert
query: |
avg(last_4h):anomalies(avg:aws.redshift.database_connections{*} by {clusteridentifier}, 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true') >= 1
message: |
{{#is_alert}}
({clusteridentifier}) Anomaly of a large variance in Redshift connection count
{{/is_alert}}
escalation_message: ""
tags: [ "ManagedBy:Terraform" ]
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 60
evaluation_delay: 60
new_host_delay: 300
no_data_timeframe: 10
threshold_windows:
trigger_window: "last_15m"
recovery_window: "last_15m"
thresholds:
critical: 1
#warning:
#unknown:
#ok:
critical_recovery: 0
#warning_recovery:

redshift-cpuutilization:
name: (Redshift) CPU Utilization above 90%
type: metric alert
query: |
avg(last_15m):avg:aws.redshift.cpuutilization{*} by {clusteridentifier} > 90
message: |
{{#is_warning}}
({clusteridentifier}) CPU Utilization above 85%
{{/is_warning}}
{{#is_alert}}
({clusteridentifier}) CPU Utilization above 90%
{{/is_alert}}
escalation_message: ""
tags: [ "ManagedBy:Terraform" ]
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 60
evaluation_delay: 60
new_host_delay: 300
no_data_timeframe: 10
threshold_windows: { }
thresholds:
critical: 90
warning: 85
#unknown:
#ok:
#critical_recovery:
#warning_recovery:

redshift-write-latency:
name: (Redshift) Write latency/cluster
type: metric alert
query: |
avg(last_15m):avg:aws.redshift.write_latency{*} by {clusteridentifier} > 15
message: |
{{#is_warning}}
({clusteridentifier}) Redshift write latency is > 10s
{{/is_warning}}
{{#is_alert}}
({clusteridentifier}) Redshift write latency is > 15s
{{/is_alert}}
escalation_message: ""
tags: [ "ManagedBy:Terraform" ]
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 60
evaluation_delay: 60
new_host_delay: 300
no_data_timeframe: 10
threshold_windows: { }
thresholds:
critical: 15
warning: 10
#unknown:
#ok:
#critical_recovery:
#warning_recovery:

redshift-disk-space-used:
name: (Redshift) Percent disk space used/cluster
type: metric alert
query: |
avg(last_15m):avg:aws.redshift.percentage_disk_space_used{*} by {clusteridentifier} > 85
message: |
{{#is_warning}}
({clusteridentifier}) Redshift disk used is > 80%
{{/is_warning}}
{{#is_alert}}
({clusteridentifier}) Redshift disk used is > 85%
{{/is_alert}}
escalation_message: ""
tags: [ "ManagedBy:Terraform" ]
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 60
evaluation_delay: 60
new_host_delay: 300
no_data_timeframe: 10
threshold_windows: { }
thresholds:
critical: 85
warning: 80
#unknown:
#ok:
#critical_recovery:
#warning_recovery:

redshift-network-receive:
name: (Redshift) Network throughput - received
type: metric alert
query: |
avg(last_4h):anomalies(avg:aws.redshift.network_receive_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
message: |
{{#is_alert}}
({clusteridentifier}) Anomaly of a large variance in Redshift network traffic received
{{/is_alert}}
escalation_message: ""
tags: [ "ManagedBy:Terraform" ]
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 60
evaluation_delay: 60
new_host_delay: 300
no_data_timeframe: 10
threshold_windows:
trigger_window: "last_15m"
recovery_window: "last_15m"
thresholds:
critical: 1
#warning:
#unknown:
#ok:
critical_recovery: 0
#warning_recovery:

redshift-network-transmit:
name: (Redshift) Network throughput - sent
type: metric alert
query: |
avg(last_4h):anomalies(avg:aws.redshift.network_transmit_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
message: |
{{#is_alert}}
({clusteridentifier}) Anomaly of a large variance in Redshift network traffic sent
{{/is_alert}}
escalation_message: ""
tags: [ "ManagedBy:Terraform" ]
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 60
evaluation_delay: 60
new_host_delay: 300
no_data_timeframe: 10
threshold_windows:
trigger_window: "last_15m"
recovery_window: "last_15m"
thresholds:
critical: 1
#warning:
#unknown:
#ok:
critical_recovery: 0
#warning_recovery:

redshift-read-throughput:
name: (Redshift) Read throughput/cluster
type: metric alert
query: |
avg(last_4h):anomalies(avg:aws.redshift.read_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
message: |
{{#is_alert}}
({clusteridentifier}) Anomaly of a large variance in Redshift read throughput
{{/is_alert}}
escalation_message: ""
tags: [ "ManagedBy:Terraform" ]
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 60
evaluation_delay: 60
new_host_delay: 300
no_data_timeframe: 10
threshold_windows:
trigger_window: "last_15m"
recovery_window: "last_15m"
thresholds:
critical: 1
#warning:
#unknown:
#ok:
critical_recovery: 0
#warning_recovery:

redshift-write-throughput:
name: (Redshift) Write throughput/cluster
type: metric alert
query: |
avg(last_4h):anomalies(avg:aws.redshift.write_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
message: |
{{#is_alert}}
({clusteridentifier}) Anomaly of a large variance in Redshift write throughput
{{/is_alert}}
escalation_message: ""
tags: [ "ManagedBy:Terraform" ]
notify_no_data: false
notify_audit: true
require_full_window: true
enable_logs_sample: false
force_delete: true
include_tags: true
locked: false
renotify_interval: 60
timeout_h: 60
evaluation_delay: 60
new_host_delay: 300
no_data_timeframe: 10
threshold_windows:
trigger_window: "last_15m"
recovery_window: "last_15m"
thresholds:
critical: 1
#warning:
#unknown:
#ok:
critical_recovery: 0
#warning_recovery: