cloudposse · srhopkins · Jun 8, 2021 · Jun 8, 2021 · Jun 8, 2021 · Jun 8, 2021
diff --git a/catalog/monitors/redshift.yaml b/catalog/monitors/redshift.yaml
@@ -0,0 +1,309 @@
+# The official Datadog API documentation with available query parameters & alert types:
+# https://docs.datadoghq.com/api/v1/monitors/#create-a-monitor
+
+redshift-health-status:
+  name: (Redshift) Health Status
+  type: metric alert
+  query: |
+    min(last_1h):min:aws.redshift.health_status{*} by {clusteridentifier} <= 0
+  message: |
+    {{#is_alert}}
+    ({clusteridentifier}) Redshift cluster is failing health checks
+    {{/is_alert}}
+  escalation_message: ""
+  tags: [ "ManagedBy:Terraform" ]
+  notify_no_data: false
+  notify_audit: true
+  require_full_window: true
+  enable_logs_sample: false
+  force_delete: true
+  include_tags: true
+  locked: false
+  renotify_interval: 60
+  timeout_h: 60
+  evaluation_delay: 60
+  new_host_delay: 300
+  no_data_timeframe: 10
+  threshold_windows: { }
+  thresholds:
+    critical: 0
+    #warning:
+    #unknown:
+    #ok:
+    #critical_recovery:
+    #warning_recovery:
+
+redshift-database-connections:
+  name: (Redshift) Anomaly of a large variance in Redshift connection count
+  type: metric alert
+  query: |
+    avg(last_4h):anomalies(avg:aws.redshift.database_connections{*} by {clusteridentifier}, 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true') >= 1
+  message: |
+    {{#is_alert}}
+    ({clusteridentifier}) Anomaly of a large variance in Redshift connection count
+    {{/is_alert}}
+  escalation_message: ""
+  tags: [ "ManagedBy:Terraform" ]
+  notify_no_data: false
+  notify_audit: true
+  require_full_window: true
+  enable_logs_sample: false
+  force_delete: true
+  include_tags: true
+  locked: false
+  renotify_interval: 60
+  timeout_h: 60
+  evaluation_delay: 60
+  new_host_delay: 300
+  no_data_timeframe: 10
+  threshold_windows:
+    trigger_window: "last_15m"
+    recovery_window: "last_15m"
+  thresholds:
+    critical: 1
+    #warning:
+    #unknown:
+    #ok:
+    critical_recovery: 0
+    #warning_recovery:
+
+redshift-cpuutilization:
+  name: (Redshift) CPU Utilization above 90%
+  type: metric alert
+  query: |
+    avg(last_15m):avg:aws.redshift.cpuutilization{*} by {clusteridentifier} > 90
+  message: |
+    {{#is_warning}}
+    ({clusteridentifier}) CPU Utilization above 85%
+    {{/is_warning}}
+    {{#is_alert}}
+    ({clusteridentifier}) CPU Utilization above 90%
+    {{/is_alert}}
+  escalation_message: ""
+  tags: [ "ManagedBy:Terraform" ]
+  notify_no_data: false
+  notify_audit: true
+  require_full_window: true
+  enable_logs_sample: false
+  force_delete: true
+  include_tags: true
+  locked: false
+  renotify_interval: 60
+  timeout_h: 60
+  evaluation_delay: 60
+  new_host_delay: 300
+  no_data_timeframe: 10
+  threshold_windows: { }
+  thresholds:
+    critical: 90
+    warning: 85
+    #unknown:
+    #ok:
+    #critical_recovery:
+    #warning_recovery:
+
+redshift-write-latency:
+  name: (Redshift) Write latency/cluster
+  type: metric alert
+  query: |
+    avg(last_15m):avg:aws.redshift.write_latency{*} by {clusteridentifier} > 15
+  message: |
+    {{#is_warning}}
+    ({clusteridentifier}) Redshift write latency is > 10s
+    {{/is_warning}}
+    {{#is_alert}}
+    ({clusteridentifier}) Redshift write latency is > 15s
+    {{/is_alert}}
+  escalation_message: ""
+  tags: [ "ManagedBy:Terraform" ]
+  notify_no_data: false
+  notify_audit: true
+  require_full_window: true
+  enable_logs_sample: false
+  force_delete: true
+  include_tags: true
+  locked: false
+  renotify_interval: 60
+  timeout_h: 60
+  evaluation_delay: 60
+  new_host_delay: 300
+  no_data_timeframe: 10
+  threshold_windows: { }
+  thresholds:
+    critical: 15
+    warning: 10
+    #unknown:
+    #ok:
+    #critical_recovery:
+    #warning_recovery:
+
+redshift-disk-space-used:
+  name: (Redshift) Percent disk space used/cluster
+  type: metric alert
+  query: |
+    avg(last_15m):avg:aws.redshift.percentage_disk_space_used{*} by {clusteridentifier} > 85
+  message: |
+    {{#is_warning}}
+    ({clusteridentifier}) Redshift disk used is > 80%
+    {{/is_warning}}
+    {{#is_alert}}
+    ({clusteridentifier}) Redshift disk used is > 85%
+    {{/is_alert}}
+  escalation_message: ""
+  tags: [ "ManagedBy:Terraform" ]
+  notify_no_data: false
+  notify_audit: true
+  require_full_window: true
+  enable_logs_sample: false
+  force_delete: true
+  include_tags: true
+  locked: false
+  renotify_interval: 60
+  timeout_h: 60
+  evaluation_delay: 60
+  new_host_delay: 300
+  no_data_timeframe: 10
+  threshold_windows: { }
+  thresholds:
+    critical: 85
+    warning: 80
+    #unknown:
+    #ok:
+    #critical_recovery:
+    #warning_recovery:
+
+redshift-network-receive:
+  name: (Redshift) Network throughput - received
+  type: metric alert
+  query: |
+    avg(last_4h):anomalies(avg:aws.redshift.network_receive_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
+  message: |
+    {{#is_alert}}
+    ({clusteridentifier}) Anomaly of a large variance in Redshift network traffic received
+    {{/is_alert}}
+  escalation_message: ""
+  tags: [ "ManagedBy:Terraform" ]
+  notify_no_data: false
+  notify_audit: true
+  require_full_window: true
+  enable_logs_sample: false
+  force_delete: true
+  include_tags: true
+  locked: false
+  renotify_interval: 60
+  timeout_h: 60
+  evaluation_delay: 60
+  new_host_delay: 300
+  no_data_timeframe: 10
+  threshold_windows:
+    trigger_window: "last_15m"
+    recovery_window: "last_15m"
+  thresholds:
+    critical: 1
+    #warning:
+    #unknown:
+    #ok:
+    critical_recovery: 0
+    #warning_recovery:
+
+redshift-network-transmit:
+  name: (Redshift) Network throughput - sent
+  type: metric alert
+  query: |
+    avg(last_4h):anomalies(avg:aws.redshift.network_transmit_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
+  message: |
+    {{#is_alert}}
+    ({clusteridentifier}) Anomaly of a large variance in Redshift network traffic sent
+    {{/is_alert}}
+  escalation_message: ""
+  tags: [ "ManagedBy:Terraform" ]
+  notify_no_data: false
+  notify_audit: true
+  require_full_window: true
+  enable_logs_sample: false
+  force_delete: true
+  include_tags: true
+  locked: false
+  renotify_interval: 60
+  timeout_h: 60
+  evaluation_delay: 60
+  new_host_delay: 300
+  no_data_timeframe: 10
+  threshold_windows:
+    trigger_window: "last_15m"
+    recovery_window: "last_15m"
+  thresholds:
+    critical: 1
+    #warning:
+    #unknown:
+    #ok:
+    critical_recovery: 0
+    #warning_recovery:
+
+redshift-read-throughput:
+  name: (Redshift) Read throughput/cluster
+  type: metric alert
+  query: |
+    avg(last_4h):anomalies(avg:aws.redshift.read_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
+  message: |
+    {{#is_alert}}
+    ({clusteridentifier}) Anomaly of a large variance in Redshift read throughput
+    {{/is_alert}}
+  escalation_message: ""
+  tags: [ "ManagedBy:Terraform" ]
+  notify_no_data: false
+  notify_audit: true
+  require_full_window: true
+  enable_logs_sample: false
+  force_delete: true
+  include_tags: true
+  locked: false
+  renotify_interval: 60
+  timeout_h: 60
+  evaluation_delay: 60
+  new_host_delay: 300
+  no_data_timeframe: 10
+  threshold_windows:
+    trigger_window: "last_15m"
+    recovery_window: "last_15m"
+  thresholds:
+    critical: 1
+    #warning:
+    #unknown:
+    #ok:
+    critical_recovery: 0
+    #warning_recovery:
+
+redshift-write-throughput:
+  name: (Redshift) Write throughput/cluster
+  type: metric alert
+  query: |
+    avg(last_4h):anomalies(avg:aws.redshift.write_throughput{*} by {clusteridentifier}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1
+  message: |
+    {{#is_alert}}
+    ({clusteridentifier}) Anomaly of a large variance in Redshift write throughput
+    {{/is_alert}}
+  escalation_message: ""
+  tags: [ "ManagedBy:Terraform" ]
+  notify_no_data: false
+  notify_audit: true
+  require_full_window: true
+  enable_logs_sample: false
+  force_delete: true
+  include_tags: true
+  locked: false
+  renotify_interval: 60
+  timeout_h: 60
+  evaluation_delay: 60
+  new_host_delay: 300
+  no_data_timeframe: 10
+  threshold_windows:
+    trigger_window: "last_15m"
+    recovery_window: "last_15m"
+  thresholds:
+    critical: 1
+    #warning:
+    #unknown:
+    #ok:
+    critical_recovery: 0
+    #warning_recovery: