diff --git a/active_directory/assets/monitors/ldap_binding.json b/active_directory/assets/monitors/ldap_binding.json index ad05bfe726fc4..60c3bc1193c8c 100644 --- a/active_directory/assets/monitors/ldap_binding.json +++ b/active_directory/assets/monitors/ldap_binding.json @@ -1,29 +1,36 @@ { - "name": "[Active Directory] Elevated LDAP binding duration for host {{host.name}}", - "type": "query alert", - "query": "avg(last_5m):avg:active_directory.ldap.bind_time{*} by {host} > 30", + "version": 2, + "created_at": "2021-04-20", + "last_updated_at": "2021-04-20", + "title": "Elevated LDAP binding duration for host {{host.name}}", + "tags": [ + "integration:active-directory" + ], + "description": "Notifies when Active Directory LDAP binding duration goes above 15ms for a specific host.", + "definition": { "message": "{{#is_alert}}\n\nAlert: the LDAP binding duration for host {{host.name}} went above {{threshold}}, current value is: {{value}} ms \n\n{{/is_alert}} \n{{#is_warning}}\n\nWarning: the LDAP binding duration for host {{host.name}} went above {{threshold}}, current value is: {{value}} ms\n\n{{/is_warning}} \n\nLDAP bind times that start to exceed 15 or 30 ms may be an indication network issues are present.", - "tags": [ - "integration:active_directory" - ], + "name": "[Active Directory] Elevated LDAP binding duration for host {{host.name}}", "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 30, - "warning": 15 - } + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 30, + "warning": 15 + }, + "timeout_h": 0 }, "priority": null, - "recommended_monitor_metadata": { - "description": "Notifies when Active Directory LDAP binding duration goes above 15ms for a specific host." - } + "query": "avg(last_5m):avg:active_directory.ldap.bind_time{*} by {host} > 30", + "tags": [ + "integration:active_directory" + ], + "type": "query alert" + } } \ No newline at end of file diff --git a/active_directory/assets/monitors/ldap_binding_successful.json b/active_directory/assets/monitors/ldap_binding_successful.json index 2c71c825541ca..23ec7c7789b03 100644 --- a/active_directory/assets/monitors/ldap_binding_successful.json +++ b/active_directory/assets/monitors/ldap_binding_successful.json @@ -1,33 +1,40 @@ { - "name": "[Active Directory] Anomalous number of successful LDAP bindings for host: {{host.name}}", - "type": "query alert", - "query": "avg(last_12h):anomalies(avg:active_directory.ldap.successful_binds_persec{*} by {host}, 'agile', 4, direction='both', alert_window='last_15m', interval=120, count_default_zero='true', seasonality='hourly') >= 1", - "message": "There is an anomalous number of successful LDAP bindings for host: {{host.name}} ", - "tags": [ - "integration:active_directory" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Notifies when Active Directory number of successful LDAP bindings becomes anomalous for a specific host." - } -} + "version": 2, + "created_at": "2021-04-20", + "last_updated_at": "2021-04-20", + "title": "Anomalous number of successful LDAP bindings for host: {{host.name}}", + "tags": [ + "integration:active-directory" + ], + "description": "Notifies when Active Directory number of successful LDAP bindings becomes anomalous for a specific host.", + "definition": { + "message": "There is an anomalous number of successful LDAP bindings for host: {{host.name}} ", + "name": "[Active Directory] Anomalous number of successful LDAP bindings for host: {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_12h):anomalies(avg:active_directory.ldap.successful_binds_persec{*} by {host}, 'agile', 4, direction='both', alert_window='last_15m', interval=120, count_default_zero='true', seasonality='hourly') >= 1", + "tags": [ + "integration:active_directory" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/active_directory/assets/monitors/ldap_client_sessions.json b/active_directory/assets/monitors/ldap_client_sessions.json index 4b595cd0adac8..12fa21755225f 100644 --- a/active_directory/assets/monitors/ldap_client_sessions.json +++ b/active_directory/assets/monitors/ldap_client_sessions.json @@ -1,33 +1,40 @@ { - "name": "[Active Directory] Anomalous number of sessions for connected LDAP clients for host: {{host.name}}", - "type": "query alert", - "query": "avg(last_12h):anomalies(avg:active_directory.ldap.client_sessions{*} by {host}, 'agile', 5, direction='both', alert_window='last_15m', interval=120, count_default_zero='true', seasonality='daily') >= 1", - "message": "There is an anomalous number of sessions for connected LDAP clients for host: {{host.name}} ", - "tags": [ - "integration:active_directory" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Notifies when Active Directory number of sessions for connected LDAP clients becomes anomalous for a specific host." - } + "version": 2, + "created_at": "2021-04-20", + "last_updated_at": "2021-04-20", + "title": "Anomalous number of sessions for connected LDAP clients for host: {{host.name}}", + "tags": [ + "integration:active-directory" + ], + "description": "Notifies when Active Directory number of sessions for connected LDAP clients becomes anomalous for a specific host.", + "definition": { + "message": "There is an anomalous number of sessions for connected LDAP clients for host: {{host.name}} ", + "name": "[Active Directory] Anomalous number of sessions for connected LDAP clients for host: {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_12h):anomalies(avg:active_directory.ldap.client_sessions{*} by {host}, 'agile', 5, direction='both', alert_window='last_15m', interval=120, count_default_zero='true', seasonality='daily') >= 1", + "tags": [ + "integration:active_directory" + ], + "type": "query alert" + } } \ No newline at end of file diff --git a/activemq/assets/monitors/activemq_artemis_high_disk_store.json b/activemq/assets/monitors/activemq_artemis_high_disk_store.json new file mode 100644 index 0000000000000..5df60f106e98d --- /dev/null +++ b/activemq/assets/monitors/activemq_artemis_high_disk_store.json @@ -0,0 +1,37 @@ +{ + "version": 2, + "created_at": "2021-03-18", + "last_updated_at": "2021-03-18", + "title": "High disk store percentage on {{host.name}}", + "tags": [ + "integration:activemq" + ], + "description": "Notify your team when disk store usage exceeds thresholds.", + "definition": { + "message": "Please check host {{host.name}}, as disk store usage is abnormally high at {{value}}.", + "name": "[ActiveMQ Artemis] High disk store percentage on {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "silenced": {}, + "thresholds": { + "critical": 0.95, + "warning": 0.8 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_10m):avg:activemq.artemis.disk_store_usage_pct{*} > 0.95", + "tags": [ + "integration:activemq" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/activemq/assets/monitors/activemq_artemis_unrouted_messages.json b/activemq/assets/monitors/activemq_artemis_unrouted_messages.json new file mode 100644 index 0000000000000..fdf3f81693ab7 --- /dev/null +++ b/activemq/assets/monitors/activemq_artemis_unrouted_messages.json @@ -0,0 +1,41 @@ +{ + "version": 2, + "created_at": "2021-03-18", + "last_updated_at": "2021-03-18", + "title": "High number of unrouted messages on address {{address.name}}", + "tags": [ + "integration:activemq" + ], + "description": "Notify your team when unrouted messages are unexpected.", + "definition": { + "message": "The number of unrouted messages on address {{address.name}} has exceeded its normal range to {{value}} messages.", + "name": "[ActiveMQ Artemis] High number of unrouted messages on address {{address.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "silenced": {}, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_4h):anomalies(avg:activemq.artemis.address.unrouted_messages{*} by {address}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1", + "tags": [ + "integration:activemq" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/activemq/assets/recommended_monitors/activemq_artemis_high_disk_store.json b/activemq/assets/recommended_monitors/activemq_artemis_high_disk_store.json deleted file mode 100644 index 73137bcc803c4..0000000000000 --- a/activemq/assets/recommended_monitors/activemq_artemis_high_disk_store.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "name": "[ActiveMQ Artemis] High disk store percentage on {{host.name}}", - "type": "query alert", - "query": "avg(last_10m):avg:activemq.artemis.disk_store_usage_pct{*} > 0.95", - "message": "Please check host {{host.name}}, as disk store usage is abnormally high at {{value}}.", - "tags": [ - "integration:activemq" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "silenced": {}, - "include_tags": true, - "no_data_timeframe": null, - "require_full_window": true, - "new_host_delay": 300, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "thresholds": { - "critical": 0.95, - "warning": 0.8 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Notify your team when disk store usage exceeds thresholds." - } -} \ No newline at end of file diff --git a/activemq/assets/recommended_monitors/activemq_artemis_unrouted_messages.json b/activemq/assets/recommended_monitors/activemq_artemis_unrouted_messages.json deleted file mode 100644 index a349532b187fa..0000000000000 --- a/activemq/assets/recommended_monitors/activemq_artemis_unrouted_messages.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "name": "[ActiveMQ Artemis] High number of unrouted messages on address {{address.name}}", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:activemq.artemis.address.unrouted_messages{*} by {address}.as_count(), 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1", - "message": "The number of unrouted messages on address {{address.name}} has exceeded its normal range to {{value}} messages.", - "tags": [ - "integration:activemq" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "silenced": {}, - "include_tags": true, - "no_data_timeframe": null, - "require_full_window": true, - "new_host_delay": 300, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "threshold_windows": { - "recovery_window": "last_15m", - "trigger_window": "last_15m" - }, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Notify your team when unrouted messages are unexpected." - } -} diff --git a/activemq/manifest.json b/activemq/manifest.json index 8556fae5f053a..5e1c1932a3ccc 100644 --- a/activemq/manifest.json +++ b/activemq/manifest.json @@ -55,8 +55,8 @@ "artemis": "assets/dashboards/artemis_dashboard.json" }, "monitors": { - "[ActiveMQ Artemis] High unrouted messages": "assets/recommended_monitors/activemq_artemis_unrouted_messages.json", - "[ActiveMQ Artemis] High disk store usage": "assets/recommended_monitors/activemq_artemis_high_disk_store.json" + "[ActiveMQ Artemis] High unrouted messages": "assets/monitors/activemq_artemis_unrouted_messages.json", + "[ActiveMQ Artemis] High disk store usage": "assets/monitors/activemq_artemis_high_disk_store.json" }, "saved_views": { "activemq_processes": "assets/saved_views/activemq_processes.json" diff --git a/airbyte/assets/monitors/long_running_jobs.json b/airbyte/assets/monitors/long_running_jobs.json index 827bd339001bb..c30070df3de43 100644 --- a/airbyte/assets/monitors/long_running_jobs.json +++ b/airbyte/assets/monitors/long_running_jobs.json @@ -1,26 +1,33 @@ { - "name": "Airbyte: Too Many Long-Running Jobs", - "type": "query alert", - "query": "avg(last_5m):avg:airbyte.metrics_reporter.num_unusually_long_syncs{*} / avg:airbyte.metrics_reporter.num_running_jobs{*} * 100 > 5", - "message": "{{#is_alert}}\n{{value}}% jobs are taking too long to complete which is higher than the threshold of {{ok_threshold}}%.\n{{/is_alert}} \n\n{{#is_recovery}}\nThe share of long-running jobs is back to normal!\n{{/is_recovery}}", - "tags": [ - "integration:airbyte" - ], - "options": { - "thresholds": { - "critical": 5 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "new_host_delay": 300, - "silenced": {} - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when there are too many long-running jobs." + "version": 2, + "created_at": "2023-08-01", + "last_updated_at": "2023-08-01", + "title": "Airbyte: Too Many Long-Running Jobs", + "tags": [ + "integration:airbyte" + ], + "description": "Notify your team when there are too many long-running jobs.", + "definition": { + "message": "{{#is_alert}}\n{{value}}% jobs are taking too long to complete which is higher than the threshold of {{ok_threshold}}%.\n{{/is_alert}} \n\n{{#is_recovery}}\nThe share of long-running jobs is back to normal!\n{{/is_recovery}}", + "name": "Airbyte: Too Many Long-Running Jobs", + "options": { + "include_tags": false, + "new_host_delay": 300, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 5 + } + }, + "priority": null, + "query": "avg(last_5m):avg:airbyte.metrics_reporter.num_unusually_long_syncs{*} / avg:airbyte.metrics_reporter.num_running_jobs{*} * 100 > 5", + "restricted_roles": null, + "tags": [ + "integration:airbyte" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/apache/assets/monitors/apache_low_idle_workers.json b/apache/assets/monitors/apache_low_idle_workers.json new file mode 100644 index 0000000000000..3dc0feabd4a5d --- /dev/null +++ b/apache/assets/monitors/apache_low_idle_workers.json @@ -0,0 +1,40 @@ +{ + "version": 2, + "created_at": "2021-02-12", + "last_updated_at": "2021-02-23", + "title": "Low number of idle workers", + "tags": [ + "integration:apache" + ], + "description": "Notify your team when the number of idle workers is running low.", + "definition": { + "message": "The number of idle workers is abnormally low: {{value}}. You may see slower request processing times.", + "name": "[Apache] Low number of idle workers", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_5m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0.3, + "warning": 0.8 + }, + "timeout_h": 0 + }, + "query": "avg(last_4h):anomalies(avg:apache.performance.idle_workers{*} by {host}, 'basic', 2, direction='below', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", + "tags": [ + "integration:apache" + ], + "type": "query alert" + } +} diff --git a/apache/assets/monitors/high_keep_alive_and_cpu.json b/apache/assets/monitors/high_keep_alive_and_cpu.json new file mode 100644 index 0000000000000..9509de1b79c4f --- /dev/null +++ b/apache/assets/monitors/high_keep_alive_and_cpu.json @@ -0,0 +1,41 @@ +{ + "version": 2, + "created_at": "2021-02-23", + "last_updated_at": "2021-02-23", + "title": "resource utilization", + "tags": [ + "integration:apache" + ], + "description": "Notify your team when the number of keep-alive async connections and the CPU load are both running high.", + "definition": { + "message": "High number of keep-alive async connections, combined with high CPU. You may want to lower the maximum number of simultaneous connections to the server (MaxRequestWorkers), and/or decrease the KeepAliveTimeout to avoid holding connections open longer than necessary.\n\n", + "name": "[Apache] resource utilization", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0, + "warning": 0.8, + "warning_recovery": 0.6 + }, + "timeout_h": 0 + }, + "query": "avg(last_4h):anomalies(+ avg:apache.performance.cpu_load{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1", + "tags": [ + "integration:apache" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/apache/assets/recommended_monitors/apache_low_idle_workers.json b/apache/assets/recommended_monitors/apache_low_idle_workers.json deleted file mode 100644 index e3f9b57b736d5..0000000000000 --- a/apache/assets/recommended_monitors/apache_low_idle_workers.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "name": "[Apache] Low number of idle workers", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:apache.performance.idle_workers{*} by {host}, 'basic', 2, direction='below', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", - "message": "The number of idle workers is abnormally low: {{value}}. You may see slower request processing times.", - "tags": [ - "integration:apache" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "warning": 0.8, - "critical_recovery": 0.3 - }, - "threshold_windows": { - "trigger_window": "last_5m", - "recovery_window": "last_5m" - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when the number of idle workers is running low." - } -} diff --git a/apache/assets/recommended_monitors/high_keep_alive_and_cpu.json b/apache/assets/recommended_monitors/high_keep_alive_and_cpu.json deleted file mode 100644 index f26974b79bbf7..0000000000000 --- a/apache/assets/recommended_monitors/high_keep_alive_and_cpu.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "name": "[Apache] resource utilization", - "type": "query alert", - "query": "avg(last_4h):anomalies(+ avg:apache.performance.cpu_load{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1", - "message": "High number of keep-alive async connections, combined with high CPU. You may want to lower the maximum number of simultaneous connections to the server (MaxRequestWorkers), and/or decrease the KeepAliveTimeout to avoid holding connections open longer than necessary.\n\n", - "tags": [ - "integration:apache" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "warning": 0.8, - "critical_recovery": 0, - "warning_recovery": 0.6 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when the number of keep-alive async connections and the CPU load are both running high." - } -} diff --git a/apache/manifest.json b/apache/manifest.json index d2842a446ec76..a4142706303c5 100644 --- a/apache/manifest.json +++ b/apache/manifest.json @@ -52,8 +52,8 @@ "apache": "assets/dashboards/apache_dashboard.json" }, "monitors": { - "[Apache] Low number of idle workers": "assets/recommended_monitors/apache_low_idle_workers.json", - "[Apache] resource utilization": "assets/recommended_monitors/high_keep_alive_and_cpu.json" + "[Apache] Low number of idle workers": "assets/monitors/apache_low_idle_workers.json", + "[Apache] resource utilization": "assets/monitors/high_keep_alive_and_cpu.json" }, "saved_views": { "4xx_errors": "assets/saved_views/4xx_errors.json", diff --git a/arangodb/assets/monitors/high_server_kernel_mode.json b/arangodb/assets/monitors/high_server_kernel_mode.json new file mode 100644 index 0000000000000..51f465fe286c9 --- /dev/null +++ b/arangodb/assets/monitors/high_server_kernel_mode.json @@ -0,0 +1,38 @@ +{ + "version": 2, + "created_at": "2022-06-01", + "last_updated_at": "2022-06-01", + "title": "High server Kernel mode percentage usage", + "tags": [ + "integration:arangodb" + ], + "description": "Notifies when ArangoDB's server Kernel mode usage is higher than usual", + "definition": { + "message": "{{#is_warning}}Kernel mode usage on server is more than 70% on host {{arangodb_host.name}}{{/is_warning}} \\n\n{{#is_alert}}Kernel mode usage on server is more than 80% on host {{arangodb_host.name}}{{/is_alert}}\\n\\n\n{{#is_recovery}}Kernel mode usage on server is less than 70% on host {{arangodb_host.name}}{{/is_recovery}}", + "name": "[ArangoDB] High server Kernel mode percentage usage", + "options": { + "avalanche_window": 10, + "include_tags": true, + "new_host_delay": 300, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 0.8, + "critical_recovery": 0, + "warning": 0.7 + } + }, + "query": "avg(last_4h):anomalies(avg:arangodb.server.kernel_mode.percent{*}, 'basic', 2, direction='both', interval=60, alert_window='last_15m', count_default_zero='true') >= 0.8", + "tags": [ + "integration:arangodb" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/arangodb/assets/monitors/high_server_user_mode.json b/arangodb/assets/monitors/high_server_user_mode.json new file mode 100644 index 0000000000000..e2abf6bc0dfa0 --- /dev/null +++ b/arangodb/assets/monitors/high_server_user_mode.json @@ -0,0 +1,38 @@ +{ + "version": 2, + "created_at": "2022-06-01", + "last_updated_at": "2022-06-01", + "title": "High server User mode percentage usage", + "tags": [ + "integration:arangodb" + ], + "description": "Notifies when ArangoDB's server User mode usage is higher than usual", + "definition": { + "message": "{{#is_warning}}User mode usage on server is more than 70% on host {{arangodb_host.name}}{{/is_warning}} \\n\n{{#is_alert}}User mode usage on server is more than 80% on host {{arangodb_host.name}}{{/is_alert}}\\n\\n\n{{#is_recovery}}User mode usage on server is less than 70% on host {{arangodb_host.name}}{{/is_recovery}}", + "name": "[ArangoDB] High server User mode percentage usage", + "options": { + "avalanche_window": 10, + "include_tags": true, + "new_host_delay": 300, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 0.8, + "critical_recovery": 0, + "warning": 0.7 + } + }, + "query": "avg(last_4h):anomalies(avg:arangodb.server.user_mode.percent{*}, 'basic', 2, direction='both', interval=60, alert_window='last_15m', count_default_zero='true') >= 0.8", + "tags": [ + "integration:arangodb" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/arangodb/assets/recommended_monitors/high_server_kernel_mode.json b/arangodb/assets/recommended_monitors/high_server_kernel_mode.json deleted file mode 100644 index c9cdf47adda4c..0000000000000 --- a/arangodb/assets/recommended_monitors/high_server_kernel_mode.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "name": "[ArangoDB] High server Kernel mode percentage usage", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:arangodb.server.kernel_mode.percent{*}, 'basic', 2, direction='both', interval=60, alert_window='last_15m', count_default_zero='true') >= 0.8", - "message": "{{#is_warning}}Kernel mode usage on server is more than 70% on host {{arangodb_host.name}}{{/is_warning}} \\n\n{{#is_alert}}Kernel mode usage on server is more than 80% on host {{arangodb_host.name}}{{/is_alert}}\\n\\n\n{{#is_recovery}}Kernel mode usage on server is less than 70% on host {{arangodb_host.name}}{{/is_recovery}}", - "tags": [ - "integration:arangodb" - ], - "options": { - "notify_audit": false, - "silenced": {}, - "include_tags": true, - "thresholds": { - "critical": 0.8, - "warning": 0.7, - "critical_recovery": 0 - }, - "require_full_window": false, - "new_host_delay": 300, - "notify_no_data": false, - "renotify_interval": 0, - "avalanche_window": 10, - "threshold_windows": { - "recovery_window": "last_15m", - "trigger_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when ArangoDB's server Kernel mode usage is higher than usual" - } -} diff --git a/arangodb/assets/recommended_monitors/high_server_user_mode.json b/arangodb/assets/recommended_monitors/high_server_user_mode.json deleted file mode 100644 index fba02372af057..0000000000000 --- a/arangodb/assets/recommended_monitors/high_server_user_mode.json +++ /dev/null @@ -1,31 +0,0 @@ -{ - "name": "[ArangoDB] High server User mode percentage usage", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:arangodb.server.user_mode.percent{*}, 'basic', 2, direction='both', interval=60, alert_window='last_15m', count_default_zero='true') >= 0.8", - "message": "{{#is_warning}}User mode usage on server is more than 70% on host {{arangodb_host.name}}{{/is_warning}} \\n\n{{#is_alert}}User mode usage on server is more than 80% on host {{arangodb_host.name}}{{/is_alert}}\\n\\n\n{{#is_recovery}}User mode usage on server is less than 70% on host {{arangodb_host.name}}{{/is_recovery}}", - "tags": [ - "integration:arangodb" - ], - "options": { - "notify_audit": false, - "silenced": {}, - "include_tags": true, - "thresholds": { - "critical": 0.8, - "warning": 0.7, - "critical_recovery": 0 - }, - "require_full_window": false, - "new_host_delay": 300, - "notify_no_data": false, - "renotify_interval": 0, - "avalanche_window": 10, - "threshold_windows": { - "recovery_window": "last_15m", - "trigger_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when ArangoDB's server User mode usage is higher than usual" - } -} diff --git a/arangodb/manifest.json b/arangodb/manifest.json index 3308d5565c9a0..89eb62cac65cf 100644 --- a/arangodb/manifest.json +++ b/arangodb/manifest.json @@ -49,8 +49,8 @@ "ArangoDB Overview": "assets/dashboards/arangodb_overview.json" }, "monitors": { - "[ArangoDB] High server Kernel mode percentage usage": "assets/recommended_monitors/high_server_kernel_mode.json", - "[ArangoDB] High server User mode percentage usage": "assets/recommended_monitors/high_server_user_mode.json" + "[ArangoDB] High server Kernel mode percentage usage": "assets/monitors/high_server_kernel_mode.json", + "[ArangoDB] High server User mode percentage usage": "assets/monitors/high_server_user_mode.json" }, "logs": { "source": "arangodb" diff --git a/argocd/assets/monitors/application_sync_status.json b/argocd/assets/monitors/application_sync_status.json new file mode 100644 index 0000000000000..40d9ab37cfe5d --- /dev/null +++ b/argocd/assets/monitors/application_sync_status.json @@ -0,0 +1,32 @@ +{ + "version": 2, + "created_at": "2023-01-20", + "last_updated_at": "2023-01-20", + "title": "Application Sync Status", + "tags": [ + "integration:argocd" + ], + "description": "Notify your team when your applications are not synced in Argo CD", + "definition": { + "message": "{{#is_alert}}\nApplication {{name.name}} has been reporting with a sync_status:{{sync_status.name}} for the last 30 minutes.\n{{/is_alert}}", + "name": "[ArgoCD] Application Sync Status", + "options": { + "include_tags": true, + "new_group_delay": 60, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 1 + } + }, + "priority": null, + "query": "max(last_30m):default_zero(avg:argocd.app_controller.app.info{!sync_status:synced} by {sync_status,name}) >= 1", + "restricted_roles": null, + "tags": [ + "integration:argocd" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/argocd/assets/recommended_monitors/application_sync_status.json b/argocd/assets/recommended_monitors/application_sync_status.json deleted file mode 100644 index 76a473e7d8f43..0000000000000 --- a/argocd/assets/recommended_monitors/application_sync_status.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "name": "[ArgoCD] Application Sync Status", - "type": "query alert", - "query": "max(last_30m):default_zero(avg:argocd.app_controller.app.info{!sync_status:synced} by {sync_status,name}) >= 1", - "message": "{{#is_alert}}\nApplication {{name.name}} has been reporting with a sync_status:{{sync_status.name}} for the last 30 minutes.\n{{/is_alert}}", - "tags": [ - "integration:argocd" - ], - "options": { - "thresholds": { - "critical": 1 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": true, - "new_group_delay": 60 - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when your applications are not synced in Argo CD" - } -} \ No newline at end of file diff --git a/argocd/manifest.json b/argocd/manifest.json index 36dae29fefca2..8ee6a2339d537 100644 --- a/argocd/manifest.json +++ b/argocd/manifest.json @@ -55,7 +55,7 @@ "Argo CD Overview": "assets/dashboards/argo_cd_overview.json" }, "monitors": { - "Sync Status": "assets/recommended_monitors/application_sync_status.json" + "Sync Status": "assets/monitors/application_sync_status.json" } }, "author": { diff --git a/avi_vantage/assets/monitors/error_rate_monitor.json b/avi_vantage/assets/monitors/error_rate_monitor.json index 29c597c519ca6..d271aaf556d23 100644 --- a/avi_vantage/assets/monitors/error_rate_monitor.json +++ b/avi_vantage/assets/monitors/error_rate_monitor.json @@ -1,28 +1,35 @@ { - "name": "[Avi Vantage] Virtual service {{virtualservice_name.name}} has a high number of errors", - "type": "query alert", - "query": "avg(last_5m):avg:avi_vantage.l7_client.pct_response_errors{*} by {virtualservice_name,host} > 70", - "message": "{{#is_alert}}\nVirtual service {{virtualservice_name.name}} is experiencing a very high number of errors.\n{{/is_alert}} \n\n{{#is_recovery}}\nError rate of virtual service {{virtualservice_name.name}} is back to a lower level.\n{{/is_recovery}} ", - "tags": [ - "integration:avi_vantage" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 70 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Notify your team when one of Avi Virtual Service is experiencing a high percentage of errors." - } + "version": 2, + "created_at": "2021-08-05", + "last_updated_at": "2021-08-06", + "title": "Virtual service {{virtualservice_name.name}} has a high number of errors", + "tags": [ + "integration:avi-vantage" + ], + "description": "Notify your team when one of Avi Virtual Service is experiencing a high percentage of errors.", + "definition": { + "message": "{{#is_alert}}\nVirtual service {{virtualservice_name.name}} is experiencing a very high number of errors.\n{{/is_alert}} \n\n{{#is_recovery}}\nError rate of virtual service {{virtualservice_name.name}} is back to a lower level.\n{{/is_recovery}} ", + "name": "[Avi Vantage] Virtual service {{virtualservice_name.name}} has a high number of errors", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 70 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_5m):avg:avi_vantage.l7_client.pct_response_errors{*} by {virtualservice_name,host} > 70", + "tags": [ + "integration:avi_vantage" + ], + "type": "query alert" + } } \ No newline at end of file diff --git a/azure_iot_edge/assets/monitors/disk_usage.json b/azure_iot_edge/assets/monitors/disk_usage.json index b8ca32172a363..47b2cd8a06538 100644 --- a/azure_iot_edge/assets/monitors/disk_usage.json +++ b/azure_iot_edge/assets/monitors/disk_usage.json @@ -1,31 +1,38 @@ { - "name": "[Azure IoT Edge] IoT Edge device {{host}} is running out of available disk space", - "type": "query alert", - "query": "max(last_1h):avg:azure.iot_edge.edge_agent.available_disk_space_bytes{*} by {host} / avg:azure.iot_edge.edge_agent.total_disk_space_bytes{*} by {host}.rollup(max, 60) * 100 < 10", - "message": "Please check device {{host}}, as Edge Agent reports that available disk space has dropped below {{threshold}}%.", - "tags": [ - "integration:azure_iot_edge" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "silenced": {}, - "include_tags": true, - "no_data_timeframe": null, - "require_full_window": true, - "new_host_delay": 300, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "thresholds": { - "critical": 10, - "warning": 25, - "critical_recovery": 11, - "warning_recovery": 26 - } - }, - "recommended_monitor_metadata": { - "description": "Triggers an alert when an IoT Edge device is running out of available disk space" - } -} + "version": 2, + "created_at": "2020-10-22", + "last_updated_at": "2020-10-22", + "title": "IoT Edge device {{host}} is running out of available disk space", + "tags": [ + "integration:azure-iot-edge" + ], + "description": "Triggers an alert when an IoT Edge device is running out of available disk space", + "definition": { + "message": "Please check device {{host}}, as Edge Agent reports that available disk space has dropped below {{threshold}}%.", + "name": "[Azure IoT Edge] IoT Edge device {{host}} is running out of available disk space", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "silenced": {}, + "thresholds": { + "critical": 10, + "critical_recovery": 11, + "warning": 25, + "warning_recovery": 26 + }, + "timeout_h": 0 + }, + "query": "max(last_1h):avg:azure.iot_edge.edge_agent.available_disk_space_bytes{*} by {host} / avg:azure.iot_edge.edge_agent.total_disk_space_bytes{*} by {host}.rollup(max, 60) * 100 < 10", + "tags": [ + "integration:azure_iot_edge" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/azure_iot_edge/assets/monitors/edgehub_retries.json b/azure_iot_edge/assets/monitors/edgehub_retries.json index 7356b47c51145..dbea8e511cb07 100644 --- a/azure_iot_edge/assets/monitors/edgehub_retries.json +++ b/azure_iot_edge/assets/monitors/edgehub_retries.json @@ -1,32 +1,39 @@ { - "name": "[Azure IoT Edge] Rate of Edge Hub operations retries is higher than usual on device device {{host}}", - "type": "query alert", - "query": "avg(last_1h):anomalies(per_minute(avg:azure.iot_edge.edge_hub.operation_retry_total{*} by {host}), 'basic', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", - "message": "Please check device {{host}}, as Edge Hub reports a rate of operation retries of {{value}} per minute, which is higher than usual.", - "tags": [ - "integration:azure_iot_edge" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when rate of Edge Hub operation retries is higher than usual" - } -} + "version": 2, + "created_at": "2020-10-22", + "last_updated_at": "2020-10-22", + "title": "Rate of Edge Hub operations retries is higher than usual on device device {{host}}", + "tags": [ + "integration:azure-iot-edge" + ], + "description": "Notifies when rate of Edge Hub operation retries is higher than usual", + "definition": { + "message": "Please check device {{host}}, as Edge Hub reports a rate of operation retries of {{value}} per minute, which is higher than usual.", + "name": "[Azure IoT Edge] Rate of Edge Hub operations retries is higher than usual on device device {{host}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "query": "avg(last_1h):anomalies(per_minute(avg:azure.iot_edge.edge_hub.operation_retry_total{*} by {host}), 'basic', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", + "tags": [ + "integration:azure_iot_edge" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/azure_iot_edge/assets/monitors/iothub_syncs.json b/azure_iot_edge/assets/monitors/iothub_syncs.json index 65aaa4c6eb18c..c145213d2c246 100644 --- a/azure_iot_edge/assets/monitors/iothub_syncs.json +++ b/azure_iot_edge/assets/monitors/iothub_syncs.json @@ -1,32 +1,39 @@ { - "name": "[Azure IoT Edge] Rate of unsuccessful syncs with IoT Hub is higher than usual on device {{host}}", - "type": "query alert", - "query": "avg(last_1h):anomalies(per_minute(avg:azure.iot_edge.edge_agent.unsuccessful_iothub_syncs_total{*} by {host}), 'basic', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", - "message": "Number of unsuccessful syncs between Edge Agent and IoT Hub on device {{host}} is at {{value}} per minute, which is higher than usual.", - "tags": [ - "integration:azure_iot_edge" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when unsuccessful syncs between Edge Agent and IoT Hub are higher than usual" - } -} + "version": 2, + "created_at": "2020-10-22", + "last_updated_at": "2020-10-22", + "title": "Rate of unsuccessful syncs with IoT Hub is higher than usual on device {{host}}", + "tags": [ + "integration:azure-iot-edge" + ], + "description": "Notifies when unsuccessful syncs between Edge Agent and IoT Hub are higher than usual", + "definition": { + "message": "Number of unsuccessful syncs between Edge Agent and IoT Hub on device {{host}} is at {{value}} per minute, which is higher than usual.", + "name": "[Azure IoT Edge] Rate of unsuccessful syncs with IoT Hub is higher than usual on device {{host}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "query": "avg(last_1h):anomalies(per_minute(avg:azure.iot_edge.edge_agent.unsuccessful_iothub_syncs_total{*} by {host}), 'basic', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", + "tags": [ + "integration:azure_iot_edge" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/azure_iot_edge/assets/monitors/memory_usage.json b/azure_iot_edge/assets/monitors/memory_usage.json index b40035c85a9c8..30a754a59a216 100644 --- a/azure_iot_edge/assets/monitors/memory_usage.json +++ b/azure_iot_edge/assets/monitors/memory_usage.json @@ -1,31 +1,38 @@ { - "name": "[Azure IoT Edge] IoT Edge device {{host}} is running out of memory", - "type": "query alert", - "query": "max(last_1h):avg:azure.iot_edge.edge_agent.used_memory_bytes{*} by {host} / avg:azure.iot_edge.edge_agent.total_memory_bytes{*} by {host}.rollup(max, 60) * 100 > 80", - "message": "Please check device {{host}}, as Edge Agent reports usage of more than {{threshold}}% of available RAM for the last hour.", - "tags": [ - "integration:azure_iot_edge" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "silenced": {}, - "include_tags": true, - "no_data_timeframe": null, - "require_full_window": true, - "new_host_delay": 300, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "thresholds": { - "critical": 80, - "warning": 65, - "critical_recovery": 79, - "warning_recovery": 64 - } - }, - "recommended_monitor_metadata": { - "description": "Triggers an alert when an IoT Edge device is running out of memory" - } -} + "version": 2, + "created_at": "2020-10-22", + "last_updated_at": "2020-10-22", + "title": "IoT Edge device {{host}} is running out of memory", + "tags": [ + "integration:azure-iot-edge" + ], + "description": "Triggers an alert when an IoT Edge device is running out of memory", + "definition": { + "message": "Please check device {{host}}, as Edge Agent reports usage of more than {{threshold}}% of available RAM for the last hour.", + "name": "[Azure IoT Edge] IoT Edge device {{host}} is running out of memory", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "silenced": {}, + "thresholds": { + "critical": 80, + "critical_recovery": 79, + "warning": 65, + "warning_recovery": 64 + }, + "timeout_h": 0 + }, + "query": "max(last_1h):avg:azure.iot_edge.edge_agent.used_memory_bytes{*} by {host} / avg:azure.iot_edge.edge_agent.total_memory_bytes{*} by {host}.rollup(max, 60) * 100 > 80", + "tags": [ + "integration:azure_iot_edge" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/boundary/assets/monitors/active_connections.json b/boundary/assets/monitors/active_connections.json index 62ef70ec7cd49..b729bd5039698 100644 --- a/boundary/assets/monitors/active_connections.json +++ b/boundary/assets/monitors/active_connections.json @@ -1,28 +1,35 @@ { + "version": 2, + "created_at": "2022-09-14", + "last_updated_at": "2022-09-14", + "title": "High active connections", + "tags": [ + "integration:boundary" + ], + "description": "Notify your team when there is a high number of active connections.", + "definition": { + "message": "There is a high number ({{value}}) of active connections.", "name": "[Boundary] High active connections", - "type": "query alert", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 90, + "warning": 70 + }, + "timeout_h": 0 + }, "query": "avg(last_5m):boundary.worker.proxy.websocket.active_connections{*} by {endpoint} > 90", - "message": "There is a high number ({{value}}) of active connections.", "tags": [ - "integration:boundary" + "integration:boundary" ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 90, - "warning": 70 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when there is a high number of active connections." - } -} + "type": "query alert" + } +} \ No newline at end of file diff --git a/calico/assets/monitors/dataplane_failures.json b/calico/assets/monitors/dataplane_failures.json index 4ff3f9940046a..d625489617fbb 100644 --- a/calico/assets/monitors/dataplane_failures.json +++ b/calico/assets/monitors/dataplane_failures.json @@ -1,28 +1,37 @@ { - "name": "[Calico] dataplane failure", - "type": "query alert", - "query": "sum(last_5m):avg:calico.felix.int_dataplane_failures{*}.as_count() > 1", - "message": "Calico encountered a problem with dataplane.", - "tags": ["integration:calico"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": true, - "renotify_interval": "0", - "evaluation_delay": 10, - "escalation_message": "", - "no_data_timeframe": 10, - "include_tags": true, - "thresholds": { - "critical": 1, - "warning": 0 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Get notified when Calico encounters a dataplane failure." + "version": 2, + "created_at": "2022-03-18", + "last_updated_at": "2022-03-18", + "title": "dataplane failure", + "tags": [ + "integration:calico" + ], + "description": "Get notified when Calico encounters a dataplane failure.", + "definition": { + "message": "Calico encountered a problem with dataplane.", + "name": "[Calico] dataplane failure", + "options": { + "escalation_message": "", + "evaluation_delay": 10, + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": 10, + "notify_audit": false, + "notify_no_data": true, + "renotify_interval": "0", + "require_full_window": false, + "thresholds": { + "critical": 1, + "warning": 0 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "sum(last_5m):avg:calico.felix.int_dataplane_failures{*}.as_count() > 1", + "tags": [ + "integration:calico" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/calico/assets/monitors/ipset_error.json b/calico/assets/monitors/ipset_error.json index 5bc0363efb096..4b45eae4d272b 100644 --- a/calico/assets/monitors/ipset_error.json +++ b/calico/assets/monitors/ipset_error.json @@ -1,28 +1,37 @@ { - "name": "[Calico] error with ipsets", - "type": "query alert", - "query": "sum(last_5m):avg:calico.felix.ipset.errors{*}.as_count() > 1", - "message": "Calico encountered a problem applying networking rules using ipset.", - "tags": ["integration:calico"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": true, - "renotify_interval": "0", - "evaluation_delay": 10, - "escalation_message": "", - "no_data_timeframe": 10, - "include_tags": true, - "thresholds": { - "critical": 1, - "warning": 0 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Get notified when Calico fails to apply networking rules using ipset." + "version": 2, + "created_at": "2022-03-18", + "last_updated_at": "2022-03-18", + "title": "error with ipsets", + "tags": [ + "integration:calico" + ], + "description": "Get notified when Calico fails to apply networking rules using ipset.", + "definition": { + "message": "Calico encountered a problem applying networking rules using ipset.", + "name": "[Calico] error with ipsets", + "options": { + "escalation_message": "", + "evaluation_delay": 10, + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": 10, + "notify_audit": false, + "notify_no_data": true, + "renotify_interval": "0", + "require_full_window": false, + "thresholds": { + "critical": 1, + "warning": 0 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "sum(last_5m):avg:calico.felix.ipset.errors{*}.as_count() > 1", + "tags": [ + "integration:calico" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/calico/assets/monitors/iptables_restore_errors.json b/calico/assets/monitors/iptables_restore_errors.json index d42995bbd3cf2..253a33d45ad12 100644 --- a/calico/assets/monitors/iptables_restore_errors.json +++ b/calico/assets/monitors/iptables_restore_errors.json @@ -1,28 +1,37 @@ { - "name": "[Calico] error with iptables restore", - "type": "query alert", - "query": "sum(last_5m):avg:calico.felix.iptables.restore_errors{*}.as_count() > 1", - "message": "Calico encountered a problem while restoring iptables.", - "tags": ["integration:calico"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": true, - "renotify_interval": "0", - "evaluation_delay": 10, - "escalation_message": "", - "no_data_timeframe": 10, - "include_tags": true, - "thresholds": { - "critical": 1, - "warning": 0 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Get notified when Calico fails to restore iptables." + "version": 2, + "created_at": "2022-03-18", + "last_updated_at": "2022-03-18", + "title": "error with iptables restore", + "tags": [ + "integration:calico" + ], + "description": "Get notified when Calico fails to restore iptables.", + "definition": { + "message": "Calico encountered a problem while restoring iptables.", + "name": "[Calico] error with iptables restore", + "options": { + "escalation_message": "", + "evaluation_delay": 10, + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": 10, + "notify_audit": false, + "notify_no_data": true, + "renotify_interval": "0", + "require_full_window": false, + "thresholds": { + "critical": 1, + "warning": 0 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "sum(last_5m):avg:calico.felix.iptables.restore_errors{*}.as_count() > 1", + "tags": [ + "integration:calico" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/calico/assets/monitors/iptables_save_errors.json b/calico/assets/monitors/iptables_save_errors.json index 66481f86b9014..33c91936a83b0 100644 --- a/calico/assets/monitors/iptables_save_errors.json +++ b/calico/assets/monitors/iptables_save_errors.json @@ -1,28 +1,37 @@ { - "name": "[Calico] error with iptables save", - "type": "query alert", - "query": "sum(last_5m):avg:calico.felix.iptables.save_errors{*}.as_count() > 1", - "message": "Calico encountered a problem while saving iptables rules.", - "tags": ["integration:calico"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": true, - "renotify_interval": "0", - "evaluation_delay": 10, - "escalation_message": "", - "no_data_timeframe": 10, - "include_tags": true, - "thresholds": { - "critical": 1, - "warning": 0 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Get notified when Calico fails to save iptables rules." + "version": 2, + "created_at": "2022-03-18", + "last_updated_at": "2022-03-18", + "title": "error with iptables save", + "tags": [ + "integration:calico" + ], + "description": "Get notified when Calico fails to save iptables rules.", + "definition": { + "message": "Calico encountered a problem while saving iptables rules.", + "name": "[Calico] error with iptables save", + "options": { + "escalation_message": "", + "evaluation_delay": 10, + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": 10, + "notify_audit": false, + "notify_no_data": true, + "renotify_interval": "0", + "require_full_window": false, + "thresholds": { + "critical": 1, + "warning": 0 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "sum(last_5m):avg:calico.felix.iptables.save_errors{*}.as_count() > 1", + "tags": [ + "integration:calico" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/calico/manifest.json b/calico/manifest.json index 4a4597340402b..cdfb5646d7ebf 100644 --- a/calico/manifest.json +++ b/calico/manifest.json @@ -55,10 +55,10 @@ "source": "calico" }, "monitors": { - "[calico] monitor ipsets error": "./assets/monitors/ipset_error.json", - "[calico] monitor iptables save errors": "./assets/monitors/iptables_save_errors.json", - "[calico] monitor iptables restore errors": "./assets/monitors/iptables_restore_errors.json", - "[calico] monitor dataplane failures": "./assets/monitors/dataplane_failures.json" + "[calico] monitor ipsets error": "assets/monitors/ipset_error.json", + "[calico] monitor iptables save errors": "assets/monitors/iptables_save_errors.json", + "[calico] monitor iptables restore errors": "assets/monitors/iptables_restore_errors.json", + "[calico] monitor dataplane failures": "assets/monitors/dataplane_failures.json" } } } \ No newline at end of file diff --git a/citrix_hypervisor/assets/monitors/host_cpu_high.json b/citrix_hypervisor/assets/monitors/host_cpu_high.json new file mode 100644 index 0000000000000..a9680e267616d --- /dev/null +++ b/citrix_hypervisor/assets/monitors/host_cpu_high.json @@ -0,0 +1,35 @@ +{ + "version": 2, + "created_at": "2021-09-24", + "last_updated_at": "2021-09-24", + "title": "CPU load is high on host {{citrix_hypervisor_host.name}}", + "tags": [ + "integration:citrix-hypervisor" + ], + "description": "Get notified when Citrix Hypervisor CPU usage is high.", + "definition": { + "message": "{{#is_warning}}CPU is over 80% on host {{citrix_hypervisor_host.name}}{{/is_warning}} \n{{#is_alert}}CPU is over 80% on VM {{citrix_hypervisor_host.name}}{{/is_alert}}\n\n{{#is_recovery}}CPU load is less than 80% on VM {{citrix_hypervisor_host.name}} {{/is_recovery}} ", + "name": "[Citrix Hypervisor] CPU load is high on host {{citrix_hypervisor_host.name}} ", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_group_delay": 300, + "no_data_timeframe": null, + "notify_audit": true, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": false, + "thresholds": { + "critical": 90, + "warning": 80 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:citrix_hypervisor.host.cpu{*} by {citrix_hypervisor_host} > 90", + "tags": [ + "integration:citrix_hypervisor" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/citrix_hypervisor/assets/monitors/vm_cpu_high.json b/citrix_hypervisor/assets/monitors/vm_cpu_high.json new file mode 100644 index 0000000000000..ec2949666fc1a --- /dev/null +++ b/citrix_hypervisor/assets/monitors/vm_cpu_high.json @@ -0,0 +1,35 @@ +{ + "version": 2, + "created_at": "2021-09-24", + "last_updated_at": "2021-09-24", + "title": "CPU load is high on VM {{citrix_hypervisor_vm.name}}", + "tags": [ + "integration:citrix-hypervisor" + ], + "description": "Get notified when Citrix Hypervisor VMs CPU usage is high.", + "definition": { + "message": "{{#is_warning}}CPU is over 80% on VM {{citrix_hypervisor_vm.name}}{{/is_warning}} \n{{#is_alert}}CPU is over 80% on VM {{citrix_hypervisor_vm.name}}{{/is_alert}}\n\n{{#is_recovery}}CPU load is less than 80% on VM {{citrix_hypervisor_vm.name}} {{/is_recovery}} ", + "name": "[Citrix Hypervisor] CPU load is high on VM {{citrix_hypervisor_vm.name}} ", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_group_delay": 300, + "no_data_timeframe": null, + "notify_audit": true, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": false, + "thresholds": { + "critical": 90, + "warning": 80 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:citrix_hypervisor.vm.cpu{*} by {citrix_hypervisor_vm} > 90", + "tags": [ + "integration:citrix_hypervisor" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/citrix_hypervisor/assets/recommended_monitors/host_cpu_high.json b/citrix_hypervisor/assets/recommended_monitors/host_cpu_high.json deleted file mode 100644 index b00878de0e38a..0000000000000 --- a/citrix_hypervisor/assets/recommended_monitors/host_cpu_high.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "name": "[Citrix Hypervisor] CPU load is high on host {{citrix_hypervisor_host.name}} ", - "type": "query alert", - "query": "avg(last_5m):avg:citrix_hypervisor.host.cpu{*} by {citrix_hypervisor_host} > 90", - "message": "{{#is_warning}}CPU is over 80% on host {{citrix_hypervisor_host.name}}{{/is_warning}} \n{{#is_alert}}CPU is over 80% on VM {{citrix_hypervisor_host.name}}{{/is_alert}}\n\n{{#is_recovery}}CPU load is less than 80% on VM {{citrix_hypervisor_host.name}} {{/is_recovery}} ", - "tags": [ - "integration:citrix_hypervisor" - ], - "options": { - "notify_audit": true, - "locked": false, - "timeout_h": 0, - "new_group_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 90, - "warning": 80 - } - }, - "recommended_monitor_metadata": { - "description": "Get notified when Citrix Hypervisor CPU usage is high." - } -} diff --git a/citrix_hypervisor/assets/recommended_monitors/vm_cpu_high.json b/citrix_hypervisor/assets/recommended_monitors/vm_cpu_high.json deleted file mode 100644 index c72964f850272..0000000000000 --- a/citrix_hypervisor/assets/recommended_monitors/vm_cpu_high.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "name": "[Citrix Hypervisor] CPU load is high on VM {{citrix_hypervisor_vm.name}} ", - "type": "query alert", - "query": "avg(last_5m):avg:citrix_hypervisor.vm.cpu{*} by {citrix_hypervisor_vm} > 90", - "message": "{{#is_warning}}CPU is over 80% on VM {{citrix_hypervisor_vm.name}}{{/is_warning}} \n{{#is_alert}}CPU is over 80% on VM {{citrix_hypervisor_vm.name}}{{/is_alert}}\n\n{{#is_recovery}}CPU load is less than 80% on VM {{citrix_hypervisor_vm.name}} {{/is_recovery}} ", - "tags": [ - "integration:citrix_hypervisor" - ], - "options": { - "notify_audit": true, - "locked": false, - "timeout_h": 0, - "new_group_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 90, - "warning": 80 - } - }, - "recommended_monitor_metadata": { - "description": "Get notified when Citrix Hypervisor VMs CPU usage is high." - } -} diff --git a/citrix_hypervisor/manifest.json b/citrix_hypervisor/manifest.json index 0dcfff1658abb..4681c9c54d4d9 100644 --- a/citrix_hypervisor/manifest.json +++ b/citrix_hypervisor/manifest.json @@ -45,8 +45,8 @@ } }, "monitors": { - "VM CPU high": "assets/recommended_monitors/vm_cpu_high.json", - "Host CPU high": "assets/recommended_monitors/host_cpu_high.json" + "VM CPU high": "assets/monitors/vm_cpu_high.json", + "Host CPU high": "assets/monitors/host_cpu_high.json" }, "logs": { "source": "citrix_hypervisor" diff --git a/cloudera/assets/monitors/cloudera_high_cpu.json b/cloudera/assets/monitors/cloudera_high_cpu.json new file mode 100644 index 0000000000000..667a6c957d801 --- /dev/null +++ b/cloudera/assets/monitors/cloudera_high_cpu.json @@ -0,0 +1,30 @@ +{ + "version": 2, + "created_at": "2023-01-30", + "last_updated_at": "2023-01-30", + "title": "High CPU % usage across hosts for {{cluster.name}}", + "tags": [ + "integration:cloudera" + ], + "description": "Notify your team when there has been high CPU % usage.", + "definition": { + "message": "The CPU % usage across hosts for {{cluster.name}} is high. Please check to see what the issue is.", + "name": "[Cloudera] High CPU % usage across hosts for {{cluster.name}}", + "options": { + "include_tags": false, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 50, + "warning": 40 + } + }, + "query": "avg(last_5m):avg:cloudera.cluster.cpu_percent_across_hosts{*} > 50", + "tags": [ + "integration:cloudera" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/cloudera/assets/recommended_monitors/cloudera_high_cpu.json b/cloudera/assets/recommended_monitors/cloudera_high_cpu.json deleted file mode 100644 index b2c52fbdd1327..0000000000000 --- a/cloudera/assets/recommended_monitors/cloudera_high_cpu.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "name": "[Cloudera] High CPU % usage across hosts for {{cluster.name}}", - "type": "query alert", - "query": "avg(last_5m):avg:cloudera.cluster.cpu_percent_across_hosts{*} > 50", - "message": "The CPU % usage across hosts for {{cluster.name}} is high. Please check to see what the issue is.", - "tags": [ - "integration:cloudera" - ], - "options": { - "thresholds": { - "critical": 50, - "warning": 40 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false - }, - "recommended_monitor_metadata": { - "description": "Notify your team when there has been high CPU % usage." - } - -} diff --git a/cloudera/manifest.json b/cloudera/manifest.json index 35bb1f3cb1961..3a0e6a26c75d4 100644 --- a/cloudera/manifest.json +++ b/cloudera/manifest.json @@ -43,7 +43,7 @@ "Cloudera Data Platform Overview": "assets/dashboards/cloudera_overview.json" }, "monitors": { - "Cloudera High CPU Usage": "assets/recommended_monitors/cloudera_high_cpu.json" + "Cloudera High CPU Usage": "assets/monitors/cloudera_high_cpu.json" } }, "author": { diff --git a/confluent_platform/assets/monitors/unclean_leader_election.json b/confluent_platform/assets/monitors/unclean_leader_election.json index 7c2ddd7ac8750..4779f113a46ea 100644 --- a/confluent_platform/assets/monitors/unclean_leader_election.json +++ b/confluent_platform/assets/monitors/unclean_leader_election.json @@ -1,28 +1,35 @@ { - "name": "[Confluent Platform] Unclean leader election", - "type": "query alert", - "query": "avg(last_5m):avg:confluent.kafka.controller.unclean_leader_elections_per_sec.rate{*} by {instance} > 0", - "message": "An unclean leader election has occurred in your cluster. This is an indication of potential data loss. \n\nIf this unclean leader election was not intentional, consider disabling unclean leader election in your Broker settings.", - "tags": [ - "integration:confluent_platform" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "silenced": {}, - "include_tags": true, - "no_data_timeframe": null, - "require_full_window": true, - "new_host_delay": 300, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "thresholds": { - "critical": 0 - } - }, - "recommended_monitor_metadata": { - "description": "Get notified if an unclean leader election has taken place." - } -} + "version": 2, + "created_at": "2021-10-28", + "last_updated_at": "2021-10-28", + "title": "Unclean leader election", + "tags": [ + "integration:confluent-platform" + ], + "description": "Get notified if an unclean leader election has taken place.", + "definition": { + "message": "An unclean leader election has occurred in your cluster. This is an indication of potential data loss. \n\nIf this unclean leader election was not intentional, consider disabling unclean leader election in your Broker settings.", + "name": "[Confluent Platform] Unclean leader election", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "silenced": {}, + "thresholds": { + "critical": 0 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:confluent.kafka.controller.unclean_leader_elections_per_sec.rate{*} by {instance} > 0", + "tags": [ + "integration:confluent_platform" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/confluent_platform/assets/monitors/unused_partition.json b/confluent_platform/assets/monitors/unused_partition.json index 215cb898d1c86..2f82fd953f028 100644 --- a/confluent_platform/assets/monitors/unused_partition.json +++ b/confluent_platform/assets/monitors/unused_partition.json @@ -1,27 +1,34 @@ { - "name": "[Confluent Platform] Unused topic partition", - "type": "query alert", - "query": "avg(last_5m):avg:confluent.kafka.server.topic.bytes_in_per_sec.rate{*} by {topic-partition} + avg:confluent.kafka.server.topic.bytes_out_per_sec.rate{*} by {topic-partition} <= 0", - "message": "The partition {{topic-partition.name}} appears to be unused. \n\nVerify that this is intentional and be aware that unused partitions increase Broker overhead, reducing efficiency. \n\nDeleting unused partitions is recommended.", - "tags": [ - "integration:confluent_platform" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 0 - } - }, - "recommended_monitor_metadata": { - "description": "Get notified if a partition has not seen bytes produced or consumed." + "version": 2, + "created_at": "2021-10-28", + "last_updated_at": "2021-10-28", + "title": "Unused topic partition", + "tags": [ + "integration:confluent-platform" + ], + "description": "Get notified if a partition has not seen bytes produced or consumed.", + "definition": { + "message": "The partition {{topic-partition.name}} appears to be unused. \n\nVerify that this is intentional and be aware that unused partitions increase Broker overhead, reducing efficiency. \n\nDeleting unused partitions is recommended.", + "name": "[Confluent Platform] Unused topic partition", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "thresholds": { + "critical": 0 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:confluent.kafka.server.topic.bytes_in_per_sec.rate{*} by {topic-partition} + avg:confluent.kafka.server.topic.bytes_out_per_sec.rate{*} by {topic-partition} <= 0", + "tags": [ + "integration:confluent_platform" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/coredns/assets/monitors/coredns_cache_hits_low.json b/coredns/assets/monitors/coredns_cache_hits_low.json index 013d00a60ecdd..03c3a7b566652 100644 --- a/coredns/assets/monitors/coredns_cache_hits_low.json +++ b/coredns/assets/monitors/coredns_cache_hits_low.json @@ -1,32 +1,39 @@ { - "name": "[CoreDNS] Cache hits count is low", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:coredns.cache_hits_count{*}.as_count(), 'basic', 2, direction='below', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", - "message": "The number of cache hits is lower than usual", - "tags": [ - "integration:coredns" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when the number of cache hits is lower than usual." - } -} + "version": 2, + "created_at": "2021-02-26", + "last_updated_at": "2021-02-26", + "title": "Cache hits count is low", + "tags": [ + "integration:coredns" + ], + "description": "Notify your team when the number of cache hits is lower than usual.", + "definition": { + "message": "The number of cache hits is lower than usual", + "name": "[CoreDNS] Cache hits count is low", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "query": "avg(last_4h):anomalies(avg:coredns.cache_hits_count{*}.as_count(), 'basic', 2, direction='below', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", + "tags": [ + "integration:coredns" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/coredns/assets/monitors/coredns_request_duration_high.json b/coredns/assets/monitors/coredns_request_duration_high.json index f23b6a77715ec..a91a7480beb04 100644 --- a/coredns/assets/monitors/coredns_request_duration_high.json +++ b/coredns/assets/monitors/coredns_request_duration_high.json @@ -1,28 +1,35 @@ { - "name": "[CoreDNS] Request duration is high on {{host.name}}", - "type": "query alert", - "query": "sum(last_5m):avg:coredns.request_duration.seconds{*}.as_count() >= 0.02", - "message": "Request duration process is high on {{host.name}} ({{value}} s)", - "tags": [ - "integration:coredns" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 0.02, - "warning": 0.01 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when the request duration is too high." - } -} + "version": 2, + "created_at": "2021-02-26", + "last_updated_at": "2021-02-26", + "title": "Request duration is high on {{host.name}}", + "tags": [ + "integration:coredns" + ], + "description": "Notify your team when the request duration is too high.", + "definition": { + "message": "Request duration process is high on {{host.name}} ({{value}} s)", + "name": "[CoreDNS] Request duration is high on {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": false, + "thresholds": { + "critical": 0.02, + "warning": 0.01 + }, + "timeout_h": 0 + }, + "query": "sum(last_5m):avg:coredns.request_duration.seconds{*}.as_count() >= 0.02", + "tags": [ + "integration:coredns" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/dcgm/assets/monitors/gpu_temperature.json b/dcgm/assets/monitors/gpu_temperature.json index 8b7a28e80b126..a3f3e68079009 100644 --- a/dcgm/assets/monitors/gpu_temperature.json +++ b/dcgm/assets/monitors/gpu_temperature.json @@ -1,29 +1,36 @@ { - "name": "🌡️ DCGM: GPU Temperature is High", - "type": "query alert", - "query": "avg(last_5m):avg:dcgm.temperature{*} by {host} > 85", + "version": 2, + "created_at": "2023-07-06", + "last_updated_at": "2023-07-06", + "title": "🌡️ DCGM: GPU Temperature is High", + "tags": [ + "integration:dcgm" + ], + "description": "Notify your team when a GPU's temperature is too high.", + "definition": { "message": "{{#is_alert}}\nYour GPU on {{host.name}} is running hot, please check it.\n{{/is_alert}}", - "tags": [ - "integration:dcgm" - ], + "name": "🌡️ DCGM: GPU Temperature is High", "options": { - "thresholds": { - "critical": 85 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": true, - "new_group_delay": 60, - "avalanche_window": 10, - "silenced": { - "*": null - } + "avalanche_window": 10, + "include_tags": true, + "new_group_delay": 60, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": { + "*": null + }, + "thresholds": { + "critical": 85 + } }, "priority": null, + "query": "avg(last_5m):avg:dcgm.temperature{*} by {host} > 85", "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when a GPU's temperature is too high." - } -} + "tags": [ + "integration:dcgm" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/dcgm/assets/monitors/memory_usage.json b/dcgm/assets/monitors/memory_usage.json index 2760ee1b5e53e..13f4aec29b60c 100644 --- a/dcgm/assets/monitors/memory_usage.json +++ b/dcgm/assets/monitors/memory_usage.json @@ -1,26 +1,33 @@ { - "name": "⚠️ DCGM: GPU Memory Usage is High", - "type": "query alert", - "query": "avg(last_5m):avg:dcgm.mem.copy_utilization{*} > 90", + "version": 2, + "created_at": "2023-07-06", + "last_updated_at": "2023-07-06", + "title": "⚠️ DCGM: GPU Memory Usage is High", + "tags": [ + "integration:dcgm" + ], + "description": "Notify your team when a GPU's memory usage is too high.", + "definition": { "message": "{{#is_alert}}\nYour GPU memory usage is high: {{value}}%.\n{{/is_alert}}", - "tags": [ - "integration:dcgm" - ], + "name": "⚠️ DCGM: GPU Memory Usage is High", "options": { - "thresholds": { - "critical": 90 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "avalanche_window": 10, - "silenced": {} + "avalanche_window": 10, + "include_tags": false, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 90 + } }, "priority": null, + "query": "avg(last_5m):avg:dcgm.mem.copy_utilization{*} > 90", "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when a GPU's memory usage is too high." - } -} + "tags": [ + "integration:dcgm" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/dcgm/assets/monitors/xid_errors.json b/dcgm/assets/monitors/xid_errors.json index 0d9422409ab58..646f2f33211dd 100644 --- a/dcgm/assets/monitors/xid_errors.json +++ b/dcgm/assets/monitors/xid_errors.json @@ -1,30 +1,37 @@ { - "name": "⛔️ DCGM: XID Errors Detected", - "type": "query alert", - "query": "avg(last_5m):avg:dcgm.xid_errors{*} > 5", + "version": 2, + "created_at": "2023-07-06", + "last_updated_at": "2023-07-06", + "title": "⛔️ DCGM: XID Errors Detected", + "tags": [ + "integration:dcgm" + ], + "description": "Notify your team when at least one XID error occurs in the last 5 minutes.", + "definition": { "message": "{{#is_alert}}\nAn XID error occurred.\n{{/is_alert}}", - "tags": [ - "integration:dcgm" - ], + "name": "⛔️ DCGM: XID Errors Detected", "options": { - "thresholds": { - "critical": 5, - "warning": 0 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "avalanche_window": 10, - "new_host_delay": 300, - "silenced": { - "*": null - } + "avalanche_window": 10, + "include_tags": false, + "new_host_delay": 300, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": { + "*": null + }, + "thresholds": { + "critical": 5, + "warning": 0 + } }, "priority": null, + "query": "avg(last_5m):avg:dcgm.xid_errors{*} > 5", "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when at least one XID error occurs in the last 5 minutes." - } -} + "tags": [ + "integration:dcgm" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/elastic/assets/monitors/elastic_pending_tasks_high.json b/elastic/assets/monitors/elastic_pending_tasks_high.json index 0f1d373188ea4..548f56e966f0d 100644 --- a/elastic/assets/monitors/elastic_pending_tasks_high.json +++ b/elastic/assets/monitors/elastic_pending_tasks_high.json @@ -1,28 +1,35 @@ { - "name": "[ElasticSearch] Number of pending tasks is high", - "type": "query alert", - "query": "avg(last_5m):sum:elasticsearch.pending_tasks_total{*} > 15", - "message": "Number of pending tasks (all priority) is high: {{value}} tasks", - "tags": [ - "integration:elastic" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 15, - "warning": 10 - } - }, - "recommended_monitor_metadata": { - "description": "Get notified when the number of pending tasks is high." - } -} + "version": 2, + "created_at": "2021-01-08", + "last_updated_at": "2021-03-05", + "title": "Number of pending tasks is high", + "tags": [ + "integration:elasticsearch" + ], + "description": "Get notified when the number of pending tasks is high.", + "definition": { + "message": "Number of pending tasks (all priority) is high: {{value}} tasks", + "name": "[ElasticSearch] Number of pending tasks is high", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 15, + "warning": 10 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):sum:elasticsearch.pending_tasks_total{*} > 15", + "tags": [ + "integration:elastic" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/elastic/assets/monitors/elastic_query_latency_high.json b/elastic/assets/monitors/elastic_query_latency_high.json index a059d85e727b2..b848fa9e04958 100644 --- a/elastic/assets/monitors/elastic_query_latency_high.json +++ b/elastic/assets/monitors/elastic_query_latency_high.json @@ -1,28 +1,35 @@ { - "name": "[ElasticSearch] Time spent on queries is high on {{host.name}}", - "type": "query alert", - "query": "avg(last_5m):monotonic_diff( avg:elasticsearch.search.query.time{*} by {host} ) / monotonic_diff( avg:elasticsearch.search.query.total{*} by {host} ) > 3", - "message": "The time spent on processing queries is increasing ({{value}} s/query).\nYou may want to look for potential resource bottlenecks, or investigate whether you need to [optimize your queries.](https://www.datadoghq.com/blog/elasticsearch-performance-scaling-problems/)", - "tags": [ - "integration:elastic" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 3, - "warning": 2 - } - }, - "recommended_monitor_metadata": { - "description": "Get notified when the query latency is high." - } -} + "version": 2, + "created_at": "2021-03-05", + "last_updated_at": "2021-03-05", + "title": "Time spent on queries is high on {{host.name}}", + "tags": [ + "integration:elasticsearch" + ], + "description": "Get notified when the query latency is high.", + "definition": { + "message": "The time spent on processing queries is increasing ({{value}} s/query).\nYou may want to look for potential resource bottlenecks, or investigate whether you need to [optimize your queries.](https://www.datadoghq.com/blog/elasticsearch-performance-scaling-problems/)", + "name": "[ElasticSearch] Time spent on queries is high on {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 3, + "warning": 2 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):monotonic_diff( avg:elasticsearch.search.query.time{*} by {host} ) / monotonic_diff( avg:elasticsearch.search.query.total{*} by {host} ) > 3", + "tags": [ + "integration:elastic" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/elastic/assets/monitors/elastic_query_load_high.json b/elastic/assets/monitors/elastic_query_load_high.json index d5200d0c17a5a..319bcd53bf0eb 100644 --- a/elastic/assets/monitors/elastic_query_load_high.json +++ b/elastic/assets/monitors/elastic_query_load_high.json @@ -1,28 +1,35 @@ { - "name": "[ElasticSearch] Query load is high on {{host.name}} ", - "type": "query alert", - "query": "avg(last_5m):sum:elasticsearch.search.query.current{*} by {host} > 10", - "message": "Number of queries currently in progress is high ({{value}} queries)", - "tags": [ - "integration:elastic" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 10, - "warning": 8 - } - }, - "recommended_monitor_metadata": { - "description": "Get notified when the query load is high." - } -} + "version": 2, + "created_at": "2021-01-08", + "last_updated_at": "2021-03-05", + "title": "Query load is high on {{host.name}}", + "tags": [ + "integration:elasticsearch" + ], + "description": "Get notified when the query load is high.", + "definition": { + "message": "Number of queries currently in progress is high ({{value}} queries)", + "name": "[ElasticSearch] Query load is high on {{host.name}} ", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 10, + "warning": 8 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):sum:elasticsearch.search.query.current{*} by {host} > 10", + "tags": [ + "integration:elastic" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/elastic/assets/monitors/elastic_requests.json b/elastic/assets/monitors/elastic_requests.json index d9e8576c9680d..d7315fdc9254c 100644 --- a/elastic/assets/monitors/elastic_requests.json +++ b/elastic/assets/monitors/elastic_requests.json @@ -1,28 +1,35 @@ { - "name": "[ElasticSearch] Unsuccessful requests rate is high", - "type": "query alert", - "query": "avg(last_5m):100 * monotonic_diff( sum:elasticsearch.get.missing.total{*} ) / monotonic_diff( sum:elasticsearch.get.total{*} ) > 15", - "message": "Unsuccessful requests rate is high: document is missing in {{value}}% of requests.", - "tags": [ - "integration:elastic" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 15, - "warning": 10 - } - }, - "recommended_monitor_metadata": { - "description": "Get notified when the unsuccessful requests rate is high." - } -} + "version": 2, + "created_at": "2021-03-05", + "last_updated_at": "2021-03-05", + "title": "Unsuccessful requests rate is high", + "tags": [ + "integration:elasticsearch" + ], + "description": "Get notified when the unsuccessful requests rate is high.", + "definition": { + "message": "Unsuccessful requests rate is high: document is missing in {{value}}% of requests.", + "name": "[ElasticSearch] Unsuccessful requests rate is high", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 15, + "warning": 10 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):100 * monotonic_diff( sum:elasticsearch.get.missing.total{*} ) / monotonic_diff( sum:elasticsearch.get.total{*} ) > 15", + "tags": [ + "integration:elastic" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/foundationdb/assets/monitors/conflicts.json b/foundationdb/assets/monitors/conflicts.json index b5a8eac8167df..e7b83a7b5bf79 100644 --- a/foundationdb/assets/monitors/conflicts.json +++ b/foundationdb/assets/monitors/conflicts.json @@ -1,29 +1,36 @@ { - "name": "FoundationDB High Level of Conflicted Transactions", - "type": "query alert", - "query": "avg(last_5m):100 * ( avg:foundationdb.workload.transactions.conflicted.hz{*} / avg:foundationdb.workload.transactions.started.hz{*} ) > 10", - "message": "A significant proportion of transactions started are ending up in a conflicted state. This may indicate a query design problem.", - "tags": [ - "integration:fdb" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 24, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 10, - "warning": 5 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Get notified when a significant portion of transactions started are ending up in a conflicted state." - } -} + "version": 2, + "created_at": "2022-03-14", + "last_updated_at": "2022-05-25", + "title": "FoundationDB High Level of Conflicted Transactions", + "tags": [ + "integration:foundationdb" + ], + "description": "Get notified when a significant portion of transactions started are ending up in a conflicted state.", + "definition": { + "message": "A significant proportion of transactions started are ending up in a conflicted state. This may indicate a query design problem.", + "name": "FoundationDB High Level of Conflicted Transactions", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 10, + "warning": 5 + }, + "timeout_h": 24 + }, + "priority": null, + "query": "avg(last_5m):100 * ( avg:foundationdb.workload.transactions.conflicted.hz{*} / avg:foundationdb.workload.transactions.started.hz{*} ) > 10", + "tags": [ + "integration:fdb" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/foundationdb/assets/monitors/errors_logged.json b/foundationdb/assets/monitors/errors_logged.json index 42ad82b1c9f59..bccc59d9d7021 100644 --- a/foundationdb/assets/monitors/errors_logged.json +++ b/foundationdb/assets/monitors/errors_logged.json @@ -1,54 +1,61 @@ { - "name": "FoundationDB Errors Logged", - "type": "log alert", - "query": "logs(\"service:foundationdb @Severity:>=40\").index(\"*\").rollup(\"count\").by(\"host\").last(\"10m\") > 3", - "message": "FoundationDB has logged errors. This indicates that one of the assumptions of the database has been violated; such situations are generally caused by hardware failures.", - "tags": [ - "integration:fdb" - ], - "options": { - "thresholds": { - "critical": 3, - "warning": 1, - "comparison": ">", - "period": { - "name": "10 minute average", - "value": "last_10m", - "text": "10 minutes", - "no_data_timeframe": 20, - "seconds": 600, - "digit": 10, - "unit": "minutes", - "tense": "last" - }, - "timeAggregator": "avg" - }, - "queryConfig": { - "indexes": [], - "track": "logs", - "queryIsFailed": false, - "queryString": "service:foundationdb @Severity:>=40", - "timeRange": { - "from": 1628061270003, - "to": 1628075670003, - "live": true - } - }, - "enable_logs_sample": true, - "notify_audit": false, - "aggregation": { - "metric": "count", - "type": "count", - "groupBy": [ - "core_host" - ] - }, - "restriction_query": null, - "escalation_message": "", - "groupby_simple_monitor": false, - "renotify_interval": 0 - }, - "recommended_monitor_metadata": { - "description": "Get notified when there is an error logged" - } -} + "version": 2, + "created_at": "2022-03-14", + "last_updated_at": "2022-05-25", + "title": "FoundationDB Errors Logged", + "tags": [ + "integration:foundationdb" + ], + "description": "Get notified when there is an error logged", + "definition": { + "message": "FoundationDB has logged errors. This indicates that one of the assumptions of the database has been violated; such situations are generally caused by hardware failures.", + "name": "FoundationDB Errors Logged", + "options": { + "aggregation": { + "groupBy": [ + "core_host" + ], + "metric": "count", + "type": "count" + }, + "enable_logs_sample": true, + "escalation_message": "", + "groupby_simple_monitor": false, + "notify_audit": false, + "queryConfig": { + "indexes": [], + "queryIsFailed": false, + "queryString": "service:foundationdb @Severity:>=40", + "timeRange": { + "from": 1628061270003, + "live": true, + "to": 1628075670003 + }, + "track": "logs" + }, + "renotify_interval": 0, + "restriction_query": null, + "thresholds": { + "comparison": ">", + "critical": 3, + "period": { + "digit": 10, + "name": "10 minute average", + "no_data_timeframe": 20, + "seconds": 600, + "tense": "last", + "text": "10 minutes", + "unit": "minutes", + "value": "last_10m" + }, + "timeAggregator": "avg", + "warning": 1 + } + }, + "query": "logs(\"service:foundationdb @Severity:>=40\").index(\"*\").rollup(\"count\").by(\"host\").last(\"10m\") > 3", + "tags": [ + "integration:fdb" + ], + "type": "log alert" + } +} \ No newline at end of file diff --git a/foundationdb/assets/monitors/high_durability_lag.json b/foundationdb/assets/monitors/high_durability_lag.json index bdc1ee8771e2e..a6b87299cf34a 100644 --- a/foundationdb/assets/monitors/high_durability_lag.json +++ b/foundationdb/assets/monitors/high_durability_lag.json @@ -1,29 +1,36 @@ { + "version": 2, + "created_at": "2022-03-14", + "last_updated_at": "2022-05-25", + "title": "FoundationDB High Durability Lag", + "tags": [ + "integration:foundationdb" + ], + "description": "Get notified when there is a very high durability lag detected", + "definition": { + "message": "{{#is_alert}}\nThe storage process has very high durability lag - that is, the time it takes for data fetched by the storage server to be written to disk.\n{{/is_alert}}\n{{#is_warning}}\nThe storage process has higher than usual durability lag - that is, the time it takes for data fetched by the storage server to be written to disk.\n{{/is_warning}}\n\nThis may indicate that the database cluster cannot keep up with the workload. Sometimes, storage servers can also struggle if faced with a huge number of reads of the keys they store, preventing progress on writes.", "name": "FoundationDB High Durability Lag", - "type": "query alert", - "query": "avg(last_5m):avg:foundationdb.process.role.durability_lag.seconds{fdb_role:storage} by {fdb_process} > 60", - "message": "{{#is_alert}}\nThe storage process has very high durability lag - that is, the time it takes for data fetched by the storage server to be written to disk.\n{{/is_alert}}\n{{#is_warning}}\nThe storage process has higher than usual durability lag - that is, the time it takes for data fetched by the storage server to be written to disk.\n{{/is_warning}}\n\nThis may indicate that the database cluster cannot keep up with the workload. Sometimes, storage servers can also struggle if faced with a huge number of reads of the keys they store, preventing progress on writes.", - "tags": [ - "integration:fdb" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 1, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 60, - "warning": 15 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Get notified when there is a very high durability lag detected" - } -} + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 60, + "warning": 15 + }, + "timeout_h": 1 + }, + "priority": null, + "query": "avg(last_5m):avg:foundationdb.process.role.durability_lag.seconds{fdb_role:storage} by {fdb_process} > 60", + "tags": [ + "integration:fdb" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/foundationdb/assets/monitors/log_queue_spill.json b/foundationdb/assets/monitors/log_queue_spill.json index 306ed87f38798..55abe44cb3758 100644 --- a/foundationdb/assets/monitors/log_queue_spill.json +++ b/foundationdb/assets/monitors/log_queue_spill.json @@ -1,29 +1,36 @@ { - "name": "FoundationDB Log Queue Reaching Spill Threshold", - "type": "query alert", - "query": "avg(last_5m):avg:foundationdb.process.role.queue_length{fdb_role:log} > 1600000000", - "message": "{{#is_warning}}\nThe FoundationDB log queue is approaching the spill threshold, implying that one or more storage server is struggling to keep up with the workload.\n{{/is_warning}}\n\n{{#is_alert}}\nThe FoundationDB log queue has reached the spill threshold, where logs have not been taken and persisted by storage servers soon enough. This may mean the database workload is more than the cluster can sustain.\n{{/is_alert}}", - "tags": [ - "integration:fdb" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 1, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1600000000, - "warning": 1200000000 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Get notified when the log queue is approaching the spill threshold." - } -} + "version": 2, + "created_at": "2022-03-14", + "last_updated_at": "2022-05-25", + "title": "FoundationDB Log Queue Reaching Spill Threshold", + "tags": [ + "integration:foundationdb" + ], + "description": "Get notified when the log queue is approaching the spill threshold.", + "definition": { + "message": "{{#is_warning}}\nThe FoundationDB log queue is approaching the spill threshold, implying that one or more storage server is struggling to keep up with the workload.\n{{/is_warning}}\n\n{{#is_alert}}\nThe FoundationDB log queue has reached the spill threshold, where logs have not been taken and persisted by storage servers soon enough. This may mean the database workload is more than the cluster can sustain.\n{{/is_alert}}", + "name": "FoundationDB Log Queue Reaching Spill Threshold", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "thresholds": { + "critical": 1600000000, + "warning": 1200000000 + }, + "timeout_h": 1 + }, + "priority": null, + "query": "avg(last_5m):avg:foundationdb.process.role.queue_length{fdb_role:log} > 1600000000", + "tags": [ + "integration:fdb" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/foundationdb/assets/monitors/low_disk_space.json b/foundationdb/assets/monitors/low_disk_space.json index 4ef14e53ee932..7e8a5a3a4cdce 100644 --- a/foundationdb/assets/monitors/low_disk_space.json +++ b/foundationdb/assets/monitors/low_disk_space.json @@ -1,29 +1,36 @@ { - "name": "FoundationDB Low Disk Space", - "type": "query alert", - "query": "avg(last_5m):min:foundationdb.process.disk.free_bytes{*} by {fdb_process} < 200000000", - "message": "A FoundationDB process is running very low on disk space. This may soon impact its ability to process new transactions.", - "tags": [ - "integration:fdb" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 200000000, - "warning": 500000000 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Get notified when the FoundationDB process is running very low on disk space." - } -} + "version": 2, + "created_at": "2022-03-14", + "last_updated_at": "2022-05-25", + "title": "FoundationDB Low Disk Space", + "tags": [ + "integration:foundationdb" + ], + "description": "Get notified when the FoundationDB process is running very low on disk space.", + "definition": { + "message": "A FoundationDB process is running very low on disk space. This may soon impact its ability to process new transactions.", + "name": "FoundationDB Low Disk Space", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 200000000, + "warning": 500000000 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_5m):min:foundationdb.process.disk.free_bytes{*} by {fdb_process} < 200000000", + "tags": [ + "integration:fdb" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/foundationdb/assets/monitors/read_latency_probe.json b/foundationdb/assets/monitors/read_latency_probe.json index df3f67a64cea4..50870778f0c56 100644 --- a/foundationdb/assets/monitors/read_latency_probe.json +++ b/foundationdb/assets/monitors/read_latency_probe.json @@ -1,29 +1,36 @@ { - "name": "FoundationDB Read Latency Probe", - "type": "query alert", - "query": "avg(last_5m):avg:foundationdb.latency_probe.read_seconds{*} > 5", - "message": "{{#is_alert}}\nA read operation performed on the FoundationDB cluster to measure current latency took a long time.\n{{/is_alert}} \n{{#is_warning}}\nA read operation performed on the FoundationDB cluster to measure current latency took longer than would normally be expected.\n{{/is_warning}}\n{{#is_no_data}}\nNo FoundationDB latency probe data was received. This may indicate cluster unavailability.\n{{/is_no_data}} ", - "tags": [ - "integration:fdb" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 1, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": true, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": 15, - "include_tags": true, - "thresholds": { - "critical": 5, - "warning": 2 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Get notified when a read operation performed on the FoundationDB cluster to measure current latency took a long time." - } -} + "version": 2, + "created_at": "2022-03-14", + "last_updated_at": "2022-05-25", + "title": "FoundationDB Read Latency Probe", + "tags": [ + "integration:foundationdb" + ], + "description": "Get notified when a read operation performed on the FoundationDB cluster to measure current latency took a long time.", + "definition": { + "message": "{{#is_alert}}\nA read operation performed on the FoundationDB cluster to measure current latency took a long time.\n{{/is_alert}} \n{{#is_warning}}\nA read operation performed on the FoundationDB cluster to measure current latency took longer than would normally be expected.\n{{/is_warning}}\n{{#is_no_data}}\nNo FoundationDB latency probe data was received. This may indicate cluster unavailability.\n{{/is_no_data}} ", + "name": "FoundationDB Read Latency Probe", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": 15, + "notify_audit": false, + "notify_no_data": true, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 5, + "warning": 2 + }, + "timeout_h": 1 + }, + "priority": null, + "query": "avg(last_5m):avg:foundationdb.latency_probe.read_seconds{*} > 5", + "tags": [ + "integration:fdb" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/foundationdb/assets/monitors/rejections.json b/foundationdb/assets/monitors/rejections.json index 3807215b999df..5fb46cc632ba1 100644 --- a/foundationdb/assets/monitors/rejections.json +++ b/foundationdb/assets/monitors/rejections.json @@ -1,29 +1,36 @@ { - "name": "FoundationDB High Level of Rejected Transactions", - "type": "query alert", - "query": "avg(last_5m):100 * ( avg:foundationdb.workload.transactions.rejected_for_queued_too_long.hz{*} / avg:foundationdb.workload.transactions.started.hz{*} ) > 10", - "message": "A significant proportion of transactions started are being rejected having been queued for too long. This may indicate that the database cluster cannot keep up with the workload.", - "tags": [ - "integration:fdb" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 24, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 10, - "warning": 5 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Get notified when a significant proportion of transactions started are being rejected." - } -} + "version": 2, + "created_at": "2022-03-14", + "last_updated_at": "2022-05-25", + "title": "FoundationDB High Level of Rejected Transactions", + "tags": [ + "integration:foundationdb" + ], + "description": "Get notified when a significant proportion of transactions started are being rejected.", + "definition": { + "message": "A significant proportion of transactions started are being rejected having been queued for too long. This may indicate that the database cluster cannot keep up with the workload.", + "name": "FoundationDB High Level of Rejected Transactions", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 10, + "warning": 5 + }, + "timeout_h": 24 + }, + "priority": null, + "query": "avg(last_5m):100 * ( avg:foundationdb.workload.transactions.rejected_for_queued_too_long.hz{*} / avg:foundationdb.workload.transactions.started.hz{*} ) > 10", + "tags": [ + "integration:fdb" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/foundationdb/assets/monitors/service_check.json b/foundationdb/assets/monitors/service_check.json index 31cb3028f6990..9d02ae04e21e4 100644 --- a/foundationdb/assets/monitors/service_check.json +++ b/foundationdb/assets/monitors/service_check.json @@ -1,26 +1,33 @@ { - "name": "FoundationDB Status Check", - "type": "service check", - "query": "\"foundationdb.can_connect\".over(\"*\").by(\"*\").last(6).count_by_status()", - "message": "{{#is_alert}}\nCould not connect to the FoundationDB cluster or otherwise failed to obtain status information.\n{{/is_alert}} \n{{^is_warning}}\nA connection to the fhe FoundationDB cluster could be established and status information retrieved. However, there are degraded processes.\n{{/is_warning}} ", - "tags": [ - "integration:fdb" - ], - "options": { - "renotify_interval": 0, - "timeout_h": 4, - "thresholds": { - "ok": 2, - "warning": 5, - "critical": 2 - }, - "notify_no_data": false, - "no_data_timeframe": 2, - "notify_audit": false, - "new_host_delay": 300, - "escalation_message": "" - }, - "recommended_monitor_metadata": { - "description": "Get notified when the agent could not connect to the FoundationDB cluster or otherwise failed to obtain status." - } -} + "version": 2, + "created_at": "2022-03-14", + "last_updated_at": "2023-07-24", + "title": "FoundationDB Status Check", + "tags": [ + "integration:foundationdb" + ], + "description": "Get notified when the agent could not connect to the FoundationDB cluster or otherwise failed to obtain status.", + "definition": { + "message": "{{#is_alert}}\nCould not connect to the FoundationDB cluster or otherwise failed to obtain status information.\n{{/is_alert}} \n{{^is_warning}}\nA connection to the fhe FoundationDB cluster could be established and status information retrieved. However, there are degraded processes.\n{{/is_warning}} ", + "name": "FoundationDB Status Check", + "options": { + "escalation_message": "", + "new_host_delay": 300, + "no_data_timeframe": 2, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "thresholds": { + "critical": 2, + "ok": 2, + "warning": 5 + }, + "timeout_h": 4 + }, + "query": "\"foundationdb.can_connect\".over(\"*\").by(\"*\").last(6).count_by_status()", + "tags": [ + "integration:fdb" + ], + "type": "service check" + } +} \ No newline at end of file diff --git a/foundationdb/assets/monitors/transaction_commit_latency.json b/foundationdb/assets/monitors/transaction_commit_latency.json index 6846253eb3175..97f5495b33183 100644 --- a/foundationdb/assets/monitors/transaction_commit_latency.json +++ b/foundationdb/assets/monitors/transaction_commit_latency.json @@ -1,29 +1,36 @@ { - "name": "FoundationDB Transaction Commit Latency Probe", - "type": "query alert", - "query": "avg(last_5m):avg:foundationdb.latency_probe.commit_seconds{*} > 5", - "message": "{{#is_alert}}\nA transaction performed on the FoundationDB cluster to measure current latency took a long time to commit.\n{{/is_alert}} \n{{#is_warning}}\nA transaction performed on the FoundationDB cluster to measure current latency took a longer time to commit than would normally be expected.\n{{/is_warning}}", - "tags": [ - "integration:fdb" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 4, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 5, - "warning": 2 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Get notified when a transaction performed on the FoundationDB cluster to measure current latency took a long time to commit." - } -} + "version": 2, + "created_at": "2022-03-14", + "last_updated_at": "2022-05-25", + "title": "FoundationDB Transaction Commit Latency Probe", + "tags": [ + "integration:foundationdb" + ], + "description": "Get notified when a transaction performed on the FoundationDB cluster to measure current latency took a long time to commit.", + "definition": { + "message": "{{#is_alert}}\nA transaction performed on the FoundationDB cluster to measure current latency took a long time to commit.\n{{/is_alert}} \n{{#is_warning}}\nA transaction performed on the FoundationDB cluster to measure current latency took a longer time to commit than would normally be expected.\n{{/is_warning}}", + "name": "FoundationDB Transaction Commit Latency Probe", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 5, + "warning": 2 + }, + "timeout_h": 4 + }, + "priority": null, + "query": "avg(last_5m):avg:foundationdb.latency_probe.commit_seconds{*} > 5", + "tags": [ + "integration:fdb" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/foundationdb/assets/monitors/transaction_start_latency.json b/foundationdb/assets/monitors/transaction_start_latency.json index ff4bf64595528..fb294e91077fa 100644 --- a/foundationdb/assets/monitors/transaction_start_latency.json +++ b/foundationdb/assets/monitors/transaction_start_latency.json @@ -1,29 +1,36 @@ { - "name": "FoundationDB Transaction Start Latency Probe", - "type": "query alert", - "query": "avg(last_5m):avg:foundationdb.latency_probe.transaction_start_seconds{*} > 5", - "message": "{{#is_alert}}\nA transaction performed on the FoundationDB cluster to measure current latency took a long time to start.\n{{/is_alert}} \n{{#is_warning}}\nA transaction performed on the FoundationDB cluster to measure current latency took a longer time to start than would normally be expected.\n{{/is_warning}}\n", - "tags": [ - "integration:fdb" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 4, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 5, - "warning": 2 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Get notified when a transaction performed on the FoundationDB cluster to measure current latency took a long time to start." - } -} + "version": 2, + "created_at": "2022-03-14", + "last_updated_at": "2022-05-25", + "title": "FoundationDB Transaction Start Latency Probe", + "tags": [ + "integration:foundationdb" + ], + "description": "Get notified when a transaction performed on the FoundationDB cluster to measure current latency took a long time to start.", + "definition": { + "message": "{{#is_alert}}\nA transaction performed on the FoundationDB cluster to measure current latency took a long time to start.\n{{/is_alert}} \n{{#is_warning}}\nA transaction performed on the FoundationDB cluster to measure current latency took a longer time to start than would normally be expected.\n{{/is_warning}}\n", + "name": "FoundationDB Transaction Start Latency Probe", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 5, + "warning": 2 + }, + "timeout_h": 4 + }, + "priority": null, + "query": "avg(last_5m):avg:foundationdb.latency_probe.transaction_start_seconds{*} > 5", + "tags": [ + "integration:fdb" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/glusterfs/assets/monitors/brick_status.json b/glusterfs/assets/monitors/brick_status.json index 9493ab1ea7a38..fd71c432b6ce5 100644 --- a/glusterfs/assets/monitors/brick_status.json +++ b/glusterfs/assets/monitors/brick_status.json @@ -1,31 +1,38 @@ { - "name": "[GlusterFS] Increased bricks are offline", - "type": "query alert", - "query": "avg(last_5m):avg:glusterfs.cluster.nodes.count{*} - avg:glusterfs.volume.online{*} > 1", - "message": "Get notified when bricks become offline.", - "tags": [ + "version": 2, + "created_at": "2021-01-12", + "last_updated_at": "2021-01-12", + "title": "Increased bricks are offline", + "tags": [ + "integration:glusterfs" + ], + "description": "Notify your team when more than one brick is down for your GlusterFS cluster.", + "definition": { + "message": "Get notified when bricks become offline.", + "name": "[GlusterFS] Increased bricks are offline", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 1, + "critical_recovery": 0, + "warning": 0.5, + "warning_recovery": 0 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_5m):avg:glusterfs.cluster.nodes.count{*} - avg:glusterfs.volume.online{*} > 1", + "tags": [ "integration:glusterfs" ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "warning": 0.5, - "critical_recovery": 0, - "warning_recovery": 0 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Notify your team when more than one brick is down for your GlusterFS cluster." - } + "type": "query alert" + } } \ No newline at end of file diff --git a/haproxy/assets/monitors/backend_dreq.json b/haproxy/assets/monitors/backend_dreq.json index c736aadeac381..a1dd10d67b0fc 100644 --- a/haproxy/assets/monitors/backend_dreq.json +++ b/haproxy/assets/monitors/backend_dreq.json @@ -1,30 +1,39 @@ { - "name": "[HAProxy] High number of backend denied responses for host: {{host.name}}", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:haproxy.backend.denied.resp_rate{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "The number of backend denied responses due to security restrictions for host: {{host.name}} is above normal.\n\nA malicious attacker or misconfigured application could be to blame. More information on designing ACLs for HAProxy can be found in the [Introduction to HAProxy ACLs blog post](https://www.haproxy.com/blog/introduction-to-haproxy-acls/).", - "tags": ["integration:haproxy"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 + "version": 2, + "created_at": "2021-03-01", + "last_updated_at": "2021-03-01", + "title": "High number of backend denied responses for host: {{host.name}}", + "tags": [ + "integration:haproxy" + ], + "description": "Notifies when HAProxy denined backend responses are higher than usual for a specific host.", + "definition": { + "message": "The number of backend denied responses due to security restrictions for host: {{host.name}} is above normal.\n\nA malicious attacker or misconfigured application could be to blame. More information on designing ACLs for HAProxy can be found in the [Introduction to HAProxy ACLs blog post](https://www.haproxy.com/blog/introduction-to-haproxy-acls/).", + "name": "[HAProxy] High number of backend denied responses for host: {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when HAProxy denined backend responses are higher than usual for a specific host." + "query": "avg(last_4h):anomalies(avg:haproxy.backend.denied.resp_rate{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "tags": [ + "integration:haproxy" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/haproxy/assets/monitors/backend_econ.json b/haproxy/assets/monitors/backend_econ.json index 8f8fb1f8c87c8..ac26cb0324c7e 100644 --- a/haproxy/assets/monitors/backend_econ.json +++ b/haproxy/assets/monitors/backend_econ.json @@ -1,30 +1,39 @@ { - "name": "[HAProxy] Number of backend connection failures for host: {{host.name}} is above normal.", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:haproxy.backend.errors.con_rate{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "There is a higher number of backend connection failures for host: {{host.name}}\n\nNote: This monitored metric doesn't only includes failed backend requests but additionally includes general backend errors, like a backend without an active frontend. Correlating this metric with `haproxy.backend.errors.resp_rate` and response codes from both your frontend and backend servers will give you a better idea of the causes of an increase in backend connection errors.", - "tags": ["integration:haproxy"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 + "version": 2, + "created_at": "2021-03-01", + "last_updated_at": "2021-03-01", + "title": "Number of backend connection failures for host: {{host.name}} is above normal.", + "tags": [ + "integration:haproxy" + ], + "description": "Notifies when HAProxy the number of backend connection errors is above normal for a specific host.", + "definition": { + "message": "There is a higher number of backend connection failures for host: {{host.name}}\n\nNote: This monitored metric doesn't only includes failed backend requests but additionally includes general backend errors, like a backend without an active frontend. Correlating this metric with `haproxy.backend.errors.resp_rate` and response codes from both your frontend and backend servers will give you a better idea of the causes of an increase in backend connection errors.", + "name": "[HAProxy] Number of backend connection failures for host: {{host.name}} is above normal.", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when HAProxy the number of backend connection errors is above normal for a specific host." + "query": "avg(last_4h):anomalies(avg:haproxy.backend.errors.con_rate{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "tags": [ + "integration:haproxy" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/haproxy/assets/monitors/backend_queue_time.json b/haproxy/assets/monitors/backend_queue_time.json index 90e6e74621477..4018607f326a8 100644 --- a/haproxy/assets/monitors/backend_queue_time.json +++ b/haproxy/assets/monitors/backend_queue_time.json @@ -1,25 +1,34 @@ { - "name": "[HAProxy] Backend queue time went above 500ms for host: {{host.name}}", - "type": "query alert", - "query": "max(last_5m):avg:haproxy.backend.queue.time{*} by {host} > 500", - "message": "The average queue time for host: {{host.name}} just reached: {{value}}.\n", - "tags": ["integration:haproxy"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 500 - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when the HAProxy backend queue time went above 500ms for a specific host." + "version": 2, + "created_at": "2021-03-01", + "last_updated_at": "2021-03-01", + "title": "Backend queue time went above 500ms for host: {{host.name}}", + "tags": [ + "integration:haproxy" + ], + "description": "Notifies when the HAProxy backend queue time went above 500ms for a specific host.", + "definition": { + "message": "The average queue time for host: {{host.name}} just reached: {{value}}.\n", + "name": "[HAProxy] Backend queue time went above 500ms for host: {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "thresholds": { + "critical": 500 + }, + "timeout_h": 0 + }, + "query": "max(last_5m):avg:haproxy.backend.queue.time{*} by {host} > 500", + "tags": [ + "integration:haproxy" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/haproxy/assets/monitors/backend_rtime.json b/haproxy/assets/monitors/backend_rtime.json index 94b7fee05b50d..de9df228f510a 100644 --- a/haproxy/assets/monitors/backend_rtime.json +++ b/haproxy/assets/monitors/backend_rtime.json @@ -1,25 +1,34 @@ { - "name": "[HAProxy] Backend response time is above 500ms for host: {{host.name}}", - "type": "query alert", - "query": "avg(last_5m):avg:haproxy.backend.response.time{*} by {host} > 500", - "message": "The average backend response time for host: {{host.name}} is on average at: {{value}} over the last 5min.\n", - "tags": ["integration:haproxy"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 500 - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when the HAProxy backend response time is above 500ms for a specific host." + "version": 2, + "created_at": "2021-03-01", + "last_updated_at": "2021-03-01", + "title": "Backend response time is above 500ms for host: {{host.name}}", + "tags": [ + "integration:haproxy" + ], + "description": "Notifies when the HAProxy backend response time is above 500ms for a specific host.", + "definition": { + "message": "The average backend response time for host: {{host.name}} is on average at: {{value}} over the last 5min.\n", + "name": "[HAProxy] Backend response time is above 500ms for host: {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "thresholds": { + "critical": 500 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:haproxy.backend.response.time{*} by {host} > 500", + "tags": [ + "integration:haproxy" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/haproxy/assets/monitors/backend_sessions.json b/haproxy/assets/monitors/backend_sessions.json index 9dcc4a57302ca..09ddd604435c5 100644 --- a/haproxy/assets/monitors/backend_sessions.json +++ b/haproxy/assets/monitors/backend_sessions.json @@ -1,26 +1,35 @@ { - "name": "[HAProxy] High amount of backend session usage for host: {{host.name}}", - "type": "query alert", - "query": "avg(last_5m):avg:haproxy.backend.session.pct{*} by {host} > 80", - "message": "{{#is_alert}}\n\nALERT: The amount of backend sessions in use for host: {{host.name}} reached {{value}} for a detection threshold of {{threshold}} %\n\nWhen reaching the session limit HAProxy will deny additional clients until resource consumption drops. It could be time to either modify HAProxy’s configuration to allow more sessions, or migrate your HAProxy server to a bigger box.\n\n{{/is_alert}} \n\n{{#is_warning}}\n\nWARNING: The amount of backend sessions in use for host: {{host.name}} reached {{value}} for a detection threshold of {{threshold}} %\n\n{{/is_warning}} ", - "tags": ["integration:haproxy"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 80, - "warning": 60 - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when HAProxy backend sessions usage is approaching the maximum defined for a specific host." + "version": 2, + "created_at": "2021-03-01", + "last_updated_at": "2021-03-01", + "title": "High amount of backend session usage for host: {{host.name}}", + "tags": [ + "integration:haproxy" + ], + "description": "Notifies when HAProxy backend sessions usage is approaching the maximum defined for a specific host.", + "definition": { + "message": "{{#is_alert}}\n\nALERT: The amount of backend sessions in use for host: {{host.name}} reached {{value}} for a detection threshold of {{threshold}} %\n\nWhen reaching the session limit HAProxy will deny additional clients until resource consumption drops. It could be time to either modify HAProxy’s configuration to allow more sessions, or migrate your HAProxy server to a bigger box.\n\n{{/is_alert}} \n\n{{#is_warning}}\n\nWARNING: The amount of backend sessions in use for host: {{host.name}} reached {{value}} for a detection threshold of {{threshold}} %\n\n{{/is_warning}} ", + "name": "[HAProxy] High amount of backend session usage for host: {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 80, + "warning": 60 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:haproxy.backend.session.pct{*} by {host} > 80", + "tags": [ + "integration:haproxy" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/haproxy/assets/monitors/frontend_4xx.json b/haproxy/assets/monitors/frontend_4xx.json index c5e4396b145f5..90a1f0175cc4f 100644 --- a/haproxy/assets/monitors/frontend_4xx.json +++ b/haproxy/assets/monitors/frontend_4xx.json @@ -1,30 +1,39 @@ { - "name": "[HAProxy] Anomalous number of frontend 4xx HTTP responses for host: {{host.name}}", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:haproxy.frontend.response.4xx{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "An anomalous number of HAProxy frontend 4xx HTTP responses for host: {{host.name}} has been detected over the last 15mins.", - "tags": ["integration:haproxy"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 + "version": 2, + "created_at": "2021-03-01", + "last_updated_at": "2021-03-01", + "title": "Anomalous number of frontend 4xx HTTP responses for host: {{host.name}}", + "tags": [ + "integration:haproxy" + ], + "description": "Notifies when HAProxy frontend 4xx errors are higher than usual for a specific host.", + "definition": { + "message": "An anomalous number of HAProxy frontend 4xx HTTP responses for host: {{host.name}} has been detected over the last 15mins.", + "name": "[HAProxy] Anomalous number of frontend 4xx HTTP responses for host: {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when HAProxy frontend 4xx errors are higher than usual for a specific host." + "query": "avg(last_4h):anomalies(avg:haproxy.frontend.response.4xx{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "tags": [ + "integration:haproxy" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/haproxy/assets/monitors/frontend_5xx.json b/haproxy/assets/monitors/frontend_5xx.json index a12f7eab7e983..9f47c0f58eec0 100644 --- a/haproxy/assets/monitors/frontend_5xx.json +++ b/haproxy/assets/monitors/frontend_5xx.json @@ -1,30 +1,39 @@ { - "name": "[HAProxy] Anomalous number of frontend 5xx HTTP responses for host: {{host.name}}", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:haproxy.frontend.response.5xx{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "An anomalous number of HAProxy frontend 5xx HTTP responses for host: {{host.name}} has been detected over the last 15mins.", - "tags": ["integration:haproxy"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 + "version": 2, + "created_at": "2021-03-01", + "last_updated_at": "2021-03-01", + "title": "Anomalous number of frontend 5xx HTTP responses for host: {{host.name}}", + "tags": [ + "integration:haproxy" + ], + "description": "Notifies when HAProxy frontend 5xx errors are higher than usual for a specific host.", + "definition": { + "message": "An anomalous number of HAProxy frontend 5xx HTTP responses for host: {{host.name}} has been detected over the last 15mins.", + "name": "[HAProxy] Anomalous number of frontend 5xx HTTP responses for host: {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when HAProxy frontend 5xx errors are higher than usual for a specific host." + "query": "avg(last_4h):anomalies(avg:haproxy.frontend.response.5xx{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "tags": [ + "integration:haproxy" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/haproxy/assets/monitors/frontend_dreq.json b/haproxy/assets/monitors/frontend_dreq.json index 5c10b2d86bc5a..8157d8370ae76 100644 --- a/haproxy/assets/monitors/frontend_dreq.json +++ b/haproxy/assets/monitors/frontend_dreq.json @@ -1,30 +1,39 @@ { - "name": "[HAProxy] High number of frontend denied requests for host: {{host.name}}", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:haproxy.frontend.denied.req_rate{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "The number of frontend denied requests due to security restrictions for host: {{host.name}} is above normal.\n\nA malicious attacker or misconfigured application could be to blame. More information on designing ACLs for HAProxy can be found in the [Introduction to HAProxy ACLs blog post](https://www.haproxy.com/blog/introduction-to-haproxy-acls/).", - "tags": ["integration:haproxy"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 + "version": 2, + "created_at": "2021-03-01", + "last_updated_at": "2021-03-01", + "title": "High number of frontend denied requests for host: {{host.name}}", + "tags": [ + "integration:haproxy" + ], + "description": "Notifies when HAProxy denined frontend requests are higher than usual for a specific host.", + "definition": { + "message": "The number of frontend denied requests due to security restrictions for host: {{host.name}} is above normal.\n\nA malicious attacker or misconfigured application could be to blame. More information on designing ACLs for HAProxy can be found in the [Introduction to HAProxy ACLs blog post](https://www.haproxy.com/blog/introduction-to-haproxy-acls/).", + "name": "[HAProxy] High number of frontend denied requests for host: {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when HAProxy denined frontend requests are higher than usual for a specific host." + "query": "avg(last_4h):anomalies(avg:haproxy.frontend.denied.req_rate{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "tags": [ + "integration:haproxy" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/haproxy/assets/monitors/frontend_ereq.json b/haproxy/assets/monitors/frontend_ereq.json index 996b165d87c5d..3bdc46164a3e8 100644 --- a/haproxy/assets/monitors/frontend_ereq.json +++ b/haproxy/assets/monitors/frontend_ereq.json @@ -1,30 +1,39 @@ { - "name": "[HAProxy] Number of client-side request error for {{host.name}} is above normal.", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:haproxy.frontend.errors.req_rate{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "There is a higher number of client-side request error for host: {{host.name}}\n\nClient-side request errors could have a number of causes:\n\n- Client terminates before sending request\n- Read error from client\n- Client timeout\n- Client terminated connection\n- Request was tarpitted/subject to ACL", - "tags": ["integration:haproxy"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 + "version": 2, + "created_at": "2021-03-01", + "last_updated_at": "2021-03-01", + "title": "Number of client-side request error for {{host.name}} is above normal.", + "tags": [ + "integration:haproxy" + ], + "description": "Notifies when HAProxy frontend error request rate is above normal for a specific host.", + "definition": { + "message": "There is a higher number of client-side request error for host: {{host.name}}\n\nClient-side request errors could have a number of causes:\n\n- Client terminates before sending request\n- Read error from client\n- Client timeout\n- Client terminated connection\n- Request was tarpitted/subject to ACL", + "name": "[HAProxy] Number of client-side request error for {{host.name}} is above normal.", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when HAProxy frontend error request rate is above normal for a specific host." + "query": "avg(last_4h):anomalies(avg:haproxy.frontend.errors.req_rate{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "tags": [ + "integration:haproxy" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/haproxy/assets/monitors/frontend_sessions.json b/haproxy/assets/monitors/frontend_sessions.json index c62c60a8eb8f0..61f4b5735dd96 100644 --- a/haproxy/assets/monitors/frontend_sessions.json +++ b/haproxy/assets/monitors/frontend_sessions.json @@ -1,26 +1,35 @@ { - "name": "[HAProxy] High amount of frontend session usage for host: {{host.name}}", - "type": "query alert", - "query": "avg(last_5m):avg:haproxy.frontend.session.pct{*} by {host} > 80", - "message": "{{#is_alert}}\n\nALERT: The amount of frontend sessions in use for host: {{host.name}} reached {{value}} for a detection threshold of {{threshold}} %\n\nWhen reaching the session limit HAProxy will deny additional clients until resource consumption drops. It could be time to either modify HAProxy’s configuration to allow more sessions, or migrate your HAProxy server to a bigger box.\n\n{{/is_alert}} \n\n{{#is_warning}}\n\nWARNING: The amount of frontend sessions in use for host: {{host.name}} reached {{value}} for a detection threshold of {{threshold}} %\n\n{{/is_warning}} ", - "tags": ["integration:haproxy"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 80, - "warning": 60 - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when HAProxy frontend sessions usage is approaching the maximum defined for a specific host." + "version": 2, + "created_at": "2021-03-01", + "last_updated_at": "2021-03-01", + "title": "High amount of frontend session usage for host: {{host.name}}", + "tags": [ + "integration:haproxy" + ], + "description": "Notifies when HAProxy frontend sessions usage is approaching the maximum defined for a specific host.", + "definition": { + "message": "{{#is_alert}}\n\nALERT: The amount of frontend sessions in use for host: {{host.name}} reached {{value}} for a detection threshold of {{threshold}} %\n\nWhen reaching the session limit HAProxy will deny additional clients until resource consumption drops. It could be time to either modify HAProxy’s configuration to allow more sessions, or migrate your HAProxy server to a bigger box.\n\n{{/is_alert}} \n\n{{#is_warning}}\n\nWARNING: The amount of frontend sessions in use for host: {{host.name}} reached {{value}} for a detection threshold of {{threshold}} %\n\n{{/is_warning}} ", + "name": "[HAProxy] High amount of frontend session usage for host: {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 80, + "warning": 60 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:haproxy.frontend.session.pct{*} by {host} > 80", + "tags": [ + "integration:haproxy" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/haproxy/assets/monitors/request_rate.json b/haproxy/assets/monitors/request_rate.json index 605fbb889de35..8edb6fb3b6821 100644 --- a/haproxy/assets/monitors/request_rate.json +++ b/haproxy/assets/monitors/request_rate.json @@ -1,31 +1,39 @@ { - "name": "[HAProxy] Anomalous frontend request rate for host {{host.name}}", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:haproxy.frontend.requests.rate{*} by {host}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "There is an anomaly in the amount of frontend requests handled by HAProxy on host: {{host.name}} ", - "tags": ["integration:haproxy"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 + "version": 2, + "created_at": "2021-03-01", + "last_updated_at": "2021-03-01", + "title": "Anomalous frontend request rate for host {{host.name}}", + "tags": [ + "integration:haproxy" + ], + "description": "Notifies when HAProxy experiences an anomalous number of frontend request rate for a specific host.", + "definition": { + "message": "There is an anomaly in the amount of frontend requests handled by HAProxy on host: {{host.name}} ", + "name": "[HAProxy] Anomalous frontend request rate for host {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - - "recommended_monitor_metadata": { - "description": "Notifies when HAProxy experiences an anomalous number of frontend request rate for a specific host." + "query": "avg(last_4h):anomalies(avg:haproxy.frontend.requests.rate{*} by {host}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "tags": [ + "integration:haproxy" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/helm/assets/monitors/monitor_failed_releases.json b/helm/assets/monitors/monitor_failed_releases.json index 1a0e7bf648cf9..c569b86598ef4 100644 --- a/helm/assets/monitors/monitor_failed_releases.json +++ b/helm/assets/monitors/monitor_failed_releases.json @@ -1,26 +1,33 @@ { - "name": "Helm release {{helm_namespace.name}}/{{helm_release.name}} failed on {{kube_cluster_name.name}}", - "type": "service check", - "query": "\"helm.release_state\".over(\"*\").by(\"helm_namespace\",\"helm_release\",\"helm_storage\",\"kube_cluster_name\").last(6).count_by_status()", - "message": "The Helm release deployment {{helm_namespace.name}}/{{helm_release.name}} (using storage {{helm_storage.name}}) has failed on {{kube_cluster_name.name}}.", + "version": 2, + "created_at": "2022-05-27", + "last_updated_at": "2022-05-27", + "title": "Helm release {{helm_namespace.name}}/{{helm_release.name}} failed on {{kube_cluster_name.name}}", "tags": [ "integration:helm" ], - "options": { - "renotify_interval": 0, - "timeout_h": 0, - "thresholds": { - "ok": 1, - "warning": 1, - "critical": 5 + "description": "Get notified when the latest revision of a Helm release is in \"failed\" state.", + "definition": { + "message": "The Helm release deployment {{helm_namespace.name}}/{{helm_release.name}} (using storage {{helm_storage.name}}) has failed on {{kube_cluster_name.name}}.", + "name": "Helm release {{helm_namespace.name}}/{{helm_release.name}} failed on {{kube_cluster_name.name}}", + "options": { + "escalation_message": "", + "new_group_delay": 60, + "no_data_timeframe": 2, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "thresholds": { + "critical": 5, + "ok": 1, + "warning": 1 + }, + "timeout_h": 0 }, - "notify_no_data": false, - "no_data_timeframe": 2, - "notify_audit": false, - "escalation_message": "", - "new_group_delay": 60 - }, - "recommended_monitor_metadata": { - "description": "Get notified when the latest revision of a Helm release is in \"failed\" state." + "query": "\"helm.release_state\".over(\"*\").by(\"helm_namespace\",\"helm_release\",\"helm_storage\",\"kube_cluster_name\").last(6).count_by_status()", + "tags": [ + "integration:helm" + ], + "type": "service check" } -} +} \ No newline at end of file diff --git a/hudi/assets/monitors/commit_duration.json b/hudi/assets/monitors/commit_duration.json index c176f56e1d778..9638ca8979d03 100644 --- a/hudi/assets/monitors/commit_duration.json +++ b/hudi/assets/monitors/commit_duration.json @@ -1,34 +1,41 @@ { - "name": "[Hudi] Anomalous commit duration for table {{table_name.name}}", - "type": "query alert", - "query": "avg(last_12h):anomalies(avg:hudi.action.duration{action:commit} by {table_name,host}, 'agile', 3, direction='above', alert_window='last_2h', interval=600, count_default_zero='true', seasonality='daily') >= 1", - "message": "There is an anomaly in the Hudi commit duration on host: {{host.host}} for the table {{table_name.name}} \n\n\n`hudi.action.duration` measures the amount of time it took to successfully perform a commit on a batch of records in milliseconds.", - "tags": [ - "integration:hudi" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_group_delay": 60, - "require_full_window": false, - "notify_no_data": true, - "renotify_interval": "0", - "renotify_occurrences": null, - "renotify_statuses": null, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_2h", - "recovery_window": "last_1h" - } - }, - "recommended_monitor_metadata": { - "description": "Get notified when the commit duration is higher than normal." - } -} + "version": 2, + "created_at": "2021-12-14", + "last_updated_at": "2021-12-14", + "title": "Anomalous commit duration for table {{table_name.name}}", + "tags": [ + "integration:hudi" + ], + "description": "Get notified when the commit duration is higher than normal.", + "definition": { + "message": "There is an anomaly in the Hudi commit duration on host: {{host.host}} for the table {{table_name.name}} \n\n\n`hudi.action.duration` measures the amount of time it took to successfully perform a commit on a batch of records in milliseconds.", + "name": "[Hudi] Anomalous commit duration for table {{table_name.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_group_delay": 60, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": true, + "renotify_interval": "0", + "renotify_occurrences": null, + "renotify_statuses": null, + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_1h", + "trigger_window": "last_2h" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "query": "avg(last_12h):anomalies(avg:hudi.action.duration{action:commit} by {table_name,host}, 'agile', 3, direction='above', alert_window='last_2h', interval=600, count_default_zero='true', seasonality='daily') >= 1", + "tags": [ + "integration:hudi" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/iis/assets/monitors/err.json b/iis/assets/monitors/err.json index e63ad94e06355..77d5ec7450019 100644 --- a/iis/assets/monitors/err.json +++ b/iis/assets/monitors/err.json @@ -1,34 +1,41 @@ { - "name": "[IIS] Increase of 404 error per second for site: {{site.name}}", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:iis.errors.not_found{*} by {site}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "An increase of not found errors per second for site: {{site.name}} has been detected over the last 15mins. Typically reported as an HTTP 404 response code.", - "tags": [ - "integration:iis" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notifies when IIS not found error per second are higher than usual for a specific site." - } -} + "version": 2, + "created_at": "2021-02-24", + "last_updated_at": "2021-02-24", + "title": "Increase of 404 error per second for site: {{site.name}}", + "tags": [ + "integration:iis" + ], + "description": "Notifies when IIS not found error per second are higher than usual for a specific site.", + "definition": { + "message": "An increase of not found errors per second for site: {{site.name}} has been detected over the last 15mins. Typically reported as an HTTP 404 response code.", + "name": "[IIS] Increase of 404 error per second for site: {{site.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_4h):anomalies(avg:iis.errors.not_found{*} by {site}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "restricted_roles": null, + "tags": [ + "integration:iis" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/iis/assets/monitors/lock.json b/iis/assets/monitors/lock.json index 3028d7a280bbc..4a6abf3a88a41 100644 --- a/iis/assets/monitors/lock.json +++ b/iis/assets/monitors/lock.json @@ -1,34 +1,41 @@ { - "name": "[IIS] Increase of locked error per second for site: {{site.name}}", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:iis.errors.locked{*} by {site}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "An increase of locked errors per second for site: {{site.name}} has been detected over the last 15mins. Typically reported as an HTTP 423 response code.", - "tags": [ - "integration:iis" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notifies when IIS not locked error per second are higher than usual for a specific site." - } -} + "version": 2, + "created_at": "2021-02-24", + "last_updated_at": "2021-02-24", + "title": "Increase of locked error per second for site: {{site.name}}", + "tags": [ + "integration:iis" + ], + "description": "Notifies when IIS not locked error per second are higher than usual for a specific site.", + "definition": { + "message": "An increase of locked errors per second for site: {{site.name}} has been detected over the last 15mins. Typically reported as an HTTP 423 response code.", + "name": "[IIS] Increase of locked error per second for site: {{site.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_4h):anomalies(avg:iis.errors.locked{*} by {site}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "restricted_roles": null, + "tags": [ + "integration:iis" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/iis/assets/monitors/req.json b/iis/assets/monitors/req.json index 8392bb18d67cb..4504f3c7ccc73 100644 --- a/iis/assets/monitors/req.json +++ b/iis/assets/monitors/req.json @@ -1,34 +1,41 @@ { - "name": "[IIS] Anomalous amount of requests for site: {{site.name}}", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:iis.httpd_request_method.get{*} by {site} + avg:iis.httpd_request_method.put{*} by {site} + avg:iis.httpd_request_method.head{*} by {site} + avg:iis.httpd_request_method.delete{*} by {site} + avg:iis.httpd_request_method.options{*} by {site}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "An anomalous amount of requests for site: {{site.name}} has been detected over the last 15mins.", - "tags": [ - "integration:iis" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notifies when IIS requests are higher or lower than usual for a specific site." - } + "version": 2, + "created_at": "2021-02-24", + "last_updated_at": "2021-02-24", + "title": "Anomalous amount of requests for site: {{site.name}}", + "tags": [ + "integration:iis" + ], + "description": "Notifies when IIS requests are higher or lower than usual for a specific site.", + "definition": { + "message": "An anomalous amount of requests for site: {{site.name}} has been detected over the last 15mins.", + "name": "[IIS] Anomalous amount of requests for site: {{site.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_4h):anomalies(avg:iis.httpd_request_method.get{*} by {site} + avg:iis.httpd_request_method.put{*} by {site} + avg:iis.httpd_request_method.head{*} by {site} + avg:iis.httpd_request_method.delete{*} by {site} + avg:iis.httpd_request_method.options{*} by {site}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "restricted_roles": null, + "tags": [ + "integration:iis" + ], + "type": "query alert" + } } \ No newline at end of file diff --git a/istio/assets/monitors/failed_sidecar_injection.json b/istio/assets/monitors/failed_sidecar_injection.json index 6b18185ec0b6f..cf0f58ae0939b 100644 --- a/istio/assets/monitors/failed_sidecar_injection.json +++ b/istio/assets/monitors/failed_sidecar_injection.json @@ -1,29 +1,36 @@ { - "name": "[Istio] Increased number of failed sidecar injection requests", - "type": "query alert", - "query": "sum(last_5m):avg:istio.sidecar_injection.requests_total{*}.as_count() - avg:istio.sidecar_injection.success_total{*}.as_count() > 3", - "message": "Monitor Istio for failed sidecar injection requests. Increasing failed requests may signify other issues in your Istio mesh and requires attention.", - "tags": [ + "version": 2, + "created_at": "2021-01-11", + "last_updated_at": "2021-01-11", + "title": "Increased number of failed sidecar injection requests", + "tags": [ + "integration:istio" + ], + "description": "Notify your team when there is more than 1 failed sidecar injection requesst in your Istio integration.", + "definition": { + "message": "Monitor Istio for failed sidecar injection requests. Increasing failed requests may signify other issues in your Istio mesh and requires attention.", + "name": "[Istio] Increased number of failed sidecar injection requests", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 3, + "warning": 1 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "sum(last_5m):avg:istio.sidecar_injection.requests_total{*}.as_count() - avg:istio.sidecar_injection.success_total{*}.as_count() > 3", + "tags": [ "integration:istio" ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 3, - "warning": 1 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Notify your team when there is more than 1 failed sidecar injection requesst in your Istio integration." - } -} + "type": "query alert" + } +} \ No newline at end of file diff --git a/kafka/assets/monitors/broker_produce_latency.json b/kafka/assets/monitors/broker_produce_latency.json index 0f1d8da5e3b80..7ddcebb106fc7 100644 --- a/kafka/assets/monitors/broker_produce_latency.json +++ b/kafka/assets/monitors/broker_produce_latency.json @@ -1,30 +1,37 @@ { - "name": "[Kafka] High produce latency: {{value}} reqs/s on broker {{instance.name}}", - "type": "query alert", - "query": "avg(last_5m):avg:kafka.request.produce.time.99percentile{*} by {instance} > 200", - "message": "{{#is_alert}}\n\nALERT: The p99 produce latency on broker {{instance.name}} reached: {{value}}.\n\n{{/is_alert}} \n\n{{#is_warning}}\n\nWARNING: The p99 produce latency on broker {{instance.name}} reached: {{value}}.\n\n{{/is_warning}} \n\n\n**Potential Impacts**\n\n - Client timeouts\n - Delays in the ability of clients to process their workload\n - Could be a leading indicator that the broker is falling behind\n due to lack of capacity or a performance-impacting incident.\n\n**Recommended Actions**\n\n - Investigate the state of the broker\n - Consider topic rebalancing if the traffic on a given topic has\n outstripped the resources available to it\n - Consider expanding capacity by adding additional brokers\n - Broker restart or replacement can help in some situations.\n If TCP memory is high, and increasing in correlation with the\n load, this could mean that the disk is struggling to keep up.\n Restarting kafka has shown some immediate benefits when it comes\n to reducing the load.", - "tags": [ - "integration:kafka" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": true, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": 10, - "include_tags": true, - "thresholds": { - "critical": 200, - "warning": 100 - } - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Get notified when your Kafka brokers have high p99 produce latency." - } + "version": 2, + "created_at": "2022-01-19", + "last_updated_at": "2022-01-19", + "title": "High produce latency: {{value}} reqs/s on broker {{instance.name}}", + "tags": [ + "integration:kafka" + ], + "description": "Get notified when your Kafka brokers have high p99 produce latency.", + "definition": { + "message": "{{#is_alert}}\n\nALERT: The p99 produce latency on broker {{instance.name}} reached: {{value}}.\n\n{{/is_alert}} \n\n{{#is_warning}}\n\nWARNING: The p99 produce latency on broker {{instance.name}} reached: {{value}}.\n\n{{/is_warning}} \n\n\n**Potential Impacts**\n\n - Client timeouts\n - Delays in the ability of clients to process their workload\n - Could be a leading indicator that the broker is falling behind\n due to lack of capacity or a performance-impacting incident.\n\n**Recommended Actions**\n\n - Investigate the state of the broker\n - Consider topic rebalancing if the traffic on a given topic has\n outstripped the resources available to it\n - Consider expanding capacity by adding additional brokers\n - Broker restart or replacement can help in some situations.\n If TCP memory is high, and increasing in correlation with the\n load, this could mean that the disk is struggling to keep up.\n Restarting kafka has shown some immediate benefits when it comes\n to reducing the load.", + "name": "[Kafka] High produce latency: {{value}} reqs/s on broker {{instance.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": 10, + "notify_audit": false, + "notify_no_data": true, + "renotify_interval": 0, + "require_full_window": true, + "thresholds": { + "critical": 200, + "warning": 100 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_5m):avg:kafka.request.produce.time.99percentile{*} by {instance} > 200", + "restricted_roles": null, + "tags": [ + "integration:kafka" + ], + "type": "query alert" + } } \ No newline at end of file diff --git a/kafka/assets/monitors/kafka_high_producer_request_rate.json b/kafka/assets/monitors/kafka_high_producer_request_rate.json new file mode 100644 index 0000000000000..02ec499d35476 --- /dev/null +++ b/kafka/assets/monitors/kafka_high_producer_request_rate.json @@ -0,0 +1,40 @@ +{ + "version": 2, + "created_at": "2021-02-12", + "last_updated_at": "2021-02-12", + "title": "High request rate on producer {{host.name}}", + "tags": [ + "integration:kafka" + ], + "description": "Notify your team when a producer has a high request rate.", + "definition": { + "message": "The request rate on a producer is abnormally high: {{value}} request/s.", + "name": "[Kafka] High request rate on producer {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_5m", + "trigger_window": "last_5m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0.2, + "warning": 0.8 + }, + "timeout_h": 0 + }, + "query": "avg(last_1h):anomalies(avg:kafka.producer.request_rate{*}, 'basic', 2, direction='above', alert_window='last_5m', interval=20, count_default_zero='true') >= 1", + "tags": [ + "integration:kafka" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/kafka/assets/monitors/kafka_offline_partition.json b/kafka/assets/monitors/kafka_offline_partition.json new file mode 100644 index 0000000000000..cf634a80e8956 --- /dev/null +++ b/kafka/assets/monitors/kafka_offline_partition.json @@ -0,0 +1,35 @@ +{ + "version": 2, + "created_at": "2021-01-08", + "last_updated_at": "2021-02-12", + "title": "Offline partition on {{host.name}}", + "tags": [ + "integration:kafka" + ], + "description": "Notify your team when a partition has no active leader.", + "definition": { + "message": "Partition without an active leader detected", + "name": "[Kafka] Offline partition on {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 1, + "warning": 0 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:kafka.replication.offline_partitions_count{*} > 1", + "tags": [ + "integration:kafka" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/kafka/assets/recommended_monitors/kafka_high_producer_request_rate.json b/kafka/assets/recommended_monitors/kafka_high_producer_request_rate.json deleted file mode 100644 index 87aba4c2241d9..0000000000000 --- a/kafka/assets/recommended_monitors/kafka_high_producer_request_rate.json +++ /dev/null @@ -1,33 +0,0 @@ -{ - "name": "[Kafka] High request rate on producer {{host.name}}", - "type": "query alert", - "query": "avg(last_1h):anomalies(avg:kafka.producer.request_rate{*}, 'basic', 2, direction='above', alert_window='last_5m', interval=20, count_default_zero='true') >= 1", - "message": "The request rate on a producer is abnormally high: {{value}} request/s.", - "tags": [ - "integration:kafka" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "warning": 0.8, - "critical_recovery": 0.2 - }, - "threshold_windows": { - "trigger_window": "last_5m", - "recovery_window": "last_5m" - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when a producer has a high request rate." - } -} diff --git a/kafka/assets/recommended_monitors/kafka_offline_partition.json b/kafka/assets/recommended_monitors/kafka_offline_partition.json deleted file mode 100644 index f0e58feb44062..0000000000000 --- a/kafka/assets/recommended_monitors/kafka_offline_partition.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "name": "[Kafka] Offline partition on {{host.name}}", - "type": "query alert", - "query": "avg(last_5m):avg:kafka.replication.offline_partitions_count{*} > 1", - "message": "Partition without an active leader detected", - "tags": [ - "integration:kafka" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "warning": 0 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when a partition has no active leader." - } -} diff --git a/kafka/manifest.json b/kafka/manifest.json index 4584f45e77a2e..c2b095fa50ebc 100644 --- a/kafka/manifest.json +++ b/kafka/manifest.json @@ -55,8 +55,8 @@ }, "monitors": { "[Kafka] High produce latency on broker": "assets/monitors/broker_produce_latency.json", - "[Kafka] High producer request rate": "assets/recommended_monitors/kafka_high_producer_request_rate.json", - "[Kafka] Offline partition": "assets/recommended_monitors/kafka_offline_partition.json" + "[Kafka] High producer request rate": "assets/monitors/kafka_high_producer_request_rate.json", + "[Kafka] Offline partition": "assets/monitors/kafka_offline_partition.json" }, "saved_views": { "error_warning_status": "assets/saved_views/error_warning_status.json", diff --git a/kubernetes/assets/monitors/monitor_deployments_replicas.json b/kubernetes/assets/monitors/monitor_deployments_replicas.json index 8d83915b4d112..74b998f0c8920 100644 --- a/kubernetes/assets/monitors/monitor_deployments_replicas.json +++ b/kubernetes/assets/monitors/monitor_deployments_replicas.json @@ -1,26 +1,33 @@ { - "name": "[kubernetes] Monitor Kubernetes Deployments Replica Pods", - "type": "query alert", - "query": "avg(last_15m):avg:kubernetes_state.deployment.replicas_desired{*} by {kube_cluster_name,kube_namespace,kube_deployment} - avg:kubernetes_state.deployment.replicas_available{*} by {kube_cluster_name,kube_namespace,kube_deployment} >= 2", - "message": "More than one Deployments Replica's pods are down in Deployment {{kube_namespace.name}}/{{kube_deployment.name}}.", - "tags": [ - "integration:kubernetes" - ], - "options": { - "notify_audit": true, - "locked": false, - "timeout_h": 0, - "include_tags": true, - "require_full_window": false, - "new_host_delay": 300, - "notify_no_data": true, - "renotify_interval": 0, - "escalation_message": "", - "thresholds": { - "critical": 2 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when more than one replica pod is down for your Kubernetes integration." - } + "version": 2, + "created_at": "2020-07-28", + "last_updated_at": "2022-12-07", + "title": "Monitor Kubernetes Deployments Replica Pods", + "tags": [ + "integration:kubernetes" + ], + "description": "Notify your team when more than one replica pod is down for your Kubernetes integration.", + "definition": { + "message": "More than one Deployments Replica's pods are down in Deployment {{kube_namespace.name}}/{{kube_deployment.name}}.", + "name": "[kubernetes] Monitor Kubernetes Deployments Replica Pods", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "notify_audit": true, + "notify_no_data": true, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 2 + }, + "timeout_h": 0 + }, + "query": "avg(last_15m):avg:kubernetes_state.deployment.replicas_desired{*} by {kube_cluster_name,kube_namespace,kube_deployment} - avg:kubernetes_state.deployment.replicas_available{*} by {kube_cluster_name,kube_namespace,kube_deployment} >= 2", + "tags": [ + "integration:kubernetes" + ], + "type": "query alert" + } } \ No newline at end of file diff --git a/kubernetes/assets/monitors/monitor_node_unavailable.json b/kubernetes/assets/monitors/monitor_node_unavailable.json index 7a665a3145537..d5f32dd15a2a8 100644 --- a/kubernetes/assets/monitors/monitor_node_unavailable.json +++ b/kubernetes/assets/monitors/monitor_node_unavailable.json @@ -1,27 +1,34 @@ { - "name": "[kubernetes] Monitor Unschedulable Kubernetes Nodes", - "type": "query alert", - "query": "max(last_15m):default_zero(sum:kubernetes_state.node.status{status:schedulable} by {kube_cluster_name} * 100 / sum:kubernetes_state.node.status{*} by {kube_cluster_name}) < 80", - "message": "More than 20% of nodes are unschedulable on ({{kube_cluster_name.name}} cluster). \n Keep in mind that this might be expected based on your infrastructure.", - "tags": [ - "integration:kubernetes" - ], - "options": { - "notify_audit": true, - "locked": false, - "timeout_h": 0, - "include_tags": true, - "require_full_window": false, - "new_host_delay": 300, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "thresholds": { - "critical": 80, - "warning": 90 - } - }, - "recommended_monitor_metadata": { - "description": "Get notified when nodes are unavailable for your Kubernetes integration." - } -} + "version": 2, + "created_at": "2020-07-28", + "last_updated_at": "2023-07-31", + "title": "Monitor Unschedulable Kubernetes Nodes", + "tags": [ + "integration:kubernetes" + ], + "description": "Get notified when nodes are unavailable for your Kubernetes integration.", + "definition": { + "message": "More than 20% of nodes are unschedulable on ({{kube_cluster_name.name}} cluster). \n Keep in mind that this might be expected based on your infrastructure.", + "name": "[kubernetes] Monitor Unschedulable Kubernetes Nodes", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "notify_audit": true, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 80, + "warning": 90 + }, + "timeout_h": 0 + }, + "query": "max(last_15m):default_zero(sum:kubernetes_state.node.status{status:schedulable} by {kube_cluster_name} * 100 / sum:kubernetes_state.node.status{*} by {kube_cluster_name}) < 80", + "tags": [ + "integration:kubernetes" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/kubernetes/assets/monitors/monitor_pod_crashloopbackoff.json b/kubernetes/assets/monitors/monitor_pod_crashloopbackoff.json index 2163551079fdd..19e28565892f5 100644 --- a/kubernetes/assets/monitors/monitor_pod_crashloopbackoff.json +++ b/kubernetes/assets/monitors/monitor_pod_crashloopbackoff.json @@ -1,27 +1,33 @@ { - "name": "[kubernetes] Pod {{pod_name.name}} is CrashloopBackOff on namespace {{kube_namespace.name}}", - "type": "query alert", - "query": "max(last_10m):default_zero(max:kubernetes_state.container.status_report.count.waiting{reason:crashloopbackoff} by {kube_cluster_name,kube_namespace,pod_name}) >= 1", - "message": "pod {{pod_name.name}} is in CrashloopBackOff on {{kube_namespace.name}} \n This alert could generate several alerts for a bad deployment. Adjust the thresholds of the query to suit your infrastructure.", - "tags": [ - "integration:kubernetes" - ], - "options": { - "notify_audit": true, - "locked": false, - "timeout_h": 0, - "include_tags": true, - "require_full_window": false, - "new_host_delay": 300, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "thresholds": { - "critical": 1 - } - }, - "recommended_monitor_metadata": { - "description": "Get notified when a pod is in a CrashloopBackOff state for your Kubernetes integration." - } -} - + "version": 2, + "created_at": "2020-07-28", + "last_updated_at": "2023-07-05", + "title": "Pod {{pod_name.name}} is CrashloopBackOff on namespace {{kube_namespace.name}}", + "tags": [ + "integration:kubernetes" + ], + "description": "Get notified when a pod is in a CrashloopBackOff state for your Kubernetes integration.", + "definition": { + "message": "pod {{pod_name.name}} is in CrashloopBackOff on {{kube_namespace.name}} \n This alert could generate several alerts for a bad deployment. Adjust the thresholds of the query to suit your infrastructure.", + "name": "[kubernetes] Pod {{pod_name.name}} is CrashloopBackOff on namespace {{kube_namespace.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "notify_audit": true, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 1 + }, + "timeout_h": 0 + }, + "query": "max(last_10m):default_zero(max:kubernetes_state.container.status_report.count.waiting{reason:crashloopbackoff} by {kube_cluster_name,kube_namespace,pod_name}) >= 1", + "tags": [ + "integration:kubernetes" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/kubernetes/assets/monitors/monitor_pod_imagepullbackoff.json b/kubernetes/assets/monitors/monitor_pod_imagepullbackoff.json index 2664211085d24..e8bf604e42593 100644 --- a/kubernetes/assets/monitors/monitor_pod_imagepullbackoff.json +++ b/kubernetes/assets/monitors/monitor_pod_imagepullbackoff.json @@ -1,26 +1,33 @@ { - "name": "[kubernetes] Pod {{pod_name.name}} is ImagePullBackOff on namespace {{kube_namespace.name}}", - "type": "query alert", - "query": "max(last_10m):default_zero(max:kubernetes_state.container.status_report.count.waiting{reason:imagepullbackoff} by {kube_cluster_name,kube_namespace,pod_name}) >= 1", - "message": "pod {{pod_name.name}} is in ImagePullBackOff on {{kube_namespace.name}} \n This could happen for several reasons, for example a bad image path or tag or if the credentials for pulling images are not configured properly.", - "tags": [ - "integration:kubernetes" - ], - "options": { - "notify_audit": true, - "locked": false, - "timeout_h": 0, - "include_tags": true, - "require_full_window": false, - "new_host_delay": 300, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "thresholds": { - "critical": 1 - } - }, - "recommended_monitor_metadata": { - "description": "Get notified when a pod is in a ImagePullBackOff state for your Kubernetes integration." - } -} + "version": 2, + "created_at": "2020-09-15", + "last_updated_at": "2023-07-31", + "title": "Pod {{pod_name.name}} is ImagePullBackOff on namespace {{kube_namespace.name}}", + "tags": [ + "integration:kubernetes" + ], + "description": "Get notified when a pod is in a ImagePullBackOff state for your Kubernetes integration.", + "definition": { + "message": "pod {{pod_name.name}} is in ImagePullBackOff on {{kube_namespace.name}} \n This could happen for several reasons, for example a bad image path or tag or if the credentials for pulling images are not configured properly.", + "name": "[kubernetes] Pod {{pod_name.name}} is ImagePullBackOff on namespace {{kube_namespace.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "notify_audit": true, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 1 + }, + "timeout_h": 0 + }, + "query": "max(last_10m):default_zero(max:kubernetes_state.container.status_report.count.waiting{reason:imagepullbackoff} by {kube_cluster_name,kube_namespace,pod_name}) >= 1", + "tags": [ + "integration:kubernetes" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/kubernetes/assets/monitors/monitor_pods_failed_state.json b/kubernetes/assets/monitors/monitor_pods_failed_state.json index f715858592058..441e9e3903ed4 100644 --- a/kubernetes/assets/monitors/monitor_pods_failed_state.json +++ b/kubernetes/assets/monitors/monitor_pods_failed_state.json @@ -1,29 +1,35 @@ { - "name": "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces", - "type": "query alert", - "query": "change(avg(last_5m),last_5m):default_zero(sum:kubernetes_state.pod.status_phase{pod_phase:failed} by {kube_cluster_name,kube_namespace}) > 10", - "message": "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.", - "tags": [ - "integration:kubernetes" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 10, - "warning": 5 - } - }, - "recommended_monitor_metadata": { - "description": "Get notified when more than 10 pods are failing for a given Kubernetes cluster." - } - -} + "version": 2, + "created_at": "2020-07-28", + "last_updated_at": "2023-07-31", + "title": "Monitor Kubernetes Failed Pods in Namespaces", + "tags": [ + "integration:kubernetes" + ], + "description": "Get notified when more than 10 pods are failing for a given Kubernetes cluster.", + "definition": { + "message": "More than ten pods are failing in ({{kube_cluster_name.name}} cluster). \n The threshold of ten pods varies depending on your infrastructure. Change the threshold to suit your needs.", + "name": "[kubernetes] Monitor Kubernetes Failed Pods in Namespaces", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 10, + "warning": 5 + }, + "timeout_h": 0 + }, + "query": "change(avg(last_5m),last_5m):default_zero(sum:kubernetes_state.pod.status_phase{pod_phase:failed} by {kube_cluster_name,kube_namespace}) > 10", + "tags": [ + "integration:kubernetes" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/kubernetes/assets/monitors/monitor_pods_restarting.json b/kubernetes/assets/monitors/monitor_pods_restarting.json index 6dcb41665a80e..60e07e5ef29aa 100644 --- a/kubernetes/assets/monitors/monitor_pods_restarting.json +++ b/kubernetes/assets/monitors/monitor_pods_restarting.json @@ -1,27 +1,34 @@ { - "name": "[kubernetes] Monitor Kubernetes Pods Restarting", - "type": "query alert", - "query": "change(max(last_5m),last_5m):exclude_null(sum:kubernetes.containers.restarts{*} by {kube_cluster_name,pod_name}) > 5", - "message": "Pod {{pod_name.name}} restarted multiple times in the last five minutes.", - "tags": [ - "integration:kubernetes" - ], - "options": { - "notify_audit": true, - "locked": false, - "timeout_h": 0, - "include_tags": true, - "require_full_window": false, - "new_host_delay": 300, - "notify_no_data": true, - "renotify_interval": 0, - "escalation_message": "", - "thresholds": { - "critical": 5, - "warning": 3 - } - }, - "recommended_monitor_metadata": { - "description": "Get notified when pods restart multiple times for your Kubernetes integration." - } -} + "version": 2, + "created_at": "2020-07-28", + "last_updated_at": "2023-07-03", + "title": "Monitor Kubernetes Pods Restarting", + "tags": [ + "integration:kubernetes" + ], + "description": "Get notified when pods restart multiple times for your Kubernetes integration.", + "definition": { + "message": "Pod {{pod_name.name}} restarted multiple times in the last five minutes.", + "name": "[kubernetes] Monitor Kubernetes Pods Restarting", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "notify_audit": true, + "notify_no_data": true, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 5, + "warning": 3 + }, + "timeout_h": 0 + }, + "query": "change(max(last_5m),last_5m):exclude_null(sum:kubernetes.containers.restarts{*} by {kube_cluster_name,pod_name}) > 5", + "tags": [ + "integration:kubernetes" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/kubernetes/assets/monitors/monitor_statefulset_replicas.json b/kubernetes/assets/monitors/monitor_statefulset_replicas.json index c58133b9647b3..9e1fe24bb6358 100644 --- a/kubernetes/assets/monitors/monitor_statefulset_replicas.json +++ b/kubernetes/assets/monitors/monitor_statefulset_replicas.json @@ -1,27 +1,34 @@ { - "name": "[kubernetes] Monitor Kubernetes Statefulset Replicas", - "type": "query alert", - "query": "max(last_15m):sum:kubernetes_state.statefulset.replicas_desired{*} by {kube_cluster_name,kube_namespace,kube_stateful_set} - sum:kubernetes_state.statefulset.replicas_ready{*} by {kube_cluster_name,kube_namespace,kube_stateful_set} >= 2", - "message": "More than one Statefulset Replica's pods are down in Statefulset {{kube_namespace.name}}/{{kube_stateful_set.name}}. This might present an unsafe situation for any further manual operations, such as killing other pods.", - "tags": [ - "integration:kubernetes" - ], - "options": { - "notify_audit": true, - "locked": false, - "timeout_h": 0, - "include_tags": true, - "require_full_window": false, - "new_host_delay": 300, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "thresholds": { - "critical": 2, - "warning": 1 - } - }, - "recommended_monitor_metadata": { - "description": "Get notified when more than one Stateful Replica pod is down for your Kubernetes integration." - } -} + "version": 2, + "created_at": "2020-07-28", + "last_updated_at": "2022-12-07", + "title": "Monitor Kubernetes Statefulset Replicas", + "tags": [ + "integration:kubernetes" + ], + "description": "Get notified when more than one Stateful Replica pod is down for your Kubernetes integration.", + "definition": { + "message": "More than one Statefulset Replica's pods are down in Statefulset {{kube_namespace.name}}/{{kube_stateful_set.name}}. This might present an unsafe situation for any further manual operations, such as killing other pods.", + "name": "[kubernetes] Monitor Kubernetes Statefulset Replicas", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "notify_audit": true, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 2, + "warning": 1 + }, + "timeout_h": 0 + }, + "query": "max(last_15m):sum:kubernetes_state.statefulset.replicas_desired{*} by {kube_cluster_name,kube_namespace,kube_stateful_set} - sum:kubernetes_state.statefulset.replicas_ready{*} by {kube_cluster_name,kube_namespace,kube_stateful_set} >= 2", + "tags": [ + "integration:kubernetes" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/langchain/assets/monitors/error_rate.json b/langchain/assets/monitors/error_rate.json new file mode 100644 index 0000000000000..1a66c1df934ce --- /dev/null +++ b/langchain/assets/monitors/error_rate.json @@ -0,0 +1,32 @@ +{ + "version": 2, + "created_at": "2023-07-21", + "last_updated_at": "2023-07-21", + "title": "LangChain service has a high error rate", + "tags": [ + "integration:langchain" + ], + "description": "Notify your team when requests made with LangChain have increased error rates", + "definition": { + "message": "{{#is_alert}}\n\nALERT: The error rate of your LangChain requests is higher than normal. The error rate is currently over {{value}} errors per request.\n{{/is_alert}}", + "name": "[LangChain] LangChain service has a high error rate", + "options": { + "include_tags": false, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 0.1, + "warning": 0.05 + } + }, + "priority": null, + "query": "sum(last_5m):(sum:trace.langchain.request.errors{*}.as_count() / sum:trace.langchain.request.hits{*}.as_count()) > 0.1", + "restricted_roles": null, + "tags": [ + "integration:langchain" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/langchain/assets/monitors/request_duration.json b/langchain/assets/monitors/request_duration.json new file mode 100644 index 0000000000000..b460517e2b5e8 --- /dev/null +++ b/langchain/assets/monitors/request_duration.json @@ -0,0 +1,31 @@ +{ + "version": 2, + "created_at": "2023-07-21", + "last_updated_at": "2023-07-21", + "title": "Request Duration Spike", + "tags": [ + "integration:langchain" + ], + "description": "Notify your team when requests made with LangChain have increased latency", + "definition": { + "message": "{{#is_alert}}\n\nALERT: The duration of your LangChain requests is higher than normal. The average LangChain request duration is currently over {{value}} seconds. \n\n{{/is_alert}}", + "name": "[LangChain] Request Duration Spike", + "options": { + "include_tags": false, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 10 + } + }, + "priority": null, + "query": "avg(last_5m):avg:system.load.1{*} > 10", + "restricted_roles": null, + "tags": [ + "integration:langchain" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/langchain/assets/recommended_monitors/error_rate.json b/langchain/assets/recommended_monitors/error_rate.json deleted file mode 100644 index ab10f117b84ac..0000000000000 --- a/langchain/assets/recommended_monitors/error_rate.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "name": "[LangChain] LangChain service has a high error rate", - "type": "query alert", - "query": "sum(last_5m):(sum:trace.langchain.request.errors{*}.as_count() / sum:trace.langchain.request.hits{*}.as_count()) > 0.1", - "message": "{{#is_alert}}\n\nALERT: The error rate of your LangChain requests is higher than normal. The error rate is currently over {{value}} errors per request.\n{{/is_alert}}", - "tags": [ - "integration:langchain" - ], - "options": { - "thresholds": { - "critical": 0.1, - "warning": 0.05 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when requests made with LangChain have increased error rates" - } -} \ No newline at end of file diff --git a/langchain/assets/recommended_monitors/request_duration.json b/langchain/assets/recommended_monitors/request_duration.json deleted file mode 100644 index a0e3d0fe13197..0000000000000 --- a/langchain/assets/recommended_monitors/request_duration.json +++ /dev/null @@ -1,24 +0,0 @@ -{ - "name": "[LangChain] Request Duration Spike", - "type": "query alert", - "query": "avg(last_5m):avg:system.load.1{*} > 10", - "message": "{{#is_alert}}\n\nALERT: The duration of your LangChain requests is higher than normal. The average LangChain request duration is currently over {{value}} seconds. \n\n{{/is_alert}}", - "tags": [ - "integration:langchain" - ], - "options": { - "thresholds": { - "critical": 10 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when requests made with LangChain have increased latency" - } -} \ No newline at end of file diff --git a/langchain/manifest.json b/langchain/manifest.json index d97216ac86ef1..8e4386a50c12a 100644 --- a/langchain/manifest.json +++ b/langchain/manifest.json @@ -40,8 +40,8 @@ "LangChain Overview Dashboard": "assets/dashboards/overview_dashboard.json" }, "monitors": { - "Request Latency": "assets/recommended_monitors/request_duration.json", - "Error Rate": "assets/recommended_monitors/error_rate.json" + "Request Latency": "assets/monitors/request_duration.json", + "Error Rate": "assets/monitors/error_rate.json" } }, "author": { diff --git a/marklogic/assets/monitors/marklogic_high_load.json b/marklogic/assets/monitors/marklogic_high_load.json new file mode 100644 index 0000000000000..9d76993359a2b --- /dev/null +++ b/marklogic/assets/monitors/marklogic_high_load.json @@ -0,0 +1,35 @@ +{ + "version": 2, + "created_at": "2020-12-23", + "last_updated_at": "2020-12-23", + "title": "Forest Processing Load is High on {{host.name}}", + "tags": [ + "integration:marklogic" + ], + "description": "Notify your team when your host has a high forest load.", + "definition": { + "message": "Average forest load has been higher than 2G during the past 5 minutes", + "name": "[MarkLogic] Forest Processing Load is High on {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 2000000000, + "warning": 1000000000 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:marklogic.forests.total_rate{*} > 2000000000", + "tags": [ + "integration:marklogic" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/marklogic/assets/monitors/marklogic_long_requests.json b/marklogic/assets/monitors/marklogic_long_requests.json new file mode 100644 index 0000000000000..50f67dc593a92 --- /dev/null +++ b/marklogic/assets/monitors/marklogic_long_requests.json @@ -0,0 +1,35 @@ +{ + "version": 2, + "created_at": "2020-12-23", + "last_updated_at": "2020-12-23", + "title": "Active requests are taking too long on average", + "tags": [ + "integration:marklogic" + ], + "description": "Notify your team when your host is taking too long to process requests.", + "definition": { + "message": "Average request length is greater than 0.5 seconds for the past 5 minutes", + "name": "[MarkLogic] Active requests are taking too long on average", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 0.5, + "warning": 0.25 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:marklogic.requests.mean_seconds{*} > 0.5", + "tags": [ + "integration:marklogic" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/marklogic/assets/monitors/marklogic_low_cache.json b/marklogic/assets/monitors/marklogic_low_cache.json new file mode 100644 index 0000000000000..39908353515a6 --- /dev/null +++ b/marklogic/assets/monitors/marklogic_low_cache.json @@ -0,0 +1,35 @@ +{ + "version": 2, + "created_at": "2020-12-23", + "last_updated_at": "2020-12-23", + "title": "{{host.name}} cache is not large enough to handle new requests", + "tags": [ + "integration:marklogic" + ], + "description": "Notify your team when your host's cache is getting too low to handle new requests.", + "definition": { + "message": "{{#is_alert}} \nTo fix:\n1. Consider adding memory to the cache.\n{{/is_alert}}\n\n{{#is_recovery}}\nWhew, there's now enough cache to safely handle new requests!\n{{/is_recovery}}", + "name": "[MarkLogic] {{host.name}} cache is not large enough to handle new requests", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 0.006, + "warning": 0.005 + }, + "timeout_h": 0 + }, + "query": "change(avg(last_5m),last_5m):avg:marklogic.forests.query_read_rate{*} / avg:marklogic.forests.list_cache_hit_rate{*} > 0.006", + "tags": [ + "integration:marklogic" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/marklogic/assets/recommended_monitors/marklogic_high_load.json b/marklogic/assets/recommended_monitors/marklogic_high_load.json deleted file mode 100644 index 60140d62787f9..0000000000000 --- a/marklogic/assets/recommended_monitors/marklogic_high_load.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "name": "[MarkLogic] Forest Processing Load is High on {{host.name}}", - "type": "query alert", - "query": "avg(last_5m):avg:marklogic.forests.total_rate{*} > 2000000000", - "message": "Average forest load has been higher than 2G during the past 5 minutes", - "tags": [ - "integration:marklogic" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 2000000000, - "warning": 1000000000 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when your host has a high forest load." - } -} diff --git a/marklogic/assets/recommended_monitors/marklogic_long_requests.json b/marklogic/assets/recommended_monitors/marklogic_long_requests.json deleted file mode 100644 index f1735e664ec2e..0000000000000 --- a/marklogic/assets/recommended_monitors/marklogic_long_requests.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "name": "[MarkLogic] Active requests are taking too long on average", - "type": "query alert", - "query": "avg(last_5m):avg:marklogic.requests.mean_seconds{*} > 0.5", - "message": "Average request length is greater than 0.5 seconds for the past 5 minutes", - "tags": [ - "integration:marklogic" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 0.5, - "warning": 0.25 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when your host is taking too long to process requests." - } -} \ No newline at end of file diff --git a/marklogic/assets/recommended_monitors/marklogic_low_cache.json b/marklogic/assets/recommended_monitors/marklogic_low_cache.json deleted file mode 100644 index 6588de50cad28..0000000000000 --- a/marklogic/assets/recommended_monitors/marklogic_low_cache.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "name": "[MarkLogic] {{host.name}} cache is not large enough to handle new requests", - "type": "query alert", - "query": "change(avg(last_5m),last_5m):avg:marklogic.forests.query_read_rate{*} / avg:marklogic.forests.list_cache_hit_rate{*} > 0.006", - "message": "{{#is_alert}} \nTo fix:\n1. Consider adding memory to the cache.\n{{/is_alert}}\n\n{{#is_recovery}}\nWhew, there's now enough cache to safely handle new requests!\n{{/is_recovery}}", - "tags": [ - "integration:marklogic" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 0.006, - "warning": 0.005 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when your host's cache is getting too low to handle new requests." - } -} \ No newline at end of file diff --git a/marklogic/manifest.json b/marklogic/manifest.json index 1f38a8cf9de53..7042309de32be 100644 --- a/marklogic/manifest.json +++ b/marklogic/manifest.json @@ -51,9 +51,9 @@ "MarkLogic - Overview": "assets/dashboards/overview.json" }, "monitors": { - "Marklogic low cache": "assets/recommended_monitors/marklogic_low_cache.json", - "Marklogic high load": "assets/recommended_monitors/marklogic_high_load.json", - "Marklogic long requests": "assets/recommended_monitors/marklogic_long_requests.json" + "Marklogic low cache": "assets/monitors/marklogic_low_cache.json", + "Marklogic high load": "assets/monitors/marklogic_high_load.json", + "Marklogic long requests": "assets/monitors/marklogic_long_requests.json" }, "saved_views": { "marklogic_processes": "assets/saved_views/marklogic_processes.json" diff --git a/mongo/assets/monitors/high_connections.json b/mongo/assets/monitors/high_connections.json index b3415d368c259..337ebabe0204c 100644 --- a/mongo/assets/monitors/high_connections.json +++ b/mongo/assets/monitors/high_connections.json @@ -1,28 +1,35 @@ { - "name": "[MongoDB] High incoming connections", - "type": "query alert", - "query": "avg(last_5m):100 * sum:mongodb.connections.current{*} by {replset_name} / ( sum:mongodb.connections.current{*} by {replset_name} + sum:mongodb.connections.available{*} by {replset_name} ) > 90", - "message": "The number of incoming connections is reaching the maximum. {{value}} % of the available connections have been used on {{replset_name.name}}", - "tags": [ - "integration:mongodb" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 90, - "warning": 70 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when incoming connections are close to reaching the maximum available." + "version": 2, + "created_at": "2020-08-05", + "last_updated_at": "2021-01-11", + "title": "High incoming connections", + "tags": [ + "integration:mongodb" + ], + "description": "Notify your team when incoming connections are close to reaching the maximum available.", + "definition": { + "message": "The number of incoming connections is reaching the maximum. {{value}} % of the available connections have been used on {{replset_name.name}}", + "name": "[MongoDB] High incoming connections", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 90, + "warning": 70 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):100 * sum:mongodb.connections.current{*} by {replset_name} / ( sum:mongodb.connections.current{*} by {replset_name} + sum:mongodb.connections.available{*} by {replset_name} ) > 90", + "tags": [ + "integration:mongodb" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/mysql/assets/monitors/replica_running.json b/mysql/assets/monitors/replica_running.json index 5a5f17a9c2e94..9671b8a6fb1c5 100644 --- a/mysql/assets/monitors/replica_running.json +++ b/mysql/assets/monitors/replica_running.json @@ -1,25 +1,32 @@ { - "name": "[MySQL] Replica {{host.name}} is not running properly", - "type": "service check", - "query": "\"mysql.replication.replica_running\".over(\"*\").by(\"*\").last(2).count_by_status()", - "message": "Replica_IO_Running and/or Replica_SQL_Running is not running on replica {{host.name}}. Consider investigating to restore full data replication.", - "tags": [ - "integration:mysql" + "version": 2, + "created_at": "2021-02-16", + "last_updated_at": "2023-07-24", + "title": "Replica {{host.name}} is not running properly", + "tags": [ + "integration:mysql" + ], + "description": "Notify your team when a replica is not running properly.", + "definition": { + "message": "Replica_IO_Running and/or Replica_SQL_Running is not running on replica {{host.name}}. Consider investigating to restore full data replication.", + "name": "[MySQL] Replica {{host.name}} is not running properly", + "options": { + "new_host_delay": 300, + "no_data_timeframe": 2, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "thresholds": { + "critical": 1, + "ok": 1, + "warning": 1 + }, + "timeout_h": 0 + }, + "query": "\"mysql.replication.replica_running\".over(\"*\").by(\"*\").last(2).count_by_status()", + "tags": [ + "integration:mysql" ], - "options": { - "notify_audit": false, - "renotify_interval": 0, - "timeout_h": 0, - "new_host_delay": 300, - "notify_no_data": false, - "no_data_timeframe": 2, - "thresholds": { - "critical": 1, - "warning": 1, - "ok": 1 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when a replica is not running properly." - } -} + "type": "service check" + } +} \ No newline at end of file diff --git a/mysql/assets/monitors/select_query_rate.json b/mysql/assets/monitors/select_query_rate.json index d299bccc693a3..1b4743f4f76fd 100644 --- a/mysql/assets/monitors/select_query_rate.json +++ b/mysql/assets/monitors/select_query_rate.json @@ -1,33 +1,40 @@ { - "name": "[MySQL] Unusual drop in SELECT query rate on server {{host.name}}", - "type": "query alert", - "query": "avg(last_1h):anomalies(avg:mysql.performance.com_select{*}, 'basic', 2, direction='below', alert_window='last_15m', interval=20, count_default_zero='true') >= 1", - "message": "Get notified of drastic and prolonged drops in SELECT query throughput.", - "tags": [ + "version": 2, + "created_at": "2021-02-16", + "last_updated_at": "2021-02-16", + "title": "Unusual drop in SELECT query rate on server {{host.name}}", + "tags": [ + "integration:mysql" + ], + "description": "Notify your team when a drastic drop in SELECT queries occurs.", + "definition": { + "message": "Get notified of drastic and prolonged drops in SELECT query throughput.", + "name": "[MySQL] Unusual drop in SELECT query rate on server {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_1h):anomalies(avg:mysql.performance.com_select{*}, 'basic', 2, direction='below', alert_window='last_15m', interval=20, count_default_zero='true') >= 1", + "tags": [ "integration:mysql" ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Notify your team when a drastic drop in SELECT queries occurs." - } -} + "type": "query alert" + } +} \ No newline at end of file diff --git a/nginx/assets/monitors/4xx.json b/nginx/assets/monitors/4xx.json index c61a19eefb2b6..1ef697dda3575 100644 --- a/nginx/assets/monitors/4xx.json +++ b/nginx/assets/monitors/4xx.json @@ -1,32 +1,39 @@ { - "name": "[NGINX] 4xx Errors higher than usual", - "type": "query alert", - "query": "avg(last_1h):anomalies(avg:nginx.upstream.peers.responses.4xx{*} by {upstream}, 'basic', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", - "message": "Number of 4xx errors on NGINX upstreams is at {{value}} which is higher than usual.", - "tags": [ - "integration:nginx" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when NGINX upstream 4xx errors are higher than usual" - } -} + "version": 2, + "created_at": "2020-09-16", + "last_updated_at": "2020-10-21", + "title": "4xx Errors higher than usual", + "tags": [ + "integration:nginx" + ], + "description": "Notifies when NGINX upstream 4xx errors are higher than usual", + "definition": { + "message": "Number of 4xx errors on NGINX upstreams is at {{value}} which is higher than usual.", + "name": "[NGINX] 4xx Errors higher than usual", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "query": "avg(last_1h):anomalies(avg:nginx.upstream.peers.responses.4xx{*} by {upstream}, 'basic', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", + "tags": [ + "integration:nginx" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/nginx/assets/monitors/5xx.json b/nginx/assets/monitors/5xx.json index b73d259fdd2d8..1a81823330e3e 100644 --- a/nginx/assets/monitors/5xx.json +++ b/nginx/assets/monitors/5xx.json @@ -1,32 +1,39 @@ { - "name": "[NGINX] 5xx Errors higher than usual", - "type": "query alert", - "query": "avg(last_1h):anomalies(avg:nginx.upstream.peers.responses.5xx{*} by {upstream}, 'basic', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", - "message": "Number of 5xx errors on NGINX upstreams is at {{value}} which is higher than usual.", - "tags": [ - "integration:nginx" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when NGINX upstream 5xx errors are higher than usual" - } -} + "version": 2, + "created_at": "2020-09-16", + "last_updated_at": "2020-10-21", + "title": "5xx Errors higher than usual", + "tags": [ + "integration:nginx" + ], + "description": "Notifies when NGINX upstream 5xx errors are higher than usual", + "definition": { + "message": "Number of 5xx errors on NGINX upstreams is at {{value}} which is higher than usual.", + "name": "[NGINX] 5xx Errors higher than usual", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "query": "avg(last_1h):anomalies(avg:nginx.upstream.peers.responses.5xx{*} by {upstream}, 'basic', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", + "tags": [ + "integration:nginx" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/nginx/assets/monitors/upstream_peer_fails.json b/nginx/assets/monitors/upstream_peer_fails.json index e7509017c1d92..eb1ac3f7ea559 100644 --- a/nginx/assets/monitors/upstream_peer_fails.json +++ b/nginx/assets/monitors/upstream_peer_fails.json @@ -1,32 +1,39 @@ { - "name": "[NGINX] Upstream peers fails", - "type": "query alert", - "query": "avg(last_1h):anomalies(avg:nginx.stream.upstream.peers.fails{*} by {upstream}, 'basic', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", - "message": "NGINX upstream peer failures are higher than usual at {{value}}.", - "tags": [ - "integration:nginx" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when NGINX upstream peer failures are higher than usual" - } -} + "version": 2, + "created_at": "2020-09-16", + "last_updated_at": "2020-10-21", + "title": "Upstream peers fails", + "tags": [ + "integration:nginx" + ], + "description": "Notify your team when NGINX upstream peer failures are higher than usual", + "definition": { + "message": "NGINX upstream peer failures are higher than usual at {{value}}.", + "name": "[NGINX] Upstream peers fails", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "query": "avg(last_1h):anomalies(avg:nginx.stream.upstream.peers.fails{*} by {upstream}, 'basic', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", + "tags": [ + "integration:nginx" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/openai/assets/monitors/api_token_usage.json b/openai/assets/monitors/api_token_usage.json new file mode 100644 index 0000000000000..84d76b2b2f16e --- /dev/null +++ b/openai/assets/monitors/api_token_usage.json @@ -0,0 +1,35 @@ +{ + "version": 2, + "created_at": "2023-08-09", + "last_updated_at": "2023-08-09", + "title": "Abnormally high token usage", + "tags": [ + "integration:openai" + ], + "description": "Notify your team when OpenAI token usage is abnormally high", + "definition": { + "message": "{{#is_alert}}\n\nALERT: Your OpenAI usage in the last hour has been abnormally high\n\n{{/is_alert}}", + "name": "[OpenAI] Abnormally high token usage ", + "options": { + "include_tags": false, + "no_data_timeframe": 10, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_30m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + } + }, + "query": "avg(last_12h):anomalies(sum:openai.api.usage.n_context{*}.as_count() + sum:openai.api.usage.n_generated{*}.as_count(), 'basic', 2, direction='both', interval=120, alert_window='last_30m', count_default_zero='true', seasonality='hourly') >= 1", + "tags": [ + "integration:openai" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/openai/assets/monitors/request_limits.json b/openai/assets/monitors/request_limits.json new file mode 100644 index 0000000000000..7b2d11b108e13 --- /dev/null +++ b/openai/assets/monitors/request_limits.json @@ -0,0 +1,33 @@ +{ + "version": 2, + "created_at": "2023-05-08", + "last_updated_at": "2023-05-08", + "title": "Request Limits", + "tags": [ + "integration:openai" + ], + "description": "Notify your team when you are close to your rate limits for OpenAI APIs", + "definition": { + "message": "{{#is_alert}}\n\nALERT: Your OpenAI usage is getting close to the rate limits for your account. You have {{value}} remaining requests. \n\n{{/is_alert}}", + "name": "[OpenAI] Request Limits", + "options": { + "include_tags": false, + "new_host_delay": 300, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 1000 + } + }, + "priority": null, + "query": "avg(last_5m):avg:openai.ratelimit.remaining.requests{*} < 1000", + "restricted_roles": null, + "tags": [ + "integration:openai" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/openai/assets/monitors/tokens_limits.json b/openai/assets/monitors/tokens_limits.json new file mode 100644 index 0000000000000..f74bba47746da --- /dev/null +++ b/openai/assets/monitors/tokens_limits.json @@ -0,0 +1,33 @@ +{ + "version": 2, + "created_at": "2023-05-08", + "last_updated_at": "2023-05-08", + "title": "Token per min Limits", + "tags": [ + "integration:openai" + ], + "description": "Notify your team when you are close to your Token per min limits for OpenAI APIs", + "definition": { + "message": "{{#is_alert}}\n\nALERT: Your OpenAI token usage is getting close to the rate limits for your account. You have {{value}} remaining tokens. \n\n{{/is_alert}}", + "name": "[OpenAI] Token per min Limits", + "options": { + "include_tags": false, + "new_host_delay": 300, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 50000 + } + }, + "priority": null, + "query": "avg(last_5m):avg:openai.ratelimit.remaining.tokens{*} < 50000", + "restricted_roles": null, + "tags": [ + "integration:openai" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/openai/assets/recommended_monitors/api_token_usage.json b/openai/assets/recommended_monitors/api_token_usage.json deleted file mode 100644 index 2413859883e8c..0000000000000 --- a/openai/assets/recommended_monitors/api_token_usage.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "name": "[OpenAI] Abnormally high token usage ", - "type": "query alert", - "query": "avg(last_12h):anomalies(sum:openai.api.usage.n_context{*}.as_count() + sum:openai.api.usage.n_generated{*}.as_count(), 'basic', 2, direction='both', interval=120, alert_window='last_30m', count_default_zero='true', seasonality='hourly') >= 1", - "message": "{{#is_alert}}\n\nALERT: Your OpenAI usage in the last hour has been abnormally high\n\n{{/is_alert}}", - "tags": [ - "integration:openai" - ], - "options": { - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "no_data_timeframe": 10, - "renotify_interval": 0, - "threshold_windows": { - "trigger_window": "last_30m", - "recovery_window": "last_15m" - }, - "include_tags": false - }, - "recommended_monitor_metadata": { - "description": "Notify your team when OpenAI token usage is abnormally high" - } -} diff --git a/openai/assets/recommended_monitors/request_limits.json b/openai/assets/recommended_monitors/request_limits.json deleted file mode 100644 index 62692450ee582..0000000000000 --- a/openai/assets/recommended_monitors/request_limits.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "name": "[OpenAI] Request Limits", - "type": "query alert", - "query": "avg(last_5m):avg:openai.ratelimit.remaining.requests{*} < 1000", - "message": "{{#is_alert}}\n\nALERT: Your OpenAI usage is getting close to the rate limits for your account. You have {{value}} remaining requests. \n\n{{/is_alert}}", - "tags": [ - "integration:openai" - ], - "options": { - "thresholds": { - "critical": 1000 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "new_host_delay": 300, - "silenced": {} - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when you are close to your rate limits for OpenAI APIs" - } -} \ No newline at end of file diff --git a/openai/assets/recommended_monitors/tokens_limits.json b/openai/assets/recommended_monitors/tokens_limits.json deleted file mode 100644 index 5f6b2e44bb5a1..0000000000000 --- a/openai/assets/recommended_monitors/tokens_limits.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "name": "[OpenAI] Token per min Limits", - "type": "query alert", - "query": "avg(last_5m):avg:openai.ratelimit.remaining.tokens{*} < 50000", - "message": "{{#is_alert}}\n\nALERT: Your OpenAI token usage is getting close to the rate limits for your account. You have {{value}} remaining tokens. \n\n{{/is_alert}}", - "tags": [ - "integration:openai" - ], - "options": { - "thresholds": { - "critical": 50000 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "new_host_delay": 300, - "silenced": {} - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when you are close to your Token per min limits for OpenAI APIs" - } -} \ No newline at end of file diff --git a/openai/manifest.json b/openai/manifest.json index 52a1629e04819..4c65127826284 100644 --- a/openai/manifest.json +++ b/openai/manifest.json @@ -59,9 +59,9 @@ "OpenAI Usage Overview": "assets/dashboards/usage_overview_dashboard.json" }, "monitors": { - "Request Limits": "assets/recommended_monitors/request_limits.json", - "Token per min Limits": "assets/recommended_monitors/tokens_limits.json", - "Abnormally High Token Usage": "assets/recommended_monitors/api_token_usage.json" + "Request Limits": "assets/monitors/request_limits.json", + "Token per min Limits": "assets/monitors/tokens_limits.json", + "Abnormally High Token Usage": "assets/monitors/api_token_usage.json" } }, "author": { diff --git a/otel/assets/recommended_monitors/otel_refused_spans.json b/otel/assets/recommended_monitors/otel_refused_spans.json index 225637f23b0aa..265107a8a91e3 100644 --- a/otel/assets/recommended_monitors/otel_refused_spans.json +++ b/otel/assets/recommended_monitors/otel_refused_spans.json @@ -1,26 +1,33 @@ { - "name": "[OTel Collector] Refused Spans", - "type": "query alert", - "query": "avg(last_10m):avg:otelcol_receiver_refused_spans{*} by {host,receiver}.as_rate() > 100", - "message": "The OpenTelemetry Collector receiver {{receiver.name}} is refusing {{value}} spans per second for host: {{host.name}}.", - "tags": [ - "integration:opentelemetry" - ], - "options": { - "thresholds": { - "critical": 100 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": true, - "new_group_delay": 60, - "silenced": {} - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notifies when the OpenTelemetry Collector receiver is refusing spans" - } -} + "version": 2, + "created_at": "2023-08-16", + "last_updated_at": "2023-08-16", + "title": "Refused Spans", + "tags": [ + "integration:otel" + ], + "description": "Notifies when the OpenTelemetry Collector receiver is refusing spans", + "definition": { + "message": "The OpenTelemetry Collector receiver {{receiver.name}} is refusing {{value}} spans per second for host: {{host.name}}.", + "name": "[OTel Collector] Refused Spans", + "options": { + "include_tags": true, + "new_group_delay": 60, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 100 + } + }, + "priority": null, + "query": "avg(last_10m):avg:otelcol_receiver_refused_spans{*} by {host,receiver}.as_rate() > 100", + "restricted_roles": null, + "tags": [ + "integration:opentelemetry" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/postgres/assets/monitors/percent_usage_connections.json b/postgres/assets/monitors/percent_usage_connections.json index f5ef72bc6a5eb..08feea43e951a 100644 --- a/postgres/assets/monitors/percent_usage_connections.json +++ b/postgres/assets/monitors/percent_usage_connections.json @@ -1,29 +1,36 @@ { - "name": "[Postgres] Number of connections is approaching connection limit on {{host.name}}", - "type": "query alert", - "query": "avg(last_15m):avg:postgresql.percent_usage_connections{*} > 0.9", - "message": "Please check host {{host.name}}, as the Postgres connection pool is approaching saturation.", - "tags": [ - "integration:postgres" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 0.9, - "warning": 0.8 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Notify your team when a host's connection pool approaches saturation." - } -} + "version": 2, + "created_at": "2021-03-17", + "last_updated_at": "2023-07-24", + "title": "Number of connections is approaching connection limit on {{host.name}}", + "tags": [ + "integration:postgres" + ], + "description": "Notify your team when a host's connection pool approaches saturation.", + "definition": { + "message": "Please check host {{host.name}}, as the Postgres connection pool is approaching saturation.", + "name": "[Postgres] Number of connections is approaching connection limit on {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 0.9, + "warning": 0.8 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_15m):avg:postgresql.percent_usage_connections{*} > 0.9", + "tags": [ + "integration:postgres" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/postgres/assets/monitors/replication_delay.json b/postgres/assets/monitors/replication_delay.json index 691f0dabe207f..610822b9cd6b4 100644 --- a/postgres/assets/monitors/replication_delay.json +++ b/postgres/assets/monitors/replication_delay.json @@ -1,33 +1,40 @@ { - "name": "[Postgres] Replication delay is abnormally high on {{host.name}}", - "type": "query alert", - "query": "avg(last_1h):anomalies(avg:postgresql.replication_delay{*}, 'basic', 2, direction='above', alert_window='last_15m', interval=20, count_default_zero='true') >= 1", - "message": "Please check host {{host.name}}, as replication delay has been abnormally high.", - "tags": [ - "integration:postgres" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Notify your team when Postgres replication delay is unusually high." - } -} + "version": 2, + "created_at": "2021-02-16", + "last_updated_at": "2021-03-17", + "title": "Replication delay is abnormally high on {{host.name}}", + "tags": [ + "integration:postgres" + ], + "description": "Notify your team when Postgres replication delay is unusually high.", + "definition": { + "message": "Please check host {{host.name}}, as replication delay has been abnormally high.", + "name": "[Postgres] Replication delay is abnormally high on {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_1h):anomalies(avg:postgresql.replication_delay{*}, 'basic', 2, direction='above', alert_window='last_15m', interval=20, count_default_zero='true') >= 1", + "tags": [ + "integration:postgres" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/rabbitmq/assets/monitors/disk_usage.json b/rabbitmq/assets/monitors/disk_usage.json index ca3633425ba2b..901a7f6695ff0 100644 --- a/rabbitmq/assets/monitors/disk_usage.json +++ b/rabbitmq/assets/monitors/disk_usage.json @@ -1,29 +1,36 @@ { - "name": "[RabbitMQ] Level of disk usage is too high for host: {{host.name}} ", - "type": "query alert", - "query": "avg(last_5m):avg:rabbitmq.node.mem_used{*} by {host} / avg:system.mem.total{*} by {host} * 100 > 35", - "message": "RabbitMQ is using too many resources on host: {{host.name}}. It may block connections and won't be able to perform many internal operations.", + "version": 2, + "created_at": "2021-03-26", + "last_updated_at": "2021-03-26", + "title": "Level of disk usage is too high for host: {{host.name}}", "tags": [ "integration:rabbitmq" ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 35, - "warning": 30 - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Notify your team when a host's disk usage reaches critical levels." + "description": "Notify your team when a host's disk usage reaches critical levels.", + "definition": { + "message": "RabbitMQ is using too many resources on host: {{host.name}}. It may block connections and won't be able to perform many internal operations.", + "name": "[RabbitMQ] Level of disk usage is too high for host: {{host.name}} ", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 35, + "warning": 30 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_5m):avg:rabbitmq.node.mem_used{*} by {host} / avg:system.mem.total{*} by {host} * 100 > 35", + "tags": [ + "integration:rabbitmq" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/rabbitmq/assets/monitors/disk_usage_prometheus.json b/rabbitmq/assets/monitors/disk_usage_prometheus.json index 7394d6da8693b..bb50fa072ddb0 100644 --- a/rabbitmq/assets/monitors/disk_usage_prometheus.json +++ b/rabbitmq/assets/monitors/disk_usage_prometheus.json @@ -1,31 +1,38 @@ { - "name": "[RabbitMQ - OMV2] Disk Space is Low", - "type": "query alert", - "query": "avg(last_5m):avg:rabbitmq.alarms.free_disk_space.watermark{*} >= 1", - "message": "{{#is_alert}}\nDisk space is extremely low. \nThis can be caused by an overload of messages or persistent messages that are being mismanaged. \nPlease resolve.\n{{/is_alert}}", - "tags": [ - "integration:rabbitmq" - ], - "options": { - "thresholds": { - "critical": 1 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 180, - "include_tags": false, - "renotify_statuses": [ - "alert" - ], - "escalation_message": "Disk space is still extremely low. \nThis can be caused by an overload of messages or persistent messages that are being mismanaged. \nPlease resolve.", - "avalanche_window": 10, - "new_host_delay": 300, - "silenced": {} - }, - "priority": 3, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when a host's disk usage reaches critical levels." -} -} + "version": 2, + "created_at": "2023-03-22", + "last_updated_at": "2023-03-22", + "title": "Disk Space is Low", + "tags": [ + "integration:rabbitmq" + ], + "description": "Notify your team when a host's disk usage reaches critical levels.", + "definition": { + "message": "{{#is_alert}}\nDisk space is extremely low. \nThis can be caused by an overload of messages or persistent messages that are being mismanaged. \nPlease resolve.\n{{/is_alert}}", + "name": "[RabbitMQ - OMV2] Disk Space is Low", + "options": { + "avalanche_window": 10, + "escalation_message": "Disk space is still extremely low. \nThis can be caused by an overload of messages or persistent messages that are being mismanaged. \nPlease resolve.", + "include_tags": false, + "new_host_delay": 300, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 180, + "renotify_statuses": [ + "alert" + ], + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 1 + } + }, + "priority": 3, + "query": "avg(last_5m):avg:rabbitmq.alarms.free_disk_space.watermark{*} >= 1", + "restricted_roles": null, + "tags": [ + "integration:rabbitmq" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/rabbitmq/assets/monitors/message_unack_prometheus.json b/rabbitmq/assets/monitors/message_unack_prometheus.json index 16117a2c0f121..92c7ba2482c64 100644 --- a/rabbitmq/assets/monitors/message_unack_prometheus.json +++ b/rabbitmq/assets/monitors/message_unack_prometheus.json @@ -1,27 +1,34 @@ { - "name": "[RabbitMQ - OMV2] Unacknowledged Messages are higher than usual", - "type": "query alert", - "query": "avg(last_5m):100 * sum:rabbitmq.queue.messages.unacked{*} / (sum:rabbitmq.queue.messages.unacked{*} + sum:rabbitmq.queue.messages.ready{*}) > 120", - "message": "{{#is_alert}}\nThe ratio of unacknowledged to acknowledged messages in this queue is higher than usual. \nPlease check your message pipeline. \n{{/is_alert}}", - "tags": [ - "integration:rabbitmq" - ], - "options": { - "thresholds": { - "critical": 120 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "avalanche_window": 10, - "new_host_delay": 300, - "silenced": {} - }, - "priority": 3, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when a RabbitMQ message unacknowledged rate is higher than usual." - } -} + "version": 2, + "created_at": "2023-03-22", + "last_updated_at": "2023-03-22", + "title": "Unacknowledged Messages are higher than usual", + "tags": [ + "integration:rabbitmq" + ], + "description": "Notify your team when a RabbitMQ message unacknowledged rate is higher than usual.", + "definition": { + "message": "{{#is_alert}}\nThe ratio of unacknowledged to acknowledged messages in this queue is higher than usual. \nPlease check your message pipeline. \n{{/is_alert}}", + "name": "[RabbitMQ - OMV2] Unacknowledged Messages are higher than usual", + "options": { + "avalanche_window": 10, + "include_tags": false, + "new_host_delay": 300, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 120 + } + }, + "priority": 3, + "query": "avg(last_5m):100 * sum:rabbitmq.queue.messages.unacked{*} / (sum:rabbitmq.queue.messages.unacked{*} + sum:rabbitmq.queue.messages.ready{*}) > 120", + "restricted_roles": null, + "tags": [ + "integration:rabbitmq" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/rabbitmq/assets/monitors/message_unacknowledge_rate_anomaly.json b/rabbitmq/assets/monitors/message_unacknowledge_rate_anomaly.json index 322fb3afa09e4..8068e59697e20 100644 --- a/rabbitmq/assets/monitors/message_unacknowledge_rate_anomaly.json +++ b/rabbitmq/assets/monitors/message_unacknowledge_rate_anomaly.json @@ -1,33 +1,40 @@ { - "name": "[RabbitMQ] Messages unacknowledged rate is higher than usual on: {{host.name}}", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:rabbitmq.queue.messages_unacknowledged.rate{*} by {rabbitmq_queue,host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1", - "message": "The rate at which messages are being delivered without receiving acknowledgement is higher than usual. There may be errors or performance issues downstream.\n\nHost: {{host.name}}\nRabbitMQ Queue: {{rabbitmq_queue.name}}", + "version": 2, + "created_at": "2021-03-26", + "last_updated_at": "2021-03-26", + "title": "Messages unacknowledged rate is higher than usual on: {{host.name}}", "tags": [ "integration:rabbitmq" ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 + "description": "Notify your team when a RabbitMQ message unacknowledged rate is higher than usual.", + "definition": { + "message": "The rate at which messages are being delivered without receiving acknowledgement is higher than usual. There may be errors or performance issues downstream.\n\nHost: {{host.name}}\nRabbitMQ Queue: {{rabbitmq_queue.name}}", + "name": "[RabbitMQ] Messages unacknowledged rate is higher than usual on: {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Notify your team when a RabbitMQ message unacknowledged rate is higher than usual." + "priority": null, + "query": "avg(last_4h):anomalies(avg:rabbitmq.queue.messages_unacknowledged.rate{*} by {rabbitmq_queue,host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='hourly') >= 1", + "tags": [ + "integration:rabbitmq" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/redisdb/assets/monitors/high_mem.json b/redisdb/assets/monitors/high_mem.json index f5288eec00e2a..743f150d43c68 100644 --- a/redisdb/assets/monitors/high_mem.json +++ b/redisdb/assets/monitors/high_mem.json @@ -1,30 +1,37 @@ { - "name": "[Redis] High memory consumption", - "type": "query alert", - "query": "avg(last_5m):100 * avg:redis.mem.used{*} / avg:redis.mem.maxmemory{*} > 90", - "message": "{{#is_alert}}\n\nALERT: Redis is consuming {{value}}% of total memory allocated.\n\n{{/is_alert}} \n\n{{#is_warning}}\n\nWARNING: Redis is consuming {{value}}% of total memory allocated.\n\n{{/is_warning}} \n\n", - "tags": [ - "integration:redisdb" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 90, - "warning": 70 - } - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify if Redis is consuming a high amount of memory." - } + "version": 2, + "created_at": "2021-02-08", + "last_updated_at": "2021-02-08", + "title": "High memory consumption", + "tags": [ + "integration:redis" + ], + "description": "Notify if Redis is consuming a high amount of memory.", + "definition": { + "message": "{{#is_alert}}\n\nALERT: Redis is consuming {{value}}% of total memory allocated.\n\n{{/is_alert}} \n\n{{#is_warning}}\n\nWARNING: Redis is consuming {{value}}% of total memory allocated.\n\n{{/is_warning}} \n\n", + "name": "[Redis] High memory consumption", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "thresholds": { + "critical": 90, + "warning": 70 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_5m):100 * avg:redis.mem.used{*} / avg:redis.mem.maxmemory{*} > 90", + "restricted_roles": null, + "tags": [ + "integration:redisdb" + ], + "type": "query alert" + } } \ No newline at end of file diff --git a/scylla/assets/monitors/instance_down.json b/scylla/assets/monitors/instance_down.json index d3fab65a8f9ec..418d23bd1cf66 100644 --- a/scylla/assets/monitors/instance_down.json +++ b/scylla/assets/monitors/instance_down.json @@ -1,27 +1,34 @@ { - "name": "[Scylla] Server is shutting down", - "type": "query alert", - "query": "avg(last_1m):max:scylla.node.operation_mode{*} by {server} > 3", - "message": "{{server.name}} is shutting down. Current value of {{value}} \n\nThe operation mode of the current nodes can have values of UNKNOWN = 0; STARTING = 1; JOINING = 2; NORMAL = 3; LEAVING = 4; DECOMMISSIONED = 5; DRAINING = 6; DRAINED = 7; MOVING = 8", - "tags": [ - "integration:scylla" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 3 - } - }, - "recommended_monitor_metadata": { - "description": "Notify if Scylla node is in a state other than NORMAL." - } -} + "version": 2, + "created_at": "2020-08-05", + "last_updated_at": "2021-02-03", + "title": "Server is shutting down", + "tags": [ + "integration:scylla" + ], + "description": "Notify if Scylla node is in a state other than NORMAL.", + "definition": { + "message": "{{server.name}} is shutting down. Current value of {{value}} \n\nThe operation mode of the current nodes can have values of UNKNOWN = 0; STARTING = 1; JOINING = 2; NORMAL = 3; LEAVING = 4; DECOMMISSIONED = 5; DRAINING = 6; DRAINED = 7; MOVING = 8", + "name": "[Scylla] Server is shutting down", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "thresholds": { + "critical": 3 + }, + "timeout_h": 0 + }, + "query": "avg(last_1m):max:scylla.node.operation_mode{*} by {server} > 3", + "tags": [ + "integration:scylla" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/silk/assets/monitors/latency_high.json b/silk/assets/monitors/latency_high.json new file mode 100644 index 0000000000000..16c90fef3f5c9 --- /dev/null +++ b/silk/assets/monitors/latency_high.json @@ -0,0 +1,34 @@ +{ + "version": 2, + "created_at": "2022-02-17", + "last_updated_at": "2022-02-17", + "title": "Latency is high on host {{silk_host.name}}", + "tags": [ + "integration:silk" + ], + "description": "Get notified when Silk latency is high.", + "definition": { + "message": "{{#is_warning}}Latency is over 100ms on host {{silk_host.name}}{{/is_warning}} \\n\n{{#is_alert}}Latency is over 150ms on host {{silk_host.name}}{{/is_alert}}\\n\\n\n{{#is_recovery}}Latency is less than 100ms on host {{silk_host.name}}{{/is_recovery}}", + "name": "[Silk] Latency is high on host {{silk_host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "no_data_timeframe": null, + "notify_audit": true, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 150, + "warning": 100 + } + }, + "query": "avg(last_5m):avg:silk.system.latency.inner{*} by {silk_host}> 150", + "tags": [ + "integration:silk" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/silk/assets/recommended_monitors/latency_high.json b/silk/assets/recommended_monitors/latency_high.json deleted file mode 100644 index 5817f90ae23ac..0000000000000 --- a/silk/assets/recommended_monitors/latency_high.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "name": "[Silk] Latency is high on host {{silk_host.name}}", - "type": "query alert", - "query": "avg(last_5m):avg:silk.system.latency.inner{*} by {silk_host}> 150", - "message": "{{#is_warning}}Latency is over 100ms on host {{silk_host.name}}{{/is_warning}} \\n\n{{#is_alert}}Latency is over 150ms on host {{silk_host.name}}{{/is_alert}}\\n\\n\n{{#is_recovery}}Latency is less than 100ms on host {{silk_host.name}}{{/is_recovery}}", - "tags": [ - "integration:silk" - ], - "options": { - "thresholds": { - "critical": 150, - "warning": 100 - }, - "notify_audit": true, - "require_full_window": false, - "notify_no_data": false, - "no_data_timeframe": null, - "renotify_interval": 0, - "locked": false, - "silenced": {}, - "include_tags": true, - "escalation_message": "" - }, - "recommended_monitor_metadata": { - "description": "Get notified when Silk latency is high." - } -} \ No newline at end of file diff --git a/silk/manifest.json b/silk/manifest.json index 472c301a88343..85b1adef2b7a8 100644 --- a/silk/manifest.json +++ b/silk/manifest.json @@ -49,7 +49,7 @@ "Silk - Overview": "assets/dashboards/silk_overview.json" }, "monitors": { - "Latency high": "assets/recommended_monitors/latency_high.json" + "Latency high": "assets/monitors/latency_high.json" } } } diff --git a/singlestore/assets/monitors/license_expiration.json b/singlestore/assets/monitors/license_expiration.json index 909e9a5536aa8..4a35c6ecb5921 100644 --- a/singlestore/assets/monitors/license_expiration.json +++ b/singlestore/assets/monitors/license_expiration.json @@ -1,30 +1,37 @@ { - "name": "[SingleStore] License will expire soon", - "type": "query alert", - "query": "avg(last_5m):cutoff_min(avg:singlestore.seconds_until_expiration{*} by {singlestore_node_name,singlestore_node_id}, -0.1) < 604800", - "message": "{{#is_alert}}\nSingleStore license will expire in 7 days on node {{singlestore_node_name.name}}\n{{/is_alert}} \n\n{{#is_warning}}\nSingleStore license will expire in 30 days on node {{singlestore_node_name.name}}\n{{/is_warning}} ", - "tags": [ - "integration:singlestore" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_group_delay": 60, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 604800, - "warning": 2592000 - } - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team before your SingleStore license expires." - } + "version": 2, + "created_at": "2021-09-29", + "last_updated_at": "2021-09-29", + "title": "License will expire soon", + "tags": [ + "integration:singlestore" + ], + "description": "Notify your team before your SingleStore license expires.", + "definition": { + "message": "{{#is_alert}}\nSingleStore license will expire in 7 days on node {{singlestore_node_name.name}}\n{{/is_alert}} \n\n{{#is_warning}}\nSingleStore license will expire in 30 days on node {{singlestore_node_name.name}}\n{{/is_warning}} ", + "name": "[SingleStore] License will expire soon", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_group_delay": 60, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": false, + "thresholds": { + "critical": 604800, + "warning": 2592000 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_5m):cutoff_min(avg:singlestore.seconds_until_expiration{*} by {singlestore_node_name,singlestore_node_id}, -0.1) < 604800", + "restricted_roles": null, + "tags": [ + "integration:singlestore" + ], + "type": "query alert" + } } \ No newline at end of file diff --git a/singlestore/assets/monitors/read_failures.json b/singlestore/assets/monitors/read_failures.json index 1b1081349a231..3b5f92e11025f 100644 --- a/singlestore/assets/monitors/read_failures.json +++ b/singlestore/assets/monitors/read_failures.json @@ -1,35 +1,42 @@ { - "name": "[SingleStore] Read queries failure rate is higher than before", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:singlestore.successful_read_queries{*} by {singlestore_node_name,singlestore_node_id} / ( avg:singlestore.successful_read_queries{*} by {singlestore_node_name,singlestore_node_id} + avg:singlestore.failed_read_queries{*} by {singlestore_node_name,singlestore_node_id} ), 'agile', 5, direction='below', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "SingleStore read queries are failing more often on node {{singlestore_node_name.name}}.\n", - "tags": [ - "integration:singlestore" - ], - "options": { - "notify_audit": true, - "locked": false, - "timeout_h": 0, - "new_group_delay": 60, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "warning": 0.5, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when SingleStore read queries are suddenly failing more than before." - } + "version": 2, + "created_at": "2021-09-29", + "last_updated_at": "2021-09-29", + "title": "Read queries failure rate is higher than before", + "tags": [ + "integration:singlestore" + ], + "description": "Notify your team when SingleStore read queries are suddenly failing more than before.", + "definition": { + "message": "SingleStore read queries are failing more often on node {{singlestore_node_name.name}}.\n", + "name": "[SingleStore] Read queries failure rate is higher than before", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_group_delay": 60, + "no_data_timeframe": null, + "notify_audit": true, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0, + "warning": 0.5 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_4h):anomalies(avg:singlestore.successful_read_queries{*} by {singlestore_node_name,singlestore_node_id} / ( avg:singlestore.successful_read_queries{*} by {singlestore_node_name,singlestore_node_id} + avg:singlestore.failed_read_queries{*} by {singlestore_node_name,singlestore_node_id} ), 'agile', 5, direction='below', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "restricted_roles": null, + "tags": [ + "integration:singlestore" + ], + "type": "query alert" + } } \ No newline at end of file diff --git a/singlestore/assets/monitors/write_failures.json b/singlestore/assets/monitors/write_failures.json index c9ef9143444f4..fbab34f4d19eb 100644 --- a/singlestore/assets/monitors/write_failures.json +++ b/singlestore/assets/monitors/write_failures.json @@ -1,35 +1,42 @@ { - "name": "[SingleStore] Write queries failure rate is higher than before", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:singlestore.successful_write_queries{*} by {singlestore_node_id,singlestore_node_name} / ( avg:singlestore.successful_write_queries{*} by {singlestore_node_id,singlestore_node_name} + avg:singlestore.failed_write_queries{*} by {singlestore_node_id,singlestore_node_name} ), 'agile', 5, direction='below', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "SingleStore write queries are failing more often on node {{singlestore_node_name.name}}.\n", - "tags": [ - "integration:singlestore" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_group_delay": 60, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "warning": 0.5, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when SingleStore write queries are suddenly failing more than before." - } + "version": 2, + "created_at": "2021-09-29", + "last_updated_at": "2021-09-29", + "title": "Write queries failure rate is higher than before", + "tags": [ + "integration:singlestore" + ], + "description": "Notify your team when SingleStore write queries are suddenly failing more than before.", + "definition": { + "message": "SingleStore write queries are failing more often on node {{singlestore_node_name.name}}.\n", + "name": "[SingleStore] Write queries failure rate is higher than before", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_group_delay": 60, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0, + "warning": 0.5 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "avg(last_4h):anomalies(avg:singlestore.successful_write_queries{*} by {singlestore_node_id,singlestore_node_name} / ( avg:singlestore.successful_write_queries{*} by {singlestore_node_id,singlestore_node_name} + avg:singlestore.failed_write_queries{*} by {singlestore_node_id,singlestore_node_name} ), 'agile', 5, direction='below', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "restricted_roles": null, + "tags": [ + "integration:singlestore" + ], + "type": "query alert" + } } \ No newline at end of file diff --git a/snmp/assets/monitors/device_down.json b/snmp/assets/monitors/device_down.json index 4869d7975dfa2..f67918fbe07e5 100644 --- a/snmp/assets/monitors/device_down.json +++ b/snmp/assets/monitors/device_down.json @@ -1,32 +1,39 @@ { - "name": "[SNMP] Device down alert on {{snmp_device.name}} in namespace {{device_namespace.name}}", - "type": "service check", - "query": "\"snmp.can_check\".over(\"*\").by(\"device_namespace\",\"snmp_device\").last(2).count_by_status()", - "message": "{{#is_alert}} \nA network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is reporting CRITICAL and can't be monitored anymore.\n{{/is_alert}}\n\n{{#is_alert_recovery}}\nA network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is back online.\n{{/is_alert_recovery}}\n\nTo know more about the status of your device, you can have more information from the [NDM page for the device {{device_namespace.name}}:{{snmp_device.name}}](/infrastructure/devices/graph?inspectedDevice={{device_namespace.name}}%3A{{snmp_device.name}}).", - "tags": [ - "integration:snmp" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "silenced": {}, - "include_tags": false, - "thresholds": { - "warning": 1, - "ok": 1, - "critical": 1 - }, - "notify_no_data": false, - "renotify_interval": 0, - "avalanche_window": 10, - "escalation_message": "", - "new_group_delay": 60, - "no_data_timeframe": 2 - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when a SNMP devices is down. Requires Datadog Agent 7.32+ or 6.32+." - } -} + "version": 2, + "created_at": "2021-12-13", + "last_updated_at": "2022-01-04", + "title": "Device down alert on {{snmp_device.name}} in namespace {{device_namespace.name}}", + "tags": [ + "integration:snmp" + ], + "description": "Notify your team when a SNMP devices is down. Requires Datadog Agent 7.32+ or 6.32+.", + "definition": { + "message": "{{#is_alert}} \nA network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is reporting CRITICAL and can't be monitored anymore.\n{{/is_alert}}\n\n{{#is_alert_recovery}}\nA network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is back online.\n{{/is_alert_recovery}}\n\nTo know more about the status of your device, you can have more information from the [NDM page for the device {{device_namespace.name}}:{{snmp_device.name}}](/infrastructure/devices/graph?inspectedDevice={{device_namespace.name}}%3A{{snmp_device.name}}).", + "name": "[SNMP] Device down alert on {{snmp_device.name}} in namespace {{device_namespace.name}}", + "options": { + "avalanche_window": 10, + "escalation_message": "", + "include_tags": false, + "locked": false, + "new_group_delay": 60, + "no_data_timeframe": 2, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "silenced": {}, + "thresholds": { + "critical": 1, + "ok": 1, + "warning": 1 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "\"snmp.can_check\".over(\"*\").by(\"device_namespace\",\"snmp_device\").last(2).count_by_status()", + "restricted_roles": null, + "tags": [ + "integration:snmp" + ], + "type": "service check" + } +} \ No newline at end of file diff --git a/snmp/assets/monitors/device_unreachable.json b/snmp/assets/monitors/device_unreachable.json index f6a18927e847c..57041e6739f92 100644 --- a/snmp/assets/monitors/device_unreachable.json +++ b/snmp/assets/monitors/device_unreachable.json @@ -1,23 +1,30 @@ { - "name": "[SNMP] Device unreachable alert on {{snmp_device.name}} in namespace {{device_namespace.name}}", - "type": "query alert", - "query": "avg(last_5m):max:snmp.device.reachable{*} by {snmp_device,device_namespace} < 0.8", - "message": "{{#is_alert}}\nA network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is unreachable and can't be monitored anymore.\n{{/is_alert}}\n\n{{#is_alert_recovery}}\nA network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is reachable again.\n{{/is_alert_recovery}}\n\nTo know more about the status of your device, you can have more information from the [NDM page for the device {{device_namespace.name}}:{{snmp_device.name}}](/infrastructure/devices/graph?inspectedDevice={{device_namespace.name}}%3A{{snmp_device.name}}).", + "version": 2, + "created_at": "2023-03-17", + "last_updated_at": "2023-03-17", + "title": "Device unreachable alert on {{snmp_device.name}} in namespace {{device_namespace.name}}", "tags": [ "integration:snmp" ], - "options": { - "thresholds": { - "critical": 0.8 + "description": "Notify your team when a SNMP device is unreachable. Requires Datadog Agent 7.43+ or 6.43+.", + "definition": { + "message": "{{#is_alert}}\nA network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is unreachable and can't be monitored anymore.\n{{/is_alert}}\n\n{{#is_alert_recovery}}\nA network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is reachable again.\n{{/is_alert_recovery}}\n\nTo know more about the status of your device, you can have more information from the [NDM page for the device {{device_namespace.name}}:{{snmp_device.name}}](/infrastructure/devices/graph?inspectedDevice={{device_namespace.name}}%3A{{snmp_device.name}}).", + "name": "[SNMP] Device unreachable alert on {{snmp_device.name}} in namespace {{device_namespace.name}}", + "options": { + "include_tags": false, + "new_group_delay": 60, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 0.8 + } }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "new_group_delay": 60 - }, - "recommended_monitor_metadata": { - "description": "Notify your team when a SNMP device is unreachable. Requires Datadog Agent 7.43+ or 6.43+." + "query": "avg(last_5m):max:snmp.device.reachable{*} by {snmp_device,device_namespace} < 0.8", + "tags": [ + "integration:snmp" + ], + "type": "query alert" } } \ No newline at end of file diff --git a/snmp/assets/monitors/interface_down.json b/snmp/assets/monitors/interface_down.json index 29ab13857c444..8ec7b7a2da5b1 100644 --- a/snmp/assets/monitors/interface_down.json +++ b/snmp/assets/monitors/interface_down.json @@ -1,23 +1,30 @@ { - "name": "[SNMP] Interface {{interface.name}} down alert on device {{snmp_device.name}} in namespace {{device_namespace.name}}", - "type": "query alert", - "query": "avg(last_5m):default_zero(max:snmp.interface.status{status:down} by {snmp_device,device_namespace,interface_index,interface}) >= 0.8", - "message": "{{#is_alert}}\nInterface {{interface.name}} of network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is reporting DOWN.\n{{/is_alert}}\n\n{{#is_alert_recovery}}\nInterface {{interface.name}} of network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is back online.\n{{/is_alert_recovery}}\n\nTo know more about the status of your device, you can have more information from the [NDM page for the device {{device_namespace.name}}:{{snmp_device.name}}](/infrastructure/devices/graph?inspectedDevice={{device_namespace.name}}%3A{{snmp_device.name}}&detailsTab=interfaces).", + "version": 2, + "created_at": "2023-05-02", + "last_updated_at": "2023-05-02", + "title": "Interface {{interface.name}} down alert on device {{snmp_device.name}} in namespace {{device_namespace.name}}", "tags": [ "integration:snmp" ], - "options": { - "thresholds": { - "critical": 0.8 + "description": "Notify your team when a SNMP device interface is down. Requires Datadog Agent 7.43+ or 6.43+.", + "definition": { + "message": "{{#is_alert}}\nInterface {{interface.name}} of network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is reporting DOWN.\n{{/is_alert}}\n\n{{#is_alert_recovery}}\nInterface {{interface.name}} of network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is back online.\n{{/is_alert_recovery}}\n\nTo know more about the status of your device, you can have more information from the [NDM page for the device {{device_namespace.name}}:{{snmp_device.name}}](/infrastructure/devices/graph?inspectedDevice={{device_namespace.name}}%3A{{snmp_device.name}}\u0026detailsTab=interfaces).", + "name": "[SNMP] Interface {{interface.name}} down alert on device {{snmp_device.name}} in namespace {{device_namespace.name}}", + "options": { + "include_tags": false, + "new_group_delay": 60, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "thresholds": { + "critical": 0.8 + } }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "new_group_delay": 60 - }, - "recommended_monitor_metadata": { - "description": "Notify your team when a SNMP device interface is down. Requires Datadog Agent 7.43+ or 6.43+." + "query": "avg(last_5m):default_zero(max:snmp.interface.status{status:down} by {snmp_device,device_namespace,interface_index,interface}) >= 0.8", + "tags": [ + "integration:snmp" + ], + "type": "query alert" } } \ No newline at end of file diff --git a/snmp/assets/monitors/traps_linkDown.json b/snmp/assets/monitors/traps_linkDown.json index f28cf986f6045..7b64c9777f768 100644 --- a/snmp/assets/monitors/traps_linkDown.json +++ b/snmp/assets/monitors/traps_linkDown.json @@ -1,112 +1,119 @@ { - "name": "[SNMP-Traps] Interface went down on device {{snmp_device.name}}", - "type": "log alert", - "query": "formula(\"default_zero(query1) / default_zero(query1) - default_zero(query) / default_zero(query)\").last(\"1m\") > 0.5", - "message": "{{#is_alert}} \nA network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is reporting CRITICAL and can't be monitored anymore.\n{{/is_alert}}\n\n{{#is_alert_recovery}}\nA network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is back online.\n{{/is_alert_recovery}}\n\nTo know more about the status of your device, you can have more information from the [NDM page for the device {{device_namespace.name}}:{{snmp_device.name}}](/infrastructure/devices/graph?inspectedDevice={{device_namespace.name}}%3A{{snmp_device.name}}).", - "tags": [ - "integration:snmp" - ], - "options": { - "thresholds": { - "critical": 0.5, - "critical_recovery": -0.5 - }, - "enable_logs_sample": true, - "notify_audit": false, - "restriction_query": null, - "on_missing_data": "default", - "include_tags": true, - "new_group_delay": 0, - "variables": [ - { - "data_source": "logs", - "name": "query1", - "indexes": [ - "*" - ], - "compute": { - "aggregation": "count" - }, - "group_by": [ - { - "facet": "snmp_device", - "limit": 5, - "sort": { - "order": "desc", - "aggregation": "count" - } - }, - { - "facet": "device_namespace", - "limit": 5, - "sort": { - "order": "desc", - "aggregation": "count" - } - }, - { - "facet": "@ifIndex", - "limit": 5, - "sort": { - "order": "desc", - "aggregation": "count" - } - } - ], - "search": { - "query": "source:snmp-traps @snmpTrapName:linkDown @ifAdminStatus:up" - }, - "storage": "hot" - }, - { - "data_source": "logs", - "name": "query", - "indexes": [ - "*" - ], - "compute": { - "aggregation": "count" - }, - "group_by": [ - { - "facet": "snmp_device", - "limit": 5, - "sort": { - "order": "desc", - "aggregation": "count" - } - }, - { - "facet": "device_namespace", - "limit": 5, - "sort": { - "order": "desc", - "aggregation": "count" - } - }, - { - "facet": "@ifIndex", - "limit": 5, - "sort": { - "order": "desc", - "aggregation": "count" - } - } - ], - "search": { - "query": "source:snmp-traps @snmpTrapName:linkUp @ifAdminStatus:up" - }, - "storage": "hot" - } - ], - "evaluation_delay": 60, - "group_retention_duration": "3d", - "groupby_simple_monitor": false, - "silenced": {} - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when a linkDown trap is received. You can use this monitor as a template for setting up any traps monitor." - } + "version": 2, + "created_at": "2023-06-15", + "last_updated_at": "2023-06-27", + "title": "Interface went down on device {{snmp_device.name}}", + "tags": [ + "integration:snmp" + ], + "description": "Notify your team when a linkDown trap is received. You can use this monitor as a template for setting up any traps monitor.", + "definition": { + "message": "{{#is_alert}} \nA network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is reporting CRITICAL and can't be monitored anymore.\n{{/is_alert}}\n\n{{#is_alert_recovery}}\nA network device with IP {{snmp_device.name}} in namespace {{device_namespace.name}} is back online.\n{{/is_alert_recovery}}\n\nTo know more about the status of your device, you can have more information from the [NDM page for the device {{device_namespace.name}}:{{snmp_device.name}}](/infrastructure/devices/graph?inspectedDevice={{device_namespace.name}}%3A{{snmp_device.name}}).", + "name": "[SNMP-Traps] Interface went down on device {{snmp_device.name}}", + "options": { + "enable_logs_sample": true, + "evaluation_delay": 60, + "group_retention_duration": "3d", + "groupby_simple_monitor": false, + "include_tags": true, + "new_group_delay": 0, + "notify_audit": false, + "on_missing_data": "default", + "restriction_query": null, + "silenced": {}, + "thresholds": { + "critical": 0.5, + "critical_recovery": -0.5 + }, + "variables": [ + { + "compute": { + "aggregation": "count" + }, + "data_source": "logs", + "group_by": [ + { + "facet": "snmp_device", + "limit": 5, + "sort": { + "aggregation": "count", + "order": "desc" + } + }, + { + "facet": "device_namespace", + "limit": 5, + "sort": { + "aggregation": "count", + "order": "desc" + } + }, + { + "facet": "@ifIndex", + "limit": 5, + "sort": { + "aggregation": "count", + "order": "desc" + } + } + ], + "indexes": [ + "*" + ], + "name": "query1", + "search": { + "query": "source:snmp-traps @snmpTrapName:linkDown @ifAdminStatus:up" + }, + "storage": "hot" + }, + { + "compute": { + "aggregation": "count" + }, + "data_source": "logs", + "group_by": [ + { + "facet": "snmp_device", + "limit": 5, + "sort": { + "aggregation": "count", + "order": "desc" + } + }, + { + "facet": "device_namespace", + "limit": 5, + "sort": { + "aggregation": "count", + "order": "desc" + } + }, + { + "facet": "@ifIndex", + "limit": 5, + "sort": { + "aggregation": "count", + "order": "desc" + } + } + ], + "indexes": [ + "*" + ], + "name": "query", + "search": { + "query": "source:snmp-traps @snmpTrapName:linkUp @ifAdminStatus:up" + }, + "storage": "hot" + } + ] + }, + "priority": null, + "query": "formula(\"default_zero(query1) / default_zero(query1) - default_zero(query) / default_zero(query)\").last(\"1m\") > 0.5", + "restricted_roles": null, + "tags": [ + "integration:snmp" + ], + "type": "log alert" + } } \ No newline at end of file diff --git a/snowflake/assets/monitors/snowflake_failed_logins.json b/snowflake/assets/monitors/snowflake_failed_logins.json new file mode 100644 index 0000000000000..cbaac2f748bff --- /dev/null +++ b/snowflake/assets/monitors/snowflake_failed_logins.json @@ -0,0 +1,34 @@ +{ + "version": 2, + "created_at": "2020-09-16", + "last_updated_at": "2020-12-10", + "title": "Increased Failed Login Attempts", + "tags": [ + "integration:snowflake" + ], + "description": "Notify your team when there has been increased failed login attempts to your account.", + "definition": { + "message": "There has been at least 10 failed login attempts in the past 24 hours.", + "name": "[Snowflake] Increased Failed Login Attempts", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 10 + }, + "timeout_h": 0 + }, + "query": "sum(last_1d):avg:snowflake.logins.fail.count{*}.as_count() > 10", + "tags": [ + "integration:snowflake" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/snowflake/assets/recommended_monitors/snowflake_failed_logins.json b/snowflake/assets/recommended_monitors/snowflake_failed_logins.json deleted file mode 100644 index ff2168e485c4d..0000000000000 --- a/snowflake/assets/recommended_monitors/snowflake_failed_logins.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "name": "[Snowflake] Increased Failed Login Attempts", - "type": "query alert", - "query": "sum(last_1d):avg:snowflake.logins.fail.count{*}.as_count() > 10", - "message": "There has been at least 10 failed login attempts in the past 24 hours.", - "tags": [ - "integration:snowflake" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 10 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when there has been increased failed login attempts to your account." - } -} \ No newline at end of file diff --git a/snowflake/manifest.json b/snowflake/manifest.json index ba246baa7ea73..95591ff993573 100644 --- a/snowflake/manifest.json +++ b/snowflake/manifest.json @@ -50,7 +50,7 @@ "Snowflake Organization Metrics": "assets/dashboards/organization_metrics.json" }, "monitors": { - "Snowflake failed logins": "assets/recommended_monitors/snowflake_failed_logins.json" + "Snowflake failed logins": "assets/monitors/snowflake_failed_logins.json" } } } diff --git a/sonarqube/assets/monitors/vulnerabilities.json b/sonarqube/assets/monitors/vulnerabilities.json new file mode 100644 index 0000000000000..db129218662a6 --- /dev/null +++ b/sonarqube/assets/monitors/vulnerabilities.json @@ -0,0 +1,35 @@ +{ + "version": 2, + "created_at": "2020-12-07", + "last_updated_at": "2021-02-03", + "title": "Vulnerabilities", + "tags": [ + "integration:sonarqube" + ], + "description": "Resolve potential vulnerabilities.", + "definition": { + "message": "At least one vulnerability has been detected.", + "name": "[SonarQube] Vulnerabilities", + "options": { + "escalation_message": "Resolve potential vulnerabilities.", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": true, + "notify_no_data": false, + "renotify_interval": 360, + "require_full_window": true, + "thresholds": { + "critical": 0 + }, + "timeout_h": 0 + }, + "priority": 2, + "query": "max(last_5m):avg:sonarqube.security.vulnerabilities{*} > 0", + "tags": [ + "integration:sonarqube" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/sonarqube/assets/recommended_monitors/vulnerabilities.json b/sonarqube/assets/recommended_monitors/vulnerabilities.json deleted file mode 100644 index b344c47b4c4ef..0000000000000 --- a/sonarqube/assets/recommended_monitors/vulnerabilities.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "name": "[SonarQube] Vulnerabilities", - "type": "query alert", - "query": "max(last_5m):avg:sonarqube.security.vulnerabilities{*} > 0", - "message": "At least one vulnerability has been detected.", - "tags": [ - "integration:sonarqube" - ], - "options": { - "notify_audit": true, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 360, - "escalation_message": "Resolve potential vulnerabilities.", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 0 - } - }, - "priority": 2, - "recommended_monitor_metadata": { - "description": "Resolve potential vulnerabilities." - } -} diff --git a/sonarqube/manifest.json b/sonarqube/manifest.json index 3e796e3f4ca2d..32e7ef49e798b 100644 --- a/sonarqube/manifest.json +++ b/sonarqube/manifest.json @@ -49,7 +49,7 @@ "Sonarqube Overview": "assets/dashboards/overview.json" }, "monitors": { - "SonarQube vulnerabilities": "assets/recommended_monitors/vulnerabilities.json" + "SonarQube vulnerabilities": "assets/monitors/vulnerabilities.json" }, "saved_views": { "status_overview": "assets/saved_views/status_overview.json" diff --git a/sqlserver/assets/monitors/sqlserver_ao_not_healthy.json b/sqlserver/assets/monitors/sqlserver_ao_not_healthy.json new file mode 100644 index 0000000000000..b2f0e2ce6f46a --- /dev/null +++ b/sqlserver/assets/monitors/sqlserver_ao_not_healthy.json @@ -0,0 +1,35 @@ +{ + "version": 2, + "created_at": "2021-01-08", + "last_updated_at": "2021-01-08", + "title": "Availability Group is not healthy", + "tags": [ + "integration:sql-server" + ], + "description": "Notify your team when your availability group is not healthy.", + "definition": { + "message": "Availability group has not been healthy for the last 5 minutes", + "name": "[SQLServer] Availability Group is not healthy", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 1, + "warning": 2 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:sqlserver.ao.ag_sync_health{*} < 1", + "tags": [ + "integration:sql-server" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/sqlserver/assets/monitors/sqlserver_db_not_online.json b/sqlserver/assets/monitors/sqlserver_db_not_online.json new file mode 100644 index 0000000000000..90bf8873ebfd7 --- /dev/null +++ b/sqlserver/assets/monitors/sqlserver_db_not_online.json @@ -0,0 +1,34 @@ +{ + "version": 2, + "created_at": "2021-01-08", + "last_updated_at": "2021-01-08", + "title": "Database is not online", + "tags": [ + "integration:sql-server" + ], + "description": "Notify your team when your database is not online.", + "definition": { + "message": "SQLServer database is not online for the last 5 minutes", + "name": "[SQLServer] Database is not online", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 0 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:sqlserver.database.state{*} > 0", + "tags": [ + "integration:sql-server" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/sqlserver/assets/monitors/sqlserver_db_not_sync.json b/sqlserver/assets/monitors/sqlserver_db_not_sync.json new file mode 100644 index 0000000000000..a58f870a41833 --- /dev/null +++ b/sqlserver/assets/monitors/sqlserver_db_not_sync.json @@ -0,0 +1,34 @@ +{ + "version": 2, + "created_at": "2021-01-08", + "last_updated_at": "2021-01-13", + "title": "Database is not marked for replication sync", + "tags": [ + "integration:sql-server" + ], + "description": "Notify your team when your database is not in sync with its backup.", + "definition": { + "message": "SQLServer database is not marked for replication sync. It may not be synced with its backup.", + "name": "[SQLServer] Database is not marked for replication sync", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 1 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:sqlserver.database.is_sync_with_backup{*} < 1", + "tags": [ + "integration:sql-server" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/sqlserver/assets/monitors/sqlserver_high_number_failed_auto_param.json b/sqlserver/assets/monitors/sqlserver_high_number_failed_auto_param.json new file mode 100644 index 0000000000000..3e51d480d56a7 --- /dev/null +++ b/sqlserver/assets/monitors/sqlserver_high_number_failed_auto_param.json @@ -0,0 +1,34 @@ +{ + "version": 2, + "created_at": "2021-01-08", + "last_updated_at": "2021-01-08", + "title": "High number of failed auto-parameterization attempts", + "tags": [ + "integration:sql-server" + ], + "description": "Notify your team when a high number of auto-parameterization are failing.", + "definition": { + "message": "There is a high number of failed auto-parameterization attempts in the past 5 minutes", + "name": "[SQLServer] High number of failed auto-parameterization attempts", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 10 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:sqlserver.stats.failed_auto_param_attempts{*} > 10", + "tags": [ + "integration:sql-server" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/sqlserver/assets/monitors/sqlserver_high_processes_blocked.json b/sqlserver/assets/monitors/sqlserver_high_processes_blocked.json new file mode 100644 index 0000000000000..f5ee0435012cf --- /dev/null +++ b/sqlserver/assets/monitors/sqlserver_high_processes_blocked.json @@ -0,0 +1,34 @@ +{ + "version": 2, + "created_at": "2021-01-08", + "last_updated_at": "2021-01-08", + "title": "High number of processes blocked", + "tags": [ + "integration:sql-server" + ], + "description": "Notify your team when a high number of processes are being blocked.", + "definition": { + "message": "There is a high number of processes being blocked in the past 5 minutes", + "name": "[SQLServer] High number of processes blocked", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 50 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:sqlserver.stats.procs_blocked{*} > 50", + "tags": [ + "integration:sql-server" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/sqlserver/assets/recommended_monitors/sqlserver_ao_not_healthy.json b/sqlserver/assets/recommended_monitors/sqlserver_ao_not_healthy.json deleted file mode 100644 index bdda1ff178af2..0000000000000 --- a/sqlserver/assets/recommended_monitors/sqlserver_ao_not_healthy.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "name": "[SQLServer] Availability Group is not healthy", - "type": "query alert", - "query": "avg(last_5m):avg:sqlserver.ao.ag_sync_health{*} < 1", - "message": "Availability group has not been healthy for the last 5 minutes", - "tags": [ - "integration:sql-server" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "warning": 2 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when your availability group is not healthy." - } -} \ No newline at end of file diff --git a/sqlserver/assets/recommended_monitors/sqlserver_db_not_online.json b/sqlserver/assets/recommended_monitors/sqlserver_db_not_online.json deleted file mode 100644 index f81c545ed2df2..0000000000000 --- a/sqlserver/assets/recommended_monitors/sqlserver_db_not_online.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "name": "[SQLServer] Database is not online", - "type": "query alert", - "query": "avg(last_5m):avg:sqlserver.database.state{*} > 0", - "message": "SQLServer database is not online for the last 5 minutes", - "tags": [ - "integration:sql-server" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 0 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when your database is not online." - } -} \ No newline at end of file diff --git a/sqlserver/assets/recommended_monitors/sqlserver_db_not_sync.json b/sqlserver/assets/recommended_monitors/sqlserver_db_not_sync.json deleted file mode 100644 index 2a259f1945eed..0000000000000 --- a/sqlserver/assets/recommended_monitors/sqlserver_db_not_sync.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "name": "[SQLServer] Database is not marked for replication sync", - "type": "query alert", - "query": "avg(last_5m):avg:sqlserver.database.is_sync_with_backup{*} < 1", - "message": "SQLServer database is not marked for replication sync. It may not be synced with its backup.", - "tags": [ - "integration:sql-server" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when your database is not in sync with its backup." - } -} \ No newline at end of file diff --git a/sqlserver/assets/recommended_monitors/sqlserver_high_number_failed_auto_param.json b/sqlserver/assets/recommended_monitors/sqlserver_high_number_failed_auto_param.json deleted file mode 100644 index d1df6ec212a8b..0000000000000 --- a/sqlserver/assets/recommended_monitors/sqlserver_high_number_failed_auto_param.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "name": "[SQLServer] High number of failed auto-parameterization attempts", - "type": "query alert", - "query": "avg(last_5m):avg:sqlserver.stats.failed_auto_param_attempts{*} > 10", - "message": "There is a high number of failed auto-parameterization attempts in the past 5 minutes", - "tags": [ - "integration:sql-server" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 10 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when a high number of auto-parameterization are failing." - } -} \ No newline at end of file diff --git a/sqlserver/assets/recommended_monitors/sqlserver_high_processes_blocked.json b/sqlserver/assets/recommended_monitors/sqlserver_high_processes_blocked.json deleted file mode 100644 index 1d8cd144870d9..0000000000000 --- a/sqlserver/assets/recommended_monitors/sqlserver_high_processes_blocked.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "name": "[SQLServer] High number of processes blocked", - "type": "query alert", - "query": "avg(last_5m):avg:sqlserver.stats.procs_blocked{*} > 50", - "message": "There is a high number of processes being blocked in the past 5 minutes", - "tags": [ - "integration:sql-server" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 50 - } - }, - "recommended_monitor_metadata": { - "description": "Notify your team when a high number of processes are being blocked." - } -} \ No newline at end of file diff --git a/sqlserver/manifest.json b/sqlserver/manifest.json index ab4683e9aca75..a8a2bfdad6b75 100644 --- a/sqlserver/manifest.json +++ b/sqlserver/manifest.json @@ -50,11 +50,11 @@ "sqlserver": "assets/dashboards/sqlserver_dashboard.json" }, "monitors": { - "SQLServer ao not healthy": "assets/recommended_monitors/sqlserver_ao_not_healthy.json", - "SQLServer high processes blocked": "assets/recommended_monitors/sqlserver_high_processes_blocked.json", - "SQLServer high failed auto param": "assets/recommended_monitors/sqlserver_high_number_failed_auto_param.json", - "SQLServer db not online": "assets/recommended_monitors/sqlserver_db_not_online.json", - "SQLServer db not in sync": "assets/recommended_monitors/sqlserver_db_not_sync.json" + "SQLServer ao not healthy": "assets/monitors/sqlserver_ao_not_healthy.json", + "SQLServer high processes blocked": "assets/monitors/sqlserver_high_processes_blocked.json", + "SQLServer high failed auto param": "assets/monitors/sqlserver_high_number_failed_auto_param.json", + "SQLServer db not online": "assets/monitors/sqlserver_db_not_online.json", + "SQLServer db not in sync": "assets/monitors/sqlserver_db_not_sync.json" }, "logs": { "source": "sqlserver" diff --git a/teamcity/assets/monitors/build_status.json b/teamcity/assets/monitors/build_status.json new file mode 100644 index 0000000000000..184e7ea9814b3 --- /dev/null +++ b/teamcity/assets/monitors/build_status.json @@ -0,0 +1,39 @@ +{ + "version": 2, + "created_at": "2022-12-01", + "last_updated_at": "2022-12-01", + "title": "TeamCity Build Status", + "tags": [ + "integration:teamcity" + ], + "description": "Notify your team when your Build Configuration is not healthy.", + "definition": { + "message": "{{#is_alert}}\nBuild configuration {{build_config.name}} experienced a critical build status. \n{{/is_alert}}", + "name": "TeamCity Build Status", + "options": { + "avalanche_window": 10, + "escalation_message": "", + "include_tags": true, + "new_group_delay": 60, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "silenced": { + "*": null + }, + "thresholds": { + "critical": 1, + "ok": 2, + "warning": 1 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "\"teamcity.build.status\".over(\"*\").by(\"build_config\").last(3).count_by_status()", + "restricted_roles": null, + "tags": [ + "integration:teamcity" + ], + "type": "service check" + } +} \ No newline at end of file diff --git a/teamcity/assets/recommended_monitors/build_status.json b/teamcity/assets/recommended_monitors/build_status.json deleted file mode 100644 index b0696ef9cb2cb..0000000000000 --- a/teamcity/assets/recommended_monitors/build_status.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "name": "TeamCity Build Status", - "type": "service check", - "query": "\"teamcity.build.status\".over(\"*\").by(\"build_config\").last(3).count_by_status()", - "message": "{{#is_alert}}\nBuild configuration {{build_config.name}} experienced a critical build status. \n{{/is_alert}}", - "tags": [ - "integration:teamcity" - ], - "options": { - "thresholds": { - "critical": 1, - "warning": 1, - "ok": 2 - }, - "notify_audit": false, - "notify_no_data": false, - "renotify_interval": 0, - "timeout_h": 0, - "new_group_delay": 60, - "include_tags": true, - "avalanche_window": 10, - "escalation_message": "", - "silenced": { - "*": null - } - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when your Build Configuration is not healthy." - } -} \ No newline at end of file diff --git a/teamcity/manifest.json b/teamcity/manifest.json index 6ff76ef7cb3e4..3bfe7b969afec 100644 --- a/teamcity/manifest.json +++ b/teamcity/manifest.json @@ -56,7 +56,7 @@ "TeamCity Overview": "assets/dashboards/overview.json" }, "monitors": { - "Build Status": "assets/recommended_monitors/build_status.json" + "Build Status": "assets/monitors/build_status.json" }, "saved_views": { "teamcity_processes": "assets/saved_views/teamcity_processes.json" diff --git a/temporal/assets/monitors/FrontendLatency.json b/temporal/assets/monitors/FrontendLatency.json index 51dc1eb5b3d28..98fe16292ae1b 100644 --- a/temporal/assets/monitors/FrontendLatency.json +++ b/temporal/assets/monitors/FrontendLatency.json @@ -1,29 +1,36 @@ { - "name": "Temporal frontend latency is elevated", - "type": "query alert", - "query": "avg(last_5m):sum:temporal.server.service.latency.sum{!operation:poll*,service_name:frontend}.as_rate() / sum:temporal.server.service.latency.count{!operation:poll*,service_name:frontend}.as_rate() > 1000", - "message": "The latency for the Temporal frontend service is elevated ({{ value }}).", - "tags": [ - "integration:temporal", - "service_name:frontend" - ], - "options": { - "thresholds": { - "critical": 1000, - "warning": 500 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "avalanche_window": 10, - "new_host_delay": 300, - "silenced": {} - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Get notified when the frontend latency is elevated." - } -} + "version": 2, + "created_at": "2023-04-13", + "last_updated_at": "2023-04-13", + "title": "Temporal frontend latency is elevated", + "tags": [ + "integration:temporal" + ], + "description": "Get notified when the frontend latency is elevated.", + "definition": { + "message": "The latency for the Temporal frontend service is elevated ({{ value }}).", + "name": "Temporal frontend latency is elevated", + "options": { + "avalanche_window": 10, + "include_tags": false, + "new_host_delay": 300, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 1000, + "warning": 500 + } + }, + "priority": null, + "query": "avg(last_5m):sum:temporal.server.service.latency.sum{!operation:poll*,service_name:frontend}.as_rate() / sum:temporal.server.service.latency.count{!operation:poll*,service_name:frontend}.as_rate() > 1000", + "restricted_roles": null, + "tags": [ + "integration:temporal", + "service_name:frontend" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/temporal/assets/monitors/HistoryLatency.json b/temporal/assets/monitors/HistoryLatency.json index 947efdee7e70d..4855e0f143f51 100644 --- a/temporal/assets/monitors/HistoryLatency.json +++ b/temporal/assets/monitors/HistoryLatency.json @@ -1,29 +1,36 @@ { - "name": "Temporal history service latency is elevated", - "type": "query alert", - "query": "avg(last_5m):sum:temporal.server.service.latency.sum{service_name:history}.as_rate() / sum:temporal.server.service.latency.count{service_name:history}.as_rate() > 1000", - "message": "The latency for the Temporal history service is elevated ({{ value }}).", - "tags": [ - "integration:temporal", - "service_name:history" - ], - "options": { - "thresholds": { - "critical": 1000, - "warning": 500 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "avalanche_window": 10, - "new_host_delay": 300, - "silenced": {} - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Get notified when the history service's latency is elevated." - } -} + "version": 2, + "created_at": "2023-04-13", + "last_updated_at": "2023-04-13", + "title": "Temporal history service latency is elevated", + "tags": [ + "integration:temporal" + ], + "description": "Get notified when the history service's latency is elevated.", + "definition": { + "message": "The latency for the Temporal history service is elevated ({{ value }}).", + "name": "Temporal history service latency is elevated", + "options": { + "avalanche_window": 10, + "include_tags": false, + "new_host_delay": 300, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 1000, + "warning": 500 + } + }, + "priority": null, + "query": "avg(last_5m):sum:temporal.server.service.latency.sum{service_name:history}.as_rate() / sum:temporal.server.service.latency.count{service_name:history}.as_rate() > 1000", + "restricted_roles": null, + "tags": [ + "integration:temporal", + "service_name:history" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/temporal/assets/monitors/MatchingLatency.json b/temporal/assets/monitors/MatchingLatency.json index b6fdb1690ee26..c92430d77c5eb 100644 --- a/temporal/assets/monitors/MatchingLatency.json +++ b/temporal/assets/monitors/MatchingLatency.json @@ -1,28 +1,35 @@ { - "name": "Temporal matching service latency is elevated", - "type": "query alert", - "query": "avg(last_5m):sum:temporal.server.service.latency.sum{service_name:matching}.as_rate() / sum:temporal.server.service.latency.count{service_name:matching}.as_rate() > 1000", - "message": "The latency for the Temporal matching service is elevated ({{ value }}).", - "tags": [ - "integration:temporal", - "service_name:matching" - ], - "options": { - "thresholds": { - "critical": 1000, - "warning": 500 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "avalanche_window": 10, - "silenced": {} - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Get notified when the matching service's latency is elevated." - } -} + "version": 2, + "created_at": "2023-04-13", + "last_updated_at": "2023-04-13", + "title": "Temporal matching service latency is elevated", + "tags": [ + "integration:temporal" + ], + "description": "Get notified when the matching service's latency is elevated.", + "definition": { + "message": "The latency for the Temporal matching service is elevated ({{ value }}).", + "name": "Temporal matching service latency is elevated", + "options": { + "avalanche_window": 10, + "include_tags": false, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 1000, + "warning": 500 + } + }, + "priority": null, + "query": "avg(last_5m):sum:temporal.server.service.latency.sum{service_name:matching}.as_rate() / sum:temporal.server.service.latency.count{service_name:matching}.as_rate() > 1000", + "restricted_roles": null, + "tags": [ + "integration:temporal", + "service_name:matching" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/temporal/assets/monitors/PersistenceLatency.json b/temporal/assets/monitors/PersistenceLatency.json index 88ed3db9255c4..6a4d32edbe3d6 100644 --- a/temporal/assets/monitors/PersistenceLatency.json +++ b/temporal/assets/monitors/PersistenceLatency.json @@ -1,28 +1,35 @@ { - "name": "Temporal persistence latency is elevated", - "type": "query alert", - "query": "avg(last_5m):sum:temporal.server.persistence.latency.sum{*}.as_rate() / sum:temporal.server.persistence.latency.count{*}.as_rate() > 1000", - "message": "The latency for the Temporal persistence is elevated ({{ value }}).", - "tags": [ - "integration:temporal" - ], - "options": { - "thresholds": { - "critical": 1000, - "warning": 500 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "avalanche_window": 10, - "new_host_delay": 300, - "silenced": {} - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Get notified when the persistence's latency is elevated." - } -} + "version": 2, + "created_at": "2023-04-13", + "last_updated_at": "2023-04-13", + "title": "Temporal persistence latency is elevated", + "tags": [ + "integration:temporal" + ], + "description": "Get notified when the persistence's latency is elevated.", + "definition": { + "message": "The latency for the Temporal persistence is elevated ({{ value }}).", + "name": "Temporal persistence latency is elevated", + "options": { + "avalanche_window": 10, + "include_tags": false, + "new_host_delay": 300, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 1000, + "warning": 500 + } + }, + "priority": null, + "query": "avg(last_5m):sum:temporal.server.persistence.latency.sum{*}.as_rate() / sum:temporal.server.persistence.latency.count{*}.as_rate() > 1000", + "restricted_roles": null, + "tags": [ + "integration:temporal" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/teradata/assets/recommended_monitors/high_disk_space.json b/teradata/assets/monitors/high_disk_space.json similarity index 63% rename from teradata/assets/recommended_monitors/high_disk_space.json rename to teradata/assets/monitors/high_disk_space.json index 454c7c7298c60..960eaa08ad41b 100644 --- a/teradata/assets/recommended_monitors/high_disk_space.json +++ b/teradata/assets/monitors/high_disk_space.json @@ -1,29 +1,36 @@ { + "version": 2, + "created_at": "2022-07-18", + "last_updated_at": "2022-07-18", + "title": "High database disk space in use", + "tags": [ + "integration:teradata" + ], + "description": "Notifies when Teradata database disk space in use is higher than usual.", + "definition": { + "message": "{{#is_warning}}Total current disk space in use for database {{td_database.name}} on server {{teradata_server.name}} is more than 50%{{/is_warning}} \n{{#is_alert}}Total current disk space in use for database {{td_database.name}} on server {{teradata_server.name}} is more than 75%{{/is_alert}}\n{{#is_recovery}}Total current disk space in use for database {{td_database.name}} on server {{teradata_server.name}} is less than 50%{{/is_recovery}}\n", "name": "[Teradata] High database disk space in use", - "type": "query alert", + "options": { + "new_group_delay": 60, + "no_data_timeframe": 10, + "notify_audit": true, + "notify_no_data": true, + "renotify_interval": 0, + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 0.75, + "critical_recovery": 0, + "warning": 0.5 + } + }, "query": "avg(last_15m):anomalies(((avg:teradata.disk_space.curr_perm.total{*} by {host,teradata_server,td_database,td_amp} + avg:teradata.disk_space.curr_spool.total{*} by {host,teradata_server,td_database,td_amp} + avg:teradata.disk_space.curr_temp.total{*} by {host,teradata_server,td_database,td_amp}) / (avg:teradata.disk_space.max_perm.total{*} by {host,teradata_server,td_database,td_amp} + avg:teradata.disk_space.max_spool.total{*} by {host,teradata_server,td_database,td_amp} + avg:teradata.disk_space.max_temp.total{*} by {host,teradata_server,td_database,td_amp})) * 100, 'basic', 2, direction='both', interval=60, alert_window='last_15m', count_default_zero='true', seasonality='hourly') >= 0.75", - "message": "{{#is_warning}}Total current disk space in use for database {{td_database.name}} on server {{teradata_server.name}} is more than 50%{{/is_warning}} \n{{#is_alert}}Total current disk space in use for database {{td_database.name}} on server {{teradata_server.name}} is more than 75%{{/is_alert}}\n{{#is_recovery}}Total current disk space in use for database {{td_database.name}} on server {{teradata_server.name}} is less than 50%{{/is_recovery}}\n", "tags": [ - "integration:teradata" + "integration:teradata" ], - "options": { - "thresholds": { - "critical": 0.75, - "critical_recovery": 0, - "warning": 0.5 - }, - "notify_audit": true, - "require_full_window": false, - "notify_no_data": true, - "no_data_timeframe": 10, - "renotify_interval": 0, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - }, - "new_group_delay": 60 - }, - "recommended_monitor_metadata": { - "description": "Notifies when Teradata database disk space in use is higher than usual." - } + "type": "query alert" + } } \ No newline at end of file diff --git a/teradata/assets/recommended_monitors/low_ready_threads.json b/teradata/assets/monitors/low_ready_threads.json similarity index 50% rename from teradata/assets/recommended_monitors/low_ready_threads.json rename to teradata/assets/monitors/low_ready_threads.json index 864a2ad493234..ef897a4ee629b 100644 --- a/teradata/assets/recommended_monitors/low_ready_threads.json +++ b/teradata/assets/monitors/low_ready_threads.json @@ -1,29 +1,36 @@ { + "version": 2, + "created_at": "2022-07-18", + "last_updated_at": "2022-07-18", + "title": "Low number of ready threads", + "tags": [ + "integration:teradata" + ], + "description": "Notifies when Teradata database ready threads are lower than usual.", + "definition": { + "message": "{{#is_warning}}Number of runnable ready threads on server {{teradata_server.name}} is less than 60%{{/is_warning}} \n{{#is_alert}}Number of runnable ready threads on server {{teradata_server.name}} is more than 80%{{/is_alert}}\n{{#is_recovery}}Number of runnable ready threads on server {{teradata_server.name}} is less than 60%{{/is_recovery}}", "name": "[Teradata] Low number of ready threads", - "type": "query alert", + "options": { + "new_group_delay": 60, + "no_data_timeframe": 10, + "notify_audit": true, + "notify_no_data": true, + "renotify_interval": 0, + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 0.8, + "critical_recovery": 0, + "warning": 0.6 + } + }, "query": "avg(last_15m):anomalies((avg:teradata.process.ready{*} by {host,teradata_server} / avg:teradata.process.ready_max{*} by {host,teradata_server}) * 100, 'basic', 2, direction='below', interval=60, alert_window='last_15m', count_default_zero='true', seasonality='hourly') >= 0.8", - "message": "{{#is_warning}}Number of runnable ready threads on server {{teradata_server.name}} is less than 60%{{/is_warning}} \n{{#is_alert}}Number of runnable ready threads on server {{teradata_server.name}} is more than 80%{{/is_alert}}\n{{#is_recovery}}Number of runnable ready threads on server {{teradata_server.name}} is less than 60%{{/is_recovery}}", "tags": [ - "integration:teradata" + "integration:teradata" ], - "options": { - "thresholds": { - "critical": 0.8, - "critical_recovery": 0, - "warning": 0.6 - }, - "notify_audit": true, - "require_full_window": false, - "notify_no_data": true, - "no_data_timeframe": 10, - "renotify_interval": 0, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - }, - "new_group_delay": 60 - }, - "recommended_monitor_metadata": { - "description": "Notifies when Teradata database ready threads are lower than usual." - } + "type": "query alert" + } } \ No newline at end of file diff --git a/teradata/manifest.json b/teradata/manifest.json index fd3f3f93a3b43..2608727567322 100644 --- a/teradata/manifest.json +++ b/teradata/manifest.json @@ -47,8 +47,8 @@ "Teradata Overview": "assets/dashboards/teradata_overview.json" }, "monitors": { - "High disk space": "assets/recommended_monitors/high_disk_space.json", - "Low ready threads": "assets/recommended_monitors/low_ready_threads.json" + "High disk space": "assets/monitors/high_disk_space.json", + "Low ready threads": "assets/monitors/low_ready_threads.json" } } } \ No newline at end of file diff --git a/tomcat/assets/monitors/error_count.json b/tomcat/assets/monitors/error_count.json index 28a8e02b9d2f0..8624f71912e1b 100644 --- a/tomcat/assets/monitors/error_count.json +++ b/tomcat/assets/monitors/error_count.json @@ -1,30 +1,39 @@ { - "name": "[Tomcat] Increase of the errors/second rate for host: {{host.name}}", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:tomcat.error_count{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "There is an increase of the number of errors per second on all request processors for host: {{host.name}}\n\nErrors indicate an issue with the Tomcat server itself, a host, a deployed application, or an application servlet. This includes errors generated when the Tomcat server runs out of memory, can’t find a requested file or servlet, or is unable to serve a JSP due to syntax errors in the servlet codebase.", - "tags": ["integration:tomcat"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 + "version": 2, + "created_at": "2021-03-09", + "last_updated_at": "2021-03-09", + "title": "Increase of the errors/second rate for host: {{host.name}}", + "tags": [ + "integration:tomcat" + ], + "description": "Notifies when Tomcat experiences an increase of error rate for a specific host.", + "definition": { + "message": "There is an increase of the number of errors per second on all request processors for host: {{host.name}}\n\nErrors indicate an issue with the Tomcat server itself, a host, a deployed application, or an application servlet. This includes errors generated when the Tomcat server runs out of memory, can’t find a requested file or servlet, or is unable to serve a JSP due to syntax errors in the servlet codebase.", + "name": "[Tomcat] Increase of the errors/second rate for host: {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when Tomcat experiences an increase of error rate for a specific host." + "query": "avg(last_4h):anomalies(avg:tomcat.error_count{*} by {host}, 'agile', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "tags": [ + "integration:tomcat" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/tomcat/assets/monitors/max_proc_time.json b/tomcat/assets/monitors/max_proc_time.json index a3f1b0118951e..bf6d015b9e2fe 100644 --- a/tomcat/assets/monitors/max_proc_time.json +++ b/tomcat/assets/monitors/max_proc_time.json @@ -1,30 +1,39 @@ { - "name": "[Tomcat] Anomalous max processing time for host {{host.name}}", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:tomcat.max_time{*} by {host}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "There is an anomaly in the Tomcat max processing time on host: {{host.name}} \n\n`tomcat.max_time` indicates the maximum amount of time it takes for the server to process one request: from the time an available thread starts processing the request to the time it returns a response. Its value updates whenever the server detects a longer request processing time than the current `tomcat.max_time`.\n\nA spike in max processing time could indicate that a JSP page isn’t loading or an associated process (such as a database query) is taking too long to complete. ", - "tags": ["integration:tomcat"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 + "version": 2, + "created_at": "2021-03-09", + "last_updated_at": "2021-03-09", + "title": "Anomalous max processing time for host {{host.name}}", + "tags": [ + "integration:tomcat" + ], + "description": "Notifies when Tomcat experiences an anomalous max processing time for a specific host.", + "definition": { + "message": "There is an anomaly in the Tomcat max processing time on host: {{host.name}} \n\n`tomcat.max_time` indicates the maximum amount of time it takes for the server to process one request: from the time an available thread starts processing the request to the time it returns a response. Its value updates whenever the server detects a longer request processing time than the current `tomcat.max_time`.\n\nA spike in max processing time could indicate that a JSP page isn’t loading or an associated process (such as a database query) is taking too long to complete. ", + "name": "[Tomcat] Anomalous max processing time for host {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when Tomcat experiences an anomalous max processing time for a specific host." + "query": "avg(last_4h):anomalies(avg:tomcat.max_time{*} by {host}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "tags": [ + "integration:tomcat" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/tomcat/assets/monitors/processing_time.json b/tomcat/assets/monitors/processing_time.json index a6b2689e7c2e6..9ee9d46932f0e 100644 --- a/tomcat/assets/monitors/processing_time.json +++ b/tomcat/assets/monitors/processing_time.json @@ -1,30 +1,39 @@ { - "name": "[Tomcat] Anomalous average processing time for host {{host.name}}", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:tomcat.processing_time{*} by {host}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "There is an anomaly in the Tomcat average processing time on host: {{host.name}} \n\nIf the processing time increases as traffic increases, then you may not have enough worker threads to process the requests, or your server is reaching its threshold and consuming too much memory.\n\nNote: When compared with the `tomcat.request_count` metric, you can gauge how many requests your server can efficiently handle.", - "tags": ["integration:tomcat"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 + "version": 2, + "created_at": "2021-03-09", + "last_updated_at": "2021-03-09", + "title": "Anomalous average processing time for host {{host.name}}", + "tags": [ + "integration:tomcat" + ], + "description": "Notifies when Tomcat experiences an anomalous processing time for a specific host.", + "definition": { + "message": "There is an anomaly in the Tomcat average processing time on host: {{host.name}} \n\nIf the processing time increases as traffic increases, then you may not have enough worker threads to process the requests, or your server is reaching its threshold and consuming too much memory.\n\nNote: When compared with the `tomcat.request_count` metric, you can gauge how many requests your server can efficiently handle.", + "name": "[Tomcat] Anomalous average processing time for host {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when Tomcat experiences an anomalous processing time for a specific host." + "query": "avg(last_4h):anomalies(avg:tomcat.processing_time{*} by {host}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "tags": [ + "integration:tomcat" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/tomcat/assets/monitors/req_count.json b/tomcat/assets/monitors/req_count.json index 66c7f71514adf..812ece11f7a4c 100644 --- a/tomcat/assets/monitors/req_count.json +++ b/tomcat/assets/monitors/req_count.json @@ -1,30 +1,39 @@ { - "name": "[Tomcat] Anomalous request rate for host {{host.name}}", - "type": "query alert", - "query": "avg(last_4h):anomalies(avg:tomcat.request_count{*} by {host}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", - "message": "There is an anomaly in the amount of requests handled by Tomcat on host: {{host.name}} ", - "tags": ["integration:tomcat"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 + "version": 2, + "created_at": "2021-03-09", + "last_updated_at": "2021-03-09", + "title": "Anomalous request rate for host {{host.name}}", + "tags": [ + "integration:tomcat" + ], + "description": "Notifies when Tomcat experiences an anomalous number of request rate for a specific host.", + "definition": { + "message": "There is an anomaly in the amount of requests handled by Tomcat on host: {{host.name}} ", + "name": "[Tomcat] Anomalous request rate for host {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when Tomcat experiences an anomalous number of request rate for a specific host." + "query": "avg(last_4h):anomalies(avg:tomcat.request_count{*} by {host}, 'agile', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true', seasonality='daily') >= 1", + "tags": [ + "integration:tomcat" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/tomcat/assets/monitors/thread_busy.json b/tomcat/assets/monitors/thread_busy.json index 512c7a99b60db..85c224484f713 100644 --- a/tomcat/assets/monitors/thread_busy.json +++ b/tomcat/assets/monitors/thread_busy.json @@ -1,26 +1,35 @@ { - "name": "[Tomcat] % of busy threads is high for host: {{host.name}}", - "type": "query alert", - "query": "avg(last_5m):( avg:tomcat.threads.busy{*} by {host} / max:tomcat.threads.max{*} by {host} ) * 100 > 70", - "message": "{{#is_alert}}\n\nALERT: The current amount of busy threads represents {{value}} % of the maximum number of allowed worker threads for host: {{host.name}}\n\n{{/is_alert}} \n\n{{#is_warning}}\n\nWARNING: The current amount of busy threads represents {{value}} % of the maximum number of allowed worker threads for host: {{host.name}}\n\n{{/is_warning}} \n\n", - "tags": ["integration:tomcat"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 70, - "warning": 50 - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when Tomcat current amount of busy threads represents a high percentage of the maximum number of allowed worker threads for a specific host." + "version": 2, + "created_at": "2021-03-09", + "last_updated_at": "2021-03-09", + "title": "% of busy threads is high for host: {{host.name}}", + "tags": [ + "integration:tomcat" + ], + "description": "Notifies when Tomcat current amount of busy threads represents a high percentage of the maximum number of allowed worker threads for a specific host.", + "definition": { + "message": "{{#is_alert}}\n\nALERT: The current amount of busy threads represents {{value}} % of the maximum number of allowed worker threads for host: {{host.name}}\n\n{{/is_alert}} \n\n{{#is_warning}}\n\nWARNING: The current amount of busy threads represents {{value}} % of the maximum number of allowed worker threads for host: {{host.name}}\n\n{{/is_warning}} \n\n", + "name": "[Tomcat] % of busy threads is high for host: {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "thresholds": { + "critical": 70, + "warning": 50 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):( avg:tomcat.threads.busy{*} by {host} / max:tomcat.threads.max{*} by {host} ) * 100 > 70", + "tags": [ + "integration:tomcat" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/tomcat/assets/monitors/thread_count_max.json b/tomcat/assets/monitors/thread_count_max.json index b4f742f4a13ba..3a44b13305d80 100644 --- a/tomcat/assets/monitors/thread_count_max.json +++ b/tomcat/assets/monitors/thread_count_max.json @@ -1,26 +1,35 @@ { - "name": "[Tomcat] % of busy threads is high for host: {{host.name}}", - "type": "query alert", - "query": "avg(last_5m):( avg:tomcat.threads.busy{*} by {host} / avg:tomcat.threads.count{*} by {host} ) * 100 > 70", - "message": "{{#is_alert}}\n\nALERT: The current amount of busy threads represents {{value}} % of the current amount of threads managed by the thread pool for host: {{host.name}}\n\n{{/is_alert}} \n\n{{#is_warning}}\n\nWARNING: The current amount of busy threads represents {{value}} % of the current amount of threads managed by the thread pool for host: {{host.name}}\n\n{{/is_warning}} \n\nLearn in the [Key metrics for monitoring Tomcat](https://www.datadoghq.com/blog/tomcat-architecture-and-performance/#fine-tuning-tomcat-thread-usage) blog post how you could fine-tune your Tomcat thread usage.", - "tags": ["integration:tomcat"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 70, - "warning": 50 - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when Tomcat current amount of busy threads represents a high percentage of the current amount of threads managed by the thread pool for a specific host." + "version": 2, + "created_at": "2021-03-09", + "last_updated_at": "2021-03-09", + "title": "% of busy threads is high for host: {{host.name}}", + "tags": [ + "integration:tomcat" + ], + "description": "Notifies when Tomcat current amount of busy threads represents a high percentage of the current amount of threads managed by the thread pool for a specific host.", + "definition": { + "message": "{{#is_alert}}\n\nALERT: The current amount of busy threads represents {{value}} % of the current amount of threads managed by the thread pool for host: {{host.name}}\n\n{{/is_alert}} \n\n{{#is_warning}}\n\nWARNING: The current amount of busy threads represents {{value}} % of the current amount of threads managed by the thread pool for host: {{host.name}}\n\n{{/is_warning}} \n\nLearn in the [Key metrics for monitoring Tomcat](https://www.datadoghq.com/blog/tomcat-architecture-and-performance/#fine-tuning-tomcat-thread-usage) blog post how you could fine-tune your Tomcat thread usage.", + "name": "[Tomcat] % of busy threads is high for host: {{host.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "thresholds": { + "critical": 70, + "warning": 50 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):( avg:tomcat.threads.busy{*} by {host} / avg:tomcat.threads.count{*} by {host} ) * 100 > 70", + "tags": [ + "integration:tomcat" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/torchserve/assets/monitors/error_ratio.json b/torchserve/assets/monitors/error_ratio.json index ac2d47e2aa4bc..c1f43d40e6146 100644 --- a/torchserve/assets/monitors/error_ratio.json +++ b/torchserve/assets/monitors/error_ratio.json @@ -1,28 +1,35 @@ { - "name": "TorchServe: Requests error ratio is high", - "type": "query alert", - "query": "sum(last_1h):(sum:torchserve.openmetrics.requests.4xx.count{*}.as_count() + sum:torchserve.openmetrics.requests.5xx.count{*}.as_count()) / (sum:torchserve.openmetrics.requests.2xx.count{*}.as_count() + sum:torchserve.openmetrics.requests.4xx.count{*}.as_count() + sum:torchserve.openmetrics.requests.5xx.count{*}.as_count()) > 0.1", - "message": "{{#is_alert}}\\nThe error ratio is high!.\\n{{/is_alert}}", - "tags": [ - "integration:torchserve" - ], - "options": { - "thresholds": { - "critical": 0.1, - "warning": 0.05 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "include_tags": false, - "avalanche_window": 10, - "new_host_delay": 300, - "silenced": {} - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when the error ratio is too high." - } + "version": 2, + "created_at": "2023-07-04", + "last_updated_at": "2023-07-04", + "title": "TorchServe: Requests error ratio is high", + "tags": [ + "integration:torchserve" + ], + "description": "Notify your team when the error ratio is too high.", + "definition": { + "message": "{{#is_alert}}\\nThe error ratio is high!.\\n{{/is_alert}}", + "name": "TorchServe: Requests error ratio is high", + "options": { + "avalanche_window": 10, + "include_tags": false, + "new_host_delay": 300, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 0.1, + "warning": 0.05 + } + }, + "priority": null, + "query": "sum(last_1h):(sum:torchserve.openmetrics.requests.4xx.count{*}.as_count() + sum:torchserve.openmetrics.requests.5xx.count{*}.as_count()) / (sum:torchserve.openmetrics.requests.2xx.count{*}.as_count() + sum:torchserve.openmetrics.requests.4xx.count{*}.as_count() + sum:torchserve.openmetrics.requests.5xx.count{*}.as_count()) > 0.1", + "restricted_roles": null, + "tags": [ + "integration:torchserve" + ], + "type": "query alert" + } } \ No newline at end of file diff --git a/traffic_server/assets/monitors/4xx.json b/traffic_server/assets/monitors/4xx.json index bdadf4b90bab0..3d655c655121d 100644 --- a/traffic_server/assets/monitors/4xx.json +++ b/traffic_server/assets/monitors/4xx.json @@ -1,32 +1,39 @@ { + "version": 2, + "created_at": "2022-04-21", + "last_updated_at": "2022-04-21", + "title": "4xx Errors higher than usual", + "tags": [ + "integration:traffic-server" + ], + "description": "Notifies when Traffic Server 4xx errors are higher than usual", + "definition": { + "message": "Number of 4xx errors on Traffic Server is at {{value}} which is higher than usual.", "name": "[Traffic Server] 4xx Errors higher than usual", - "type": "query alert", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, "query": "avg(last_1h):anomalies(avg:traffic_server.process.http.code.4xx_responses{*} by {code}, 'basic', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", - "message": "Number of 4xx errors on Traffic Server is at {{value}} which is higher than usual.", "tags": [ - "integration:traffic_server" + "integration:traffic_server" ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when Traffic Server 4xx errors are higher than usual" - } + "type": "query alert" + } } \ No newline at end of file diff --git a/traffic_server/assets/monitors/5xx.json b/traffic_server/assets/monitors/5xx.json index 0fa38949b9c2c..7383a775d8daa 100644 --- a/traffic_server/assets/monitors/5xx.json +++ b/traffic_server/assets/monitors/5xx.json @@ -1,32 +1,39 @@ { + "version": 2, + "created_at": "2022-04-21", + "last_updated_at": "2022-04-21", + "title": "5xx Errors higher than usual", + "tags": [ + "integration:traffic-server" + ], + "description": "Notifies when Traffic Server 5xx errors are higher than usual", + "definition": { + "message": "Number of 5xx errors on Traffic Server is at {{value}} which is higher than usual.", "name": "[Traffic Server] 5xx Errors higher than usual", - "type": "query alert", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "threshold_windows": { + "recovery_window": "last_15m", + "trigger_window": "last_15m" + }, + "thresholds": { + "critical": 1, + "critical_recovery": 0 + }, + "timeout_h": 0 + }, "query": "avg(last_1h):anomalies(avg:traffic_server.process.http.code.5xx_responses{*} by {code}, 'basic', 2, direction='above', alert_window='last_15m', interval=60, count_default_zero='true') >= 1", - "message": "Number of 5xx errors on Traffic Server is at {{value}} which is higher than usual.", "tags": [ - "integration:traffic_server" + "integration:traffic_server" ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1, - "critical_recovery": 0 - }, - "threshold_windows": { - "trigger_window": "last_15m", - "recovery_window": "last_15m" - } - }, - "recommended_monitor_metadata": { - "description": "Notifies when Traffic Server 5xx errors are higher than usual" - } + "type": "query alert" + } } \ No newline at end of file diff --git a/vault/assets/monitors/vault_S3_time_high.json b/vault/assets/monitors/vault_S3_time_high.json index c97ba67131adf..57fe7b5ec278a 100644 --- a/vault/assets/monitors/vault_S3_time_high.json +++ b/vault/assets/monitors/vault_S3_time_high.json @@ -1,27 +1,34 @@ { - "name": "[Vault] S3 time to access secrets is high", - "type": "query alert", - "query": "avg(last_1m):avg:vault.vault.s3.get.quantile{*} > 50", - "message": "S3 time to access secrets is high in the past 1 minute.", - "tags": [ - "integration:vault" - ], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": true, - "notify_no_data": false, - "renotify_interval": "0", - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 50 - } + "version": 2, + "created_at": "2021-03-08", + "last_updated_at": "2021-03-08", + "title": "S3 time to access secrets is high", + "tags": [ + "integration:vault" + ], + "description": "Notify your team when the time for S3 to access secrets is too high", + "definition": { + "message": "S3 time to access secrets is high in the past 1 minute.", + "name": "[Vault] S3 time to access secrets is high", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "require_full_window": true, + "thresholds": { + "critical": 50 + }, + "timeout_h": 0 }, - "recommended_monitor_metadata": { - "description": "Notify your team when the time for S3 to access secrets is too high" - } + "query": "avg(last_1m):avg:vault.vault.s3.get.quantile{*} > 50", + "tags": [ + "integration:vault" + ], + "type": "query alert" + } } \ No newline at end of file diff --git a/vertica/assets/monitors/vertica_replication_safety.json b/vertica/assets/monitors/vertica_replication_safety.json index 291eaca1e63e6..1b3e96abfe022 100644 --- a/vertica/assets/monitors/vertica_replication_safety.json +++ b/vertica/assets/monitors/vertica_replication_safety.json @@ -1,31 +1,38 @@ { - "name": "[Vertica] Nodes down above K-safety level", - "type": "query alert", - "query": "max(last_5m):avg:vertica.node.down{*} - avg:vertica.ksafety.current{*} > 1", - "message": "Number of nodes down is above the K-safety, which may lead to data becoming unavailable.", - "tags": [ - "integration:vertica" - ], - "options": { - "notify_audit": false, - "locked": null, - "timeout_h": 0, - "new_host_delay": 300, - "require_full_window": false, - "notify_no_data": false, - "renotify_interval": "0", - "renotify_occurrences": null, - "renotify_statuses": null, - "escalation_message": "", - "no_data_timeframe": null, - "include_tags": true, - "thresholds": { - "critical": 1 - } - }, - "priority": null, - "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Get notified when the database is at risk of becoming unsafe." - } -} + "version": 2, + "created_at": "2022-09-15", + "last_updated_at": "2023-07-24", + "title": "Nodes down above K-safety level", + "tags": [ + "integration:vertica" + ], + "description": "Get notified when the database is at risk of becoming unsafe.", + "definition": { + "message": "Number of nodes down is above the K-safety, which may lead to data becoming unavailable.", + "name": "[Vertica] Nodes down above K-safety level", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": null, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": "0", + "renotify_occurrences": null, + "renotify_statuses": null, + "require_full_window": false, + "thresholds": { + "critical": 1 + }, + "timeout_h": 0 + }, + "priority": null, + "query": "max(last_5m):avg:vertica.node.down{*} - avg:vertica.ksafety.current{*} > 1", + "restricted_roles": null, + "tags": [ + "integration:vertica" + ], + "type": "query alert" + } +} \ No newline at end of file diff --git a/voltdb/assets/monitors/cpu_load.json b/voltdb/assets/monitors/cpu_load.json index 0c8cf29c4d91e..3a628e4c7afb3 100644 --- a/voltdb/assets/monitors/cpu_load.json +++ b/voltdb/assets/monitors/cpu_load.json @@ -1,29 +1,38 @@ { - "name": "[VoltDB] Node {{voltdb_host.name}} is running at very high CPU load", - "type": "query alert", - "query": "avg(last_5m):avg:voltdb.cpu.percent_used{*} by {voltdb_host}.rollup(max, 60) >= 90", - "message": "Please check node {{voltdb_host.name}}, as CPU usage has been over {{threshold}}% for the past 5min.", - "tags": ["integration:voltdb"], - "options": { - "notify_audit": false, - "locked": false, - "timeout_h": 0, - "silenced": {}, - "include_tags": true, - "no_data_timeframe": null, - "require_full_window": true, - "new_host_delay": 300, - "notify_no_data": false, - "renotify_interval": 0, - "escalation_message": "", - "thresholds": { - "critical": 90, - "critical_recovery": 89, - "warning": 75, - "warning_recovery": 74 - } - }, - "recommended_monitor_metadata": { - "description": "Triggers an alert when a VoltDB node is reporting very high CPU usage" + "version": 2, + "created_at": "2021-01-08", + "last_updated_at": "2021-01-13", + "title": "Node {{voltdb_host.name}} is running at very high CPU load", + "tags": [ + "integration:voltdb" + ], + "description": "Triggers an alert when a VoltDB node is reporting very high CPU usage", + "definition": { + "message": "Please check node {{voltdb_host.name}}, as CPU usage has been over {{threshold}}% for the past 5min.", + "name": "[VoltDB] Node {{voltdb_host.name}} is running at very high CPU load", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_host_delay": 300, + "no_data_timeframe": null, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": true, + "silenced": {}, + "thresholds": { + "critical": 90, + "critical_recovery": 89, + "warning": 75, + "warning_recovery": 74 + }, + "timeout_h": 0 + }, + "query": "avg(last_5m):avg:voltdb.cpu.percent_used{*} by {voltdb_host}.rollup(max, 60) >= 90", + "tags": [ + "integration:voltdb" + ], + "type": "query alert" } -} +} \ No newline at end of file diff --git a/weaviate/assets/monitors/node_status.json b/weaviate/assets/monitors/node_status.json index 225154cabf52d..7f738c3a0e05e 100644 --- a/weaviate/assets/monitors/node_status.json +++ b/weaviate/assets/monitors/node_status.json @@ -1,25 +1,32 @@ { - "name": "Weaviate Node {weaviate_node.name} on {host.name} is {weaviate_node_status.name}", - "type": "query alert", - "query": "min(last_10m):avg:weaviate.node.status{*} by {host,weaviate_node,weaviate_node_status} >= 1", + "version": 2, + "created_at": "2023-07-24", + "last_updated_at": "2023-07-24", + "title": "Weaviate Node {weaviate_node.name} on {host.name} is {weaviate_node_status.name}", + "tags": [ + "integration:weaviate" + ], + "description": "Notify your team when a Weaviate Node is not 'Healthy'", + "definition": { "message": "{{#is_alert}}\n\nWeaviate Node {{weaviate_node.name}} on {{host.name}} has been {{weaviate_node_status.name}} for the last 10 mins.\n\n{{/is_alert}} \n\n{{#is_recovery}}\n\nWeaviate Node {{weaviate_node.name}} on {{host.name}} has recovered back to {{weaviate_node_status.name}} status for the last 10 mins.\n\n{{/is_recovery}}", - "tags": [ - "integration:weaviate" - ], + "name": "Weaviate Node {weaviate_node.name} on {host.name} is {weaviate_node_status.name}", "options": { - "thresholds": { - "critical": 1 - }, - "notify_audit": false, - "include_tags": false, - "new_group_delay": 60, - "notify_no_data": false, - "avalanche_window": 10, - "silenced": {} + "avalanche_window": 10, + "include_tags": false, + "new_group_delay": 60, + "notify_audit": false, + "notify_no_data": false, + "silenced": {}, + "thresholds": { + "critical": 1 + } }, "priority": null, + "query": "min(last_10m):avg:weaviate.node.status{*} by {host,weaviate_node,weaviate_node_status} >= 1", "restricted_roles": null, - "recommended_monitor_metadata": { - "description": "Notify your team when a Weaviate Node is not 'Healthy'" - } + "tags": [ + "integration:weaviate" + ], + "type": "query alert" + } } \ No newline at end of file diff --git a/weblogic/assets/monitors/active_threads.json b/weblogic/assets/monitors/active_threads.json index 6112f22aab0fb..eb932affe6240 100644 --- a/weblogic/assets/monitors/active_threads.json +++ b/weblogic/assets/monitors/active_threads.json @@ -1,29 +1,36 @@ { - "name": "[WebLogic] % of Active Execute Threads is high on host {{host.name}} for server runtime {{serverruntime.name}}", - "type": "query alert", - "query": "avg(last_5m):((avg:weblogic.threadpool_runtime.execute_threads_total{*} by {host,serverruntime} - avg:weblogic.threadpool_runtime.threads_standby{*} by {host,serverruntime}) / avg:weblogic.threadpool_runtime.execute_threads_total{*} by {host,serverruntime}) * 100 > 70", - "message": "{{#is_alert}}\\n\\nALERT: The current number of active threads represents {{value}} % of the current number of threads managed by the thread pool for host: {{host.name}}, server runtime {{serverruntime.name}}\\n\\n{{/is_alert}} \\n\\n{{#is_warning}}\\n\\nWARNING: The current number of active threads represents {{value}} % of the current number of threads managed by the thread pool for host: {{host.name}}, server runtime {{serverruntime.name}}\\n\\n{{/is_warning}}", - "tags": [ - "integration:weblogic" - ], - "options": { - "thresholds": { - "critical": 70, - "warning": 50 - }, - "notify_audit": false, - "require_full_window": false, - "notify_no_data": false, - "no_data_timeframe": 10, - "renotify_interval": 0, - "locked": false, - "silenced": {}, - "include_tags": true, - "escalation_message": "", - "new_group_delay": 60 - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Get notified when the percent of Active Execute Threads is high." - } + "version": 2, + "created_at": "2022-03-01", + "last_updated_at": "2022-03-01", + "title": "% of Active Execute Threads is high on host {{host.name}} for server runtime {{serverruntime.name}}", + "tags": [ + "integration:weblogic" + ], + "description": "Get notified when the percent of Active Execute Threads is high.", + "definition": { + "message": "{{#is_alert}}\\n\\nALERT: The current number of active threads represents {{value}} % of the current number of threads managed by the thread pool for host: {{host.name}}, server runtime {{serverruntime.name}}\\n\\n{{/is_alert}} \\n\\n{{#is_warning}}\\n\\nWARNING: The current number of active threads represents {{value}} % of the current number of threads managed by the thread pool for host: {{host.name}}, server runtime {{serverruntime.name}}\\n\\n{{/is_warning}}", + "name": "[WebLogic] % of Active Execute Threads is high on host {{host.name}} for server runtime {{serverruntime.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "new_group_delay": 60, + "no_data_timeframe": 10, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 70, + "warning": 50 + } + }, + "priority": null, + "query": "avg(last_5m):((avg:weblogic.threadpool_runtime.execute_threads_total{*} by {host,serverruntime} - avg:weblogic.threadpool_runtime.threads_standby{*} by {host,serverruntime}) / avg:weblogic.threadpool_runtime.execute_threads_total{*} by {host,serverruntime}) * 100 > 70", + "tags": [ + "integration:weblogic" + ], + "type": "query alert" + } } \ No newline at end of file diff --git a/weblogic/assets/monitors/stuck_threads.json b/weblogic/assets/monitors/stuck_threads.json index 4994296021781..a839cc6474f82 100644 --- a/weblogic/assets/monitors/stuck_threads.json +++ b/weblogic/assets/monitors/stuck_threads.json @@ -1,28 +1,35 @@ { - "name": "[WebLogic] Work Manager Stuck Threads count has increased {{value}}% over the last hour for server runtime {{serverruntime.name}}, application runtime {{applicationruntime.name}}", - "type": "query alert", - "query": "change(avg(last_5m),last_1h):avg:weblogic.work_manager_runtime.threads_stuck{*} by {host,serverruntime,applicationruntime} > 25", - "message": "{{#is_alert}}\\n\\nALERT: The average number of work manager stuck threads has increased by {{value}}% over the past hour for server runtime:{{serverruntime.name}}, application runtime:{{applicationruntime.name}} on host {{host.name}}.\\n\\n{{/is_alert}} \\n\\n{{#is_warning}}\\n\\nWARNING: The average number of work manager stuck threads has increased by {{value}}% over the past hour for server runtime: {{serverruntime.name}}, application runtime:{{applicationruntime.name}} on host {{host.name}}.\\n\\n{{/is_warning}}", - "tags": [ - "integration:weblogic" - ], - "options": { - "notify_audit": false, - "locked": false, - "silenced": {}, - "include_tags": true, - "thresholds": { - "critical": 25, - "warning": 15 - }, - "require_full_window": false, - "notify_no_data": false, - "no_data_timeframe": 10, - "renotify_interval": 0, - "escalation_message": "" - }, - "priority": null, - "recommended_monitor_metadata": { - "description": "Get notified when the percent change of Work Manager Stuck Threads is high." - } + "version": 2, + "created_at": "2022-03-01", + "last_updated_at": "2022-03-01", + "title": "Work Manager Stuck Threads count has increased {{value}}% over the last hour for server runtime {{serverruntime.name}}, application runtime {{applicationruntime.name}}", + "tags": [ + "integration:weblogic" + ], + "description": "Get notified when the percent change of Work Manager Stuck Threads is high.", + "definition": { + "message": "{{#is_alert}}\\n\\nALERT: The average number of work manager stuck threads has increased by {{value}}% over the past hour for server runtime:{{serverruntime.name}}, application runtime:{{applicationruntime.name}} on host {{host.name}}.\\n\\n{{/is_alert}} \\n\\n{{#is_warning}}\\n\\nWARNING: The average number of work manager stuck threads has increased by {{value}}% over the past hour for server runtime: {{serverruntime.name}}, application runtime:{{applicationruntime.name}} on host {{host.name}}.\\n\\n{{/is_warning}}", + "name": "[WebLogic] Work Manager Stuck Threads count has increased {{value}}% over the last hour for server runtime {{serverruntime.name}}, application runtime {{applicationruntime.name}}", + "options": { + "escalation_message": "", + "include_tags": true, + "locked": false, + "no_data_timeframe": 10, + "notify_audit": false, + "notify_no_data": false, + "renotify_interval": 0, + "require_full_window": false, + "silenced": {}, + "thresholds": { + "critical": 25, + "warning": 15 + } + }, + "priority": null, + "query": "change(avg(last_5m),last_1h):avg:weblogic.work_manager_runtime.threads_stuck{*} by {host,serverruntime,applicationruntime} > 25", + "tags": [ + "integration:weblogic" + ], + "type": "query alert" + } } \ No newline at end of file