Skip to content

Commit

Permalink
metrics (ticdc): add changefeed status alert rules (#9265)
Browse files Browse the repository at this point in the history
ref #9266
  • Loading branch information
asddongmen authored Jun 20, 2023
1 parent 10b9850 commit d52a23d
Showing 1 changed file with 40 additions and 25 deletions.
65 changes: 40 additions & 25 deletions metrics/alertmanager/ticdc.rules.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
groups:
- name: alert.rules
rules:
# server related alter rules
- alert: cdc_multiple_owners
expr: sum(rate(ticdc_owner_ownership_counter[30s])) >= 2
for: 1m
Expand All @@ -25,6 +26,19 @@ groups:
value: '{{ $value }}'
summary: cdc cluster has no owner for more than 10 minutes

# changefeed related alter rules
- alert: ticdc_changefeed_failed
expr: (max_over_time(ticdc_owner_status[1m]) == 2) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: (max_over_time(ticdc_owner_status[1m]) == 2) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc changefeed failed, it can not be automatically resumed

- alert: cdc_checkpoint_high_delay
expr: ticdc_owner_checkpoint_ts_lag > 600
for: 1m
Expand All @@ -37,62 +51,63 @@ groups:
value: '{{ $value }}'
summary: cdc owner checkpoint delay more than 10 minutes

- alert: tikv_cdc_min_resolved_ts_no_change_for_1m
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0
- alert: ticdc_sink_execution_error
expr: changes(ticdc_sink_execution_error[1m]) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0
expr: changes(ticdc_sink_execution_error[1m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $labels.instance }}'
summary: tikv cdc min resolved ts no change for 1m
value: '{{ $value }}'
summary: cdc sink execution meets errors

- alert: tikv_cdc_scan_duration_seconds_more_than_10min
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
- alert: ticdc_processor_exit_with_error_count
expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: tikv cdc scan duration seconds more than 10 min
summary: cdc processor exits with error

- alert: ticdc_sink_mysql_execution_error
expr: changes(ticdc_sink_mysql_execution_error[1m]) > 0
- alert: ticdc_changefeed_meet_error
expr: (max_over_time(ticdc_owner_status[1m]) == 1) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: changes(ticdc_sink_mysql_execution_error[1m]) > 0
expr: (max_over_time(ticdc_owner_status[1m]) == 1) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc sink mysql execution meets errors

- alert: ticdc_processor_exit_with_error_count
expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
summary: cdc changefeed meet error

# tikv related alter rules
- alert: tikv_cdc_min_resolved_ts_no_change_for_1m
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
level: warning
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc processor exits with error
value: '{{ $labels.instance }}'
summary: tikv cdc min resolved ts no change for 1m

- alert: ticdc_memory_abnormal
expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10
- alert: tikv_cdc_scan_duration_seconds_more_than_10min
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiCDC heap memory usage is over 10 GB
summary: tikv cdc scan duration seconds more than 10 min

0 comments on commit d52a23d

Please sign in to comment.