You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

147 lines
5.7 KiB
YAML

groups:
- name: alert.rules
rules:
- alert: PD_cluster_offline_tikv_nums
expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0
for: 1m
labels:
env: test-cluster
level: emergency
expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: PD_cluster_offline_tikv_nums
- alert: PD_etcd_write_disk_latency
expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1
for: 1m
labels:
env: test-cluster
level: critical
expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: PD_etcd_write_disk_latency
- alert: PD_miss_peer_region_count
expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100
for: 1m
labels:
env: test-cluster
level: critical
expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: PD_miss_peer_region_count
- alert: PD_cluster_lost_connect_tikv_nums
expr: sum ( pd_cluster_status{type="store_disconnected_count"} ) > 0
for: 1m
labels:
env: test-cluster
level: warning
expr: sum ( pd_cluster_status{type="store_disconnected_count"} ) > 0
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: PD_cluster_lost_connect_tikv_nums
- alert: PD_cluster_low_space
expr: sum ( pd_cluster_status{type="store_low_space_count"} ) > 0
for: 1m
labels:
env: test-cluster
level: warning
expr: sum ( pd_cluster_status{type="store_low_space_count"} ) > 0
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: PD_cluster_low_space
- alert: PD_etcd_network_peer_latency
expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1
for: 1m
labels:
env: test-cluster
level: warning
expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: PD_etcd_network_peer_latency
- alert: PD_tidb_handle_requests_duration
expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1
for: 1m
labels:
env: test-cluster
level: warning
expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: PD_tidb_handle_requests_duration
- alert: PD_down_peer_region_nums
expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0
for: 1m
labels:
env: test-cluster
level: warning
expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: PD_down_peer_region_nums
- alert: PD_incorrect_namespace_region_count
expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 100
for: 1m
labels:
env: test-cluster
level: warning
expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 0
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: PD_incorrect_namespace_region_count
- alert: PD_pending_peer_region_count
expr: sum( pd_regions_status{type="pending_peer_region_count"} ) > 100
for: 1m
labels:
env: test-cluster
level: warning
expr: sum( pd_regions_status{type="pending_peer_region_count"} ) > 100
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: PD_pending_peer_region_count
- alert: PD_leader_change
expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2
for: 1m
labels:
env: test-cluster
level: warning
expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: PD_leader_change
- alert: TiKV_space_used_more_than_80%
expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80
for: 1m
labels:
env: test-cluster
level: warning
expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80
annotations:
description: 'cluster: test-cluster, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV_space_used_more_than_80%