You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
147 lines
5.7 KiB
YAML
147 lines
5.7 KiB
YAML
groups:
|
|
- name: alert.rules
|
|
rules:
|
|
- alert: PD_cluster_offline_tikv_nums
|
|
expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0
|
|
for: 1m
|
|
labels:
|
|
env: test-cluster
|
|
level: emergency
|
|
expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0
|
|
annotations:
|
|
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
|
|
value: '{{ $value }}'
|
|
summary: PD_cluster_offline_tikv_nums
|
|
|
|
- alert: PD_etcd_write_disk_latency
|
|
expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1
|
|
for: 1m
|
|
labels:
|
|
env: test-cluster
|
|
level: critical
|
|
expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1
|
|
annotations:
|
|
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
|
|
value: '{{ $value }}'
|
|
summary: PD_etcd_write_disk_latency
|
|
|
|
- alert: PD_miss_peer_region_count
|
|
expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100
|
|
for: 1m
|
|
labels:
|
|
env: test-cluster
|
|
level: critical
|
|
expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100
|
|
annotations:
|
|
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
|
|
value: '{{ $value }}'
|
|
summary: PD_miss_peer_region_count
|
|
|
|
- alert: PD_cluster_lost_connect_tikv_nums
|
|
expr: sum ( pd_cluster_status{type="store_disconnected_count"} ) > 0
|
|
for: 1m
|
|
labels:
|
|
env: test-cluster
|
|
level: warning
|
|
expr: sum ( pd_cluster_status{type="store_disconnected_count"} ) > 0
|
|
annotations:
|
|
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
|
|
value: '{{ $value }}'
|
|
summary: PD_cluster_lost_connect_tikv_nums
|
|
|
|
- alert: PD_cluster_low_space
|
|
expr: sum ( pd_cluster_status{type="store_low_space_count"} ) > 0
|
|
for: 1m
|
|
labels:
|
|
env: test-cluster
|
|
level: warning
|
|
expr: sum ( pd_cluster_status{type="store_low_space_count"} ) > 0
|
|
annotations:
|
|
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
|
|
value: '{{ $value }}'
|
|
summary: PD_cluster_low_space
|
|
|
|
- alert: PD_etcd_network_peer_latency
|
|
expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1
|
|
for: 1m
|
|
labels:
|
|
env: test-cluster
|
|
level: warning
|
|
expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1
|
|
annotations:
|
|
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
|
|
value: '{{ $value }}'
|
|
summary: PD_etcd_network_peer_latency
|
|
|
|
- alert: PD_tidb_handle_requests_duration
|
|
expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1
|
|
for: 1m
|
|
labels:
|
|
env: test-cluster
|
|
level: warning
|
|
expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1
|
|
annotations:
|
|
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
|
|
value: '{{ $value }}'
|
|
summary: PD_tidb_handle_requests_duration
|
|
|
|
- alert: PD_down_peer_region_nums
|
|
expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0
|
|
for: 1m
|
|
labels:
|
|
env: test-cluster
|
|
level: warning
|
|
expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0
|
|
annotations:
|
|
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
|
|
value: '{{ $value }}'
|
|
summary: PD_down_peer_region_nums
|
|
|
|
- alert: PD_incorrect_namespace_region_count
|
|
expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 100
|
|
for: 1m
|
|
labels:
|
|
env: test-cluster
|
|
level: warning
|
|
expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 0
|
|
annotations:
|
|
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
|
|
value: '{{ $value }}'
|
|
summary: PD_incorrect_namespace_region_count
|
|
|
|
- alert: PD_pending_peer_region_count
|
|
expr: sum( pd_regions_status{type="pending_peer_region_count"} ) > 100
|
|
for: 1m
|
|
labels:
|
|
env: test-cluster
|
|
level: warning
|
|
expr: sum( pd_regions_status{type="pending_peer_region_count"} ) > 100
|
|
annotations:
|
|
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
|
|
value: '{{ $value }}'
|
|
summary: PD_pending_peer_region_count
|
|
|
|
- alert: PD_leader_change
|
|
expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2
|
|
for: 1m
|
|
labels:
|
|
env: test-cluster
|
|
level: warning
|
|
expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2
|
|
annotations:
|
|
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
|
|
value: '{{ $value }}'
|
|
summary: PD_leader_change
|
|
|
|
- alert: TiKV_space_used_more_than_80%
|
|
expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80
|
|
for: 1m
|
|
labels:
|
|
env: test-cluster
|
|
level: warning
|
|
expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80
|
|
annotations:
|
|
description: 'cluster: test-cluster, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
|
|
value: '{{ $value }}'
|
|
summary: TiKV_space_used_more_than_80%
|