groups: - name: alert.rules rules: - alert: PD_cluster_offline_tikv_nums expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 for: 1m labels: env: test-cluster level: emergency expr: sum ( pd_cluster_status{type="store_down_count"} ) > 0 annotations: description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_cluster_offline_tikv_nums - alert: PD_etcd_write_disk_latency expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 for: 1m labels: env: test-cluster level: critical expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 annotations: description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_etcd_write_disk_latency - alert: PD_miss_peer_region_count expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 for: 1m labels: env: test-cluster level: critical expr: sum( pd_regions_status{type="miss_peer_region_count"} ) > 100 annotations: description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_miss_peer_region_count - alert: PD_cluster_lost_connect_tikv_nums expr: sum ( pd_cluster_status{type="store_disconnected_count"} ) > 0 for: 1m labels: env: test-cluster level: warning expr: sum ( pd_cluster_status{type="store_disconnected_count"} ) > 0 annotations: description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_cluster_lost_connect_tikv_nums - alert: PD_cluster_low_space expr: sum ( pd_cluster_status{type="store_low_space_count"} ) > 0 for: 1m labels: env: test-cluster level: warning expr: sum ( pd_cluster_status{type="store_low_space_count"} ) > 0 annotations: description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_cluster_low_space - alert: PD_etcd_network_peer_latency expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 for: 1m labels: env: test-cluster level: warning expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 annotations: description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_etcd_network_peer_latency - alert: PD_tidb_handle_requests_duration expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 for: 1m labels: env: test-cluster level: warning expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 annotations: description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_tidb_handle_requests_duration - alert: PD_down_peer_region_nums expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 for: 1m labels: env: test-cluster level: warning expr: sum ( pd_regions_status{type="down_peer_region_count"} ) > 0 annotations: description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_down_peer_region_nums - alert: PD_incorrect_namespace_region_count expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 100 for: 1m labels: env: test-cluster level: warning expr: sum ( pd_regions_status{type="incorrect_namespace_region_count"} ) > 0 annotations: description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_incorrect_namespace_region_count - alert: PD_pending_peer_region_count expr: sum( pd_regions_status{type="pending_peer_region_count"} ) > 100 for: 1m labels: env: test-cluster level: warning expr: sum( pd_regions_status{type="pending_peer_region_count"} ) > 100 annotations: description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_pending_peer_region_count - alert: PD_leader_change expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 for: 1m labels: env: test-cluster level: warning expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 annotations: description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_leader_change - alert: TiKV_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 for: 1m labels: env: test-cluster level: warning expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 annotations: description: 'cluster: test-cluster, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV_space_used_more_than_80%