You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

351 lines
16 KiB
YAML

groups:
- name: alert.rules
rules:
- alert: TiKV_memory_used_too_fast
expr: process_resident_memory_bytes{job=~"tikv.*"} - (process_resident_memory_bytes{job=~"tikv.*"} offset 5m) > 5*1024*1024*1024
for: 5m
labels:
env: test-cluster
level: emergency
expr: process_resident_memory_bytes{job=~"tikv.*"} - (process_resident_memory_bytes{job=~"tikv.*"} offset 5m) > 5*1024*1024*1024
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, job: {{ $labels.job }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV memory used too fast
- alert: TiKV_GC_can_not_work
expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < 1
for: 1m
labels:
env: test-cluster
level: emergency
expr: sum(increase(tidb_tikvclient_gc_action_result{type="success"}[6h])) < 1
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV GC can not work
- alert: TiKV_server_report_failure_msg_total
expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10
for: 1m
labels:
env: test-cluster
level: critical
expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV server_report_failure_msg_total error
- alert: TiKV_channel_full_total
expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0
for: 1m
labels:
env: test-cluster
level: critical
expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV channel full
- alert: TiKV_write_stall
expr: delta( tikv_engine_write_stall[10m]) > 0
for: 1m
labels:
env: test-cluster
level: critical
expr: delta( tikv_engine_write_stall[10m]) > 0
annotations:
description: 'cluster: test-cluster, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV write stall
- alert: TiKV_raft_log_lag
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance, job)) > 5000
for: 1m
labels:
env: test-cluster
level: critical
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance, job)) > 5000
annotations:
description: 'cluster: test-cluster, instance {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV raftstore log lag more than 5000
- alert: TiKV_async_request_snapshot_duration_seconds
expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, job,type)) > 1
for: 1m
labels:
env: test-cluster
level: critical
expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, job,type)) > 1
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV async request snapshot duration seconds more than 1s
- alert: TiKV_async_request_write_duration_seconds
expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, job,type)) > 1
for: 1m
labels:
env: test-cluster
level: critical
expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, job,type)) > 1
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV async request write duration seconds more than 1s
- alert: TiKV_coprocessor_request_wait_seconds
expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, job,req)) > 10
for: 1m
labels:
env: test-cluster
level: critical
expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, job,req)) > 10
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV coprocessor request wait seconds more than 10s
- alert: TiKV_raftstore_thread_cpu_seconds_total
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (job, name) > 0.8
for: 1m
labels:
env: test-cluster
level: critical
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (job, name) > 0.8
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV raftstore thread CPU seconds is high
- alert: TiKV_raft_append_log_duration_secs
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance, job)) > 1
for: 1m
labels:
env: test-cluster
level: critical
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance, job)) > 1
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV_raft_append_log_duration_secs
- alert: TiKV_raft_apply_log_duration_secs
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance, job)) > 1
for: 1m
labels:
env: test-cluster
level: critical
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance, job)) > 1
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV_raft_apply_log_duration_secs
- alert: TiKV_scheduler_latch_wait_duration_seconds
expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, job,type)) > 1
for: 1m
labels:
env: test-cluster
level: critical
expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, job,type)) > 1
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV scheduler latch wait duration seconds more than 1s
- alert: TiKV_thread_apply_worker_cpu_seconds
expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (job) > 0.9
for: 1m
labels:
env: test-cluster
level: critical
expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (job) > 0.9
annotations:
description: 'cluster: test-cluster, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV thread apply worker cpu seconds is high
- alert: TiDB_tikvclient_gc_action_fail
expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10
for: 1m
labels:
env: test-cluster
level: critical
expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10
annotations:
description: 'cluster: test-cluster, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiDB_tikvclient_gc_action_fail
- alert: TiKV_leader_drops
expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10
for: 1m
labels:
env: test-cluster
level: warning
expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV leader drops
- alert: TiKV_raft_process_ready_duration_secs
expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, job,type)) > 2
for: 1m
labels:
env: test-cluster
level: warning
expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, job,type)) > 2
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV_raft_process_ready_duration_secs
- alert: TiKV_raft_process_tick_duration_secs
expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, job,type)) > 2
for: 1m
labels:
env: test-cluster
level: warning
expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, job,type)) > 2
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV_raft_process_tick_duration_secs
- alert: TiKV_scheduler_context_total
expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000
for: 1m
labels:
env: test-cluster
level: warning
expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV scheduler context total
- alert: TiKV_scheduler_command_duration_seconds
expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, job,type) / 1000) > 1
for: 1m
labels:
env: test-cluster
level: warning
expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, job,type) / 1000) > 1
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV scheduler command duration seconds more than 1s
- alert: TiKV_thread_storage_scheduler_cpu_seconds
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) by (job) > 0.8
for: 1m
labels:
env: test-cluster
level: warning
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"storage_schedul.*"}[1m])) by (job) > 0.8
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiKV storage scheduler cpu seconds more than 80%
- alert: TiKV_coprocessor_outdated_request_wait_seconds
expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > 0
for: 1m
labels:
env: test-cluster
level: warning
expr: delta( tikv_coprocessor_outdated_request_wait_seconds_count[10m] ) > 0
annotations:
description: 'cluster: test-cluster, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV coprocessor outdated request wait seconds
- alert: TiKV_coprocessor_request_error
expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100
for: 1m
labels:
env: test-cluster
level: warning
expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100
annotations:
description: 'cluster: test-cluster, reason: {{ $labels.reason }}, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV coprocessor request error
- alert: TiKV_coprocessor_request_lock_error
expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000
for: 1m
labels:
env: test-cluster
level: warning
expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000
annotations:
description: 'cluster: test-cluster, reason: {{ $labels.reason }}, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV coprocessor request lock error
- alert: TiKV_coprocessor_pending_request
expr: delta( tikv_coprocessor_pending_request[10m]) > 5000
for: 1m
labels:
env: test-cluster
level: warning
expr: delta( tikv_coprocessor_pending_request[10m]) > 5000
annotations:
description: 'cluster: test-cluster, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV pending {{ $labels.type }} request is high
- alert: TiKV_batch_request_snapshot_nums
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"endpoint.*"}[1m])) by (job) / ( count(tikv_thread_cpu_seconds_total{name=~"endpoint.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0
for: 1m
labels:
env: test-cluster
level: warning
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"endpoint.*"}[1m])) by (job) / ( count(tikv_thread_cpu_seconds_total{name=~"endpoint.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0
annotations:
description: 'cluster: test-cluster, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV batch request snapshot nums is high
- alert: TiKV_pending_task
expr: sum(tikv_worker_pending_task_total) BY (job,instance,name) > 1000
for: 1m
labels:
env: test-cluster
level: warning
expr: sum(tikv_worker_pending_task_total) BY (job,instance,name) > 1000
annotations:
description: 'cluster: test-cluster, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV pending task too much
- alert: TiKV_low_space_and_add_region
expr: count( (sum(tikv_store_size_bytes{type="available"}) by (job) / sum(tikv_store_size_bytes{type="capacity"}) by (job) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (job) > 0 ) ) > 0
for: 1m
labels:
env: test-cluster
level: warning
expr: count( (sum(tikv_store_size_bytes{type="available"}) by (job) / sum(tikv_store_size_bytes{type="capacity"}) by (job) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (job) > 0 ) ) > 0
annotations:
description: 'cluster: test-cluster, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV low_space and add_region
- alert: TiKV_approximate_region_size
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824
for: 1m
labels:
env: test-cluster
level: warning
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824
annotations:
description: 'cluster: test-cluster, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: TiKV approximate region size is more than 1GB