etcd3_alert.rules.yml 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. groups:
  2. - name: etcd3_alert.rules
  3. rules:
  4. - alert: InsufficientMembers
  5. expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
  6. for: 3m
  7. labels:
  8. severity: critical
  9. annotations:
  10. description: If one more etcd member goes down the cluster will be unavailable
  11. summary: etcd cluster insufficient members
  12. - alert: NoLeader
  13. expr: etcd_server_has_leader{job="etcd"} == 0
  14. for: 1m
  15. labels:
  16. severity: critical
  17. annotations:
  18. description: etcd member {{ $labels.instance }} has no leader
  19. summary: etcd member has no leader
  20. - alert: HighNumberOfLeaderChanges
  21. expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
  22. labels:
  23. severity: warning
  24. annotations:
  25. description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
  26. changes within the last hour
  27. summary: a high number of leader changes within the etcd cluster are happening
  28. - alert: HighNumberOfFailedGRPCRequests
  29. expr: 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
  30. / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method)) > 1
  31. for: 10m
  32. labels:
  33. severity: warning
  34. annotations:
  35. description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
  36. on etcd instance {{ $labels.instance }}'
  37. summary: a high number of gRPC requests are failing
  38. - alert: HighNumberOfFailedGRPCRequests
  39. expr: 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
  40. / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method)) > 5
  41. for: 5m
  42. labels:
  43. severity: critical
  44. annotations:
  45. description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
  46. on etcd instance {{ $labels.instance }}'
  47. summary: a high number of gRPC requests are failing
  48. - alert: GRPCRequestsSlow
  49. expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
  50. > 0.15
  51. for: 10m
  52. labels:
  53. severity: critical
  54. annotations:
  55. description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
  56. }} are slow
  57. summary: slow gRPC requests
  58. - record: instance:fd_utilization
  59. expr: process_open_fds / process_max_fds
  60. - alert: FdExhaustionClose
  61. expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
  62. for: 10m
  63. labels:
  64. severity: warning
  65. annotations:
  66. description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust
  67. its file descriptors soon'
  68. summary: file descriptors soon exhausted
  69. - alert: FdExhaustionClose
  70. expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
  71. for: 10m
  72. labels:
  73. severity: critical
  74. annotations:
  75. description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust
  76. its file descriptors soon'
  77. summary: file descriptors soon exhausted
  78. - alert: EtcdMemberCommunicationSlow
  79. expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
  80. > 0.15
  81. for: 10m
  82. labels:
  83. severity: warning
  84. annotations:
  85. description: etcd instance {{ $labels.instance }} member communication with
  86. {{ $labels.To }} is slow
  87. summary: etcd member communication is slow
  88. - alert: HighNumberOfFailedProposals
  89. expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
  90. labels:
  91. severity: warning
  92. annotations:
  93. description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
  94. failures within the last hour
  95. summary: a high number of proposals within the etcd cluster are failing
  96. - alert: HighFsyncDurations
  97. expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
  98. > 0.5
  99. for: 10m
  100. labels:
  101. severity: warning
  102. annotations:
  103. description: etcd instance {{ $labels.instance }} fync durations are high
  104. summary: high fsync durations
  105. - alert: HighCommitDurations
  106. expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
  107. > 0.25
  108. for: 10m
  109. labels:
  110. severity: warning
  111. annotations:
  112. description: etcd instance {{ $labels.instance }} commit durations are high
  113. summary: high commit durations