etcd3_alert.rules 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. # general cluster availability
  2. # alert if another failed member will result in an unavailable cluster
  3. ALERT InsufficientMembers
  4. IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
  5. FOR 3m
  6. LABELS {
  7. severity = "critical"
  8. }
  9. ANNOTATIONS {
  10. summary = "etcd cluster insufficient members",
  11. description = "If one more etcd member goes down the cluster will be unavailable",
  12. }
  13. # etcd leader alerts
  14. # ==================
  15. # alert if any etcd instance has no leader
  16. ALERT NoLeader
  17. IF etcd_server_has_leader{job="etcd"} == 0
  18. FOR 1m
  19. LABELS {
  20. severity = "critical"
  21. }
  22. ANNOTATIONS {
  23. summary = "etcd member has no leader",
  24. description = "etcd member {{ $labels.instance }} has no leader",
  25. }
  26. # alert if there are lots of leader changes
  27. ALERT HighNumberOfLeaderChanges
  28. IF increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
  29. LABELS {
  30. severity = "warning"
  31. }
  32. ANNOTATIONS {
  33. summary = "a high number of leader changes within the etcd cluster are happening",
  34. description = "etcd instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour",
  35. }
  36. # gRPC request alerts
  37. # ===================
  38. # alert if more than 1% of gRPC method calls have failed within the last 5 minutes
  39. ALERT HighNumberOfFailedGRPCRequests
  40. IF 100 * (sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
  41. / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m]))) > 1
  42. FOR 10m
  43. LABELS {
  44. severity = "warning"
  45. }
  46. ANNOTATIONS {
  47. summary = "a high number of gRPC requests are failing",
  48. description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
  49. }
  50. # alert if more than 5% of gRPC method calls have failed within the last 5 minutes
  51. ALERT HighNumberOfFailedGRPCRequests
  52. IF 100 * (sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
  53. / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m]))) > 5
  54. FOR 5m
  55. LABELS {
  56. severity = "critical"
  57. }
  58. ANNOTATIONS {
  59. summary = "a high number of gRPC requests are failing",
  60. description = "{{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}",
  61. }
  62. # alert if the 99th percentile of gRPC method calls take more than 150ms
  63. ALERT GRPCRequestsSlow
  64. IF histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15
  65. FOR 10m
  66. LABELS {
  67. severity = "critical"
  68. }
  69. ANNOTATIONS {
  70. summary = "slow gRPC requests",
  71. description = "on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method }} are slow",
  72. }
  73. # HTTP requests alerts
  74. # ====================
  75. # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
  76. ALERT HighNumberOfFailedHTTPRequests
  77. IF 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
  78. / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method)) > 1
  79. FOR 10m
  80. LABELS {
  81. severity = "warning"
  82. }
  83. ANNOTATIONS {
  84. summary = "a high number of HTTP requests are failing",
  85. description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
  86. }
  87. # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
  88. ALERT HighNumberOfFailedHTTPRequests
  89. IF 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
  90. / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method)) > 5
  91. FOR 5m
  92. LABELS {
  93. severity = "critical"
  94. }
  95. ANNOTATIONS {
  96. summary = "a high number of HTTP requests are failing",
  97. description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
  98. }
  99. # alert if the 99th percentile of HTTP requests take more than 150ms
  100. ALERT HTTPRequestsSlow
  101. IF histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15
  102. FOR 10m
  103. LABELS {
  104. severity = "warning"
  105. }
  106. ANNOTATIONS {
  107. summary = "slow HTTP requests",
  108. description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",
  109. }
  110. # file descriptor alerts
  111. # ======================
  112. instance:fd_utilization = process_open_fds / process_max_fds
  113. # alert if file descriptors are likely to exhaust within the next 4 hours
  114. ALERT FdExhaustionClose
  115. IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
  116. FOR 10m
  117. LABELS {
  118. severity = "warning"
  119. }
  120. ANNOTATIONS {
  121. summary = "file descriptors soon exhausted",
  122. description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon",
  123. }
  124. # alert if file descriptors are likely to exhaust within the next hour
  125. ALERT FdExhaustionClose
  126. IF predict_linear(instance:fd_utilization[10m], 3600) > 1
  127. FOR 10m
  128. LABELS {
  129. severity = "critical"
  130. }
  131. ANNOTATIONS {
  132. summary = "file descriptors soon exhausted",
  133. description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon",
  134. }
  135. # etcd member communication alerts
  136. # ================================
  137. # alert if 99th percentile of round trips take 150ms
  138. ALERT EtcdMemberCommunicationSlow
  139. IF histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15
  140. FOR 10m
  141. LABELS {
  142. severity = "warning"
  143. }
  144. ANNOTATIONS {
  145. summary = "etcd member communication is slow",
  146. description = "etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow",
  147. }
  148. # etcd proposal alerts
  149. # ====================
  150. # alert if there are several failed proposals within an hour
  151. ALERT HighNumberOfFailedProposals
  152. IF increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
  153. LABELS {
  154. severity = "warning"
  155. }
  156. ANNOTATIONS {
  157. summary = "a high number of proposals within the etcd cluster are failing",
  158. description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
  159. }
  160. # etcd disk io latency alerts
  161. # ===========================
  162. # alert if 99th percentile of fsync durations is higher than 500ms
  163. ALERT HighFsyncDurations
  164. IF histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5
  165. FOR 10m
  166. LABELS {
  167. severity = "warning"
  168. }
  169. ANNOTATIONS {
  170. summary = "high fsync durations",
  171. description = "etcd instance {{ $labels.instance }} fync durations are high",
  172. }
  173. # alert if 99th percentile of commit durations is higher than 250ms
  174. ALERT HighCommitDurations
  175. IF histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25
  176. FOR 10m
  177. LABELS {
  178. severity = "warning"
  179. }
  180. ANNOTATIONS {
  181. summary = "high commit durations",
  182. description = "etcd instance {{ $labels.instance }} commit durations are high",
  183. }