etcd3_alert.rules.yml 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. groups:
  2. - name: etcd3_alert.rules
  3. rules:
  4. - alert: InsufficientMembers
  5. expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
  6. for: 3m
  7. labels:
  8. severity: critical
  9. annotations:
  10. description: If one more etcd member goes down the cluster will be unavailable
  11. summary: etcd cluster insufficient members
  12. - alert: NoLeader
  13. expr: etcd_server_has_leader{job="etcd"} == 0
  14. for: 1m
  15. labels:
  16. severity: critical
  17. annotations:
  18. description: etcd member {{ $labels.instance }} has no leader
  19. summary: etcd member has no leader
  20. - alert: HighNumberOfLeaderChanges
  21. expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
  22. labels:
  23. severity: warning
  24. annotations:
  25. description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
  26. changes within the last hour
  27. summary: a high number of leader changes within the etcd cluster are happening
  28. - alert: HighNumberOfFailedGRPCRequests
  29. expr: 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
  30. / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method)) > 1
  31. for: 10m
  32. labels:
  33. severity: warning
  34. annotations:
  35. description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
  36. on etcd instance {{ $labels.instance }}'
  37. summary: a high number of gRPC requests are failing
  38. - alert: HighNumberOfFailedGRPCRequests
  39. expr: 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
  40. / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method)) > 5
  41. for: 5m
  42. labels:
  43. severity: critical
  44. annotations:
  45. description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
  46. on etcd instance {{ $labels.instance }}'
  47. summary: a high number of gRPC requests are failing
  48. - alert: GRPCRequestsSlow
  49. expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
  50. > 0.15
  51. for: 10m
  52. labels:
  53. severity: critical
  54. annotations:
  55. description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
  56. }} are slow
  57. summary: slow gRPC requests
  58. - alert: HighNumberOfFailedHTTPRequests
  59. expr: 100 * (sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
  60. BY (method)) > 1
  61. for: 10m
  62. labels:
  63. severity: warning
  64. annotations:
  65. description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
  66. instance {{ $labels.instance }}'
  67. summary: a high number of HTTP requests are failing
  68. - alert: HighNumberOfFailedHTTPRequests
  69. expr: 100 * (sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
  70. BY (method)) > 5
  71. for: 5m
  72. labels:
  73. severity: critical
  74. annotations:
  75. description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
  76. instance {{ $labels.instance }}'
  77. summary: a high number of HTTP requests are failing
  78. - alert: HTTPRequestsSlow
  79. expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
  80. > 0.15
  81. for: 10m
  82. labels:
  83. severity: warning
  84. annotations:
  85. description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
  86. }} are slow
  87. summary: slow HTTP requests
  88. - record: instance:fd_utilization
  89. expr: process_open_fds / process_max_fds
  90. - alert: FdExhaustionClose
  91. expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
  92. for: 10m
  93. labels:
  94. severity: warning
  95. annotations:
  96. description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust
  97. its file descriptors soon'
  98. summary: file descriptors soon exhausted
  99. - alert: FdExhaustionClose
  100. expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
  101. for: 10m
  102. labels:
  103. severity: critical
  104. annotations:
  105. description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust
  106. its file descriptors soon'
  107. summary: file descriptors soon exhausted
  108. - alert: EtcdMemberCommunicationSlow
  109. expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
  110. > 0.15
  111. for: 10m
  112. labels:
  113. severity: warning
  114. annotations:
  115. description: etcd instance {{ $labels.instance }} member communication with
  116. {{ $labels.To }} is slow
  117. summary: etcd member communication is slow
  118. - alert: HighNumberOfFailedProposals
  119. expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
  120. labels:
  121. severity: warning
  122. annotations:
  123. description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
  124. failures within the last hour
  125. summary: a high number of proposals within the etcd cluster are failing
  126. - alert: HighFsyncDurations
  127. expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
  128. > 0.5
  129. for: 10m
  130. labels:
  131. severity: warning
  132. annotations:
  133. description: etcd instance {{ $labels.instance }} fync durations are high
  134. summary: high fsync durations
  135. - alert: HighCommitDurations
  136. expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
  137. > 0.25
  138. for: 10m
  139. labels:
  140. severity: warning
  141. annotations:
  142. description: etcd instance {{ $labels.instance }} commit durations are high
  143. summary: high commit durations