etcd3_alert.rules.yml 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. # these rules synced manually from https://github.com/etcd-io/etcd/blob/master/Documentation/etcd-mixin/mixin.libsonnet
  2. groups:
  3. - name: etcd
  4. rules:
  5. - alert: etcdInsufficientMembers
  6. annotations:
  7. message: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
  8. }}).'
  9. expr: |
  10. sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
  11. for: 3m
  12. labels:
  13. severity: critical
  14. - alert: etcdNoLeader
  15. annotations:
  16. message: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has
  17. no leader.'
  18. expr: |
  19. etcd_server_has_leader{job=~".*etcd.*"} == 0
  20. for: 1m
  21. labels:
  22. severity: critical
  23. - alert: etcdHighNumberOfLeaderChanges
  24. annotations:
  25. message: 'etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }}
  26. has seen {{ $value }} leader changes within the last hour.'
  27. expr: |
  28. rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
  29. for: 15m
  30. labels:
  31. severity: warning
  32. - alert: etcdHighNumberOfFailedGRPCRequests
  33. annotations:
  34. message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
  35. $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
  36. expr: |
  37. 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
  38. /
  39. sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
  40. > 1
  41. for: 10m
  42. labels:
  43. severity: warning
  44. - alert: etcdHighNumberOfFailedGRPCRequests
  45. annotations:
  46. message: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{
  47. $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
  48. expr: |
  49. 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
  50. /
  51. sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
  52. > 5
  53. for: 5m
  54. labels:
  55. severity: critical
  56. - alert: etcdGRPCRequestsSlow
  57. annotations:
  58. message: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
  59. }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
  60. expr: |
  61. histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
  62. > 0.15
  63. for: 10m
  64. labels:
  65. severity: critical
  66. - alert: etcdMemberCommunicationSlow
  67. annotations:
  68. message: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
  69. }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
  70. expr: |
  71. histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
  72. > 0.15
  73. for: 10m
  74. labels:
  75. severity: warning
  76. - alert: etcdHighNumberOfFailedProposals
  77. annotations:
  78. message: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within
  79. the last hour on etcd instance {{ $labels.instance }}.'
  80. expr: |
  81. rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
  82. for: 15m
  83. labels:
  84. severity: warning
  85. - alert: etcdHighFsyncDurations
  86. annotations:
  87. message: 'etcd cluster "{{ $labels.job }}": 99th percentile fync durations are
  88. {{ $value }}s on etcd instance {{ $labels.instance }}.'
  89. expr: |
  90. histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
  91. > 0.5
  92. for: 10m
  93. labels:
  94. severity: warning
  95. - alert: etcdHighCommitDurations
  96. annotations:
  97. message: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
  98. {{ $value }}s on etcd instance {{ $labels.instance }}.'
  99. expr: |
  100. histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
  101. > 0.25
  102. for: 10m
  103. labels:
  104. severity: warning
  105. - alert: etcdHighNumberOfFailedHTTPRequests
  106. annotations:
  107. message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
  108. instance {{ $labels.instance }}'
  109. expr: |
  110. sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
  111. BY (method) > 0.01
  112. for: 10m
  113. labels:
  114. severity: warning
  115. - alert: etcdHighNumberOfFailedHTTPRequests
  116. annotations:
  117. message: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
  118. instance {{ $labels.instance }}.'
  119. expr: |
  120. sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
  121. BY (method) > 0.05
  122. for: 10m
  123. labels:
  124. severity: critical
  125. - alert: etcdHTTPRequestsSlow
  126. annotations:
  127. message: etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
  128. }} are slow.
  129. expr: |
  130. histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
  131. > 0.15
  132. for: 10m
  133. labels:
  134. severity: warning