etcd_alert.rules 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. ### General cluster availability ###
  2. # alert if another failed member will result in an unavailable cluster
  3. ALERT InsufficientMembers
  4. IF count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
  5. FOR 3m
  6. LABELS {
  7. severity = "critical"
  8. }
  9. ANNOTATIONS {
  10. summary = "etcd cluster insufficient members",
  11. description = "If one more etcd member goes down the cluster will be unavailable",
  12. }
  13. ### HTTP requests alerts ###
  14. # alert if more than 1% of requests to an HTTP endpoint have failed with a non 4xx response
  15. ALERT HighNumberOfFailedHTTPRequests
  16. IF sum by(method) (rate(etcd_http_failed_total{job="etcd", code!~"4[0-9]{2}"}[5m]))
  17. / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.01
  18. FOR 10m
  19. LABELS {
  20. severity = "warning"
  21. }
  22. ANNOTATIONS {
  23. summary = "a high number of HTTP requests are failing",
  24. description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
  25. }
  26. # alert if more than 5% of requests to an HTTP endpoint have failed with a non 4xx response
  27. ALERT HighNumberOfFailedHTTPRequests
  28. IF sum by(method) (rate(etcd_http_failed_total{job="etcd", code!~"4[0-9]{2}"}[5m]))
  29. / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.05
  30. FOR 5m
  31. LABELS {
  32. severity = "critical"
  33. }
  34. ANNOTATIONS {
  35. summary = "a high number of HTTP requests are failing",
  36. description = "{{ $value }}% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}",
  37. }
  38. # alert if 50% of requests get a 4xx response
  39. ALERT HighNumberOfFailedHTTPRequests
  40. IF sum by(method) (rate(etcd_http_failed_total{job="etcd", code=~"4[0-9]{2}"}[5m]))
  41. / sum by(method) (rate(etcd_http_received_total{job="etcd"}[5m])) > 0.5
  42. FOR 10m
  43. LABELS {
  44. severity = "critical"
  45. }
  46. ANNOTATIONS {
  47. summary = "a high number of HTTP requests are failing",
  48. description = "{{ $value }}% of requests for {{ $labels.method }} failed with 4xx responses on etcd instance {{ $labels.instance }}",
  49. }
  50. # alert if the 99th percentile of HTTP requests take more than 150ms
  51. ALERT HTTPRequestsSlow
  52. IF histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m])) > 0.15
  53. FOR 10m
  54. LABELS {
  55. severity = "warning"
  56. }
  57. ANNOTATIONS {
  58. summary = "slow HTTP requests",
  59. description = "on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method }} are slow",
  60. }
  61. ### File descriptor alerts ###
  62. instance:fd_utilization = process_open_fds / process_max_fds
  63. # alert if file descriptors are likely to exhaust within the next 4 hours
  64. ALERT FdExhaustionClose
  65. IF predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
  66. FOR 10m
  67. LABELS {
  68. severity = "warning"
  69. }
  70. ANNOTATIONS {
  71. summary = "file descriptors soon exhausted",
  72. description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon",
  73. }
  74. # alert if file descriptors are likely to exhaust within the next hour
  75. ALERT FdExhaustionClose
  76. IF predict_linear(instance:fd_utilization[10m], 3600) > 1
  77. FOR 10m
  78. LABELS {
  79. severity = "critical"
  80. }
  81. ANNOTATIONS {
  82. summary = "file descriptors soon exhausted",
  83. description = "{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon",
  84. }
  85. ### etcd proposal alerts ###
  86. # alert if there are several failed proposals within an hour
  87. ALERT HighNumberOfFailedProposals
  88. IF increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
  89. LABELS {
  90. severity = "warning"
  91. }
  92. ANNOTATIONS {
  93. summary = "a high number of proposals within the etcd cluster are failing",
  94. description = "etcd instance {{ $labels.instance }} has seen {{ $value }} proposal failures within the last hour",
  95. }
  96. ### etcd disk io latency alerts ###
  97. # alert if 99th percentile of fsync durations is higher than 500ms
  98. ALERT HighFsyncDurations
  99. IF histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m])) > 0.5
  100. FOR 10m
  101. LABELS {
  102. severity = "warning"
  103. }
  104. ANNOTATIONS {
  105. summary = "high fsync durations",
  106. description = "etcd instance {{ $labels.instance }} fync durations are high",
  107. }