etcd_alert.rules.yml 3.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. groups:
  2. - name: etcd_alert.rules
  3. rules:
  4. - alert: InsufficientMembers
  5. expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
  6. for: 3m
  7. labels:
  8. severity: critical
  9. annotations:
  10. description: If one more etcd member goes down the cluster will be unavailable
  11. summary: etcd cluster insufficient members
  12. - alert: HighNumberOfFailedHTTPRequests
  13. expr: sum(rate(etcd_http_failed_total{code!~"^(?:4[0-9]{2})$",job="etcd"}[5m]))
  14. BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method)
  15. > 0.01
  16. for: 10m
  17. labels:
  18. severity: warning
  19. annotations:
  20. description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
  21. instance {{ $labels.instance }}'
  22. summary: a high number of HTTP requests are failing
  23. - alert: HighNumberOfFailedHTTPRequests
  24. expr: sum(rate(etcd_http_failed_total{code!~"^(?:4[0-9]{2})$",job="etcd"}[5m]))
  25. BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method)
  26. > 0.05
  27. for: 5m
  28. labels:
  29. severity: critical
  30. annotations:
  31. description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
  32. instance {{ $labels.instance }}'
  33. summary: a high number of HTTP requests are failing
  34. - alert: HighNumberOfFailedHTTPRequests
  35. expr: sum(rate(etcd_http_failed_total{code=~"^(?:4[0-9]{2})$",job="etcd"}[5m]))
  36. BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) BY (method)
  37. > 0.5
  38. for: 10m
  39. labels:
  40. severity: critical
  41. annotations:
  42. description: '{{ $value }}% of requests for {{ $labels.method }} failed with
  43. 4xx responses on etcd instance {{ $labels.instance }}'
  44. summary: a high number of HTTP requests are failing
  45. - alert: HTTPRequestsSlow
  46. expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_second_bucket[5m]))
  47. > 0.15
  48. for: 10m
  49. labels:
  50. severity: warning
  51. annotations:
  52. description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
  53. }} are slow
  54. summary: slow HTTP requests
  55. - record: instance:fd_utilization
  56. expr: process_open_fds / process_max_fds
  57. - alert: FdExhaustionClose
  58. expr: predict_linear(instance:fd_utilization[1h], 3600 * 4) > 1
  59. for: 10m
  60. labels:
  61. severity: warning
  62. annotations:
  63. description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust
  64. its file descriptors soon'
  65. summary: file descriptors soon exhausted
  66. - alert: FdExhaustionClose
  67. expr: predict_linear(instance:fd_utilization[10m], 3600) > 1
  68. for: 10m
  69. labels:
  70. severity: critical
  71. annotations:
  72. description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust
  73. its file descriptors soon'
  74. summary: file descriptors soon exhausted
  75. - alert: HighNumberOfFailedProposals
  76. expr: increase(etcd_server_proposal_failed_total{job="etcd"}[1h]) > 5
  77. labels:
  78. severity: warning
  79. annotations:
  80. description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
  81. failures within the last hour
  82. summary: a high number of proposals within the etcd cluster are failing
  83. - alert: HighFsyncDurations
  84. expr: histogram_quantile(0.99, rate(etcd_wal_fsync_durations_seconds_bucket[5m]))
  85. > 0.5
  86. for: 10m
  87. labels:
  88. severity: warning
  89. annotations:
  90. description: etcd instance {{ $labels.instance }} fync durations are high
  91. summary: high fsync durations