mixin.libsonnet 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. {
  2. _config+:: {
  3. etcd_selector: 'job=~".*etcd.*"',
  4. },
  5. prometheusAlerts+:: {
  6. groups+: [
  7. {
  8. name: "etcd",
  9. rules: [
  10. {
  11. alert: "EtcdInsufficientMembers",
  12. expr: |||
  13. count(up{%(etcd_selector)s} == 0) by (job) > (count(up{%(etcd_selector)s}) by (job) / 2 - 1)
  14. ||| % $._config,
  15. "for": "3m",
  16. labels: {
  17. severity: "critical",
  18. },
  19. annotations: {
  20. message: 'Etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).',
  21. },
  22. },
  23. {
  24. alert: "EtcdNoLeader",
  25. expr: |||
  26. etcd_server_has_leader{%(etcd_selector)s} == 0
  27. ||| % $._config,
  28. "for": "1m",
  29. labels: {
  30. severity: "critical",
  31. },
  32. annotations: {
  33. message: 'Etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.',
  34. },
  35. },
  36. {
  37. alert: "EtcdHighNumberOfLeaderChanges",
  38. expr: |||
  39. rate(etcd_server_leader_changes_seen_total{%(etcd_selector)s}[15m]) > 3
  40. ||| % $._config,
  41. "for": "15m",
  42. labels: {
  43. severity: "warning",
  44. },
  45. annotations: {
  46. message: 'Etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }} has seen {{ $value }} leader changes within the last hour.',
  47. },
  48. },
  49. {
  50. alert: "EtcdHighNumberOfFailedGRPCRequests",
  51. expr: |||
  52. 100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
  53. /
  54. sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) BY (job, instance, grpc_service, grpc_method)
  55. > 1
  56. ||| % $._config,
  57. "for": "10m",
  58. labels: {
  59. severity: "warning",
  60. },
  61. annotations: {
  62. message: 'Etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.',
  63. },
  64. },
  65. {
  66. alert: "EtcdHighNumberOfFailedGRPCRequests",
  67. expr: |||
  68. 100 * sum(rate(grpc_server_handled_total{%(etcd_selector)s, grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
  69. /
  70. sum(rate(grpc_server_handled_total{%(etcd_selector)s}[5m])) BY (job, instance, grpc_service, grpc_method)
  71. > 5
  72. ||| % $._config,
  73. "for": "5m",
  74. labels: {
  75. severity: "critical",
  76. },
  77. annotations: {
  78. message: 'Etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.',
  79. },
  80. },
  81. {
  82. alert: "EtcdGRPCRequestsSlow",
  83. expr: |||
  84. histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{%(etcd_selector)s, grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
  85. > 0.15
  86. ||| % $._config,
  87. "for": "10m",
  88. labels: {
  89. severity: "critical",
  90. },
  91. annotations: {
  92. message: 'Etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.',
  93. },
  94. },
  95. {
  96. alert: "EtcdHighNumberOfFailedHTTPRequests",
  97. expr: |||
  98. 100 * sum(rate(etcd_http_failed_total{%(etcd_selector)s}[5m])) BY (job, instance, method)
  99. /
  100. sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m])) BY (job, instance, method)
  101. > 1
  102. ||| % $._config,
  103. "for": "10m",
  104. labels: {
  105. severity: "warning",
  106. },
  107. annotations: {
  108. message: 'Etcd cluster "{{ $labels.job }}": {{ $value }}%% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.',
  109. },
  110. },
  111. {
  112. alert: "EtcdHighNumberOfFailedHTTPRequests",
  113. expr: |||
  114. 100 * sum(rate(etcd_http_failed_total{%(etcd_selector)s}[5m])) BY (job, instance, method)
  115. /
  116. sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m])) BY (job, instance, method)
  117. > 5
  118. ||| % $._config,
  119. "for": "5m",
  120. labels: {
  121. severity: "critical",
  122. },
  123. annotations: {
  124. message: 'Etcd cluster "{{ $labels.job }}": {{ $value }}%% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.',
  125. },
  126. },
  127. {
  128. alert: "EtcdHTTPRequestsSlow",
  129. expr: |||
  130. histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket{%(etcd_selector)s}[5m]))
  131. > 0.15
  132. ||| % $._config,
  133. "for": "10m",
  134. labels: {
  135. severity: "warning",
  136. },
  137. annotations: {
  138. message: 'Etcd cluster "{{ $labels.job }}": HTTP requests to {{ $labels.method }} are taking {{ $value }} on etcd instance {{ $labels.instance }}.',
  139. },
  140. },
  141. {
  142. alert: "EtcdMemberCommunicationSlow",
  143. expr: |||
  144. histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{%(etcd_selector)s}[5m]))
  145. > 0.15
  146. ||| % $._config,
  147. "for": "10m",
  148. labels: {
  149. severity: "warning",
  150. },
  151. annotations: {
  152. message: 'Etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.',
  153. },
  154. },
  155. {
  156. alert: "EtcdHighNumberOfFailedProposals",
  157. expr: |||
  158. rate(etcd_server_proposals_failed_total{%(etcd_selector)s}[15m]) > 5
  159. ||| % $._config,
  160. "for": "15m",
  161. labels: {
  162. severity: "warning",
  163. },
  164. annotations: {
  165. message: 'Etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last hour on etcd instance {{ $labels.instance }}.',
  166. },
  167. },
  168. {
  169. alert: "EtcdHighFsyncDurations",
  170. expr: |||
  171. histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m]))
  172. > 0.5
  173. ||| % $._config,
  174. "for": "10m",
  175. labels: {
  176. severity: "warning",
  177. },
  178. annotations: {
  179. message: 'Etcd cluster "{{ $labels.job }}": 99th percentile fync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.',
  180. },
  181. },
  182. {
  183. alert: "EtcdHighCommitDurations",
  184. expr: |||
  185. histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{%(etcd_selector)s}[5m]))
  186. > 0.25
  187. ||| % $._config,
  188. "for": "10m",
  189. labels: {
  190. severity: "warning",
  191. },
  192. annotations: {
  193. message: 'Etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.',
  194. },
  195. },
  196. {
  197. record: "instance:fd_utilization",
  198. expr: "process_open_fds / process_max_fds",
  199. },
  200. {
  201. alert: "FdExhaustionClose",
  202. expr: |||
  203. predict_linear(instance:fd_utilization{%(etcd_selector)s}[1h], 3600 * 4) > 1
  204. ||| % $._config,
  205. "for": "10m",
  206. labels: {
  207. severity: "warning",
  208. },
  209. annotations: {
  210. message: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon',
  211. },
  212. },
  213. {
  214. alert: "FdExhaustionClose",
  215. expr: |||
  216. predict_linear(instance:fd_utilization{%(etcd_selector)s}[10m], 3600) > 1
  217. ||| % $._config,
  218. "for": "10m",
  219. labels: {
  220. severity: "critical",
  221. },
  222. annotations: {
  223. description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust its file descriptors soon',
  224. },
  225. }
  226. ],
  227. },
  228. ],
  229. },
  230. }