Parcourir la source

Documentation/etcd-mixin: Add an alert for down etcd members

An etcd member being down is an important failure state - while
normal admin operations may cause transient outages to rotate,
when any member is down the cluster is operating in a degraded
fashion. Add an alert that records when any members are down
so that administrators know whether the next failure is fatal.

The rule is more complicated than `up{...} == 0` because not all
failure modes for etcd may have an `up{...}` entry for each member.
For instance, a Kubernetes service in front of an etcd cluster
might only have 2 endpoints recorded in `up` because the third
pod is evicted by the kubelet - the cluster is degraded but
`count(up{...})` would not return the full quorum size. Instead,
use network peer send failures as a failure detector and attempt
to return the max of down services or failing peers. We may
undercount the number of total failures, but we will at least
alert that a member is down.
Clayton Coleman il y a 6 ans
Parent
commit
465592a718
2 fichiers modifiés avec 76 ajouts et 6 suppressions
  1. 20 0
      Documentation/etcd-mixin/mixin.libsonnet
  2. 56 6
      Documentation/etcd-mixin/test.yaml

+ 20 - 0
Documentation/etcd-mixin/mixin.libsonnet

@@ -8,6 +8,26 @@
       {
         name: 'etcd',
         rules: [
+          {
+            alert: 'etcdMembersDown',
+            expr: |||
+              max by (job) (
+                sum by (job) (up{%(etcd_selector)s} == bool 0)
+              or
+                count by (job,endpoint) (
+                  sum by (job,endpoint,To) (rate(etcd_network_peer_sent_failures_total{%(etcd_selector)s}[3m])) > 0.01
+                )
+              )
+              > 0
+            ||| % $._config,
+            'for': '3m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              message: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).',
+            },
+          },
           {
             alert: 'etcdInsufficientMembers',
             expr: |||

+ 56 - 6
Documentation/etcd-mixin/test.yaml

@@ -14,22 +14,72 @@ tests:
         values: '1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0'
     alert_rule_test:
       - eval_time: 3m
-        alertname: EtcdInsufficientMembers
+        alertname: etcdInsufficientMembers
+      - eval_time: 5m
+        alertname: etcdInsufficientMembers
+      - eval_time: 5m
+        alertname: etcdMembersDown
       - eval_time: 7m
-        alertname: EtcdInsufficientMembers
+        alertname: etcdMembersDown
+        exp_alerts:
+          - exp_labels:
+              job: etcd
+              severity: critical
+            exp_annotations:
+              message: 'etcd cluster "etcd": members are down (1).'
+      - eval_time: 7m
+        alertname: etcdInsufficientMembers
       - eval_time: 11m
-        alertname: EtcdInsufficientMembers
+        alertname: etcdInsufficientMembers
         exp_alerts:
           - exp_labels:
               job: etcd
               severity: critical
             exp_annotations:
-              message: 'Etcd cluster "etcd": insufficient members (1).'
+              message: 'etcd cluster "etcd": insufficient members (1).'
       - eval_time: 15m
-        alertname: EtcdInsufficientMembers
+        alertname: etcdInsufficientMembers
+        exp_alerts:
+          - exp_labels:
+              job: etcd
+              severity: critical
+            exp_annotations:
+              message: 'etcd cluster "etcd": insufficient members (0).'
+
+  - interval: 1m
+    input_series:
+      - series: 'up{job="etcd",instance="10.10.10.0"}'
+        values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
+      - series: 'up{job="etcd",instance="10.10.10.1"}'
+        values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
+      - series: 'up{job="etcd",instance="10.10.10.2"}'
+        values: '1 1 1 1 0 0 0 0'
+    alert_rule_test:
+      - eval_time: 10m
+        alertname: etcdMembersDown
+        exp_alerts:
+          - exp_labels:
+              job: etcd
+              severity: critical
+            exp_annotations:
+              message: 'etcd cluster "etcd": members are down (2).'
+
+  - interval: 1m
+    input_series:
+      - series: 'up{job="etcd",instance="10.10.10.0"}'
+        values: '1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0'
+      - series: 'up{job="etcd",instance="10.10.10.1"}'
+        values: '1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0'
+      - series: 'etcd_network_peer_sent_failures_total{To="member-1",job="etcd",endpoint="test"}'
+        values: '0 0 1 2 3 4 5 6 7 8 9 10'
+    alert_rule_test:
+      - eval_time: 4m
+        alertname: etcdMembersDown
+      - eval_time: 6m
+        alertname: etcdMembersDown
         exp_alerts:
           - exp_labels:
               job: etcd
               severity: critical
             exp_annotations:
-              message: 'Etcd cluster "etcd": insufficient members (0).'
+              message: 'etcd cluster "etcd": members are down (1).'