Browse Source

Merge pull request #9202 from olvesh/patch-1

Documentation/op-guide: integer value of 1 is 100% not 1% for failing reqs
Xiang Li 8 years ago
parent
commit
216247570c

+ 8 - 8
Documentation/op-guide/etcd3_alert.rules

@@ -43,8 +43,8 @@ ANNOTATIONS {
 
 
 # alert if more than 1% of gRPC method calls have failed within the last 5 minutes
 # alert if more than 1% of gRPC method calls have failed within the last 5 minutes
 ALERT HighNumberOfFailedGRPCRequests
 ALERT HighNumberOfFailedGRPCRequests
-IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
-  / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.01
+IF 100 * (sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
+  / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m]))) > 1
 FOR 10m
 FOR 10m
 LABELS {
 LABELS {
   severity = "warning"
   severity = "warning"
@@ -56,8 +56,8 @@ ANNOTATIONS {
 
 
 # alert if more than 5% of gRPC method calls have failed within the last 5 minutes
 # alert if more than 5% of gRPC method calls have failed within the last 5 minutes
 ALERT HighNumberOfFailedGRPCRequests
 ALERT HighNumberOfFailedGRPCRequests
-IF sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
-  / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m])) > 0.05
+IF 100 * (sum by(grpc_method) (rate(etcd_grpc_requests_failed_total{job="etcd"}[5m]))
+  / sum by(grpc_method) (rate(etcd_grpc_total{job="etcd"}[5m]))) > 5
 FOR 5m
 FOR 5m
 LABELS {
 LABELS {
   severity = "critical"
   severity = "critical"
@@ -84,8 +84,8 @@ ANNOTATIONS {
 
 
 # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
 # alert if more than 1% of requests to an HTTP endpoint have failed within the last 5 minutes
 ALERT HighNumberOfFailedHTTPRequests
 ALERT HighNumberOfFailedHTTPRequests
-IF sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
-  / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
+IF 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
+  / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method)) > 1
 FOR 10m
 FOR 10m
 LABELS {
 LABELS {
   severity = "warning"
   severity = "warning"
@@ -97,8 +97,8 @@ ANNOTATIONS {
 
 
 # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
 # alert if more than 5% of requests to an HTTP endpoint have failed within the last 5 minutes
 ALERT HighNumberOfFailedHTTPRequests
 ALERT HighNumberOfFailedHTTPRequests
-IF sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
-  / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
+IF 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
+  / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method))  > 5
 FOR 5m
 FOR 5m
 LABELS {
 LABELS {
   severity = "critical"
   severity = "critical"

+ 8 - 8
Documentation/op-guide/etcd3_alert.rules.yml

@@ -26,8 +26,8 @@ groups:
         changes within the last hour
         changes within the last hour
       summary: a high number of leader changes within the etcd cluster are happening
       summary: a high number of leader changes within the etcd cluster are happening
   - alert: HighNumberOfFailedGRPCRequests
   - alert: HighNumberOfFailedGRPCRequests
-    expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
-      / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
+    expr: 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
+      / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method)) > 1
     for: 10m
     for: 10m
     labels:
     labels:
       severity: warning
       severity: warning
@@ -36,8 +36,8 @@ groups:
         on etcd instance {{ $labels.instance }}'
         on etcd instance {{ $labels.instance }}'
       summary: a high number of gRPC requests are failing
       summary: a high number of gRPC requests are failing
   - alert: HighNumberOfFailedGRPCRequests
   - alert: HighNumberOfFailedGRPCRequests
-    expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
-      / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
+    expr: 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
+      / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method)) > 5
     for: 5m
     for: 5m
     labels:
     labels:
       severity: critical
       severity: critical
@@ -56,8 +56,8 @@ groups:
         }} are slow
         }} are slow
       summary: slow gRPC requests
       summary: slow gRPC requests
   - alert: HighNumberOfFailedHTTPRequests
   - alert: HighNumberOfFailedHTTPRequests
-    expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
-      BY (method) > 0.01
+    expr: 100 * (sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
+      BY (method)) > 1
     for: 10m
     for: 10m
     labels:
     labels:
       severity: warning
       severity: warning
@@ -66,8 +66,8 @@ groups:
         instance {{ $labels.instance }}'
         instance {{ $labels.instance }}'
       summary: a high number of HTTP requests are failing
       summary: a high number of HTTP requests are failing
   - alert: HighNumberOfFailedHTTPRequests
   - alert: HighNumberOfFailedHTTPRequests
-    expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
-      BY (method) > 0.05
+    expr: 100 * (sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
+      BY (method)) > 5
     for: 5m
     for: 5m
     labels:
     labels:
       severity: critical
       severity: critical