|
|
@@ -93,52 +93,6 @@
|
|
|
message: 'Etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method }} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.',
|
|
|
},
|
|
|
},
|
|
|
- {
|
|
|
- alert: 'EtcdHighNumberOfFailedHTTPRequests',
|
|
|
- expr: |||
|
|
|
- 100 * sum(rate(etcd_http_failed_total{%(etcd_selector)s}[5m])) BY (job, instance, method)
|
|
|
- /
|
|
|
- sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m])) BY (job, instance, method)
|
|
|
- > 1
|
|
|
- ||| % $._config,
|
|
|
- 'for': '10m',
|
|
|
- labels: {
|
|
|
- severity: 'warning',
|
|
|
- },
|
|
|
- annotations: {
|
|
|
- message: 'Etcd cluster "{{ $labels.job }}": {{ $value }}%% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.',
|
|
|
- },
|
|
|
- },
|
|
|
- {
|
|
|
- alert: 'EtcdHighNumberOfFailedHTTPRequests',
|
|
|
- expr: |||
|
|
|
- 100 * sum(rate(etcd_http_failed_total{%(etcd_selector)s}[5m])) BY (job, instance, method)
|
|
|
- /
|
|
|
- sum(rate(etcd_http_received_total{%(etcd_selector)s}[5m])) BY (job, instance, method)
|
|
|
- > 5
|
|
|
- ||| % $._config,
|
|
|
- 'for': '5m',
|
|
|
- labels: {
|
|
|
- severity: 'critical',
|
|
|
- },
|
|
|
- annotations: {
|
|
|
- message: 'Etcd cluster "{{ $labels.job }}": {{ $value }}%% of requests for {{ $labels.method }} failed on etcd instance {{ $labels.instance }}.',
|
|
|
- },
|
|
|
- },
|
|
|
- {
|
|
|
- alert: 'EtcdHTTPRequestsSlow',
|
|
|
- expr: |||
|
|
|
- histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket{%(etcd_selector)s}[5m]))
|
|
|
- > 0.15
|
|
|
- ||| % $._config,
|
|
|
- 'for': '10m',
|
|
|
- labels: {
|
|
|
- severity: 'warning',
|
|
|
- },
|
|
|
- annotations: {
|
|
|
- message: 'Etcd cluster "{{ $labels.job }}": HTTP requests to {{ $labels.method }} are taking {{ $value }} on etcd instance {{ $labels.instance }}.',
|
|
|
- },
|
|
|
- },
|
|
|
{
|
|
|
alert: 'EtcdMemberCommunicationSlow',
|
|
|
expr: |||
|