Browse Source

Merge pull request #6851 from gyuho/metrics

v3rpc: replace grpc metrics w/ go-grpc-prometheus
Gyu-Ho Lee 9 years ago
parent
commit
677606da7d

+ 2 - 25
Documentation/metrics.md

@@ -82,31 +82,7 @@ All these metrics are prefixed with `etcd_network_`
 
 ### gRPC requests
 
-These metrics describe the requests served by a specific etcd member: total received requests, total failed requests, and processing latency. They are useful for tracking user-generated traffic hitting the etcd cluster.
-
-All these metrics are prefixed with `etcd_grpc_`
-
-| Name                           | Description                                                                         | Type                   |
-|--------------------------------|-------------------------------------------------------------------------------------|------------------------|
-| requests_total                 | Total number of received requests                                                   | Counter(method)        |
-| requests_failed_total                   | Total number of failed requests.                                                    | Counter(method,error)  |
-| active_streams                   | Total number of active streams.                                                    | Gauge(method)  |
-| unary_requests_duration_seconds     | Bucketed handling duration of the requests.                                         | Histogram(method)      |
-
-
-Example Prometheus queries that may be useful from these metrics (across all etcd members):
- 
- * `sum(rate(etcd_grpc_requests_failed_total{job="etcd"}[1m]) by (grpc_method) / sum(rate(etcd_grpc_total{job="etcd"})[1m]) by (grpc_method)` 
-    
-    Shows the fraction of events that failed by gRPC method across all members, across a time window of `1m`.
- 
- * `sum(rate(etcd_grpc_requests_total{job="etcd",grpc_method="PUT"})[1m]) by (grpc_method)`
-    
-    Shows the rate of PUT requests across all members, across a time window of `1m`.
-    
- * `histogram_quantile(0.9, sum(rate(etcd_grpc_unary_requests_duration_seconds{job="etcd",grpc_method="PUT"}[5m]) ) by (le))`
-    
-    Show the 0.90-tile latency (in seconds) of PUT request handling across all members, with a window of `5m`. 
+These metrics are exposed via [go-grpc-prometheus][go-grpc-prometheus].
 
 ## etcd_debugging namespace metrics
 
@@ -137,3 +113,4 @@ Heavy file descriptor (`process_open_fds`) usage (i.e., near the process's file
 [prometheus-getting-started]: http://prometheus.io/docs/introduction/getting_started/
 [prometheus-naming]: http://prometheus.io/docs/practices/naming/
 [v2-http-metrics]: v2/metrics.md#http-requests
+[go-grpc-prometheus]: https://github.com/grpc-ecosystem/go-grpc-prometheus

+ 10 - 10
Documentation/op-guide/grafana.json

@@ -115,20 +115,20 @@
                     "stack": false,
                     "steppedLine": false,
                     "targets": [{
-                            "expr": "sum(rate(etcd_grpc_requests_total [1m]))",
+                            "expr": "sum(rate({grpc_type=\"unary\",grpc_code!=\"OK\"} [1m]))",
                             "intervalFactor": 2,
                             "legendFormat": "{{instance}} RPC Rate",
-                            "metric": "etcd_grpc_requests_total",
+                            "metric": "grpc_server_started_total",
                             "refId": "A",
-                            "step": 4
+                            "step": 2
                         },
                         {
-                            "expr": "sum(rate(etcd_grpc_requests_failed_total [1m]))",
+                            "expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\",grpc_code!=\"OK\"} [1m])) - sum(rate(grpc_server_handled_total{grpc_type=\"unary\"} [1m]))",
                             "intervalFactor": 2,
                             "legendFormat": "{{instance}} RPC Failed Rate",
-                            "metric": "etcd_grpc_requests_failed_total",
+                            "metric": "grpc_server_handled_total",
                             "refId": "B",
-                            "step": 4
+                            "step": 2
                         }
                     ],
                     "thresholds": [],
@@ -197,18 +197,18 @@
                     "stack": true,
                     "steppedLine": false,
                     "targets": [{
-                            "expr": "sum(etcd_grpc_active_streams {grpc_service=\"etcdserverpb.Watch\"})",
+                            "expr": "sum(grpc_server_started_total {grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\",grpc_code!=\"OK\"}) - sum(grpc_server_handled_total {grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})",
                             "intervalFactor": 2,
                             "legendFormat": "Watch Streams",
-                            "metric": "etcd_grpc_active_streams",
+                            "metric": "grpc_server_handled_total",
                             "refId": "A",
                             "step": 4
                         },
                         {
-                            "expr": "sum(etcd_grpc_active_streams {grpc_service=\"etcdserverpb.Lease\"})",
+                            "expr": "sum(grpc_server_started_total {grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total {grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})",
                             "intervalFactor": 2,
                             "legendFormat": "Lease Streams",
-                            "metric": "etcd_grpc_active_streams",
+                            "metric": "grpc_server_handled_total",
                             "refId": "B",
                             "step": 4
                         }

+ 4 - 29
etcdserver/api/v3rpc/interceptor.go

@@ -25,6 +25,7 @@ import (
 	"github.com/coreos/etcd/pkg/types"
 	"github.com/coreos/etcd/raft"
 
+	prometheus "github.com/grpc-ecosystem/go-grpc-prometheus"
 	"golang.org/x/net/context"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/metadata"
@@ -53,7 +54,8 @@ func newUnaryInterceptor(s *etcdserver.EtcdServer) grpc.UnaryServerInterceptor {
 				}
 			}
 		}
-		return metricsUnaryInterceptor(ctx, req, info, handler)
+
+		return prometheus.UnaryServerInterceptor(ctx, req, info, handler)
 	}
 }
 
@@ -88,36 +90,9 @@ func newStreamInterceptor(s *etcdserver.EtcdServer) grpc.StreamServerInterceptor
 
 			}
 		}
-		return metricsStreamInterceptor(srv, ss, info, handler)
-	}
-}
-
-func metricsUnaryInterceptor(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (resp interface{}, err error) {
-	service, method := splitMethodName(info.FullMethod)
-	receivedCounter.WithLabelValues(service, method).Inc()
-
-	start := time.Now()
-	resp, err = handler(ctx, req)
-	if err != nil {
-		failedCounter.WithLabelValues(service, method, grpc.Code(err).String()).Inc()
-	}
-	handlingDuration.WithLabelValues(service, method).Observe(time.Since(start).Seconds())
-
-	return resp, err
-}
 
-func metricsStreamInterceptor(srv interface{}, ss grpc.ServerStream, info *grpc.StreamServerInfo, handler grpc.StreamHandler) error {
-	service, method := splitMethodName(info.FullMethod)
-	receivedCounter.WithLabelValues(service, method).Inc()
-
-	streamsGauage.WithLabelValues(service, method).Inc()
-	err := handler(srv, ss)
-	streamsGauage.WithLabelValues(service, method).Dec()
-	if err != nil {
-		failedCounter.WithLabelValues(service, method, grpc.Code(err).String()).Inc()
+		return prometheus.StreamServerInterceptor(srv, ss, info, handler)
 	}
-
-	return err
 }
 
 func splitMethodName(fullMethodName string) (string, string) {

+ 0 - 38
etcdserver/api/v3rpc/metrics.go

@@ -17,39 +17,6 @@ package v3rpc
 import "github.com/prometheus/client_golang/prometheus"
 
 var (
-	receivedCounter = prometheus.NewCounterVec(
-		prometheus.CounterOpts{
-			Namespace: "etcd",
-			Subsystem: "grpc",
-			Name:      "requests_total",
-			Help:      "Counter of received requests.",
-		}, []string{"grpc_service", "grpc_method"})
-
-	failedCounter = prometheus.NewCounterVec(
-		prometheus.CounterOpts{
-			Namespace: "etcd",
-			Subsystem: "grpc",
-			Name:      "requests_failed_total",
-			Help:      "Counter of failed requests.",
-		}, []string{"grpc_service", "grpc_method", "grpc_code"})
-
-	streamsGauage = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
-			Namespace: "etcd",
-			Subsystem: "grpc",
-			Name:      "active_streams",
-			Help:      "Number of active streams.",
-		}, []string{"grpc_service", "grpc_method"})
-
-	handlingDuration = prometheus.NewHistogramVec(
-		prometheus.HistogramOpts{
-			Namespace: "etcd",
-			Subsystem: "grpc",
-			Name:      "unary_requests_duration_seconds",
-			Help:      "Bucketed histogram of processing time (s) of handled unary (non-stream) requests.",
-			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 13),
-		}, []string{"grpc_service", "grpc_method"})
-
 	sentBytes = prometheus.NewCounter(prometheus.CounterOpts{
 		Namespace: "etcd",
 		Subsystem: "network",
@@ -66,11 +33,6 @@ var (
 )
 
 func init() {
-	prometheus.MustRegister(receivedCounter)
-	prometheus.MustRegister(failedCounter)
-	prometheus.MustRegister(streamsGauage)
-	prometheus.MustRegister(handlingDuration)
-
 	prometheus.MustRegister(sentBytes)
 	prometheus.MustRegister(receivedBytes)
 }