Browse Source

Merge pull request #3700 from xiang90/metrics_hi

Replace Summary with Histogram for all metrics
Xiang Li 10 years ago
parent
commit
ff36b9d9bc
8 changed files with 46 additions and 37 deletions
  1. 21 21
      Documentation/metrics.md
  2. 3 2
      etcdserver/metrics.go
  3. 1 1
      etcdserver/server.go
  4. 9 4
      rafthttp/metrics.go
  5. 6 4
      snap/metrics.go
  6. 2 2
      snap/snapshotter.go
  7. 3 2
      wal/metrics.go
  8. 1 1
      wal/wal.go

+ 21 - 21
Documentation/metrics.md

@@ -15,16 +15,16 @@ etcd now exposes the following metrics:
 
 ## etcdserver
 
-| Name                                    | Description                                      | Type    |
-|-----------------------------------------|--------------------------------------------------|---------|
-| file_descriptors_used_total             | The total number of file descriptors used        | Gauge   |
-| proposal_durations_milliseconds         | The latency distributions of committing proposal | Summary |
-| pending_proposal_total                  | The total number of pending proposals            | Gauge   |
-| proposal_failed_total                   | The total number of failed proposals             | Counter |
+| Name                                    | Description                                      | Type      |
+|-----------------------------------------|--------------------------------------------------|-----------|
+| file_descriptors_used_total             | The total number of file descriptors used        | Gauge     |
+| proposal_durations_seconds              | The latency distributions of committing proposal | Histogram |
+| pending_proposal_total                  | The total number of pending proposals            | Gauge     |
+| proposal_failed_total                   | The total number of failed proposals             | Counter   |
 
 High file descriptors (`file_descriptors_used_total`) usage (near the file descriptors limitation of the process) indicates a potential out of file descriptors issue. That might cause etcd fails to create new WAL files and panics.
 
-[Proposal](glossary.md#proposal) durations (`proposal_durations_milliseconds`) give you an summary about the proposal commit latency. Latency can be introduced into this process by network and disk IO.
+[Proposal](glossary.md#proposal) durations (`proposal_durations_seconds`) give you a histogram about the proposal commit latency. Latency can be introduced into this process by network and disk IO.
 
 Pending proposal (`pending_proposal_total`) gives you an idea about how many proposal are in the queue and waiting for commit. An increasing pending number indicates a high client load or an unstable cluster.
 
@@ -32,12 +32,12 @@ Failed proposals (`proposal_failed_total`) are normally related to two issues: t
 
 ## wal
 
-| Name                               | Description                                      | Type    |
-|------------------------------------|--------------------------------------------------|---------|
-| fsync_durations_microseconds       | The latency distributions of fsync called by wal | Summary |
-| last_index_saved                   | The index of the last entry saved by wal         | Gauge   |
+| Name                               | Description                                      | Type      |
+|------------------------------------|--------------------------------------------------|-----------|
+| fsync_durations_seconds            | The latency distributions of fsync called by wal | Histogram |
+| last_index_saved                   | The index of the last entry saved by wal         | Gauge     |
 
-Abnormally high fsync duration (`fsync_durations_microseconds`) indicates disk issues and might cause the cluster to be unstable.
+Abnormally high fsync duration (`fsync_durations_seconds`) indicates disk issues and might cause the cluster to be unstable.
 
 
 ## http requests
@@ -73,22 +73,22 @@ Example Prometheus queries that may be useful from these metrics (across all etc
 
 ## snapshot
 
-| Name                                       | Description                                                | Type    |
-|--------------------------------------------|------------------------------------------------------------|---------|
-| snapshot_save_total_durations_microseconds | The total latency distributions of save called by snapshot | Summary |
+| Name                                       | Description                                                | Type      |
+|--------------------------------------------|------------------------------------------------------------|-----------|
+| snapshot_save_total_durations_seconds      | The total latency distributions of save called by snapshot | Histogram |
 
-Abnormally high snapshot duration (`snapshot_save_total_durations_microseconds`) indicates disk issues and might cause the cluster to be unstable.
+Abnormally high snapshot duration (`snapshot_save_total_durations_seconds`) indicates disk issues and might cause the cluster to be unstable.
 
 
 ## rafthttp
 
-| Name                              | Description                                | Type    | Labels                         |
-|-----------------------------------|--------------------------------------------|---------|--------------------------------|
-| message_sent_latency_microseconds | The latency distributions of messages sent | Summary | sendingType, msgType, remoteID |
-| message_sent_failed_total         | The total number of failed messages sent   | Summary | sendingType, msgType, remoteID |
+| Name                              | Description                                | Type         | Labels                         |
+|-----------------------------------|--------------------------------------------|--------------|--------------------------------|
+| message_sent_latency_seconds      | The latency distributions of messages sent | HistogramVec | sendingType, msgType, remoteID |
+| message_sent_failed_total         | The total number of failed messages sent   | Summary      | sendingType, msgType, remoteID |
 
 
-Abnormally high message duration (`message_sent_latency_microseconds`) indicates network issues and might cause the cluster to be unstable.
+Abnormally high message duration (`message_sent_latency_seconds`) indicates network issues and might cause the cluster to be unstable.
 
 An increase in message failures (`message_sent_failed_total`) indicates more severe network issues and might cause the cluster to be unstable.
 

+ 3 - 2
etcdserver/metrics.go

@@ -23,11 +23,12 @@ import (
 
 var (
 	// TODO: with label in v3?
-	proposeDurations = prometheus.NewSummary(prometheus.SummaryOpts{
+	proposeDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
 		Namespace: "etcd",
 		Subsystem: "server",
-		Name:      "proposal_durations_milliseconds",
+		Name:      "proposal_durations_seconds",
 		Help:      "The latency distributions of committing proposal.",
+		Buckets:   prometheus.ExponentialBuckets(0.001, 2, 14),
 	})
 	proposePending = prometheus.NewGauge(prometheus.GaugeOpts{
 		Namespace: "etcd",

+ 1 - 1
etcdserver/server.go

@@ -622,7 +622,7 @@ func (s *EtcdServer) Do(ctx context.Context, r pb.Request) (Response, error) {
 
 		select {
 		case x := <-ch:
-			proposeDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Millisecond)))
+			proposeDurations.Observe(float64(time.Since(start)) / float64(time.Second))
 			resp := x.(Response)
 			return resp, resp.err
 		case <-ctx.Done():

+ 9 - 4
rafthttp/metrics.go

@@ -23,12 +23,17 @@ import (
 )
 
 var (
-	msgSentDuration = prometheus.NewSummaryVec(
-		prometheus.SummaryOpts{
+	// TODO: create a separate histogram for recording
+	// snapshot sending metric. snapshot can be large and
+	// take a long time to send. So it needs a different
+	// time range than other type of messages.
+	msgSentDuration = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
 			Namespace: "etcd",
 			Subsystem: "rafthttp",
-			Name:      "message_sent_latency_microseconds",
+			Name:      "message_sent_latency_seconds",
 			Help:      "message sent latency distributions.",
+			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 13),
 		},
 		[]string{"sendingType", "remoteID", "msgType"},
 	)
@@ -53,7 +58,7 @@ func reportSentDuration(sendingType string, m raftpb.Message, duration time.Dura
 	if isLinkHeartbeatMessage(m) {
 		typ = "MsgLinkHeartbeat"
 	}
-	msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration.Nanoseconds() / int64(time.Microsecond)))
+	msgSentDuration.WithLabelValues(sendingType, types.ID(m.To).String(), typ).Observe(float64(duration) / float64(time.Second))
 }
 
 func reportSentFailure(sendingType string, m raftpb.Message) {

+ 6 - 4
snap/metrics.go

@@ -18,18 +18,20 @@ import "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/clien
 
 var (
 	// TODO: save_fsync latency?
-	saveDurations = prometheus.NewSummary(prometheus.SummaryOpts{
+	saveDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
 		Namespace: "etcd",
 		Subsystem: "snapshot",
-		Name:      "save_total_durations_microseconds",
+		Name:      "save_total_durations_seconds",
 		Help:      "The total latency distributions of save called by snapshot.",
+		Buckets:   prometheus.ExponentialBuckets(0.001, 2, 14),
 	})
 
-	marshallingDurations = prometheus.NewSummary(prometheus.SummaryOpts{
+	marshallingDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
 		Namespace: "etcd",
 		Subsystem: "snapshot",
-		Name:      "save_marshalling_durations_microseconds",
+		Name:      "save_marshalling_durations_seconds",
 		Help:      "The marshalling cost distributions of save called by snapshot.",
+		Buckets:   prometheus.ExponentialBuckets(0.001, 2, 14),
 	})
 )
 

+ 2 - 2
snap/snapshotter.go

@@ -74,12 +74,12 @@ func (s *Snapshotter) save(snapshot *raftpb.Snapshot) error {
 	if err != nil {
 		return err
 	} else {
-		marshallingDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond)))
+		marshallingDurations.Observe(float64(time.Since(start)) / float64(time.Second))
 	}
 
 	err = ioutil.WriteFile(path.Join(s.dir, fname), d, 0666)
 	if err == nil {
-		saveDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond)))
+		saveDurations.Observe(float64(time.Since(start)) / float64(time.Second))
 	}
 	return err
 }

+ 3 - 2
wal/metrics.go

@@ -17,11 +17,12 @@ package wal
 import "github.com/coreos/etcd/Godeps/_workspace/src/github.com/prometheus/client_golang/prometheus"
 
 var (
-	syncDurations = prometheus.NewSummary(prometheus.SummaryOpts{
+	syncDurations = prometheus.NewHistogram(prometheus.HistogramOpts{
 		Namespace: "etcd",
 		Subsystem: "wal",
-		Name:      "fsync_durations_microseconds",
+		Name:      "fsync_durations_seconds",
 		Help:      "The latency distributions of fsync called by wal.",
+		Buckets:   prometheus.ExponentialBuckets(0.001, 2, 14),
 	})
 	lastIndexSaved = prometheus.NewGauge(prometheus.GaugeOpts{
 		Namespace: "etcd",

+ 1 - 1
wal/wal.go

@@ -403,7 +403,7 @@ func (w *WAL) sync() error {
 	}
 	start := time.Now()
 	err := w.f.Sync()
-	syncDurations.Observe(float64(time.Since(start).Nanoseconds() / int64(time.Microsecond)))
+	syncDurations.Observe(float64(time.Since(start)) / float64(time.Second))
 	return err
 }