Browse Source

Merge pull request #6843 from gyuho/docs

Documentation/op-guide: add 'monitoring' guide
Gyu-Ho Lee 9 năm trước cách đây
mục cha
commit
70fd684843
3 tập tin đã thay đổi với 1089 bổ sung1 xóa
  1. 2 1
      Documentation/docs.md
  2. 1012 0
      Documentation/op-guide/grafana.json
  3. 75 0
      Documentation/op-guide/monitoring.md

+ 2 - 1
Documentation/docs.md

@@ -28,7 +28,7 @@ Administrators who need to create reliable and scalable key-value stores for the
  - [Run etcd clusters inside containers][container]
  - [Configuration][conf]
  - [Security][security]
- - Monitoring
+ - [Monitoring][monitoring]
  - [Maintenance][maintenance]
  - [Understand failures][failures]
  - [Disaster recovery][recovery]
@@ -72,6 +72,7 @@ To learn more about the concepts and internals behind etcd, read the following p
 [recovery]: op-guide/recovery.md
 [maintenance]: op-guide/maintenance.md
 [security]: op-guide/security.md
+[monitoring]: op-guide/monitoring.md
 [v2_migration]: op-guide/v2-migration.md
 [container]: op-guide/container.md
 [understand_apis]: learning/api.md

+ 1012 - 0
Documentation/op-guide/grafana.json

@@ -0,0 +1,1012 @@
+{
+    "id": 6,
+    "title": "test-etcd",
+    "description": "etcd sample Grafana dashboard with Prometheus",
+    "tags": [],
+    "style": "dark",
+    "timezone": "browser",
+    "editable": true,
+    "hideControls": false,
+    "sharedCrosshair": false,
+    "rows": [{
+            "collapse": false,
+            "editable": true,
+            "height": "250px",
+            "panels": [{
+                    "cacheTimeout": null,
+                    "colorBackground": false,
+                    "colorValue": false,
+                    "colors": [
+                        "rgba(245, 54, 54, 0.9)",
+                        "rgba(237, 129, 40, 0.89)",
+                        "rgba(50, 172, 45, 0.97)"
+                    ],
+                    "datasource": "test-etcd",
+                    "editable": true,
+                    "error": false,
+                    "format": "none",
+                    "gauge": {
+                        "maxValue": 100,
+                        "minValue": 0,
+                        "show": false,
+                        "thresholdLabels": false,
+                        "thresholdMarkers": true
+                    },
+                    "id": 28,
+                    "interval": null,
+                    "isNew": true,
+                    "links": [],
+                    "mappingType": 1,
+                    "mappingTypes": [{
+                            "name": "value to text",
+                            "value": 1
+                        },
+                        {
+                            "name": "range to text",
+                            "value": 2
+                        }
+                    ],
+                    "maxDataPoints": 100,
+                    "nullPointMode": "connected",
+                    "nullText": null,
+                    "postfix": "",
+                    "postfixFontSize": "50%",
+                    "prefix": "",
+                    "prefixFontSize": "50%",
+                    "rangeMaps": [{
+                        "from": "null",
+                        "text": "N/A",
+                        "to": "null"
+                    }],
+                    "span": 3,
+                    "sparkline": {
+                        "fillColor": "rgba(31, 118, 189, 0.18)",
+                        "full": false,
+                        "lineColor": "rgb(31, 120, 193)",
+                        "show": false
+                    },
+                    "targets": [{
+                        "expr": "sum(etcd_server_has_leader)",
+                        "intervalFactor": 2,
+                        "legendFormat": "",
+                        "metric": "etcd_server_has_leader",
+                        "refId": "A",
+                        "step": 20
+                    }],
+                    "thresholds": "",
+                    "title": "Up",
+                    "type": "singlestat",
+                    "valueFontSize": "200%",
+                    "valueMaps": [{
+                        "op": "=",
+                        "text": "N/A",
+                        "value": "null"
+                    }],
+                    "valueName": "avg"
+                },
+                {
+                    "aliasColors": {},
+                    "bars": false,
+                    "datasource": "test-etcd",
+                    "editable": true,
+                    "error": false,
+                    "fill": 0,
+                    "id": 23,
+                    "isNew": true,
+                    "legend": {
+                        "avg": false,
+                        "current": false,
+                        "max": false,
+                        "min": false,
+                        "show": false,
+                        "total": false,
+                        "values": false
+                    },
+                    "lines": true,
+                    "linewidth": 2,
+                    "links": [],
+                    "nullPointMode": "connected",
+                    "percentage": false,
+                    "pointradius": 5,
+                    "points": false,
+                    "renderer": "flot",
+                    "seriesOverrides": [],
+                    "span": 5,
+                    "stack": false,
+                    "steppedLine": false,
+                    "targets": [{
+                            "expr": "sum(rate(etcd_grpc_requests_total [1m]))",
+                            "intervalFactor": 2,
+                            "legendFormat": "{{instance}} RPC Rate",
+                            "metric": "etcd_grpc_requests_total",
+                            "refId": "A",
+                            "step": 4
+                        },
+                        {
+                            "expr": "sum(rate(etcd_grpc_requests_failed_total [1m]))",
+                            "intervalFactor": 2,
+                            "legendFormat": "{{instance}} RPC Failed Rate",
+                            "metric": "etcd_grpc_requests_failed_total",
+                            "refId": "B",
+                            "step": 4
+                        }
+                    ],
+                    "thresholds": [],
+                    "timeFrom": null,
+                    "timeShift": null,
+                    "title": "RPC Rate",
+                    "tooltip": {
+                        "msResolution": false,
+                        "shared": true,
+                        "sort": 0,
+                        "value_type": "individual"
+                    },
+                    "type": "graph",
+                    "xaxis": {
+                        "mode": "time",
+                        "name": null,
+                        "show": true,
+                        "values": []
+                    },
+                    "yaxes": [{
+                            "format": "ops",
+                            "label": null,
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        },
+                        {
+                            "format": "short",
+                            "label": null,
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        }
+                    ]
+                },
+                {
+                    "aliasColors": {},
+                    "bars": false,
+                    "datasource": "test-etcd",
+                    "editable": true,
+                    "error": false,
+                    "fill": 0,
+                    "id": 41,
+                    "isNew": true,
+                    "legend": {
+                        "avg": false,
+                        "current": false,
+                        "max": false,
+                        "min": false,
+                        "show": false,
+                        "total": false,
+                        "values": false
+                    },
+                    "lines": true,
+                    "linewidth": 2,
+                    "links": [],
+                    "nullPointMode": "connected",
+                    "percentage": false,
+                    "pointradius": 5,
+                    "points": false,
+                    "renderer": "flot",
+                    "seriesOverrides": [],
+                    "span": 4,
+                    "stack": true,
+                    "steppedLine": false,
+                    "targets": [{
+                            "expr": "sum(etcd_grpc_active_streams {grpc_service=\"etcdserverpb.Watch\"})",
+                            "intervalFactor": 2,
+                            "legendFormat": "Watch Streams",
+                            "metric": "etcd_grpc_active_streams",
+                            "refId": "A",
+                            "step": 4
+                        },
+                        {
+                            "expr": "sum(etcd_grpc_active_streams {grpc_service=\"etcdserverpb.Lease\"})",
+                            "intervalFactor": 2,
+                            "legendFormat": "Lease Streams",
+                            "metric": "etcd_grpc_active_streams",
+                            "refId": "B",
+                            "step": 4
+                        }
+                    ],
+                    "thresholds": [],
+                    "timeFrom": null,
+                    "timeShift": null,
+                    "title": "Active Streams",
+                    "tooltip": {
+                        "msResolution": false,
+                        "shared": true,
+                        "sort": 0,
+                        "value_type": "individual"
+                    },
+                    "type": "graph",
+                    "xaxis": {
+                        "mode": "time",
+                        "name": null,
+                        "show": true,
+                        "values": []
+                    },
+                    "yaxes": [{
+                            "format": "short",
+                            "label": "",
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        },
+                        {
+                            "format": "short",
+                            "label": null,
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        }
+                    ]
+                }
+            ],
+            "showTitle": false,
+            "title": "Row"
+        },
+        {
+            "collapse": false,
+            "editable": true,
+            "height": "250px",
+            "panels": [{
+                    "aliasColors": {},
+                    "bars": false,
+                    "datasource": "test-etcd",
+                    "decimals": null,
+                    "editable": true,
+                    "error": false,
+                    "fill": 0,
+                    "grid": {},
+                    "id": 1,
+                    "legend": {
+                        "avg": false,
+                        "current": false,
+                        "max": false,
+                        "min": false,
+                        "show": false,
+                        "total": false,
+                        "values": false
+                    },
+                    "lines": true,
+                    "linewidth": 2,
+                    "links": [],
+                    "nullPointMode": "connected",
+                    "percentage": false,
+                    "pointradius": 5,
+                    "points": false,
+                    "renderer": "flot",
+                    "seriesOverrides": [],
+                    "span": 4,
+                    "stack": false,
+                    "steppedLine": false,
+                    "targets": [{
+                        "expr": "etcd_debugging_mvcc_db_total_size_in_bytes",
+                        "hide": false,
+                        "interval": "",
+                        "intervalFactor": 2,
+                        "legendFormat": "{{instance}} DB Size",
+                        "metric": "",
+                        "refId": "A",
+                        "step": 4
+                    }],
+                    "thresholds": [],
+                    "timeFrom": null,
+                    "timeShift": null,
+                    "title": "DB Size",
+                    "tooltip": {
+                        "msResolution": false,
+                        "shared": true,
+                        "sort": 0,
+                        "value_type": "cumulative"
+                    },
+                    "type": "graph",
+                    "xaxis": {
+                        "mode": "time",
+                        "name": null,
+                        "show": true,
+                        "values": []
+                    },
+                    "yaxes": [{
+                            "format": "bytes",
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        },
+                        {
+                            "format": "short",
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": false
+                        }
+                    ]
+                },
+                {
+                    "aliasColors": {},
+                    "bars": false,
+                    "datasource": "test-etcd",
+                    "editable": true,
+                    "error": false,
+                    "fill": 0,
+                    "grid": {},
+                    "id": 3,
+                    "legend": {
+                        "avg": false,
+                        "current": false,
+                        "max": false,
+                        "min": false,
+                        "show": false,
+                        "total": false,
+                        "values": false
+                    },
+                    "lines": true,
+                    "linewidth": 2,
+                    "links": [],
+                    "nullPointMode": "connected",
+                    "percentage": false,
+                    "pointradius": 1,
+                    "points": false,
+                    "renderer": "flot",
+                    "seriesOverrides": [],
+                    "span": 4,
+                    "stack": false,
+                    "steppedLine": true,
+                    "targets": [{
+                            "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket [5m])) by (instance, le))",
+                            "hide": false,
+                            "intervalFactor": 2,
+                            "legendFormat": "{{instance}} WAL fsync",
+                            "metric": "etcd_disk_wal_fsync_duration_seconds_bucket",
+                            "refId": "A",
+                            "step": 4
+                        },
+                        {
+                            "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket [5m])) by (instance, le))",
+                            "intervalFactor": 2,
+                            "legendFormat": "{{instance}} DB fsync",
+                            "metric": "etcd_disk_backend_commit_duration_seconds_bucket",
+                            "refId": "B",
+                            "step": 4
+                        }
+                    ],
+                    "thresholds": [],
+                    "timeFrom": null,
+                    "timeShift": null,
+                    "title": "Disk Sync Duration",
+                    "tooltip": {
+                        "msResolution": false,
+                        "shared": true,
+                        "sort": 0,
+                        "value_type": "cumulative"
+                    },
+                    "type": "graph",
+                    "xaxis": {
+                        "mode": "time",
+                        "name": null,
+                        "show": true,
+                        "values": []
+                    },
+                    "yaxes": [{
+                            "format": "s",
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        },
+                        {
+                            "format": "short",
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": false
+                        }
+                    ]
+                },
+                {
+                    "aliasColors": {},
+                    "bars": false,
+                    "datasource": "test-etcd",
+                    "editable": true,
+                    "error": false,
+                    "fill": 0,
+                    "id": 29,
+                    "isNew": true,
+                    "legend": {
+                        "avg": false,
+                        "current": false,
+                        "max": false,
+                        "min": false,
+                        "show": false,
+                        "total": false,
+                        "values": false
+                    },
+                    "lines": true,
+                    "linewidth": 2,
+                    "links": [],
+                    "nullPointMode": "connected",
+                    "percentage": false,
+                    "pointradius": 5,
+                    "points": false,
+                    "renderer": "flot",
+                    "seriesOverrides": [],
+                    "span": 4,
+                    "stack": false,
+                    "steppedLine": false,
+                    "targets": [{
+                        "expr": "process_resident_memory_bytes",
+                        "intervalFactor": 2,
+                        "legendFormat": "{{instance}} Resident Memory",
+                        "metric": "process_resident_memory_bytes",
+                        "refId": "A",
+                        "step": 4
+                    }],
+                    "thresholds": [],
+                    "timeFrom": null,
+                    "timeShift": null,
+                    "title": "Memory",
+                    "tooltip": {
+                        "msResolution": false,
+                        "shared": true,
+                        "sort": 0,
+                        "value_type": "individual"
+                    },
+                    "type": "graph",
+                    "xaxis": {
+                        "mode": "time",
+                        "name": null,
+                        "show": true,
+                        "values": []
+                    },
+                    "yaxes": [{
+                            "format": "bytes",
+                            "label": null,
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        },
+                        {
+                            "format": "short",
+                            "label": null,
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        }
+                    ]
+                }
+            ],
+            "title": "New row"
+        },
+        {
+            "collapse": false,
+            "editable": true,
+            "height": "250px",
+            "panels": [{
+                    "aliasColors": {},
+                    "bars": false,
+                    "datasource": "test-etcd",
+                    "editable": true,
+                    "error": false,
+                    "fill": 5,
+                    "id": 22,
+                    "isNew": true,
+                    "legend": {
+                        "avg": false,
+                        "current": false,
+                        "max": false,
+                        "min": false,
+                        "show": false,
+                        "total": false,
+                        "values": false
+                    },
+                    "lines": true,
+                    "linewidth": 2,
+                    "links": [],
+                    "nullPointMode": "connected",
+                    "percentage": false,
+                    "pointradius": 5,
+                    "points": false,
+                    "renderer": "flot",
+                    "seriesOverrides": [],
+                    "span": 3,
+                    "stack": true,
+                    "steppedLine": false,
+                    "targets": [{
+                        "expr": "rate(etcd_network_client_grpc_received_bytes_total [1m])",
+                        "intervalFactor": 2,
+                        "legendFormat": "{{instance}} Client Traffic In",
+                        "metric": "etcd_network_client_grpc_received_bytes_total",
+                        "refId": "A",
+                        "step": 4
+                    }],
+                    "thresholds": [],
+                    "timeFrom": null,
+                    "timeShift": null,
+                    "title": "Client Traffic In",
+                    "tooltip": {
+                        "msResolution": false,
+                        "shared": true,
+                        "sort": 0,
+                        "value_type": "individual"
+                    },
+                    "type": "graph",
+                    "xaxis": {
+                        "mode": "time",
+                        "name": null,
+                        "show": true,
+                        "values": []
+                    },
+                    "yaxes": [{
+                            "format": "short",
+                            "label": null,
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        },
+                        {
+                            "format": "short",
+                            "label": null,
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        }
+                    ]
+                },
+                {
+                    "aliasColors": {},
+                    "bars": false,
+                    "datasource": "test-etcd",
+                    "editable": true,
+                    "error": false,
+                    "fill": 5,
+                    "id": 21,
+                    "isNew": true,
+                    "legend": {
+                        "avg": false,
+                        "current": false,
+                        "max": false,
+                        "min": false,
+                        "show": false,
+                        "total": false,
+                        "values": false
+                    },
+                    "lines": true,
+                    "linewidth": 2,
+                    "links": [],
+                    "nullPointMode": "connected",
+                    "percentage": false,
+                    "pointradius": 5,
+                    "points": false,
+                    "renderer": "flot",
+                    "seriesOverrides": [],
+                    "span": 3,
+                    "stack": true,
+                    "steppedLine": false,
+                    "targets": [{
+                        "expr": "rate(etcd_network_client_grpc_sent_bytes_total [1m])",
+                        "intervalFactor": 2,
+                        "legendFormat": "{{instance}} Client Traffic Out",
+                        "metric": "etcd_network_client_grpc_sent_bytes_total",
+                        "refId": "A",
+                        "step": 4
+                    }],
+                    "thresholds": [],
+                    "timeFrom": null,
+                    "timeShift": null,
+                    "title": "Client Traffic Out",
+                    "tooltip": {
+                        "msResolution": false,
+                        "shared": true,
+                        "sort": 0,
+                        "value_type": "individual"
+                    },
+                    "type": "graph",
+                    "xaxis": {
+                        "mode": "time",
+                        "name": null,
+                        "show": true,
+                        "values": []
+                    },
+                    "yaxes": [{
+                            "format": "Bps",
+                            "label": null,
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        },
+                        {
+                            "format": "short",
+                            "label": null,
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        }
+                    ]
+                },
+                {
+                    "aliasColors": {},
+                    "bars": false,
+                    "datasource": "test-etcd",
+                    "editable": true,
+                    "error": false,
+                    "fill": 0,
+                    "id": 20,
+                    "isNew": true,
+                    "legend": {
+                        "avg": false,
+                        "current": false,
+                        "max": false,
+                        "min": false,
+                        "show": false,
+                        "total": false,
+                        "values": false
+                    },
+                    "lines": true,
+                    "linewidth": 2,
+                    "links": [],
+                    "nullPointMode": "connected",
+                    "percentage": false,
+                    "pointradius": 5,
+                    "points": false,
+                    "renderer": "flot",
+                    "seriesOverrides": [],
+                    "span": 3,
+                    "stack": false,
+                    "steppedLine": false,
+                    "targets": [{
+                        "expr": "sum(rate(etcd_network_peer_received_bytes_total [1m])) by (instance)",
+                        "intervalFactor": 2,
+                        "legendFormat": "{{instance}} Peer Traffic In",
+                        "metric": "etcd_network_peer_received_bytes_total",
+                        "refId": "A",
+                        "step": 4
+                    }],
+                    "thresholds": [],
+                    "timeFrom": null,
+                    "timeShift": null,
+                    "title": "Peer Traffic In",
+                    "tooltip": {
+                        "msResolution": false,
+                        "shared": true,
+                        "sort": 0,
+                        "value_type": "individual"
+                    },
+                    "type": "graph",
+                    "xaxis": {
+                        "mode": "time",
+                        "name": null,
+                        "show": true,
+                        "values": []
+                    },
+                    "yaxes": [{
+                            "format": "Bps",
+                            "label": null,
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        },
+                        {
+                            "format": "short",
+                            "label": null,
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        }
+                    ]
+                },
+                {
+                    "aliasColors": {},
+                    "bars": false,
+                    "datasource": "test-etcd",
+                    "decimals": null,
+                    "editable": true,
+                    "error": false,
+                    "fill": 0,
+                    "grid": {},
+                    "id": 16,
+                    "legend": {
+                        "avg": false,
+                        "current": false,
+                        "max": false,
+                        "min": false,
+                        "show": false,
+                        "total": false,
+                        "values": false
+                    },
+                    "lines": true,
+                    "linewidth": 2,
+                    "links": [],
+                    "nullPointMode": "connected",
+                    "percentage": false,
+                    "pointradius": 5,
+                    "points": false,
+                    "renderer": "flot",
+                    "seriesOverrides": [],
+                    "span": 3,
+                    "stack": false,
+                    "steppedLine": false,
+                    "targets": [{
+                        "expr": "sum(rate(etcd_network_peer_sent_bytes_total [1m])) by (instance)",
+                        "hide": false,
+                        "interval": "",
+                        "intervalFactor": 2,
+                        "legendFormat": "{{instance}} Peer Traffic Out",
+                        "metric": "etcd_network_peer_sent_bytes_total",
+                        "refId": "A",
+                        "step": 4
+                    }],
+                    "thresholds": [],
+                    "timeFrom": null,
+                    "timeShift": null,
+                    "title": "Peer Traffic Out",
+                    "tooltip": {
+                        "msResolution": false,
+                        "shared": true,
+                        "sort": 0,
+                        "value_type": "cumulative"
+                    },
+                    "type": "graph",
+                    "xaxis": {
+                        "mode": "time",
+                        "name": null,
+                        "show": true,
+                        "values": []
+                    },
+                    "yaxes": [{
+                            "format": "Bps",
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        },
+                        {
+                            "format": "short",
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        }
+                    ]
+                }
+            ],
+            "title": "New row"
+        },
+        {
+            "collapse": false,
+            "editable": true,
+            "height": "250px",
+            "panels": [{
+                    "aliasColors": {},
+                    "bars": false,
+                    "datasource": "test-etcd",
+                    "editable": true,
+                    "error": false,
+                    "fill": 0,
+                    "id": 40,
+                    "isNew": true,
+                    "legend": {
+                        "avg": false,
+                        "current": false,
+                        "max": false,
+                        "min": false,
+                        "show": false,
+                        "total": false,
+                        "values": false
+                    },
+                    "lines": true,
+                    "linewidth": 2,
+                    "links": [],
+                    "nullPointMode": "connected",
+                    "percentage": false,
+                    "pointradius": 5,
+                    "points": false,
+                    "renderer": "flot",
+                    "seriesOverrides": [],
+                    "span": 6,
+                    "stack": false,
+                    "steppedLine": false,
+                    "targets": [{
+                            "expr": "sum(rate(etcd_server_proposals_failed_total [1m]))",
+                            "intervalFactor": 2,
+                            "legendFormat": "Proposal Failure Rate",
+                            "metric": "etcd_server_proposals_failed_total",
+                            "refId": "A",
+                            "step": 2
+                        },
+                        {
+                            "expr": "sum(etcd_server_proposals_pending)",
+                            "intervalFactor": 2,
+                            "legendFormat": "Proposal Pending Total",
+                            "metric": "etcd_server_proposals_pending",
+                            "refId": "B",
+                            "step": 2
+                        },
+                        {
+                            "expr": "sum(rate(etcd_server_proposals_committed_total [1m]))",
+                            "intervalFactor": 2,
+                            "legendFormat": "Proposal Commit Rate",
+                            "metric": "etcd_server_proposals_committed_total",
+                            "refId": "C",
+                            "step": 2
+                        },
+                        {
+                            "expr": "sum(rate(etcd_server_proposals_applied_total [1m]))",
+                            "intervalFactor": 2,
+                            "legendFormat": "Proposal Apply Rate",
+                            "refId": "D",
+                            "step": 2
+                        }
+                    ],
+                    "thresholds": [],
+                    "timeFrom": null,
+                    "timeShift": null,
+                    "title": "Raft Proposals",
+                    "tooltip": {
+                        "msResolution": false,
+                        "shared": true,
+                        "sort": 0,
+                        "value_type": "individual"
+                    },
+                    "type": "graph",
+                    "xaxis": {
+                        "mode": "time",
+                        "name": null,
+                        "show": true,
+                        "values": []
+                    },
+                    "yaxes": [{
+                            "format": "short",
+                            "label": "",
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        },
+                        {
+                            "format": "short",
+                            "label": null,
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        }
+                    ]
+                },
+                {
+                    "aliasColors": {},
+                    "bars": false,
+                    "datasource": "test-etcd",
+                    "decimals": 0,
+                    "editable": true,
+                    "error": false,
+                    "fill": 0,
+                    "id": 19,
+                    "isNew": true,
+                    "legend": {
+                        "alignAsTable": false,
+                        "avg": false,
+                        "current": false,
+                        "max": false,
+                        "min": false,
+                        "rightSide": false,
+                        "show": false,
+                        "total": false,
+                        "values": false
+                    },
+                    "lines": true,
+                    "linewidth": 2,
+                    "links": [],
+                    "nullPointMode": "connected",
+                    "percentage": false,
+                    "pointradius": 5,
+                    "points": false,
+                    "renderer": "flot",
+                    "seriesOverrides": [],
+                    "span": 6,
+                    "stack": false,
+                    "steppedLine": false,
+                    "targets": [{
+                        "expr": "etcd_server_leader_changes_seen_total",
+                        "intervalFactor": 2,
+                        "legendFormat": "{{instance}} Leader Change Seen",
+                        "metric": "etcd_server_leader_changes_seen_total",
+                        "refId": "A",
+                        "step": 2
+                    }],
+                    "thresholds": [],
+                    "timeFrom": null,
+                    "timeShift": null,
+                    "title": "Rate Leader Elections",
+                    "tooltip": {
+                        "msResolution": false,
+                        "shared": true,
+                        "sort": 0,
+                        "value_type": "individual"
+                    },
+                    "type": "graph",
+                    "xaxis": {
+                        "mode": "time",
+                        "name": null,
+                        "show": true,
+                        "values": []
+                    },
+                    "yaxes": [{
+                            "format": "short",
+                            "label": null,
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        },
+                        {
+                            "format": "short",
+                            "label": null,
+                            "logBase": 1,
+                            "max": null,
+                            "min": null,
+                            "show": true
+                        }
+                    ]
+                }
+            ],
+            "title": "New row"
+        }
+    ],
+    "time": {
+        "from": "now-15m",
+        "to": "now"
+    },
+    "timepicker": {
+        "now": true,
+        "refresh_intervals": [
+            "5s",
+            "10s",
+            "30s",
+            "1m",
+            "5m",
+            "15m",
+            "30m",
+            "1h",
+            "2h",
+            "1d"
+        ],
+        "time_options": [
+            "5m",
+            "15m",
+            "1h",
+            "6h",
+            "12h",
+            "24h",
+            "2d",
+            "7d",
+            "30d"
+        ]
+    },
+    "templating": {
+        "list": []
+    },
+    "annotations": {
+        "list": []
+    },
+    "refresh": false,
+    "schemaVersion": 13,
+    "version": 215,
+    "links": [],
+    "gnetId": null
+}

+ 75 - 0
Documentation/op-guide/monitoring.md

@@ -0,0 +1,75 @@
+# Monitoring etcd
+
+Each etcd server exports metrics under the `/metrics` path on its client port.
+
+The metrics can be fetched with `curl`:
+
+```sh
+$ curl -L http://localhost:2379/metrics
+
+# HELP etcd_debugging_mvcc_keys_total Total number of keys.
+# TYPE etcd_debugging_mvcc_keys_total gauge
+etcd_debugging_mvcc_keys_total 0
+# HELP etcd_debugging_mvcc_pending_events_total Total number of pending events to be sent.
+# TYPE etcd_debugging_mvcc_pending_events_total gauge
+etcd_debugging_mvcc_pending_events_total 0
+...
+```
+
+
+## Prometheus
+
+Running a [Prometheus][prometheus] monitoring service is the easiest way to ingest and record etcd's metrics.
+
+First, install Prometheus:
+
+```sh
+PROMETHEUS_VERSION="1.3.1"
+wget https://github.com/prometheus/prometheus/releases/download/v$PROMETHEUS_VERSION/prometheus-$PROMETHEUS_VERSION.linux-amd64.tar.gz -O /tmp/prometheus-$PROMETHEUS_VERSION.linux-amd64.tar.gz
+tar -xvzf /tmp/prometheus-$PROMETHEUS_VERSION.linux-amd64.tar.gz --directory /tmp/ --strip-components=1
+/tmp/prometheus -version
+```
+
+Set Prometheus's scraper to target the etcd cluster endpoints:
+
+```sh
+cat > /tmp/test-etcd.yaml <<EOF
+global:
+  scrape_interval: 10s
+scrape_configs:
+  - job_name: test-etcd
+    static_configs:
+    - targets: ['10.240.0.32:2379','10.240.0.33:2379','10.240.0.34:2379']
+EOF
+cat /tmp/test-etcd.yaml
+```
+
+Set up the Prometheus handler:
+
+```sh
+nohup /tmp/prometheus \
+    -config.file /tmp/test-etcd.yaml \
+    -web.listen-address ":9090" \
+    -storage.local.path "test-etcd.data" >> /tmp/test-etcd.log  2>&1 &
+```
+
+Now Prometheus will scrape etcd metrics every 10 seconds.
+
+
+## Grafana
+
+[Grafana][grafana] has built-in Prometheus support; just add a Prometheus data source:
+
+```
+Name:   test-etcd
+Type:   Prometheus
+Url:    http://localhost:9090
+Access: proxy
+```
+
+Then import the default [etcd dashboard template][template] and customize; see the [demo][demo].
+
+[prometheus]: https://prometheus.io/
+[grafana]: http://grafana.org/
+[template]: ./grafana.json
+[demo]: http://dash.etcd.io/dashboard/db/test-etcd