|
@@ -23,6 +23,10 @@ import (
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
const (
|
|
const (
|
|
|
|
|
+ // MaxHeartbeatTimeoutBackoff is the maximum number of seconds before we warn
|
|
|
|
|
+ // the user again about a peer not accepting heartbeats.
|
|
|
|
|
+ MaxHeartbeatTimeoutBackoff = 15 * time.Second
|
|
|
|
|
+
|
|
|
// ThresholdMonitorTimeout is the time between log notifications that the
|
|
// ThresholdMonitorTimeout is the time between log notifications that the
|
|
|
// Raft heartbeat is too close to the election timeout.
|
|
// Raft heartbeat is too close to the election timeout.
|
|
|
ThresholdMonitorTimeout = 5 * time.Second
|
|
ThresholdMonitorTimeout = 5 * time.Second
|
|
@@ -70,10 +74,18 @@ type PeerServer struct {
|
|
|
routineGroup sync.WaitGroup
|
|
routineGroup sync.WaitGroup
|
|
|
timeoutThresholdChan chan interface{}
|
|
timeoutThresholdChan chan interface{}
|
|
|
|
|
|
|
|
|
|
+ logBackoffs map[string]*logBackoff
|
|
|
|
|
+
|
|
|
metrics *metrics.Bucket
|
|
metrics *metrics.Bucket
|
|
|
sync.Mutex
|
|
sync.Mutex
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+type logBackoff struct {
|
|
|
|
|
+ next time.Time
|
|
|
|
|
+ backoff time.Duration
|
|
|
|
|
+ count int
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
// TODO: find a good policy to do snapshot
|
|
// TODO: find a good policy to do snapshot
|
|
|
type snapshotConf struct {
|
|
type snapshotConf struct {
|
|
|
// Etcd will check if snapshot is need every checkingInterval
|
|
// Etcd will check if snapshot is need every checkingInterval
|
|
@@ -97,6 +109,7 @@ func NewPeerServer(psConfig PeerServerConfig, client *Client, registry *Registry
|
|
|
serverStats: serverStats,
|
|
serverStats: serverStats,
|
|
|
|
|
|
|
|
timeoutThresholdChan: make(chan interface{}, 1),
|
|
timeoutThresholdChan: make(chan interface{}, 1),
|
|
|
|
|
+ logBackoffs: make(map[string]*logBackoff),
|
|
|
|
|
|
|
|
metrics: mb,
|
|
metrics: mb,
|
|
|
}
|
|
}
|
|
@@ -687,11 +700,12 @@ func (s *PeerServer) raftEventLogger(event raft.Event) {
|
|
|
case raft.RemovePeerEventType:
|
|
case raft.RemovePeerEventType:
|
|
|
log.Infof("%s: peer removed: '%v'", s.Config.Name, value)
|
|
log.Infof("%s: peer removed: '%v'", s.Config.Name, value)
|
|
|
case raft.HeartbeatIntervalEventType:
|
|
case raft.HeartbeatIntervalEventType:
|
|
|
- var name = "<unknown>"
|
|
|
|
|
- if peer, ok := value.(*raft.Peer); ok {
|
|
|
|
|
- name = peer.Name
|
|
|
|
|
|
|
+ peer, ok := value.(*raft.Peer);
|
|
|
|
|
+ if !ok {
|
|
|
|
|
+ log.Warnf("%s: heatbeat timeout from unknown peer", s.Config.Name)
|
|
|
|
|
+ return
|
|
|
}
|
|
}
|
|
|
- log.Infof("%s: warning: heartbeat timed out: '%v'", s.Config.Name, name)
|
|
|
|
|
|
|
+ s.logHeartbeatTimeout(peer)
|
|
|
case raft.ElectionTimeoutThresholdEventType:
|
|
case raft.ElectionTimeoutThresholdEventType:
|
|
|
select {
|
|
select {
|
|
|
case s.timeoutThresholdChan <- value:
|
|
case s.timeoutThresholdChan <- value:
|
|
@@ -701,6 +715,35 @@ func (s *PeerServer) raftEventLogger(event raft.Event) {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+// logHeartbeatTimeout logs about the edge triggered heartbeat timeout event
|
|
|
|
|
+// only if we haven't warned within a reasonable interval.
|
|
|
|
|
+func (s *PeerServer) logHeartbeatTimeout(peer *raft.Peer) {
|
|
|
|
|
+ b, ok := s.logBackoffs[peer.Name]
|
|
|
|
|
+ if !ok {
|
|
|
|
|
+ b = &logBackoff{time.Time{}, time.Second, 1}
|
|
|
|
|
+ s.logBackoffs[peer.Name] = b
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if peer.LastActivity().After(b.next) {
|
|
|
|
|
+ b.next = time.Time{}
|
|
|
|
|
+ b.backoff = time.Second
|
|
|
|
|
+ b.count = 1
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if b.next.After(time.Now()) {
|
|
|
|
|
+ b.count++
|
|
|
|
|
+ return
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ b.backoff = 2 * b.backoff
|
|
|
|
|
+ if b.backoff > MaxHeartbeatTimeoutBackoff {
|
|
|
|
|
+ b.backoff = MaxHeartbeatTimeoutBackoff
|
|
|
|
|
+ }
|
|
|
|
|
+ b.next = time.Now().Add(b.backoff)
|
|
|
|
|
+
|
|
|
|
|
+ log.Infof("%s: warning: heartbeat time out peer=%q missed=%d backoff=%q", s.Config.Name, peer.Name, b.count, b.backoff)
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
func (s *PeerServer) recordMetricEvent(event raft.Event) {
|
|
func (s *PeerServer) recordMetricEvent(event raft.Event) {
|
|
|
name := fmt.Sprintf("raft.event.%s", event.Type())
|
|
name := fmt.Sprintf("raft.event.%s", event.Type())
|
|
|
value := event.Value().(time.Duration)
|
|
value := event.Value().(time.Duration)
|