Browse Source

Merge pull request #6072 from xiang90/tls_err

Log TLS error in health checking
Xiang Li 9 years ago
parent
commit
a2715e3bda

+ 1 - 1
cmd/Godeps/Godeps.json

@@ -197,7 +197,7 @@
 		},
 		},
 		{
 		{
 			"ImportPath": "github.com/xiang90/probing",
 			"ImportPath": "github.com/xiang90/probing",
-			"Rev": "6a0cc1ae81b4cc11db5e491e030e4b98fba79c19"
+			"Rev": "95bd620af35406ab93d7f5bf320dba4b4667982e"
 		},
 		},
 		{
 		{
 			"ImportPath": "golang.org/x/crypto/bcrypt",
 			"ImportPath": "golang.org/x/crypto/bcrypt",

+ 2 - 2
cmd/vendor/github.com/xiang90/probing/prober.go

@@ -61,7 +61,7 @@ func (p *prober) AddHTTP(id string, probingInterval time.Duration, endpoints []s
 				}
 				}
 				resp, err := p.tr.RoundTrip(req)
 				resp, err := p.tr.RoundTrip(req)
 				if err != nil {
 				if err != nil {
-					s.recordFailure()
+					s.recordFailure(err)
 					pinned = (pinned + 1) % len(endpoints)
 					pinned = (pinned + 1) % len(endpoints)
 					continue
 					continue
 				}
 				}
@@ -71,7 +71,7 @@ func (p *prober) AddHTTP(id string, probingInterval time.Duration, endpoints []s
 				err = d.Decode(&hh)
 				err = d.Decode(&hh)
 				resp.Body.Close()
 				resp.Body.Close()
 				if err != nil || !hh.OK {
 				if err != nil || !hh.OK {
-					s.recordFailure()
+					s.recordFailure(err)
 					pinned = (pinned + 1) % len(endpoints)
 					pinned = (pinned + 1) % len(endpoints)
 					continue
 					continue
 				}
 				}

+ 12 - 1
cmd/vendor/github.com/xiang90/probing/status.go

@@ -14,6 +14,7 @@ type Status interface {
 	Total() int64
 	Total() int64
 	Loss() int64
 	Loss() int64
 	Health() bool
 	Health() bool
+	Err() error
 	// Estimated smoothed round trip time
 	// Estimated smoothed round trip time
 	SRTT() time.Duration
 	SRTT() time.Duration
 	// Estimated clock difference
 	// Estimated clock difference
@@ -27,6 +28,7 @@ type status struct {
 	total     int64
 	total     int64
 	loss      int64
 	loss      int64
 	health    bool
 	health    bool
+	err       error
 	clockdiff time.Duration
 	clockdiff time.Duration
 	stopC     chan struct{}
 	stopC     chan struct{}
 }
 }
@@ -56,6 +58,12 @@ func (s *status) Health() bool {
 	return s.health
 	return s.health
 }
 }
 
 
+func (s *status) Err() error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return s.err
+}
+
 func (s *status) ClockDiff() time.Duration {
 func (s *status) ClockDiff() time.Duration {
 	s.mu.Lock()
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	defer s.mu.Unlock()
@@ -74,15 +82,17 @@ func (s *status) record(rtt time.Duration, when time.Time) {
 	s.health = true
 	s.health = true
 	s.srtt = time.Duration((1-α)*float64(s.srtt) + α*float64(rtt))
 	s.srtt = time.Duration((1-α)*float64(s.srtt) + α*float64(rtt))
 	s.clockdiff = time.Now().Sub(when) - s.srtt/2
 	s.clockdiff = time.Now().Sub(when) - s.srtt/2
+	s.err = nil
 }
 }
 
 
-func (s *status) recordFailure() {
+func (s *status) recordFailure(err error) {
 	s.mu.Lock()
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	defer s.mu.Unlock()
 
 
 	s.total++
 	s.total++
 	s.health = false
 	s.health = false
 	s.loss += 1
 	s.loss += 1
+	s.err = err
 }
 }
 
 
 func (s *status) reset() {
 func (s *status) reset() {
@@ -93,4 +103,5 @@ func (s *status) reset() {
 	s.total = 0
 	s.total = 0
 	s.health = false
 	s.health = false
 	s.clockdiff = 0
 	s.clockdiff = 0
+	s.err = nil
 }
 }

+ 8 - 2
rafthttp/probing_status.go

@@ -25,6 +25,7 @@ var (
 	// Or the connection will time-out.
 	// Or the connection will time-out.
 	proberInterval           = ConnReadTimeout - time.Second
 	proberInterval           = ConnReadTimeout - time.Second
 	statusMonitoringInterval = 30 * time.Second
 	statusMonitoringInterval = 30 * time.Second
+	statusErrorInterval      = 5 * time.Second
 )
 )
 
 
 func addPeerToProber(p probing.Prober, id string, us []string) {
 func addPeerToProber(p probing.Prober, id string, us []string) {
@@ -44,11 +45,16 @@ func addPeerToProber(p probing.Prober, id string, us []string) {
 }
 }
 
 
 func monitorProbingStatus(s probing.Status, id string) {
 func monitorProbingStatus(s probing.Status, id string) {
+	// set the first interval short to log error early.
+	interval := statusErrorInterval
 	for {
 	for {
 		select {
 		select {
-		case <-time.After(statusMonitoringInterval):
+		case <-time.After(interval):
 			if !s.Health() {
 			if !s.Health() {
-				plog.Warningf("health check for peer %s could not connect", id)
+				plog.Warningf("health check for peer %s could not connect: %v", id, s.Err())
+				interval = statusErrorInterval
+			} else {
+				interval = statusMonitoringInterval
 			}
 			}
 			if s.ClockDiff() > time.Second {
 			if s.ClockDiff() > time.Second {
 				plog.Warningf("the clock difference against peer %s is too high [%v > %v]", id, s.ClockDiff(), time.Second)
 				plog.Warningf("the clock difference against peer %s is too high [%v > %v]", id, s.ClockDiff(), time.Second)