浏览代码

etcdserver: reject member removal that breaks the current active quorum

Anthony Romano 9 年之前
父节点
当前提交
a1ce07a321
共有 2 个文件被更改,包括 40 次插入14 次删除
  1. 29 4
      etcdserver/server.go
  2. 11 10
      etcdserver/util.go

+ 29 - 4
etcdserver/server.go

@@ -844,10 +844,9 @@ func (s *EtcdServer) AddMember(ctx context.Context, memb membership.Member) erro
 }
 
 func (s *EtcdServer) RemoveMember(ctx context.Context, id uint64) error {
-	if s.Cfg.StrictReconfigCheck && !s.cluster.IsReadyToRemoveMember(id) {
-		// If s.cfg.StrictReconfigCheck is false, it means the option --strict-reconfig-check isn't passed to etcd.
-		// In such a case removing a member is allowed unconditionally
-		return ErrNotEnoughStartedMembers
+	// by default StrictReconfigCheck is enabled; reject removal if leads to quorum loss
+	if err := s.mayRemoveMember(types.ID(id)); err != nil {
+		return err
 	}
 
 	cc := raftpb.ConfChange{
@@ -857,6 +856,32 @@ func (s *EtcdServer) RemoveMember(ctx context.Context, id uint64) error {
 	return s.configure(ctx, cc)
 }
 
+func (s *EtcdServer) mayRemoveMember(id types.ID) error {
+	if !s.Cfg.StrictReconfigCheck {
+		return nil
+	}
+
+	if !s.cluster.IsReadyToRemoveMember(uint64(id)) {
+		plog.Warningf("not enough started members, rejecting remove member %s", id)
+		return ErrNotEnoughStartedMembers
+	}
+
+	// downed member is safe to remove since it's not part of the active quorum
+	if t := s.r.transport.ActiveSince(id); id != s.ID() && t.IsZero() {
+		return nil
+	}
+
+	// protect quorum if some members are down
+	m := s.cluster.Members()
+	active := numConnectedSince(s.r.transport, time.Now().Add(-HealthInterval), s.ID(), m)
+	if (active - 1) < 1+((len(m)-1)/2) {
+		plog.Warningf("reconfigure breaks active quorum, rejecting remove member %s", id)
+		return ErrUnhealthy
+	}
+
+	return nil
+}
+
 func (s *EtcdServer) UpdateMember(ctx context.Context, memb membership.Member) error {
 	b, err := json.Marshal(memb)
 	if err != nil {

+ 11 - 10
etcdserver/util.go

@@ -25,13 +25,7 @@ import (
 // isConnectedToQuorumSince checks whether the local member is connected to the
 // quorum of the cluster since the given time.
 func isConnectedToQuorumSince(transport rafthttp.Transporter, since time.Time, self types.ID, members []*membership.Member) bool {
-	var connectedNum int
-	for _, m := range members {
-		if m.ID == self || isConnectedSince(transport, since, m.ID) {
-			connectedNum++
-		}
-	}
-	return connectedNum >= (len(members)+1)/2
+	return numConnectedSince(transport, since, self, members) >= (len(members)/2)+1
 }
 
 // isConnectedSince checks whether the local member is connected to the
@@ -44,10 +38,17 @@ func isConnectedSince(transport rafthttp.Transporter, since time.Time, remote ty
 // isConnectedFullySince checks whether the local member is connected to all
 // members in the cluster since the given time.
 func isConnectedFullySince(transport rafthttp.Transporter, since time.Time, self types.ID, members []*membership.Member) bool {
+	return numConnectedSince(transport, since, self, members) == len(members)
+}
+
+// numConnectedSince counts how many members are connected to the local member
+// since the given time.
+func numConnectedSince(transport rafthttp.Transporter, since time.Time, self types.ID, members []*membership.Member) int {
+	connectedNum := 0
 	for _, m := range members {
-		if m.ID != self && !isConnectedSince(transport, since, m.ID) {
-			return false
+		if m.ID == self || isConnectedSince(transport, since, m.ID) {
+			connectedNum++
 		}
 	}
-	return true
+	return connectedNum
 }