Browse Source

etcdserver: add mayPromote check

宇慕 6 years ago
parent
commit
dfe296ac3c

+ 11 - 17
clientv3/integration/cluster_test.go

@@ -214,14 +214,13 @@ func TestMemberAddForLearner(t *testing.T) {
 	}
 }
 
-func TestMemberPromoteForLearner(t *testing.T) {
-	// TODO test not ready learner promotion.
+func TestMemberPromoteForNotReadyLearner(t *testing.T) {
 	defer testutil.AfterTest(t)
 
-	clus := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 3})
+	clus := integration.NewClusterV3(t, &integration.ClusterConfig{Size: 1})
 	defer clus.Terminate(t)
-	// TODO change the random client to client that talk to leader directly.
-	capi := clus.RandClient()
+	// first client is talked to leader because cluster size is 1
+	capi := clus.Client(0)
 
 	urls := []string{"http://127.0.0.1:1234"}
 	memberAddResp, err := capi.MemberAddAsLearner(context.Background(), urls)
@@ -244,19 +243,14 @@ func TestMemberPromoteForLearner(t *testing.T) {
 		t.Fatalf("Added 1 learner node to cluster, got %d", numberOfLearners)
 	}
 
-	memberPromoteResp, err := capi.MemberPromote(context.Background(), learnerID)
-	if err != nil {
-		t.Fatalf("failed to promote member: %v", err)
-	}
-
-	numberOfLearners = 0
-	for _, m := range memberPromoteResp.Members {
-		if m.IsLearner {
-			numberOfLearners++
-		}
+	// since we do not start learner, learner must be not ready.
+	_, err = capi.MemberPromote(context.Background(), learnerID)
+	expectedErrKeywords := "can only promote a learner member which catches up with leader"
+	if err == nil {
+		t.Fatalf("expecting promote not ready learner to fail, got no error")
 	}
-	if numberOfLearners != 0 {
-		t.Errorf("learner promoted, expect 0 learner, got %d", numberOfLearners)
+	if !strings.Contains(err.Error(), expectedErrKeywords) {
+		t.Errorf("expecting error to contain %s, got %s", expectedErrKeywords, err.Error())
 	}
 }
 

+ 30 - 0
etcdserver/api/membership/cluster.go

@@ -652,6 +652,36 @@ func (c *RaftCluster) IsReadyToRemoveVotingMember(id uint64) bool {
 	return true
 }
 
+func (c *RaftCluster) IsReadyToPromoteMember(id uint64) bool {
+	nmembers := 1
+	nstarted := 0
+
+	for _, member := range c.VotingMembers() {
+		if member.IsStarted() {
+			nstarted++
+		}
+		nmembers++
+	}
+
+	nquorum := nmembers/2 + 1
+	if nstarted < nquorum {
+		if c.lg != nil {
+			c.lg.Warn(
+				"rejecting member promote; started member will be less than quorum",
+				zap.Int("number-of-started-member", nstarted),
+				zap.Int("quorum", nquorum),
+				zap.String("cluster-id", c.cid.String()),
+				zap.String("local-member-id", c.localID.String()),
+			)
+		} else {
+			plog.Warningf("Reject promote member request: the number of started member (%d) will be less than the quorum number of the cluster (%d)", nstarted, nquorum)
+		}
+		return false
+	}
+
+	return true
+}
+
 func membersFromStore(lg *zap.Logger, st v2store.Store) (map[types.ID]*Member, map[types.ID]bool) {
 	members := make(map[types.ID]*Member)
 	removed := make(map[types.ID]bool)

+ 1 - 0
etcdserver/api/v3rpc/util.go

@@ -39,6 +39,7 @@ var toGRPCErrorMap = map[error]error{
 	membership.ErrLearnerNotReady:         rpctypes.ErrGRPCLearnerNotReady,
 	membership.ErrTooManyLearners:         rpctypes.ErrGRPCTooManyLearners,
 	etcdserver.ErrNotEnoughStartedMembers: rpctypes.ErrMemberNotEnoughStarted,
+	etcdserver.ErrLearnerNotReady:         rpctypes.ErrGRPCLearnerNotReady,
 
 	mvcc.ErrCompacted:             rpctypes.ErrGRPCCompacted,
 	mvcc.ErrFutureRev:             rpctypes.ErrGRPCFutureRev,

+ 1 - 0
etcdserver/errors.go

@@ -29,6 +29,7 @@ var (
 	ErrTimeoutLeaderTransfer      = errors.New("etcdserver: request timed out, leader transfer took too long")
 	ErrLeaderChanged              = errors.New("etcdserver: leader changed")
 	ErrNotEnoughStartedMembers    = errors.New("etcdserver: re-configuration failed due to not enough started members")
+	ErrLearnerNotReady            = errors.New("etcdserver: can only promote a learner member which catches up with leader")
 	ErrNoLeader                   = errors.New("etcdserver: no leader")
 	ErrNotLeader                  = errors.New("etcdserver: not leader")
 	ErrRequestTooLarge            = errors.New("etcdserver: request is too large")

+ 59 - 2
etcdserver/server.go

@@ -97,6 +97,8 @@ const (
 	maxPendingRevokes = 16
 
 	recommendedMaxRequestBytes = 10 * 1024 * 1024
+
+	readyPercent = 0.9
 )
 
 var (
@@ -1637,7 +1639,7 @@ func (s *EtcdServer) PromoteMember(ctx context.Context, id uint64) ([]*membershi
 		return nil, err
 	}
 
-	// check if we can promote this learner
+	// check if we can promote this learner.
 	if err := s.mayPromoteMember(types.ID(id)); err != nil {
 		return nil, err
 	}
@@ -1665,10 +1667,65 @@ func (s *EtcdServer) PromoteMember(ctx context.Context, id uint64) ([]*membershi
 }
 
 func (s *EtcdServer) mayPromoteMember(id types.ID) error {
+	err := isLearnerReady(uint64(id))
+	if err != nil {
+		return err
+	}
+
 	if !s.Cfg.StrictReconfigCheck {
 		return nil
 	}
-	// TODO add more checks whether the member can be promoted.
+	if !s.cluster.IsReadyToPromoteMember(uint64(id)) {
+		if lg := s.getLogger(); lg != nil {
+			lg.Warn(
+				"rejecting member promote request; not enough healthy members",
+				zap.String("local-member-id", s.ID().String()),
+				zap.String("requested-member-remove-id", id.String()),
+				zap.Error(ErrNotEnoughStartedMembers),
+			)
+		} else {
+			plog.Warningf("not enough started members, rejecting promote member %s", id)
+		}
+		return ErrNotEnoughStartedMembers
+	}
+
+	return nil
+}
+
+// check whether the learner catches up with leader or not.
+// Note: it will return nil if member is not found in cluster or if member is not learner.
+// These two conditions will be checked before apply phase later.
+func isLearnerReady(id uint64) error {
+	// sanity check, this can happen in the unit test when we do not start node.
+	if raftStatus == nil {
+		return nil
+	}
+	rs := raftStatus()
+
+	// leader's raftStatus.Progress is not nil
+	if rs.Progress == nil {
+		return ErrNotLeader
+	}
+
+	var learnerMatch uint64
+	isFound := false
+	leaderID := rs.ID
+	for memberID, progress := range rs.Progress {
+		if id == memberID {
+			// check its status
+			learnerMatch = progress.Match
+			isFound = true
+			break
+		}
+	}
+
+	if isFound {
+		leaderMatch := rs.Progress[leaderID].Match
+		// the learner's Match not caught up with leader yet
+		if float64(learnerMatch) < float64(leaderMatch)*readyPercent {
+			return ErrLearnerNotReady
+		}
+	}
 
 	return nil
 }