Browse Source

functional/tester: delay after injecting "kill" to trigger election

Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
Gyuho Lee 7 years ago
parent
commit
d8a2d3a209

+ 56 - 28
functional/tester/cluster.go

@@ -143,69 +143,97 @@ func (clus *Cluster) updateFailures() {
 	for _, cs := range clus.Tester.FailureCases {
 		switch cs {
 		case "KILL_ONE_FOLLOWER":
-			clus.failures = append(clus.failures, newFailureKillOneFollower())
+			clus.failures = append(clus.failures,
+				newFailureKillOneFollower(clus))
 		case "KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
-			clus.failures = append(clus.failures, newFailureKillOneFollowerUntilTriggerSnapshot())
+			clus.failures = append(clus.failures,
+				newFailureKillOneFollowerUntilTriggerSnapshot(clus))
 		case "KILL_LEADER":
-			clus.failures = append(clus.failures, newFailureKillLeader())
+			clus.failures = append(clus.failures,
+				newFailureKillLeader(clus))
 		case "KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT":
-			clus.failures = append(clus.failures, newFailureKillLeaderUntilTriggerSnapshot())
+			clus.failures = append(clus.failures,
+				newFailureKillLeaderUntilTriggerSnapshot(clus))
 		case "KILL_QUORUM":
-			clus.failures = append(clus.failures, newFailureKillQuorum())
+			clus.failures = append(clus.failures,
+				newFailureKillQuorum(clus))
 		case "KILL_ALL":
-			clus.failures = append(clus.failures, newFailureKillAll())
+			clus.failures = append(clus.failures,
+				newFailureKillAll(clus))
 
 		case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER":
-			clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxOneFollower(clus))
+			clus.failures = append(clus.failures,
+				newFailureBlackholePeerPortTxRxOneFollower(clus))
 		case "BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
-			clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxOneFollowerUntilTriggerSnapshot())
+			clus.failures = append(clus.failures,
+				newFailureBlackholePeerPortTxRxOneFollowerUntilTriggerSnapshot())
 		case "BLACKHOLE_PEER_PORT_TX_RX_LEADER":
-			clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxLeader(clus))
+			clus.failures = append(clus.failures,
+				newFailureBlackholePeerPortTxRxLeader(clus))
 		case "BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
-			clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxLeaderUntilTriggerSnapshot())
+			clus.failures = append(clus.failures,
+				newFailureBlackholePeerPortTxRxLeaderUntilTriggerSnapshot())
 		case "BLACKHOLE_PEER_PORT_TX_RX_QUORUM":
-			clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxQuorum(clus))
+			clus.failures = append(clus.failures,
+				newFailureBlackholePeerPortTxRxQuorum(clus))
 		case "BLACKHOLE_PEER_PORT_TX_RX_ALL":
-			clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxAll(clus))
+			clus.failures = append(clus.failures,
+				newFailureBlackholePeerPortTxRxAll(clus))
 
 		case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollower(clus, false))
+			clus.failures = append(clus.failures,
+				newFailureDelayPeerPortTxRxOneFollower(clus, false))
 		case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollower(clus, true))
+			clus.failures = append(clus.failures,
+				newFailureDelayPeerPortTxRxOneFollower(clus, true))
 		case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, false))
+			clus.failures = append(clus.failures,
+				newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, false))
 		case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, true))
+			clus.failures = append(clus.failures,
+				newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, true))
 		case "DELAY_PEER_PORT_TX_RX_LEADER":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeader(clus, false))
+			clus.failures = append(clus.failures,
+				newFailureDelayPeerPortTxRxLeader(clus, false))
 		case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeader(clus, true))
+			clus.failures = append(clus.failures,
+				newFailureDelayPeerPortTxRxLeader(clus, true))
 		case "DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, false))
+			clus.failures = append(clus.failures,
+				newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, false))
 		case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, true))
+			clus.failures = append(clus.failures,
+				newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, true))
 		case "DELAY_PEER_PORT_TX_RX_QUORUM":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxQuorum(clus, false))
+			clus.failures = append(clus.failures,
+				newFailureDelayPeerPortTxRxQuorum(clus, false))
 		case "RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxQuorum(clus, true))
+			clus.failures = append(clus.failures,
+				newFailureDelayPeerPortTxRxQuorum(clus, true))
 		case "DELAY_PEER_PORT_TX_RX_ALL":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxAll(clus, false))
+			clus.failures = append(clus.failures,
+				newFailureDelayPeerPortTxRxAll(clus, false))
 		case "RANDOM_DELAY_PEER_PORT_TX_RX_ALL":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxAll(clus, true))
+			clus.failures = append(clus.failures,
+				newFailureDelayPeerPortTxRxAll(clus, true))
 
 		case "NO_FAIL_WITH_STRESS":
-			clus.failures = append(clus.failures, newFailureNoFailWithStress(clus))
+			clus.failures = append(clus.failures,
+				newFailureNoFailWithStress(clus))
 		case "NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS":
-			clus.failures = append(clus.failures, newFailureNoFailWithNoStressForLiveness(clus))
+			clus.failures = append(clus.failures,
+				newFailureNoFailWithNoStressForLiveness(clus))
 
 		case "EXTERNAL":
-			clus.failures = append(clus.failures, newFailureExternal(clus.Tester.ExternalExecPath))
+			clus.failures = append(clus.failures,
+				newFailureExternal(clus.Tester.ExternalExecPath))
 		case "FAILPOINTS":
 			fpFailures, fperr := failpointFailures(clus)
 			if len(fpFailures) == 0 {
 				clus.lg.Info("no failpoints found!", zap.Error(fperr))
 			}
-			clus.failures = append(clus.failures, fpFailures...)
+			clus.failures = append(clus.failures,
+				fpFailures...)
 		}
 	}
 }

+ 4 - 0
functional/tester/cluster_read_config.go

@@ -40,6 +40,10 @@ func read(lg *zap.Logger, fpath string) (*Cluster, error) {
 		return nil, err
 	}
 
+	if len(clus.Members) < 3 {
+		return nil, fmt.Errorf("len(clus.Members) expects at least 3, got %d", len(clus.Members))
+	}
+
 	for i, mem := range clus.Members {
 		if mem.BaseDir == "" {
 			return nil, fmt.Errorf("BaseDir cannot be empty (got %q)", mem.BaseDir)

+ 0 - 3
functional/tester/failure.go

@@ -242,9 +242,6 @@ func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
 	if err := f.Failure.Inject(clus); err != nil {
 		return err
 	}
-	if len(clus.Members) < 3 {
-		return nil
-	}
 
 	snapshotCount := clus.Members[0].Etcd.SnapshotCount
 

+ 2 - 2
functional/tester/failure_case_delay.go

@@ -31,9 +31,9 @@ func (f *failureDelay) Inject(clus *Cluster) error {
 	}
 	if f.delayDuration > 0 {
 		clus.lg.Info(
-			"sleeping in failureDelay",
+			"wait after inject",
 			zap.Duration("delay", f.delayDuration),
-			zap.String("case", f.Failure.Desc()),
+			zap.String("desc", f.Failure.Desc()),
 		)
 		time.Sleep(f.delayDuration)
 	}

+ 28 - 12
functional/tester/failure_case_kill.go

@@ -24,50 +24,66 @@ func recoverKill(clus *Cluster, idx int) error {
 	return clus.sendOperation(idx, rpcpb.Operation_RestartEtcd)
 }
 
-func newFailureKillOneFollower() Failure {
+func newFailureKillOneFollower(clus *Cluster) Failure {
 	ff := failureByFunc{
 		failureCase:   rpcpb.FailureCase_KILL_ONE_FOLLOWER,
 		injectMember:  injectKill,
 		recoverMember: recoverKill,
 	}
-	return &failureFollower{ff, -1, -1}
+	f := &failureFollower{ff, -1, -1}
+	return &failureDelay{
+		Failure:       f,
+		delayDuration: clus.GetFailureDelayDuration(),
+	}
 }
 
-func newFailureKillLeader() Failure {
+func newFailureKillLeader(clus *Cluster) Failure {
 	ff := failureByFunc{
 		failureCase:   rpcpb.FailureCase_KILL_LEADER,
 		injectMember:  injectKill,
 		recoverMember: recoverKill,
 	}
-	return &failureLeader{ff, -1, -1}
+	f := &failureLeader{ff, -1, -1}
+	return &failureDelay{
+		Failure:       f,
+		delayDuration: clus.GetFailureDelayDuration(),
+	}
 }
 
-func newFailureKillQuorum() Failure {
-	return &failureQuorum{
+func newFailureKillQuorum(clus *Cluster) Failure {
+	f := &failureQuorum{
 		failureCase:   rpcpb.FailureCase_KILL_QUORUM,
 		injectMember:  injectKill,
 		recoverMember: recoverKill,
 	}
+	return &failureDelay{
+		Failure:       f,
+		delayDuration: clus.GetFailureDelayDuration(),
+	}
 }
 
-func newFailureKillAll() Failure {
-	return &failureAll{
+func newFailureKillAll(clus *Cluster) Failure {
+	f := &failureAll{
 		failureCase:   rpcpb.FailureCase_KILL_ALL,
 		injectMember:  injectKill,
 		recoverMember: recoverKill,
 	}
+	return &failureDelay{
+		Failure:       f,
+		delayDuration: clus.GetFailureDelayDuration(),
+	}
 }
 
-func newFailureKillOneFollowerUntilTriggerSnapshot() Failure {
+func newFailureKillOneFollowerUntilTriggerSnapshot(clus *Cluster) Failure {
 	return &failureUntilSnapshot{
 		failureCase: rpcpb.FailureCase_KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
-		Failure:     newFailureKillOneFollower(),
+		Failure:     newFailureKillOneFollower(clus),
 	}
 }
 
-func newFailureKillLeaderUntilTriggerSnapshot() Failure {
+func newFailureKillLeaderUntilTriggerSnapshot(clus *Cluster) Failure {
 	return &failureUntilSnapshot{
 		failureCase: rpcpb.FailureCase_KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT,
-		Failure:     newFailureKillLeader(),
+		Failure:     newFailureKillLeader(clus),
 	}
 }

+ 0 - 12
functional/tester/failure_case_network_delay.go

@@ -52,13 +52,11 @@ func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster, random bool) Failure
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}
-
 	clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
 	if random {
 		clus.UpdateDelayLatencyMs()
 		ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER
 	}
-
 	f := &failureFollower{ff, -1, -1}
 	return &failureDelay{
 		Failure:       f,
@@ -72,13 +70,11 @@ func newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus *Cluster, r
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}
-
 	clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
 	if random {
 		clus.UpdateDelayLatencyMs()
 		ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
 	}
-
 	f := &failureFollower{ff, -1, -1}
 	return &failureUntilSnapshot{
 		failureCase: ff.failureCase,
@@ -92,13 +88,11 @@ func newFailureDelayPeerPortTxRxLeader(clus *Cluster, random bool) Failure {
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}
-
 	clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
 	if random {
 		clus.UpdateDelayLatencyMs()
 		ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER
 	}
-
 	f := &failureLeader{ff, -1, -1}
 	return &failureDelay{
 		Failure:       f,
@@ -112,13 +106,11 @@ func newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus *Cluster, random
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}
-
 	clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
 	if random {
 		clus.UpdateDelayLatencyMs()
 		ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT
 	}
-
 	f := &failureLeader{ff, -1, -1}
 	return &failureUntilSnapshot{
 		failureCase: ff.failureCase,
@@ -132,13 +124,11 @@ func newFailureDelayPeerPortTxRxQuorum(clus *Cluster, random bool) Failure {
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}
-
 	clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
 	if random {
 		clus.UpdateDelayLatencyMs()
 		f.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM
 	}
-
 	return &failureDelay{
 		Failure:       f,
 		delayDuration: clus.GetFailureDelayDuration(),
@@ -151,13 +141,11 @@ func newFailureDelayPeerPortTxRxAll(clus *Cluster, random bool) Failure {
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}
-
 	clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
 	if random {
 		clus.UpdateDelayLatencyMs()
 		f.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ALL
 	}
-
 	return &failureDelay{
 		Failure:       f,
 		delayDuration: clus.GetFailureDelayDuration(),