Browse Source

functional-tester/tester: add randomize delay

Signed-off-by: Gyuho Lee <gyuhox@gmail.com>
Gyuho Lee 7 years ago
parent
commit
ea4effce84

+ 35 - 8
tools/functional-tester/tester/cluster.go

@@ -170,8 +170,11 @@ func newCluster(lg *zap.Logger, fpath string) (*Cluster, error) {
 		}
 		}
 	}
 	}
 
 
-	if clus.Tester.DelayLatencyMs <= clus.Tester.DelayLatencyMsRv {
-		return nil, fmt.Errorf("delay latency %d ms must be greater than delay latency random variable %d ms", clus.Tester.DelayLatencyMs, clus.Tester.DelayLatencyMsRv)
+	if clus.Tester.DelayLatencyMs <= clus.Tester.DelayLatencyMsRv*5 {
+		return nil, fmt.Errorf("delay latency %d ms must be greater than 5x of delay latency random variable %d ms", clus.Tester.DelayLatencyMs, clus.Tester.DelayLatencyMsRv)
+	}
+	if clus.Tester.UpdatedDelayLatencyMs == 0 {
+		clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
 	}
 	}
 
 
 	for _, v := range clus.Tester.FailureCases {
 	for _, v := range clus.Tester.FailureCases {
@@ -303,17 +306,29 @@ func (clus *Cluster) updateFailures() {
 			clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxAll(clus))
 			clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxAll(clus))
 
 
 		case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
 		case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollower(clus))
+			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollower(clus, false))
+		case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER":
+			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollower(clus, true))
 		case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
 		case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot())
+			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, false))
+		case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT":
+			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, true))
 		case "DELAY_PEER_PORT_TX_RX_LEADER":
 		case "DELAY_PEER_PORT_TX_RX_LEADER":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeader(clus))
+			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeader(clus, false))
+		case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER":
+			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeader(clus, true))
 		case "DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
 		case "DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot())
+			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, false))
+		case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT":
+			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, true))
 		case "DELAY_PEER_PORT_TX_RX_QUORUM":
 		case "DELAY_PEER_PORT_TX_RX_QUORUM":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxQuorum(clus))
+			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxQuorum(clus, false))
+		case "RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM":
+			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxQuorum(clus, true))
 		case "DELAY_PEER_PORT_TX_RX_ALL":
 		case "DELAY_PEER_PORT_TX_RX_ALL":
-			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxAll(clus))
+			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxAll(clus, false))
+		case "RANDOM_DELAY_PEER_PORT_TX_RX_ALL":
+			clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxAll(clus, true))
 
 
 		case "NO_FAIL_WITH_STRESS":
 		case "NO_FAIL_WITH_STRESS":
 			clus.failures = append(clus.failures, newFailureNoFailWithStress(clus))
 			clus.failures = append(clus.failures, newFailureNoFailWithStress(clus))
@@ -340,6 +355,18 @@ func (clus *Cluster) failureStrings() (fs []string) {
 	return fs
 	return fs
 }
 }
 
 
+// UpdateDelayLatencyMs updates delay latency with random value
+// within election timeout.
+func (clus *Cluster) UpdateDelayLatencyMs() {
+	rand.Seed(time.Now().UnixNano())
+	clus.Tester.UpdatedDelayLatencyMs = uint32(rand.Int63n(clus.Members[0].Etcd.ElectionTimeoutMs))
+
+	minLatRv := clus.Tester.DelayLatencyMsRv + clus.Tester.DelayLatencyMsRv/5
+	if clus.Tester.UpdatedDelayLatencyMs <= minLatRv {
+		clus.Tester.UpdatedDelayLatencyMs += minLatRv
+	}
+}
+
 func (clus *Cluster) shuffleFailures() {
 func (clus *Cluster) shuffleFailures() {
 	rand.Seed(time.Now().UnixNano())
 	rand.Seed(time.Now().UnixNano())
 	offset := rand.Intn(1000)
 	offset := rand.Intn(1000)

+ 9 - 8
tools/functional-tester/tester/cluster_test.go

@@ -116,14 +116,15 @@ func Test_newCluster(t *testing.T) {
 			},
 			},
 		},
 		},
 		Tester: &rpcpb.Tester{
 		Tester: &rpcpb.Tester{
-			TesterNetwork:    "tcp",
-			TesterAddr:       "127.0.0.1:9028",
-			DelayLatencyMs:   5000,
-			DelayLatencyMsRv: 150,
-			RoundLimit:       1,
-			ExitOnFailure:    true,
-			ConsistencyCheck: true,
-			EnablePprof:      true,
+			TesterNetwork:         "tcp",
+			TesterAddr:            "127.0.0.1:9028",
+			DelayLatencyMs:        5000,
+			DelayLatencyMsRv:      500,
+			UpdatedDelayLatencyMs: 5000,
+			RoundLimit:            1,
+			ExitOnFailure:         true,
+			ConsistencyCheck:      true,
+			EnablePprof:           true,
 			FailureCases: []string{
 			FailureCases: []string{
 				"KILL_ONE_FOLLOWER",
 				"KILL_ONE_FOLLOWER",
 				"KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",
 				"KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT",

+ 1 - 0
tools/functional-tester/tester/cluster_tester.go

@@ -111,6 +111,7 @@ func (clus *Cluster) doRound() error {
 		"round START",
 		"round START",
 		zap.Int("round", clus.rd),
 		zap.Int("round", clus.rd),
 		zap.Strings("failures", clus.failureStrings()),
 		zap.Strings("failures", clus.failureStrings()),
+		zap.Int("total-failures", len(clus.failures)),
 	)
 	)
 	for i, fa := range clus.failures {
 	for i, fa := range clus.failures {
 		clus.cs = i
 		clus.cs = i

+ 12 - 3
tools/functional-tester/tester/failure.go

@@ -224,9 +224,18 @@ type failureUntilSnapshot struct {
 	Failure
 	Failure
 }
 }
 
 
+// all delay failure cases except the ones failing with latency
+// greater than election timeout (trigger leader election and
+// cluster keeps operating anyways)
 var slowCases = map[rpcpb.FailureCase]bool{
 var slowCases = map[rpcpb.FailureCase]bool{
-	rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT: true,
-	rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT:       true,
+	rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER:                        true,
+	rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT:        true,
+	rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT: true,
+	rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER:                              true,
+	rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT:              true,
+	rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT:       true,
+	rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM:                              true,
+	rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ALL:                                 true,
 }
 }
 
 
 func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
 func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
@@ -263,7 +272,7 @@ func (f *failureUntilSnapshot) Inject(clus *Cluster) error {
 	retries := int(snapshotCount) / 1000 * 3
 	retries := int(snapshotCount) / 1000 * 3
 	if v, ok := slowCases[f.FailureCase()]; v && ok {
 	if v, ok := slowCases[f.FailureCase()]; v && ok {
 		// slow network takes more retries
 		// slow network takes more retries
-		retries *= 2
+		retries *= 5
 	}
 	}
 
 
 	for i := 0; i < retries; i++ {
 	for i := 0; i < retries; i++ {

+ 58 - 8
tools/functional-tester/tester/failure_case_network_delay.go

@@ -18,6 +18,8 @@ import (
 	"time"
 	"time"
 
 
 	"github.com/coreos/etcd/tools/functional-tester/rpcpb"
 	"github.com/coreos/etcd/tools/functional-tester/rpcpb"
+
+	"go.uber.org/zap"
 )
 )
 
 
 const (
 const (
@@ -29,6 +31,12 @@ const (
 )
 )
 
 
 func injectDelayPeerPortTxRx(clus *Cluster, idx int) error {
 func injectDelayPeerPortTxRx(clus *Cluster, idx int) error {
+	clus.lg.Info(
+		"injecting delay latency",
+		zap.Duration("latency", time.Duration(clus.Tester.UpdatedDelayLatencyMs)*time.Millisecond),
+		zap.Duration("latency-rv", time.Duration(clus.Tester.DelayLatencyMsRv)*time.Millisecond),
+		zap.String("endpoint", clus.Members[idx].EtcdClientEndpoint),
+	)
 	return clus.sendOperation(idx, rpcpb.Operation_DelayPeerPortTxRx)
 	return clus.sendOperation(idx, rpcpb.Operation_DelayPeerPortTxRx)
 }
 }
 
 
@@ -38,12 +46,19 @@ func recoverDelayPeerPortTxRx(clus *Cluster, idx int) error {
 	return err
 	return err
 }
 }
 
 
-func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster) Failure {
+func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster, random bool) Failure {
 	ff := failureByFunc{
 	ff := failureByFunc{
 		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER,
 		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER,
 		injectMember:  injectDelayPeerPortTxRx,
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}
 	}
+
+	clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
+	if random {
+		clus.UpdateDelayLatencyMs()
+		ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER
+	}
+
 	f := &failureFollower{ff, -1, -1}
 	f := &failureFollower{ff, -1, -1}
 	return &failureDelay{
 	return &failureDelay{
 		Failure:       f,
 		Failure:       f,
@@ -51,25 +66,39 @@ func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster) Failure {
 	}
 	}
 }
 }
 
 
-func newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot() Failure {
+func newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus *Cluster, random bool) Failure {
 	ff := failureByFunc{
 	ff := failureByFunc{
 		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
 		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
 		injectMember:  injectDelayPeerPortTxRx,
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}
 	}
+
+	clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
+	if random {
+		clus.UpdateDelayLatencyMs()
+		ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT
+	}
+
 	f := &failureFollower{ff, -1, -1}
 	f := &failureFollower{ff, -1, -1}
 	return &failureUntilSnapshot{
 	return &failureUntilSnapshot{
-		failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
+		failureCase: ff.failureCase,
 		Failure:     f,
 		Failure:     f,
 	}
 	}
 }
 }
 
 
-func newFailureDelayPeerPortTxRxLeader(clus *Cluster) Failure {
+func newFailureDelayPeerPortTxRxLeader(clus *Cluster, random bool) Failure {
 	ff := failureByFunc{
 	ff := failureByFunc{
 		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER,
 		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER,
 		injectMember:  injectDelayPeerPortTxRx,
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}
 	}
+
+	clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
+	if random {
+		clus.UpdateDelayLatencyMs()
+		ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER
+	}
+
 	f := &failureLeader{ff, -1, -1}
 	f := &failureLeader{ff, -1, -1}
 	return &failureDelay{
 	return &failureDelay{
 		Failure:       f,
 		Failure:       f,
@@ -77,37 +106,58 @@ func newFailureDelayPeerPortTxRxLeader(clus *Cluster) Failure {
 	}
 	}
 }
 }
 
 
-func newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot() Failure {
+func newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus *Cluster, random bool) Failure {
 	ff := failureByFunc{
 	ff := failureByFunc{
 		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT,
 		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT,
 		injectMember:  injectDelayPeerPortTxRx,
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}
 	}
+
+	clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
+	if random {
+		clus.UpdateDelayLatencyMs()
+		ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT
+	}
+
 	f := &failureLeader{ff, -1, -1}
 	f := &failureLeader{ff, -1, -1}
 	return &failureUntilSnapshot{
 	return &failureUntilSnapshot{
-		failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT,
+		failureCase: ff.failureCase,
 		Failure:     f,
 		Failure:     f,
 	}
 	}
 }
 }
 
 
-func newFailureDelayPeerPortTxRxQuorum(clus *Cluster) Failure {
+func newFailureDelayPeerPortTxRxQuorum(clus *Cluster, random bool) Failure {
 	f := &failureQuorum{
 	f := &failureQuorum{
 		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_QUORUM,
 		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_QUORUM,
 		injectMember:  injectDelayPeerPortTxRx,
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}
 	}
+
+	clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
+	if random {
+		clus.UpdateDelayLatencyMs()
+		f.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM
+	}
+
 	return &failureDelay{
 	return &failureDelay{
 		Failure:       f,
 		Failure:       f,
 		delayDuration: clus.GetFailureDelayDuration(),
 		delayDuration: clus.GetFailureDelayDuration(),
 	}
 	}
 }
 }
 
 
-func newFailureDelayPeerPortTxRxAll(clus *Cluster) Failure {
+func newFailureDelayPeerPortTxRxAll(clus *Cluster, random bool) Failure {
 	f := &failureAll{
 	f := &failureAll{
 		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ALL,
 		failureCase:   rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ALL,
 		injectMember:  injectDelayPeerPortTxRx,
 		injectMember:  injectDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 		recoverMember: recoverDelayPeerPortTxRx,
 	}
 	}
+
+	clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs
+	if random {
+		clus.UpdateDelayLatencyMs()
+		f.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ALL
+	}
+
 	return &failureDelay{
 	return &failureDelay{
 		Failure:       f,
 		Failure:       f,
 		delayDuration: clus.GetFailureDelayDuration(),
 		delayDuration: clus.GetFailureDelayDuration(),

+ 1 - 1
tools/functional-tester/tester/local-test.yaml

@@ -84,7 +84,7 @@ tester-config:
 
 
   # slow enough to trigger election
   # slow enough to trigger election
   delay-latency-ms: 5000
   delay-latency-ms: 5000
-  delay-latency-ms-rv: 150
+  delay-latency-ms-rv: 500
 
 
   round-limit: 1
   round-limit: 1
   exit-on-failure: true
   exit-on-failure: true