7 years ago · 68adc6e300
--- a/functional/rpcpb/rpc.pb.go
+++ b/functional/rpcpb/rpc.pb.go
@@ -52,11 +52,17 @@ const (
 
				 	// thus need to archive etcd data directories.
			
 
				 	Operation_FailArchive Operation = 4
			
 
				 	// DestroyEtcdAgent destroys etcd process, etcd data, and agent server.
			
 
				-	Operation_DestroyEtcdAgent        Operation = 5
			
 
				-	Operation_BlackholePeerPortTxRx   Operation = 100
			
 
				+	Operation_DestroyEtcdAgent Operation = 5
			
 
				+	// BlackholePeerPortTxRx drops all outgoing/incoming packets from/to the
			
 
				+	// peer port on target member's peer port.
			
 
				+	Operation_BlackholePeerPortTxRx Operation = 100
			
 
				+	// UnblackholePeerPortTxRx removes outgoing/incoming packet dropping.
			
 
				 	Operation_UnblackholePeerPortTxRx Operation = 101
			
 
				-	Operation_DelayPeerPortTxRx       Operation = 102
			
 
				-	Operation_UndelayPeerPortTxRx     Operation = 103
			
 
				+	// DelayPeerPortTxRx delays all outgoing/incoming packets from/to the
			
 
				+	// peer port on target member's peer port.
			
 
				+	Operation_DelayPeerPortTxRx Operation = 102
			
 
				+	// UndelayPeerPortTxRx removes all outgoing/incoming delays.
			
 
				+	Operation_UndelayPeerPortTxRx Operation = 103
			
 
				 )
			
 
				 
			
 
				 var Operation_name = map[int32]string{
			
@@ -89,41 +95,235 @@ func (x Operation) String() string {
 
				 }
			
 
				 func (Operation) EnumDescriptor() ([]byte, []int) { return fileDescriptorRpc, []int{0} }
			
 
				 
			
 
				+// FailureCase defines various system faults in distributed systems,
			
 
				+// in order to verify correct behavior of etcd servers and clients.
			
 
				 type FailureCase int32
			
 
				 
			
 
				 const (
			
 
				-	FailureCase_KILL_ONE_FOLLOWER                                                FailureCase = 0
			
 
				-	FailureCase_KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT                         FailureCase = 1
			
 
				-	FailureCase_KILL_LEADER                                                      FailureCase = 2
			
 
				-	FailureCase_KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT                               FailureCase = 3
			
 
				-	FailureCase_KILL_QUORUM                                                      FailureCase = 4
			
 
				-	FailureCase_KILL_ALL                                                         FailureCase = 5
			
 
				-	FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER                           FailureCase = 100
			
 
				-	FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT    FailureCase = 101
			
 
				-	FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER                                 FailureCase = 102
			
 
				-	FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT          FailureCase = 103
			
 
				-	FailureCase_BLACKHOLE_PEER_PORT_TX_RX_QUORUM                                 FailureCase = 104
			
 
				-	FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ALL                                    FailureCase = 105
			
 
				-	FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER                               FailureCase = 200
			
 
				-	FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER                        FailureCase = 201
			
 
				-	FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT        FailureCase = 202
			
 
				+	// KILL_ONE_FOLLOWER stops a randomly chosen follower (non-leader)
			
 
				+	// but does not delete its data directories on disk for next restart.
			
 
				+	// It waits "failure-delay-ms" before recovering this failure.
			
 
				+	// The expected behavior is that the follower comes back online
			
 
				+	// and rejoins the cluster, and then each member continues to process
			
 
				+	// client requests ('Put' request that requires Raft consensus).
			
 
				+	FailureCase_KILL_ONE_FOLLOWER FailureCase = 0
			
 
				+	// KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly chosen
			
 
				+	// follower but does not delete its data directories on disk for next
			
 
				+	// restart. And waits until most up-to-date node (leader) applies the
			
 
				+	// snapshot count of entries since the stop operation.
			
 
				+	// The expected behavior is that the follower comes back online and
			
 
				+	// rejoins the cluster, and then active leader sends snapshot
			
 
				+	// to the follower to force it to follow the leader's log.
			
 
				+	// As always, after recovery, each member must be able to process
			
 
				+	// client requests.
			
 
				+	FailureCase_KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT FailureCase = 1
			
 
				+	// KILL_LEADER stops the active leader node but does not delete its
			
 
				+	// data directories on disk for next restart. Then it waits
			
 
				+	// "failure-delay-ms" before recovering this failure, in order to
			
 
				+	// trigger election timeouts.
			
 
				+	// The expected behavior is that a new leader gets elected, and the
			
 
				+	// old leader comes back online and rejoins the cluster as a follower.
			
 
				+	// As always, after recovery, each member must be able to process
			
 
				+	// client requests.
			
 
				+	FailureCase_KILL_LEADER FailureCase = 2
			
 
				+	// KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader node
			
 
				+	// but does not delete its data directories on disk for next restart.
			
 
				+	// And waits until most up-to-date node ("new" leader) applies the
			
 
				+	// snapshot count of entries since the stop operation.
			
 
				+	// The expected behavior is that cluster elects a new leader, and the
			
 
				+	// old leader comes back online and rejoins the cluster as a follower.
			
 
				+	// And it receives the snapshot from the new leader to overwrite its
			
 
				+	// store. As always, after recovery, each member must be able to
			
 
				+	// process client requests.
			
 
				+	FailureCase_KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT FailureCase = 3
			
 
				+	// KILL_QUORUM stops majority number of nodes to make the whole cluster
			
 
				+	// inoperable but does not delete data directories on stopped nodes
			
 
				+	// for next restart. And it waits "failure-delay-ms" before recovering
			
 
				+	// this failure.
			
 
				+	// The expected behavior is that nodes come back online, thus cluster
			
 
				+	// comes back operative as well. As always, after recovery, each member
			
 
				+	// must be able to process client requests.
			
 
				+	FailureCase_KILL_QUORUM FailureCase = 4
			
 
				+	// KILL_ALL stops the whole cluster but does not delete data directories
			
 
				+	// on disk for next restart. And it waits "failure-delay-ms" before
			
 
				+	// recovering this failure.
			
 
				+	// The expected behavior is that nodes come back online, thus cluster
			
 
				+	// comes back operative as well. As always, after recovery, each member
			
 
				+	// must be able to process client requests.
			
 
				+	FailureCase_KILL_ALL FailureCase = 5
			
 
				+	// BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER drops all outgoing/incoming
			
 
				+	// packets from/to the peer port on a randomly chosen follower
			
 
				+	// (non-leader), and waits for "failure-delay-ms" until recovery.
			
 
				+	// The expected behavior is that once dropping operation is undone,
			
 
				+	// each member must be able to process client requests.
			
 
				+	FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER FailureCase = 100
			
 
				+	// BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT drops
			
 
				+	// all outgoing/incoming packets from/to the peer port on a randomly
			
 
				+	// chosen follower (non-leader), and waits for most up-to-date node
			
 
				+	// (leader) applies the snapshot count of entries since the blackhole
			
 
				+	// operation.
			
 
				+	// The expected behavior is that once packet drop operation is undone,
			
 
				+	// the slow follower tries to catch up, possibly receiving the snapshot
			
 
				+	// from the active leader. As always, after recovery, each member must
			
 
				+	// be able to process client requests.
			
 
				+	FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT FailureCase = 101
			
 
				+	// BLACKHOLE_PEER_PORT_TX_RX_LEADER drops all outgoing/incoming packets
			
 
				+	// from/to the peer port on the active leader (isolated), and waits for
			
 
				+	// "failure-delay-ms" until recovery, in order to trigger election timeout.
			
 
				+	// The expected behavior is that after election timeout, a new leader gets
			
 
				+	// elected, and once dropping operation is undone, the old leader comes
			
 
				+	// back and rejoins the cluster as a follower. As always, after recovery,
			
 
				+	// each member must be able to process client requests.
			
 
				+	FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER FailureCase = 102
			
 
				+	// BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT drops all
			
 
				+	// outgoing/incoming packets from/to the peer port on the active leader,
			
 
				+	// and waits for most up-to-date node (leader) applies the snapshot
			
 
				+	// count of entries since the blackhole operation.
			
 
				+	// The expected behavior is that cluster elects a new leader, and once
			
 
				+	// dropping operation is undone, the old leader comes back and rejoins
			
 
				+	// the cluster as a follower. The slow follower tries to catch up, likely
			
 
				+	// receiving the snapshot from the new active leader. As always, after
			
 
				+	// recovery, each member must be able to process client requests.
			
 
				+	FailureCase_BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT FailureCase = 103
			
 
				+	// BLACKHOLE_PEER_PORT_TX_RX_QUORUM drops all outgoing/incoming packets
			
 
				+	// from/to the peer ports on majority nodes of cluster, thus losing its
			
 
				+	// leader and cluster being inoperable. And it waits for "failure-delay-ms"
			
 
				+	// until recovery.
			
 
				+	// The expected behavior is that once packet drop operation is undone,
			
 
				+	// nodes come back online, thus cluster comes back operative. As always,
			
 
				+	// after recovery, each member must be able to process client requests.
			
 
				+	FailureCase_BLACKHOLE_PEER_PORT_TX_RX_QUORUM FailureCase = 104
			
 
				+	// BLACKHOLE_PEER_PORT_TX_RX_ALL drops all outgoing/incoming packets
			
 
				+	// from/to the peer ports on all nodes, thus making cluster totally
			
 
				+	// inoperable. It waits for "failure-delay-ms" until recovery.
			
 
				+	// The expected behavior is that once packet drop operation is undone,
			
 
				+	// nodes come back online, thus cluster comes back operative. As always,
			
 
				+	// after recovery, each member must be able to process client requests.
			
 
				+	FailureCase_BLACKHOLE_PEER_PORT_TX_RX_ALL FailureCase = 105
			
 
				+	// DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming packets
			
 
				+	// from/to the peer port on a randomly chosen follower (non-leader).
			
 
				+	// It waits for "failure-delay-ms" until recovery.
			
 
				+	// The expected behavior is that once packet delay operation is undone,
			
 
				+	// the follower comes back and tries to catch up with latest changes from
			
 
				+	// cluster. And as always, after recovery, each member must be able to
			
 
				+	// process client requests.
			
 
				+	FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER FailureCase = 200
			
 
				+	// RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming
			
 
				+	// packets from/to the peer port on a randomly chosen follower
			
 
				+	// (non-leader) with a randomized time duration (thus isolated). It waits
			
 
				+	// for "failure-delay-ms" until recovery.
			
 
				+	// The expected behavior is that once packet delay operation is undone,
			
 
				+	// each member must be able to process client requests.
			
 
				+	FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER FailureCase = 201
			
 
				+	// DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
			
 
				+	// outgoing/incoming packets from/to the peer port on a randomly chosen
			
 
				+	// follower (non-leader), and waits for most up-to-date node (leader)
			
 
				+	// applies the snapshot count of entries since the delay operation.
			
 
				+	// The expected behavior is that the delayed follower gets isolated
			
 
				+	// and behind the current active leader, and once delay operation is undone,
			
 
				+	// the slow follower comes back and catches up possibly receiving snapshot
			
 
				+	// from the active leader. As always, after recovery, each member must be
			
 
				+	// able to process client requests.
			
 
				+	FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT FailureCase = 202
			
 
				+	// RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
			
 
				+	// outgoing/incoming packets from/to the peer port on a randomly chosen
			
 
				+	// follower (non-leader) with a randomized time duration, and waits for
			
 
				+	// most up-to-date node (leader) applies the snapshot count of entries
			
 
				+	// since the delay operation.
			
 
				+	// The expected behavior is that the delayed follower gets isolated
			
 
				+	// and behind the current active leader, and once delay operation is undone,
			
 
				+	// the slow follower comes back and catches up, possibly receiving a
			
 
				+	// snapshot from the active leader. As always, after recovery, each member
			
 
				+	// must be able to process client requests.
			
 
				 	FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT FailureCase = 203
			
 
				-	FailureCase_DELAY_PEER_PORT_TX_RX_LEADER                                     FailureCase = 204
			
 
				-	FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER                              FailureCase = 205
			
 
				-	FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT              FailureCase = 206
			
 
				-	FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT       FailureCase = 207
			
 
				-	FailureCase_DELAY_PEER_PORT_TX_RX_QUORUM                                     FailureCase = 208
			
 
				-	FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM                              FailureCase = 209
			
 
				-	FailureCase_DELAY_PEER_PORT_TX_RX_ALL                                        FailureCase = 210
			
 
				-	FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ALL                                 FailureCase = 211
			
 
				-	// NO_FAIL_WITH_STRESS runs no-op failure injection for specified period
			
 
				-	// while stressers are still sending requests.
			
 
				+	// DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets from/to
			
 
				+	// the peer port on the active leader. And waits for "failure-delay-ms"
			
 
				+	// until recovery.
			
 
				+	// The expected behavior is that cluster may elect a new leader, and
			
 
				+	// once packet delay operation is undone, the (old) leader comes back
			
 
				+	// and tries to catch up with latest changes from cluster. As always,
			
 
				+	// after recovery, each member must be able to process client requests.
			
 
				+	FailureCase_DELAY_PEER_PORT_TX_RX_LEADER FailureCase = 204
			
 
				+	// RANDOM_DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets
			
 
				+	// from/to the peer port on the active leader with a randomized time
			
 
				+	// duration. And waits for "failure-delay-ms" until recovery.
			
 
				+	// The expected behavior is that cluster may elect a new leader, and
			
 
				+	// once packet delay operation is undone, the (old) leader comes back
			
 
				+	// and tries to catch up with latest changes from cluster. As always,
			
 
				+	// after recovery, each member must be able to process client requests.
			
 
				+	FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER FailureCase = 205
			
 
				+	// DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
			
 
				+	// outgoing/incoming packets from/to the peer port on the active leader,
			
 
				+	// and waits for most up-to-date node (current or new leader) applies the
			
 
				+	// snapshot count of entries since the delay operation.
			
 
				+	// The expected behavior is that cluster may elect a new leader, and
			
 
				+	// the old leader gets isolated and behind the current active leader,
			
 
				+	// and once delay operation is undone, the slow follower comes back
			
 
				+	// and catches up, likely receiving a snapshot from the active leader.
			
 
				+	// As always, after recovery, each member must be able to process client
			
 
				+	// requests.
			
 
				+	FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT FailureCase = 206
			
 
				+	// RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
			
 
				+	// outgoing/incoming packets from/to the peer port on the active leader,
			
 
				+	// with a randomized time duration. And it waits for most up-to-date node
			
 
				+	// (current or new leader) applies the snapshot count of entries since the
			
 
				+	// delay operation.
			
 
				+	// The expected behavior is that cluster may elect a new leader, and
			
 
				+	// the old leader gets isolated and behind the current active leader,
			
 
				+	// and once delay operation is undone, the slow follower comes back
			
 
				+	// and catches up, likely receiving a snapshot from the active leader.
			
 
				+	// As always, after recovery, each member must be able to process client
			
 
				+	// requests.
			
 
				+	FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT FailureCase = 207
			
 
				+	// DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets from/to
			
 
				+	// the peer ports on majority nodes of cluster. And it waits for
			
 
				+	// "failure-delay-ms" until recovery, likely to trigger election timeouts.
			
 
				+	// The expected behavior is that cluster may elect a new leader, while
			
 
				+	// quorum of nodes struggle with slow networks, and once delay operation
			
 
				+	// is undone, nodes come back and cluster comes back operative. As always,
			
 
				+	// after recovery, each member must be able to process client requests.
			
 
				+	FailureCase_DELAY_PEER_PORT_TX_RX_QUORUM FailureCase = 208
			
 
				+	// RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets
			
 
				+	// from/to the peer ports on majority nodes of cluster, with randomized
			
 
				+	// time durations. And it waits for "failure-delay-ms" until recovery,
			
 
				+	// likely to trigger election timeouts.
			
 
				+	// The expected behavior is that cluster may elect a new leader, while
			
 
				+	// quorum of nodes struggle with slow networks, and once delay operation
			
 
				+	// is undone, nodes come back and cluster comes back operative. As always,
			
 
				+	// after recovery, each member must be able to process client requests.
			
 
				+	FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM FailureCase = 209
			
 
				+	// DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets from/to the
			
 
				+	// peer ports on all nodes. And it waits for "failure-delay-ms" until
			
 
				+	// recovery, likely to trigger election timeouts.
			
 
				+	// The expected behavior is that cluster may become totally inoperable,
			
 
				+	// struggling with slow networks across the whole cluster. Once delay
			
 
				+	// operation is undone, nodes come back and cluster comes back operative.
			
 
				+	// As always, after recovery, each member must be able to process client
			
 
				+	// requests.
			
 
				+	FailureCase_DELAY_PEER_PORT_TX_RX_ALL FailureCase = 210
			
 
				+	// RANDOM_DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets
			
 
				+	// from/to the peer ports on all nodes, with randomized time durations.
			
 
				+	// And it waits for "failure-delay-ms" until recovery, likely to trigger
			
 
				+	// election timeouts.
			
 
				+	// The expected behavior is that cluster may become totally inoperable,
			
 
				+	// struggling with slow networks across the whole cluster. Once delay
			
 
				+	// operation is undone, nodes come back and cluster comes back operative.
			
 
				+	// As always, after recovery, each member must be able to process client
			
 
				+	// requests.
			
 
				+	FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ALL FailureCase = 211
			
 
				+	// NO_FAIL_WITH_STRESS runs no-op failure injection that does not do
			
 
				+	// anything against cluster for "failure-delay-ms" duration, while
			
 
				+	// stressers are still sending requests.
			
 
				 	FailureCase_NO_FAIL_WITH_STRESS FailureCase = 300
			
 
				 	// NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS runs no-op failure injection
			
 
				-	// with all stressers stopped.
			
 
				+	// that does not do anything against cluster for "failure-delay-ms"
			
 
				+	// duration, while all stressers are stopped.
			
 
				 	FailureCase_NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS FailureCase = 301
			
 
				-	FailureCase_FAILPOINTS                          FailureCase = 400
			
 
				-	FailureCase_EXTERNAL                            FailureCase = 500
			
 
				+	// FAILPOINTS injects failpoints to etcd server runtime, triggering panics
			
 
				+	// in critical code paths.
			
 
				+	FailureCase_FAILPOINTS FailureCase = 400
			
 
				+	// EXTERNAL runs external failure injection scripts.
			
 
				+	FailureCase_EXTERNAL FailureCase = 500
			
 
				 )
			
 
				 
			
 
				 var FailureCase_name = map[int32]string{
			
--- a/functional/rpcpb/rpc.proto
+++ b/functional/rpcpb/rpc.proto
@@ -28,9 +28,15 @@ enum Operation {
 
				   // DestroyEtcdAgent destroys etcd process, etcd data, and agent server.
			
 
				   DestroyEtcdAgent = 5;
			
 
				 
			
 
				+  // BlackholePeerPortTxRx drops all outgoing/incoming packets from/to the
			
 
				+  // peer port on target member's peer port.
			
 
				   BlackholePeerPortTxRx = 100;
			
 
				+  // UnblackholePeerPortTxRx removes outgoing/incoming packet dropping.
			
 
				   UnblackholePeerPortTxRx = 101;
			
 
				+  // DelayPeerPortTxRx delays all outgoing/incoming packets from/to the
			
 
				+  // peer port on target member's peer port.
			
 
				   DelayPeerPortTxRx = 102;
			
 
				+  // UndelayPeerPortTxRx removes all outgoing/incoming delays.
			
 
				   UndelayPeerPortTxRx = 103;
			
 
				 }
			
 
				 
			
@@ -122,42 +128,259 @@ message Member {
 
				   string PeerTrustedCAPath = 506 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-path\""];
			
 
				 }
			
 
				 
			
 
				+// FailureCase defines various system faults in distributed systems,
			
 
				+// in order to verify correct behavior of etcd servers and clients.
			
 
				 enum FailureCase {
			
 
				+  // KILL_ONE_FOLLOWER stops a randomly chosen follower (non-leader)
			
 
				+  // but does not delete its data directories on disk for next restart.
			
 
				+  // It waits "failure-delay-ms" before recovering this failure.
			
 
				+  // The expected behavior is that the follower comes back online
			
 
				+  // and rejoins the cluster, and then each member continues to process
			
 
				+  // client requests ('Put' request that requires Raft consensus).
			
 
				   KILL_ONE_FOLLOWER = 0;
			
 
				+
			
 
				+  // KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly chosen
			
 
				+  // follower but does not delete its data directories on disk for next
			
 
				+  // restart. And waits until most up-to-date node (leader) applies the
			
 
				+  // snapshot count of entries since the stop operation.
			
 
				+  // The expected behavior is that the follower comes back online and
			
 
				+  // rejoins the cluster, and then active leader sends snapshot
			
 
				+  // to the follower to force it to follow the leader's log.
			
 
				+  // As always, after recovery, each member must be able to process
			
 
				+  // client requests.
			
 
				   KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 1;
			
 
				+
			
 
				+  // KILL_LEADER stops the active leader node but does not delete its
			
 
				+  // data directories on disk for next restart. Then it waits
			
 
				+  // "failure-delay-ms" before recovering this failure, in order to
			
 
				+  // trigger election timeouts.
			
 
				+  // The expected behavior is that a new leader gets elected, and the
			
 
				+  // old leader comes back online and rejoins the cluster as a follower.
			
 
				+  // As always, after recovery, each member must be able to process
			
 
				+  // client requests.
			
 
				   KILL_LEADER = 2;
			
 
				+
			
 
				+  // KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader node
			
 
				+  // but does not delete its data directories on disk for next restart.
			
 
				+  // And waits until most up-to-date node ("new" leader) applies the
			
 
				+  // snapshot count of entries since the stop operation.
			
 
				+  // The expected behavior is that cluster elects a new leader, and the
			
 
				+  // old leader comes back online and rejoins the cluster as a follower.
			
 
				+  // And it receives the snapshot from the new leader to overwrite its
			
 
				+  // store. As always, after recovery, each member must be able to
			
 
				+  // process client requests.
			
 
				   KILL_LEADER_UNTIL_TRIGGER_SNAPSHOT = 3;
			
 
				+
			
 
				+  // KILL_QUORUM stops majority number of nodes to make the whole cluster
			
 
				+  // inoperable but does not delete data directories on stopped nodes
			
 
				+  // for next restart. And it waits "failure-delay-ms" before recovering
			
 
				+  // this failure.
			
 
				+  // The expected behavior is that nodes come back online, thus cluster
			
 
				+  // comes back operative as well. As always, after recovery, each member
			
 
				+  // must be able to process client requests.
			
 
				   KILL_QUORUM = 4;
			
 
				+
			
 
				+  // KILL_ALL stops the whole cluster but does not delete data directories
			
 
				+  // on disk for next restart. And it waits "failure-delay-ms" before
			
 
				+  // recovering this failure.
			
 
				+  // The expected behavior is that nodes come back online, thus cluster
			
 
				+  // comes back operative as well. As always, after recovery, each member
			
 
				+  // must be able to process client requests.
			
 
				   KILL_ALL = 5;
			
 
				 
			
 
				+  // BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER drops all outgoing/incoming
			
 
				+  // packets from/to the peer port on a randomly chosen follower
			
 
				+  // (non-leader), and waits for "failure-delay-ms" until recovery.
			
 
				+  // The expected behavior is that once dropping operation is undone,
			
 
				+  // each member must be able to process client requests.
			
 
				   BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER = 100;
			
 
				+
			
 
				+  // BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT drops
			
 
				+  // all outgoing/incoming packets from/to the peer port on a randomly
			
 
				+  // chosen follower (non-leader), and waits for most up-to-date node
			
 
				+  // (leader) applies the snapshot count of entries since the blackhole
			
 
				+  // operation.
			
 
				+  // The expected behavior is that once packet drop operation is undone,
			
 
				+  // the slow follower tries to catch up, possibly receiving the snapshot
			
 
				+  // from the active leader. As always, after recovery, each member must
			
 
				+  // be able to process client requests.
			
 
				   BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 101;
			
 
				+
			
 
				+  // BLACKHOLE_PEER_PORT_TX_RX_LEADER drops all outgoing/incoming packets
			
 
				+  // from/to the peer port on the active leader (isolated), and waits for
			
 
				+  // "failure-delay-ms" until recovery, in order to trigger election timeout.
			
 
				+  // The expected behavior is that after election timeout, a new leader gets
			
 
				+  // elected, and once dropping operation is undone, the old leader comes
			
 
				+  // back and rejoins the cluster as a follower. As always, after recovery,
			
 
				+  // each member must be able to process client requests.
			
 
				   BLACKHOLE_PEER_PORT_TX_RX_LEADER = 102;
			
 
				+
			
 
				+  // BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT drops all
			
 
				+  // outgoing/incoming packets from/to the peer port on the active leader,
			
 
				+  // and waits for most up-to-date node (leader) applies the snapshot
			
 
				+  // count of entries since the blackhole operation.
			
 
				+  // The expected behavior is that cluster elects a new leader, and once
			
 
				+  // dropping operation is undone, the old leader comes back and rejoins
			
 
				+  // the cluster as a follower. The slow follower tries to catch up, likely
			
 
				+  // receiving the snapshot from the new active leader. As always, after
			
 
				+  // recovery, each member must be able to process client requests.
			
 
				   BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 103;
			
 
				+
			
 
				+  // BLACKHOLE_PEER_PORT_TX_RX_QUORUM drops all outgoing/incoming packets
			
 
				+  // from/to the peer ports on majority nodes of cluster, thus losing its
			
 
				+  // leader and cluster being inoperable. And it waits for "failure-delay-ms"
			
 
				+  // until recovery.
			
 
				+  // The expected behavior is that once packet drop operation is undone,
			
 
				+  // nodes come back online, thus cluster comes back operative. As always,
			
 
				+  // after recovery, each member must be able to process client requests.
			
 
				   BLACKHOLE_PEER_PORT_TX_RX_QUORUM = 104;
			
 
				+
			
 
				+  // BLACKHOLE_PEER_PORT_TX_RX_ALL drops all outgoing/incoming packets
			
 
				+  // from/to the peer ports on all nodes, thus making cluster totally
			
 
				+  // inoperable. It waits for "failure-delay-ms" until recovery.
			
 
				+  // The expected behavior is that once packet drop operation is undone,
			
 
				+  // nodes come back online, thus cluster comes back operative. As always,
			
 
				+  // after recovery, each member must be able to process client requests.
			
 
				   BLACKHOLE_PEER_PORT_TX_RX_ALL = 105;
			
 
				 
			
 
				+  // DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming packets
			
 
				+  // from/to the peer port on a randomly chosen follower (non-leader).
			
 
				+  // It waits for "failure-delay-ms" until recovery.
			
 
				+  // The expected behavior is that once packet delay operation is undone,
			
 
				+  // the follower comes back and tries to catch up with latest changes from
			
 
				+  // cluster. And as always, after recovery, each member must be able to
			
 
				+  // process client requests.
			
 
				   DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 200;
			
 
				+
			
 
				+  // RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming
			
 
				+  // packets from/to the peer port on a randomly chosen follower
			
 
				+  // (non-leader) with a randomized time duration (thus isolated). It waits
			
 
				+  // for "failure-delay-ms" until recovery.
			
 
				+  // The expected behavior is that once packet delay operation is undone,
			
 
				+  // each member must be able to process client requests.
			
 
				   RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 201;
			
 
				+
			
 
				+  // DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
			
 
				+  // outgoing/incoming packets from/to the peer port on a randomly chosen
			
 
				+  // follower (non-leader), and waits for most up-to-date node (leader)
			
 
				+  // applies the snapshot count of entries since the delay operation.
			
 
				+  // The expected behavior is that the delayed follower gets isolated
			
 
				+  // and behind the current active leader, and once delay operation is undone,
			
 
				+  // the slow follower comes back and catches up possibly receiving snapshot
			
 
				+  // from the active leader. As always, after recovery, each member must be
			
 
				+  // able to process client requests.
			
 
				   DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 202;
			
 
				+
			
 
				+  // RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
			
 
				+  // outgoing/incoming packets from/to the peer port on a randomly chosen
			
 
				+  // follower (non-leader) with a randomized time duration, and waits for
			
 
				+  // most up-to-date node (leader) applies the snapshot count of entries
			
 
				+  // since the delay operation.
			
 
				+  // The expected behavior is that the delayed follower gets isolated
			
 
				+  // and behind the current active leader, and once delay operation is undone,
			
 
				+  // the slow follower comes back and catches up, possibly receiving a
			
 
				+  // snapshot from the active leader. As always, after recovery, each member
			
 
				+  // must be able to process client requests.
			
 
				   RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 203;
			
 
				+
			
 
				+  // DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets from/to
			
 
				+  // the peer port on the active leader. And waits for "failure-delay-ms"
			
 
				+  // until recovery.
			
 
				+  // The expected behavior is that cluster may elect a new leader, and
			
 
				+  // once packet delay operation is undone, the (old) leader comes back
			
 
				+  // and tries to catch up with latest changes from cluster. As always,
			
 
				+  // after recovery, each member must be able to process client requests.
			
 
				   DELAY_PEER_PORT_TX_RX_LEADER = 204;
			
 
				+
			
 
				+  // RANDOM_DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets
			
 
				+  // from/to the peer port on the active leader with a randomized time
			
 
				+  // duration. And waits for "failure-delay-ms" until recovery.
			
 
				+  // The expected behavior is that cluster may elect a new leader, and
			
 
				+  // once packet delay operation is undone, the (old) leader comes back
			
 
				+  // and tries to catch up with latest changes from cluster. As always,
			
 
				+  // after recovery, each member must be able to process client requests.
			
 
				   RANDOM_DELAY_PEER_PORT_TX_RX_LEADER = 205;
			
 
				+
			
 
				+  // DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
			
 
				+  // outgoing/incoming packets from/to the peer port on the active leader,
			
 
				+  // and waits for most up-to-date node (current or new leader) applies the
			
 
				+  // snapshot count of entries since the delay operation.
			
 
				+  // The expected behavior is that cluster may elect a new leader, and
			
 
				+  // the old leader gets isolated and behind the current active leader,
			
 
				+  // and once delay operation is undone, the slow follower comes back
			
 
				+  // and catches up, likely receiving a snapshot from the active leader.
			
 
				+  // As always, after recovery, each member must be able to process client
			
 
				+  // requests.
			
 
				   DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 206;
			
 
				+
			
 
				+  // RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
			
 
				+  // outgoing/incoming packets from/to the peer port on the active leader,
			
 
				+  // with a randomized time duration. And it waits for most up-to-date node
			
 
				+  // (current or new leader) applies the snapshot count of entries since the
			
 
				+  // delay operation.
			
 
				+  // The expected behavior is that cluster may elect a new leader, and
			
 
				+  // the old leader gets isolated and behind the current active leader,
			
 
				+  // and once delay operation is undone, the slow follower comes back
			
 
				+  // and catches up, likely receiving a snapshot from the active leader.
			
 
				+  // As always, after recovery, each member must be able to process client
			
 
				+  // requests.
			
 
				   RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 207;
			
 
				+
			
 
				+  // DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets from/to
			
 
				+  // the peer ports on majority nodes of cluster. And it waits for
			
 
				+  // "failure-delay-ms" until recovery, likely to trigger election timeouts.
			
 
				+  // The expected behavior is that cluster may elect a new leader, while
			
 
				+  // quorum of nodes struggle with slow networks, and once delay operation
			
 
				+  // is undone, nodes come back and cluster comes back operative. As always,
			
 
				+  // after recovery, each member must be able to process client requests.
			
 
				   DELAY_PEER_PORT_TX_RX_QUORUM = 208;
			
 
				+
			
 
				+  // RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets
			
 
				+  // from/to the peer ports on majority nodes of cluster, with randomized
			
 
				+  // time durations. And it waits for "failure-delay-ms" until recovery,
			
 
				+  // likely to trigger election timeouts.
			
 
				+  // The expected behavior is that cluster may elect a new leader, while
			
 
				+  // quorum of nodes struggle with slow networks, and once delay operation
			
 
				+  // is undone, nodes come back and cluster comes back operative. As always,
			
 
				+  // after recovery, each member must be able to process client requests.
			
 
				   RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM = 209;
			
 
				+
			
 
				+  // DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets from/to the
			
 
				+  // peer ports on all nodes. And it waits for "failure-delay-ms" until
			
 
				+  // recovery, likely to trigger election timeouts.
			
 
				+  // The expected behavior is that cluster may become totally inoperable,
			
 
				+  // struggling with slow networks across the whole cluster. Once delay
			
 
				+  // operation is undone, nodes come back and cluster comes back operative.
			
 
				+  // As always, after recovery, each member must be able to process client
			
 
				+  // requests.
			
 
				   DELAY_PEER_PORT_TX_RX_ALL = 210;
			
 
				+
			
 
				+  // RANDOM_DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets
			
 
				+  // from/to the peer ports on all nodes, with randomized time durations.
			
 
				+  // And it waits for "failure-delay-ms" until recovery, likely to trigger
			
 
				+  // election timeouts.
			
 
				+  // The expected behavior is that cluster may become totally inoperable,
			
 
				+  // struggling with slow networks across the whole cluster. Once delay
			
 
				+  // operation is undone, nodes come back and cluster comes back operative.
			
 
				+  // As always, after recovery, each member must be able to process client
			
 
				+  // requests.
			
 
				   RANDOM_DELAY_PEER_PORT_TX_RX_ALL = 211;
			
 
				 
			
 
				-  // NO_FAIL_WITH_STRESS runs no-op failure injection for specified period
			
 
				-  // while stressers are still sending requests.
			
 
				+  // NO_FAIL_WITH_STRESS runs no-op failure injection that does not do
			
 
				+  // anything against cluster for "failure-delay-ms" duration, while
			
 
				+  // stressers are still sending requests.
			
 
				   NO_FAIL_WITH_STRESS = 300;
			
 
				+
			
 
				   // NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS runs no-op failure injection
			
 
				-  // with all stressers stopped.
			
 
				+  // that does not do anything against cluster for "failure-delay-ms"
			
 
				+  // duration, while all stressers are stopped.
			
 
				   NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS = 301;
			
 
				 
			
 
				+  // FAILPOINTS injects failpoints to etcd server runtime, triggering panics
			
 
				+  // in critical code paths.
			
 
				   FAILPOINTS = 400;
			
 
				+
			
 
				+  // EXTERNAL runs external failure injection scripts.
			
 
				   EXTERNAL = 500;
			
 
				 }