rpc.proto 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. syntax = "proto3";
  2. package rpcpb;
  3. import "github.com/gogo/protobuf/gogoproto/gogo.proto";
  4. option (gogoproto.marshaler_all) = true;
  5. option (gogoproto.sizer_all) = true;
  6. option (gogoproto.unmarshaler_all) = true;
  7. option (gogoproto.goproto_getters_all) = false;
  8. message Request {
  9. Operation Operation = 1;
  10. // Member contains the same Member object from tester configuration.
  11. Member Member = 2;
  12. // Tester contains tester configuration.
  13. Tester Tester = 3;
  14. }
  15. message Response {
  16. bool Success = 1;
  17. string Status = 2;
  18. // Member contains the same Member object from tester request.
  19. Member Member = 3;
  20. }
  21. service Transport {
  22. rpc Transport(stream Request) returns (stream Response) {}
  23. }
  24. message Member {
  25. // EtcdExecPath is the executable etcd binary path in agent server.
  26. string EtcdExecPath = 1 [(gogoproto.moretags) = "yaml:\"etcd-exec-path\""];
  27. // TODO: support embedded etcd
  28. // AgentAddr is the agent HTTP server address.
  29. string AgentAddr = 11 [(gogoproto.moretags) = "yaml:\"agent-addr\""];
  30. // FailpointHTTPAddr is the agent's failpoints HTTP server address.
  31. string FailpointHTTPAddr = 12 [(gogoproto.moretags) = "yaml:\"failpoint-http-addr\""];
  32. // BaseDir is the base directory where all logs and etcd data are stored.
  33. string BaseDir = 101 [(gogoproto.moretags) = "yaml:\"base-dir\""];
  34. // EtcdLogPath is the log file to store current etcd server logs.
  35. string EtcdLogPath = 102 [(gogoproto.moretags) = "yaml:\"etcd-log-path\""];
  36. // EtcdClientProxy is true when client traffic needs to be proxied.
  37. // If true, listen client URL port must be different than advertise client URL port.
  38. bool EtcdClientProxy = 201 [(gogoproto.moretags) = "yaml:\"etcd-client-proxy\""];
  39. // EtcdPeerProxy is true when peer traffic needs to be proxied.
  40. // If true, listen peer URL port must be different than advertise peer URL port.
  41. bool EtcdPeerProxy = 202 [(gogoproto.moretags) = "yaml:\"etcd-peer-proxy\""];
  42. // EtcdClientEndpoint is the etcd client endpoint.
  43. string EtcdClientEndpoint = 301 [(gogoproto.moretags) = "yaml:\"etcd-client-endpoint\""];
  44. // Etcd defines etcd binary configuration flags.
  45. Etcd Etcd = 302 [(gogoproto.moretags) = "yaml:\"etcd\""];
  46. // ClientCertData contains cert file contents from this member's etcd server.
  47. string ClientCertData = 401 [(gogoproto.moretags) = "yaml:\"client-cert-data\""];
  48. string ClientCertPath = 402 [(gogoproto.moretags) = "yaml:\"client-cert-path\""];
  49. // ClientKeyData contains key file contents from this member's etcd server.
  50. string ClientKeyData = 403 [(gogoproto.moretags) = "yaml:\"client-key-data\""];
  51. string ClientKeyPath = 404 [(gogoproto.moretags) = "yaml:\"client-key-path\""];
  52. // ClientTrustedCAData contains trusted CA file contents from this member's etcd server.
  53. string ClientTrustedCAData = 405 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-data\""];
  54. string ClientTrustedCAPath = 406 [(gogoproto.moretags) = "yaml:\"client-trusted-ca-path\""];
  55. // PeerCertData contains cert file contents from this member's etcd server.
  56. string PeerCertData = 501 [(gogoproto.moretags) = "yaml:\"peer-cert-data\""];
  57. string PeerCertPath = 502 [(gogoproto.moretags) = "yaml:\"peer-cert-path\""];
  58. // PeerKeyData contains key file contents from this member's etcd server.
  59. string PeerKeyData = 503 [(gogoproto.moretags) = "yaml:\"peer-key-data\""];
  60. string PeerKeyPath = 504 [(gogoproto.moretags) = "yaml:\"peer-key-path\""];
  61. // PeerTrustedCAData contains trusted CA file contents from this member's etcd server.
  62. string PeerTrustedCAData = 505 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-data\""];
  63. string PeerTrustedCAPath = 506 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-path\""];
  64. }
  65. message Tester {
  66. string DataDir = 1 [(gogoproto.moretags) = "yaml:\"data-dir\""];
  67. string Network = 2 [(gogoproto.moretags) = "yaml:\"network\""];
  68. string Addr = 3 [(gogoproto.moretags) = "yaml:\"addr\""];
  69. // DelayLatencyMsRv is the delay latency in milliseconds,
  70. // to inject to simulated slow network.
  71. uint32 DelayLatencyMs = 11 [(gogoproto.moretags) = "yaml:\"delay-latency-ms\""];
  72. // DelayLatencyMsRv is the delay latency random variable in milliseconds.
  73. uint32 DelayLatencyMsRv = 12 [(gogoproto.moretags) = "yaml:\"delay-latency-ms-rv\""];
  74. // UpdatedDelayLatencyMs is the update delay latency in milliseconds,
  75. // to inject to simulated slow network. It's the final latency to apply,
  76. // in case the latency numbers are randomly generated from given delay latency field.
  77. uint32 UpdatedDelayLatencyMs = 13 [(gogoproto.moretags) = "yaml:\"updated-delay-latency-ms\""];
  78. // RoundLimit is the limit of rounds to run failure set (-1 to run without limits).
  79. int32 RoundLimit = 21 [(gogoproto.moretags) = "yaml:\"round-limit\""];
  80. // ExitOnFailure is true, then exit tester on first failure.
  81. bool ExitOnFailure = 22 [(gogoproto.moretags) = "yaml:\"exit-on-failure\""];
  82. // ConsistencyCheck is true to check consistency (revision, hash).
  83. bool ConsistencyCheck = 23 [(gogoproto.moretags) = "yaml:\"consistency-check\""];
  84. // EnablePprof is true to enable profiler.
  85. bool EnablePprof = 24 [(gogoproto.moretags) = "yaml:\"enable-pprof\""];
  86. // FailureDelayMs is the delay duration after failure is injected.
  87. // Useful when triggering snapshot or no-op failure cases.
  88. uint32 FailureDelayMs = 31 [(gogoproto.moretags) = "yaml:\"failure-delay-ms\""];
  89. // FailureShuffle is true to randomize failure injecting order.
  90. bool FailureShuffle = 32 [(gogoproto.moretags) = "yaml:\"failure-shuffle\""];
  91. // FailureCases is the selected test cases to schedule.
  92. // If empty, run all failure cases.
  93. repeated string FailureCases = 33 [(gogoproto.moretags) = "yaml:\"failure-cases\""];
  94. // Failpoinommands is the list of "gofail" commands (e.g. panic("etcd-tester"),1*sleep(1000)
  95. repeated string FailpointCommands = 34 [(gogoproto.moretags) = "yaml:\"failpoint-commands\""];
  96. // RunnerExecPath is a path of etcd-runner binary.
  97. string RunnerExecPath = 41 [(gogoproto.moretags) = "yaml:\"runner-exec-path\""];
  98. // ExternalExecPath is a path of script for enabling/disabling an external fault injector.
  99. string ExternalExecPath = 42 [(gogoproto.moretags) = "yaml:\"external-exec-path\""];
  100. // StressTypes is the list of stresser names:
  101. // keys, lease, nop, election-runner, watch-runner, lock-racer-runner, lease-runner.
  102. repeated string StressTypes = 101 [(gogoproto.moretags) = "yaml:\"stress-types\""];
  103. // StressKeySize is the size of each small key written into etcd.
  104. int32 StressKeySize = 102 [(gogoproto.moretags) = "yaml:\"stress-key-size\""];
  105. // StressKeySizeLarge is the size of each large key written into etcd.
  106. int32 StressKeySizeLarge = 103 [(gogoproto.moretags) = "yaml:\"stress-key-size-large\""];
  107. // StressKeySuffixRange is the count of key range written into etcd.
  108. // Stress keys are created with "fmt.Sprintf("foo%016x", rand.Intn(keySuffixRange)".
  109. int32 StressKeySuffixRange = 104 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range\""];
  110. // StressKeySuffixRangeTxn is the count of key range written into etcd txn (max 100).
  111. // Stress keys are created with "fmt.Sprintf("/k%03d", i)".
  112. int32 StressKeySuffixRangeTxn = 105 [(gogoproto.moretags) = "yaml:\"stress-key-suffix-range-txn\""];
  113. // StressKeyTxnOps is the number of operations per a transaction (max 64).
  114. int32 StressKeyTxnOps = 106 [(gogoproto.moretags) = "yaml:\"stress-key-txn-ops\""];
  115. // StressClients is the number of concurrent stressing clients
  116. // with "one" shared TCP connection.
  117. int32 StressClients = 201 [(gogoproto.moretags) = "yaml:\"stress-clients\""];
  118. // StressQPS is the maximum number of stresser requests per second.
  119. int32 StressQPS = 202 [(gogoproto.moretags) = "yaml:\"stress-qps\""];
  120. }
  121. message Etcd {
  122. string Name = 1 [(gogoproto.moretags) = "yaml:\"name\""];
  123. string DataDir = 2 [(gogoproto.moretags) = "yaml:\"data-dir\""];
  124. string WALDir = 3 [(gogoproto.moretags) = "yaml:\"wal-dir\""];
  125. // HeartbeatIntervalMs is the time (in milliseconds) of a heartbeat interval.
  126. // Default value is 100, which is 100ms.
  127. int64 HeartbeatIntervalMs = 11 [(gogoproto.moretags) = "yaml:\"heartbeat-interval\""];
  128. // ElectionTimeoutMs is the time (in milliseconds) for an election to timeout.
  129. // Default value is 1000, which is 1s.
  130. int64 ElectionTimeoutMs = 12 [(gogoproto.moretags) = "yaml:\"election-timeout\""];
  131. repeated string ListenClientURLs = 21 [(gogoproto.moretags) = "yaml:\"listen-client-urls\""];
  132. repeated string AdvertiseClientURLs = 22 [(gogoproto.moretags) = "yaml:\"advertise-client-urls\""];
  133. bool ClientAutoTLS = 23 [(gogoproto.moretags) = "yaml:\"auto-tls\""];
  134. bool ClientCertAuth = 24 [(gogoproto.moretags) = "yaml:\"client-cert-auth\""];
  135. string ClientCertFile = 25 [(gogoproto.moretags) = "yaml:\"cert-file\""];
  136. string ClientKeyFile = 26 [(gogoproto.moretags) = "yaml:\"key-file\""];
  137. string ClientTrustedCAFile = 27 [(gogoproto.moretags) = "yaml:\"trusted-ca-file\""];
  138. repeated string ListenPeerURLs = 31 [(gogoproto.moretags) = "yaml:\"listen-peer-urls\""];
  139. repeated string AdvertisePeerURLs = 32 [(gogoproto.moretags) = "yaml:\"initial-advertise-peer-urls\""];
  140. bool PeerAutoTLS = 33 [(gogoproto.moretags) = "yaml:\"peer-auto-tls\""];
  141. bool PeerClientCertAuth = 34 [(gogoproto.moretags) = "yaml:\"peer-client-cert-auth\""];
  142. string PeerCertFile = 35 [(gogoproto.moretags) = "yaml:\"peer-cert-file\""];
  143. string PeerKeyFile = 36 [(gogoproto.moretags) = "yaml:\"peer-key-file\""];
  144. string PeerTrustedCAFile = 37 [(gogoproto.moretags) = "yaml:\"peer-trusted-ca-file\""];
  145. string InitialCluster = 41 [(gogoproto.moretags) = "yaml:\"initial-cluster\""];
  146. string InitialClusterState = 42 [(gogoproto.moretags) = "yaml:\"initial-cluster-state\""];
  147. string InitialClusterToken = 43 [(gogoproto.moretags) = "yaml:\"initial-cluster-token\""];
  148. int64 SnapshotCount = 51 [(gogoproto.moretags) = "yaml:\"snapshot-count\""];
  149. int64 QuotaBackendBytes = 52 [(gogoproto.moretags) = "yaml:\"quota-backend-bytes\""];
  150. bool PreVote = 63 [(gogoproto.moretags) = "yaml:\"pre-vote\""];
  151. bool InitialCorruptCheck = 64 [(gogoproto.moretags) = "yaml:\"initial-corrupt-check\""];
  152. }
  153. enum Operation {
  154. // NOT_STARTED is the agent status before etcd first start.
  155. NOT_STARTED = 0;
  156. // INITIAL_START_ETCD is only called to start etcd, the very first time.
  157. INITIAL_START_ETCD = 10;
  158. // RESTART_ETCD is sent to restart killed etcd.
  159. RESTART_ETCD = 11;
  160. // SIGTERM_ETCD pauses etcd process while keeping data directories
  161. // and previous etcd configurations.
  162. SIGTERM_ETCD = 20;
  163. // SIGQUIT_ETCD_AND_REMOVE_DATA kills etcd process and removes all data
  164. // directories to simulate destroying the whole machine.
  165. SIGQUIT_ETCD_AND_REMOVE_DATA = 21;
  166. // SIGQUIT_ETCD_AND_ARCHIVE_DATA is sent when consistency check failed,
  167. // thus need to archive etcd data directories.
  168. SIGQUIT_ETCD_AND_ARCHIVE_DATA = 30;
  169. // SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT destroys etcd process,
  170. // etcd data, and agent server.
  171. SIGQUIT_ETCD_AND_REMOVE_DATA_AND_STOP_AGENT = 31;
  172. // BLACKHOLE_PEER_PORT_TX_RX drops all outgoing/incoming packets from/to
  173. // the peer port on target member's peer port.
  174. BLACKHOLE_PEER_PORT_TX_RX = 100;
  175. // UNBLACKHOLE_PEER_PORT_TX_RX removes outgoing/incoming packet dropping.
  176. UNBLACKHOLE_PEER_PORT_TX_RX = 101;
  177. // DELAY_PEER_PORT_TX_RX delays all outgoing/incoming packets from/to
  178. // the peer port on target member's peer port.
  179. DELAY_PEER_PORT_TX_RX = 200;
  180. // UNDELAY_PEER_PORT_TX_RX removes all outgoing/incoming delays.
  181. UNDELAY_PEER_PORT_TX_RX = 201;
  182. }
  183. // FailureCase defines various system faults in distributed systems,
  184. // in order to verify correct behavior of etcd servers and clients.
  185. enum FailureCase {
  186. // SIGTERM_ONE_FOLLOWER stops a randomly chosen follower (non-leader)
  187. // but does not delete its data directories on disk for next restart.
  188. // It waits "failure-delay-ms" before recovering this failure.
  189. // The expected behavior is that the follower comes back online
  190. // and rejoins the cluster, and then each member continues to process
  191. // client requests ('Put' request that requires Raft consensus).
  192. SIGTERM_ONE_FOLLOWER = 0;
  193. // SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT stops a randomly chosen
  194. // follower but does not delete its data directories on disk for next
  195. // restart. And waits until most up-to-date node (leader) applies the
  196. // snapshot count of entries since the stop operation.
  197. // The expected behavior is that the follower comes back online and
  198. // rejoins the cluster, and then active leader sends snapshot
  199. // to the follower to force it to follow the leader's log.
  200. // As always, after recovery, each member must be able to process
  201. // client requests.
  202. SIGTERM_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 1;
  203. // SIGTERM_LEADER stops the active leader node but does not delete its
  204. // data directories on disk for next restart. Then it waits
  205. // "failure-delay-ms" before recovering this failure, in order to
  206. // trigger election timeouts.
  207. // The expected behavior is that a new leader gets elected, and the
  208. // old leader comes back online and rejoins the cluster as a follower.
  209. // As always, after recovery, each member must be able to process
  210. // client requests.
  211. SIGTERM_LEADER = 2;
  212. // SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT stops the active leader node
  213. // but does not delete its data directories on disk for next restart.
  214. // And waits until most up-to-date node ("new" leader) applies the
  215. // snapshot count of entries since the stop operation.
  216. // The expected behavior is that cluster elects a new leader, and the
  217. // old leader comes back online and rejoins the cluster as a follower.
  218. // And it receives the snapshot from the new leader to overwrite its
  219. // store. As always, after recovery, each member must be able to
  220. // process client requests.
  221. SIGTERM_LEADER_UNTIL_TRIGGER_SNAPSHOT = 3;
  222. // SIGTERM_QUORUM stops majority number of nodes to make the whole cluster
  223. // inoperable but does not delete data directories on stopped nodes
  224. // for next restart. And it waits "failure-delay-ms" before recovering
  225. // this failure.
  226. // The expected behavior is that nodes come back online, thus cluster
  227. // comes back operative as well. As always, after recovery, each member
  228. // must be able to process client requests.
  229. SIGTERM_QUORUM = 4;
  230. // SIGTERM_ALL stops the whole cluster but does not delete data directories
  231. // on disk for next restart. And it waits "failure-delay-ms" before
  232. // recovering this failure.
  233. // The expected behavior is that nodes come back online, thus cluster
  234. // comes back operative as well. As always, after recovery, each member
  235. // must be able to process client requests.
  236. SIGTERM_ALL = 5;
  237. // BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER drops all outgoing/incoming
  238. // packets from/to the peer port on a randomly chosen follower
  239. // (non-leader), and waits for "failure-delay-ms" until recovery.
  240. // The expected behavior is that once dropping operation is undone,
  241. // each member must be able to process client requests.
  242. BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER = 100;
  243. // BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT drops
  244. // all outgoing/incoming packets from/to the peer port on a randomly
  245. // chosen follower (non-leader), and waits for most up-to-date node
  246. // (leader) applies the snapshot count of entries since the blackhole
  247. // operation.
  248. // The expected behavior is that once packet drop operation is undone,
  249. // the slow follower tries to catch up, possibly receiving the snapshot
  250. // from the active leader. As always, after recovery, each member must
  251. // be able to process client requests.
  252. BLACKHOLE_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 101;
  253. // BLACKHOLE_PEER_PORT_TX_RX_LEADER drops all outgoing/incoming packets
  254. // from/to the peer port on the active leader (isolated), and waits for
  255. // "failure-delay-ms" until recovery, in order to trigger election timeout.
  256. // The expected behavior is that after election timeout, a new leader gets
  257. // elected, and once dropping operation is undone, the old leader comes
  258. // back and rejoins the cluster as a follower. As always, after recovery,
  259. // each member must be able to process client requests.
  260. BLACKHOLE_PEER_PORT_TX_RX_LEADER = 102;
  261. // BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT drops all
  262. // outgoing/incoming packets from/to the peer port on the active leader,
  263. // and waits for most up-to-date node (leader) applies the snapshot
  264. // count of entries since the blackhole operation.
  265. // The expected behavior is that cluster elects a new leader, and once
  266. // dropping operation is undone, the old leader comes back and rejoins
  267. // the cluster as a follower. The slow follower tries to catch up, likely
  268. // receiving the snapshot from the new active leader. As always, after
  269. // recovery, each member must be able to process client requests.
  270. BLACKHOLE_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 103;
  271. // BLACKHOLE_PEER_PORT_TX_RX_QUORUM drops all outgoing/incoming packets
  272. // from/to the peer ports on majority nodes of cluster, thus losing its
  273. // leader and cluster being inoperable. And it waits for "failure-delay-ms"
  274. // until recovery.
  275. // The expected behavior is that once packet drop operation is undone,
  276. // nodes come back online, thus cluster comes back operative. As always,
  277. // after recovery, each member must be able to process client requests.
  278. BLACKHOLE_PEER_PORT_TX_RX_QUORUM = 104;
  279. // BLACKHOLE_PEER_PORT_TX_RX_ALL drops all outgoing/incoming packets
  280. // from/to the peer ports on all nodes, thus making cluster totally
  281. // inoperable. It waits for "failure-delay-ms" until recovery.
  282. // The expected behavior is that once packet drop operation is undone,
  283. // nodes come back online, thus cluster comes back operative. As always,
  284. // after recovery, each member must be able to process client requests.
  285. BLACKHOLE_PEER_PORT_TX_RX_ALL = 105;
  286. // DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming packets
  287. // from/to the peer port on a randomly chosen follower (non-leader).
  288. // It waits for "failure-delay-ms" until recovery.
  289. // The expected behavior is that once packet delay operation is undone,
  290. // the follower comes back and tries to catch up with latest changes from
  291. // cluster. And as always, after recovery, each member must be able to
  292. // process client requests.
  293. DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 200;
  294. // RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER delays outgoing/incoming
  295. // packets from/to the peer port on a randomly chosen follower
  296. // (non-leader) with a randomized time duration (thus isolated). It waits
  297. // for "failure-delay-ms" until recovery.
  298. // The expected behavior is that once packet delay operation is undone,
  299. // each member must be able to process client requests.
  300. RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER = 201;
  301. // DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
  302. // outgoing/incoming packets from/to the peer port on a randomly chosen
  303. // follower (non-leader), and waits for most up-to-date node (leader)
  304. // applies the snapshot count of entries since the delay operation.
  305. // The expected behavior is that the delayed follower gets isolated
  306. // and behind the current active leader, and once delay operation is undone,
  307. // the slow follower comes back and catches up possibly receiving snapshot
  308. // from the active leader. As always, after recovery, each member must be
  309. // able to process client requests.
  310. DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 202;
  311. // RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT delays
  312. // outgoing/incoming packets from/to the peer port on a randomly chosen
  313. // follower (non-leader) with a randomized time duration, and waits for
  314. // most up-to-date node (leader) applies the snapshot count of entries
  315. // since the delay operation.
  316. // The expected behavior is that the delayed follower gets isolated
  317. // and behind the current active leader, and once delay operation is undone,
  318. // the slow follower comes back and catches up, possibly receiving a
  319. // snapshot from the active leader. As always, after recovery, each member
  320. // must be able to process client requests.
  321. RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT = 203;
  322. // DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets from/to
  323. // the peer port on the active leader. And waits for "failure-delay-ms"
  324. // until recovery.
  325. // The expected behavior is that cluster may elect a new leader, and
  326. // once packet delay operation is undone, the (old) leader comes back
  327. // and tries to catch up with latest changes from cluster. As always,
  328. // after recovery, each member must be able to process client requests.
  329. DELAY_PEER_PORT_TX_RX_LEADER = 204;
  330. // RANDOM_DELAY_PEER_PORT_TX_RX_LEADER delays outgoing/incoming packets
  331. // from/to the peer port on the active leader with a randomized time
  332. // duration. And waits for "failure-delay-ms" until recovery.
  333. // The expected behavior is that cluster may elect a new leader, and
  334. // once packet delay operation is undone, the (old) leader comes back
  335. // and tries to catch up with latest changes from cluster. As always,
  336. // after recovery, each member must be able to process client requests.
  337. RANDOM_DELAY_PEER_PORT_TX_RX_LEADER = 205;
  338. // DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
  339. // outgoing/incoming packets from/to the peer port on the active leader,
  340. // and waits for most up-to-date node (current or new leader) applies the
  341. // snapshot count of entries since the delay operation.
  342. // The expected behavior is that cluster may elect a new leader, and
  343. // the old leader gets isolated and behind the current active leader,
  344. // and once delay operation is undone, the slow follower comes back
  345. // and catches up, likely receiving a snapshot from the active leader.
  346. // As always, after recovery, each member must be able to process client
  347. // requests.
  348. DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 206;
  349. // RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT delays
  350. // outgoing/incoming packets from/to the peer port on the active leader,
  351. // with a randomized time duration. And it waits for most up-to-date node
  352. // (current or new leader) applies the snapshot count of entries since the
  353. // delay operation.
  354. // The expected behavior is that cluster may elect a new leader, and
  355. // the old leader gets isolated and behind the current active leader,
  356. // and once delay operation is undone, the slow follower comes back
  357. // and catches up, likely receiving a snapshot from the active leader.
  358. // As always, after recovery, each member must be able to process client
  359. // requests.
  360. RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT = 207;
  361. // DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets from/to
  362. // the peer ports on majority nodes of cluster. And it waits for
  363. // "failure-delay-ms" until recovery, likely to trigger election timeouts.
  364. // The expected behavior is that cluster may elect a new leader, while
  365. // quorum of nodes struggle with slow networks, and once delay operation
  366. // is undone, nodes come back and cluster comes back operative. As always,
  367. // after recovery, each member must be able to process client requests.
  368. DELAY_PEER_PORT_TX_RX_QUORUM = 208;
  369. // RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM delays outgoing/incoming packets
  370. // from/to the peer ports on majority nodes of cluster, with randomized
  371. // time durations. And it waits for "failure-delay-ms" until recovery,
  372. // likely to trigger election timeouts.
  373. // The expected behavior is that cluster may elect a new leader, while
  374. // quorum of nodes struggle with slow networks, and once delay operation
  375. // is undone, nodes come back and cluster comes back operative. As always,
  376. // after recovery, each member must be able to process client requests.
  377. RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM = 209;
  378. // DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets from/to the
  379. // peer ports on all nodes. And it waits for "failure-delay-ms" until
  380. // recovery, likely to trigger election timeouts.
  381. // The expected behavior is that cluster may become totally inoperable,
  382. // struggling with slow networks across the whole cluster. Once delay
  383. // operation is undone, nodes come back and cluster comes back operative.
  384. // As always, after recovery, each member must be able to process client
  385. // requests.
  386. DELAY_PEER_PORT_TX_RX_ALL = 210;
  387. // RANDOM_DELAY_PEER_PORT_TX_RX_ALL delays outgoing/incoming packets
  388. // from/to the peer ports on all nodes, with randomized time durations.
  389. // And it waits for "failure-delay-ms" until recovery, likely to trigger
  390. // election timeouts.
  391. // The expected behavior is that cluster may become totally inoperable,
  392. // struggling with slow networks across the whole cluster. Once delay
  393. // operation is undone, nodes come back and cluster comes back operative.
  394. // As always, after recovery, each member must be able to process client
  395. // requests.
  396. RANDOM_DELAY_PEER_PORT_TX_RX_ALL = 211;
  397. // NO_FAIL_WITH_STRESS runs no-op failure injection that does not do
  398. // anything against cluster for "failure-delay-ms" duration, while
  399. // stressers are still sending requests.
  400. NO_FAIL_WITH_STRESS = 300;
  401. // NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS runs no-op failure injection
  402. // that does not do anything against cluster for "failure-delay-ms"
  403. // duration, while all stressers are stopped.
  404. NO_FAIL_WITH_NO_STRESS_FOR_LIVENESS = 301;
  405. // FAILPOINTS injects failpoints to etcd server runtime, triggering panics
  406. // in critical code paths.
  407. FAILPOINTS = 400;
  408. // EXTERNAL runs external failure injection scripts.
  409. EXTERNAL = 500;
  410. }
  411. enum StressType {
  412. KV = 0;
  413. LEASE = 1;
  414. ELECTION_RUNNER = 2;
  415. WATCH_RUNNER = 3;
  416. LOCK_RACER_RUNNER = 4;
  417. LEASE_RUNNER = 5;
  418. }